In [1]:
# Download the Dataset
from sklearn.datasets import fetch_openml
data = fetch_openml(name ='credit-g', version = 1)
print(data)

# Reading the Dataset
import pandas as pd
df = pd.DataFrame(data.data, columns = data.feature_names)
df['class'] = data.target

#Does the dataset include any missing values? If so, drop them
before_drop = df.shape
print('Shape Before NA Drop: ', before_drop)
df.dropna
after_drop = df.shape
print("Shape After NA Drop: ", after_drop)

{'data':     checking_status  duration                  credit_history  \
0                <0         6  critical/other existing credit   
1          0<=X<200        48                   existing paid   
2       no checking        12  critical/other existing credit   
3                <0        42                   existing paid   
4                <0        24              delayed previously   
..              ...       ...                             ...   
995     no checking        12                   existing paid   
996              <0        30                   existing paid   
997     no checking        12                   existing paid   
998              <0        45                   existing paid   
999        0<=X<200        45  critical/other existing credit   

                 purpose  credit_amount    savings_status  employment  \
0               radio/tv           1169  no known savings         >=7   
1               radio/tv           5951              <100      1

In [2]:
# Feature Selection
# Choose the features you think are relevant to our analysis! There are A LOT of features in this dataset, but we have to make our models training time reasonable for you.
# You MUST include at least four numeric features and at least three nominal features. You can choose more if you prefer.
print(df.dtypes)
numerical = df[['duration','credit_amount', 'installment_commitment','existing_credits']]
categorical = df[['checking_status', 'credit_history', 'savings_status', 'employment', 'other_parties']]

checking_status           category
duration                     int64
credit_history            category
purpose                   category
credit_amount                int64
savings_status            category
employment                category
installment_commitment       int64
personal_status           category
other_parties             category
residence_since              int64
property_magnitude        category
age                          int64
other_payment_plans       category
housing                   category
existing_credits             int64
job                       category
num_dependents               int64
own_telephone             category
foreign_worker            category
class                     category
dtype: object


In [3]:
# Preprocessing: Perform any needed preprocessing on the chosen features, including:
# Scaling, Encoding, Dealing with NaN values
# Note: Use only the preprocessing steps you think are useful.
categorical = pd.get_dummies(categorical, dtype=int)
credit_df = pd.concat([numerical, categorical], axis = 1)
credit_df

Unnamed: 0,duration,credit_amount,installment_commitment,existing_credits,checking_status_0<=X<200,checking_status_<0,checking_status_>=200,checking_status_no checking,credit_history_all paid,credit_history_critical/other existing credit,...,savings_status_>=1000,savings_status_no known savings,employment_1<=X<4,employment_4<=X<7,employment_<1,employment_>=7,employment_unemployed,other_parties_co applicant,other_parties_guarantor,other_parties_none
0,6,1169,4,2,0,1,0,0,0,1,...,0,1,0,0,0,1,0,0,0,1
1,48,5951,2,1,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,12,2096,2,1,0,0,0,1,0,1,...,0,0,0,1,0,0,0,0,0,1
3,42,7882,2,1,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4,24,4870,3,2,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,12,1736,3,1,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,1
996,30,3857,4,1,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
997,12,804,4,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
998,45,1845,4,1,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [4]:
#scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
credit_df[['duration', 'credit_amount']] = scaler.fit_transform(credit_df[['duration', 'credit_amount']])
print(credit_df[['duration', 'credit_amount']])

#encoder
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
credit_df.iloc[:, 4:] = encoder.fit_transform(credit_df.iloc[:, 4:])
print(credit_df.iloc[:, 4:])

# nan values
print("Are the any NAN Values?:",df.isnull().values.any())

     duration  credit_amount
0   -1.236478      -0.745131
1    2.248194       0.949817
2   -0.738668      -0.416562
3    1.750384       1.634247
4    0.256953       0.566664
..        ...            ...
995 -0.738668      -0.544162
996  0.754763       0.207612
997 -0.738668      -0.874503
998  1.999289      -0.505528
999  1.999289       0.462457

[1000 rows x 2 columns]
     checking_status_0<=X<200  checking_status_<0  checking_status_>=200  \
0                           0                   1                      0   
1                           1                   0                      0   
2                           0                   0                      0   
3                           0                   1                      0   
4                           0                   1                      0   
..                        ...                 ...                    ...   
995                         0                   0                      0   
996                

In [5]:
# Splitting the Data: Split your data as follows:
# 80% training set
# 10% validation set
# 10% test set

from sklearn.model_selection import train_test_split
encoded_y = df['class'].values
encoded_X = credit_df.iloc[:, 0:]
X_train, X_test, y_train, y_test = train_test_split(encoded_X, encoded_y, test_size=0.1, train_size=0.8, random_state=0)
X_train, X_validate, y_train, y_validate = train_test_split(encoded_X, encoded_y, test_size=0.1,train_size=0.8, random_state=0)

Training Classifiers

In [6]:
# Use the KNN-classifier model to train your data.
# Choose the best k for the k-nearest neighbor (KNN) algorithm by trying different values and validating performance on the validation set.
# Note: choosing the best k is an example of hyper-parameter tuning.
from sklearn.neighbors import KNeighborsClassifier
k = 9
model = KNeighborsClassifier(n_neighbors=k)
model.fit(X_train, y_train)

# Classification Metrics
# Print the accuracy score of your final classifier.
y_pred = model.predict(X_test)
print(y_pred)
score = model.score(X_test,y_test)
print(score)

['bad' 'good' 'good' 'good' 'good' 'good' 'good' 'good' 'good' 'good'
 'bad' 'bad' 'good' 'good' 'bad' 'good' 'good' 'good' 'bad' 'good' 'bad'
 'bad' 'good' 'good' 'good' 'bad' 'good' 'bad' 'good' 'good' 'good' 'good'
 'good' 'good' 'good' 'bad' 'good' 'good' 'good' 'good' 'bad' 'bad' 'good'
 'good' 'good' 'good' 'good' 'bad' 'good' 'good' 'good' 'good' 'good'
 'good' 'bad' 'good' 'good' 'good' 'good' 'good' 'bad' 'good' 'bad' 'good'
 'good' 'good' 'good' 'good' 'good' 'good' 'good' 'good' 'good' 'good'
 'bad' 'good' 'good' 'good' 'good' 'good' 'good' 'good' 'good' 'good'
 'good' 'good' 'bad' 'good' 'good' 'good' 'good' 'good' 'good' 'good'
 'good' 'good' 'good' 'good' 'good' 'good']
0.75


In [7]:
# Print the confusion matrix.
from sklearn.metrics import confusion_matrix
conf = confusion_matrix(y_test, y_pred)
print(conf)

[[ 9 16]
 [ 9 66]]


Challenge Yourself (Optional)

In [10]:
#Choose another model (other than k-nearest neighbor (KNN)) and repeat step 6.
from sklearn.ensemble import RandomForestClassifier 
model = RandomForestClassifier(n_estimators=5, criterion='gini')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = model.score(X_test,y_test)
print(score)

0.75
