In [158]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

from sklearn import preprocessing
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report

import timeit

In [159]:
%matplotlib inline

## Part 1: Data exploration and preprocessing

In [160]:
loandata= pd.read_csv('UniversalBank_unprocessed.csv')
loandata.head()

Unnamed: 0,ID,Age,Experience,Income,Family,CCAvg,Education,Mortgage,CD Account,Online,CreditCard,Personal Loan
0,1,39,13.0,58.0,3,2.1,Undergraduate,169,0,1,0,0
1,2,51,25.0,18.0,1,0.3,Advanced,93,0,0,1,0
2,3,43,13.0,38.0,3,2.0,Advanced,0,0,1,0,0
3,4,37,12.0,60.0,4,2.1,Advanced,217,0,1,0,0
4,5,23,,149.0,1,6.33,Undergraduate,305,0,0,1,0


In [161]:
print(loandata.shape)
loandata['Personal Loan'].value_counts()

(1117, 12)


0    637
1    480
Name: Personal Loan, dtype: int64

In [162]:
#droping ID column
loandata = loandata.drop(["ID"], axis=1 )
loandata.isnull().sum()

Age              0
Experience       4
Income           3
Family           0
CCAvg            0
Education        0
Mortgage         0
CD Account       0
Online           0
CreditCard       0
Personal Loan    0
dtype: int64

In [163]:
loandata['Education'].value_counts()

Undergraduate    389
Advanced         383
Masters          345
Name: Education, dtype: int64

In [164]:
loandata.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,CD Account,Online,CreditCard,Personal Loan
0,39,13.0,58.0,3,2.1,Undergraduate,169,0,1,0,0
1,51,25.0,18.0,1,0.3,Advanced,93,0,0,1,0
2,43,13.0,38.0,3,2.0,Advanced,0,0,1,0,0
3,37,12.0,60.0,4,2.1,Advanced,217,0,1,0,0
4,23,,149.0,1,6.33,Undergraduate,305,0,0,1,0


In [165]:
# correlation between columns
loandata.corr()


  loandata.corr()


Unnamed: 0,Age,Experience,Income,Family,CCAvg,Mortgage,CD Account,Online,CreditCard,Personal Loan
Age,1.0,0.994236,-0.043482,-0.046942,-0.036068,0.020851,0.024537,0.034029,0.024464,-0.029408
Experience,0.994236,1.0,-0.03969,-0.05389,-0.038741,0.026466,0.025706,0.028488,0.033042,-0.031741
Income,-0.043482,-0.03969,1.0,-0.045998,0.629227,0.25096,0.257815,0.016826,-0.013544,0.721258
Family,-0.046942,-0.05389,-0.045998,1.0,-0.012103,0.029502,0.025519,0.024708,0.012454,0.083456
CCAvg,-0.036068,-0.038741,0.629227,-0.012103,1.0,0.131017,0.197364,0.011636,0.00341,0.496695
Mortgage,0.020851,0.026466,0.25096,0.029502,0.131017,1.0,0.14219,0.00424,0.037236,0.19585
CD Account,0.024537,0.025706,0.257815,0.025519,0.197364,0.14219,1.0,0.26287,0.377198,0.349109
Online,0.034029,0.028488,0.016826,0.024708,0.011636,0.00424,0.26287,1.0,0.002696,0.008225
CreditCard,0.024464,0.033042,-0.013544,0.012454,0.00341,0.037236,0.377198,0.002696,1.0,0.008144
Personal Loan,-0.029408,-0.031741,0.721258,0.083456,0.496695,0.19585,0.349109,0.008225,0.008144,1.0


In [166]:
#dropping age column because age and experience are highly correlated
loandata = loandata.drop(["Age"], axis = 1)

In [167]:
loandata.head()

Unnamed: 0,Experience,Income,Family,CCAvg,Education,Mortgage,CD Account,Online,CreditCard,Personal Loan
0,13.0,58.0,3,2.1,Undergraduate,169,0,1,0,0
1,25.0,18.0,1,0.3,Advanced,93,0,0,1,0
2,13.0,38.0,3,2.0,Advanced,0,0,1,0,0
3,12.0,60.0,4,2.1,Advanced,217,0,1,0,0
4,,149.0,1,6.33,Undergraduate,305,0,0,1,0


In [168]:
# seperating response and predictors
response_loans = loandata['Personal Loan']
predictors_df_loans = loandata.loc[:, :'CreditCard']

In [169]:
#flagging categorical values
predictors_df_loans = pd.get_dummies(predictors_df_loans, drop_first = True)

In [170]:
predictors_df_loans.head()

Unnamed: 0,Experience,Income,Family,CCAvg,Mortgage,CD Account,Online,CreditCard,Education_Masters,Education_Undergraduate
0,13.0,58.0,3,2.1,169,0,1,0,0,1
1,25.0,18.0,1,0.3,93,0,0,1,0,0
2,13.0,38.0,3,2.0,0,0,1,0,0,0
3,12.0,60.0,4,2.1,217,0,1,0,0,0
4,,149.0,1,6.33,305,0,0,1,0,1


In [171]:
# impute NA values with k-NN imputer 
# in the code below, be sure you change "predictors_df" if you use a different name for your predictors DataFrame
imputer = KNNImputer(n_neighbors=5)
predictors_df_loans = pd.DataFrame(imputer.fit_transform(predictors_df_loans), columns = predictors_df_loans.columns)
predictors_df_loans

Unnamed: 0,Experience,Income,Family,CCAvg,Mortgage,CD Account,Online,CreditCard,Education_Masters,Education_Undergraduate
0,13.0,58.0,3.0,2.10,169.0,0.0,1.0,0.0,0.0,1.0
1,25.0,18.0,1.0,0.30,93.0,0.0,0.0,1.0,0.0,0.0
2,13.0,38.0,3.0,2.00,0.0,0.0,1.0,0.0,0.0,0.0
3,12.0,60.0,4.0,2.10,217.0,0.0,1.0,0.0,0.0,0.0
4,17.8,149.0,1.0,6.33,305.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
1112,12.0,123.0,4.0,3.10,253.0,1.0,1.0,1.0,1.0,0.0
1113,13.0,158.0,2.0,2.30,0.0,1.0,1.0,1.0,1.0,0.0
1114,29.0,120.0,4.0,2.70,111.0,1.0,1.0,0.0,1.0,0.0
1115,0.0,179.0,4.0,2.10,0.0,0.0,0.0,0.0,1.0,0.0


## Part 2: $k$-NN

In [172]:
# partition data into train and test sets
X_classifier = predictors_df_loans
y_classifier = response_loans
train_X_classifier, test_X_classifier, train_y_classifier, test_y_classifier = train_test_split(X_classifier, 
                                                            y_classifier, test_size=0.3, random_state=616, stratify = y_classifier)

In [173]:
# normalize predictors in auctions data using standardization
z_score_norm1 = preprocessing.StandardScaler()
z_score_norm1.fit(train_X_classifier)
train_X_classifier = pd.DataFrame(z_score_norm1.transform(train_X_classifier), 
                                          columns = predictors_df_loans.columns)
test_X_classifier = pd.DataFrame(z_score_norm1.transform(test_X_classifier), 
                                          columns = predictors_df_loans.columns)
test_X_classifier

Unnamed: 0,Experience,Income,Family,CCAvg,Mortgage,CD Account,Online,CreditCard,Education_Masters,Education_Undergraduate
0,1.434331,0.718119,-0.444965,0.040487,-0.555574,-0.423979,0.817804,-0.628206,1.505916,-0.743427
1,-0.736455,-0.704029,1.283978,-1.174968,-0.555574,-0.423979,0.817804,-0.628206,-0.664048,-0.743427
2,0.566017,0.293321,-0.444965,-1.034723,-0.555574,-0.423979,-1.222787,-0.628206,-0.664048,-0.743427
3,-0.910118,-0.279232,1.283978,-1.221716,-0.555574,-0.423979,0.817804,-0.628206,1.505916,-0.743427
4,1.173837,-0.630151,0.419507,-0.240002,0.204030,2.358607,-1.222787,1.591834,-0.664048,-0.743427
...,...,...,...,...,...,...,...,...,...,...
331,-0.823287,1.567714,-0.444965,1.115698,-0.555574,-0.423979,-1.222787,-0.628206,1.505916,-0.743427
332,-1.344276,0.588833,0.419507,-1.034723,-0.555574,-0.423979,-1.222787,-0.628206,-0.664048,-0.743427
333,0.305522,1.161386,1.283978,0.274229,3.048503,2.358607,0.817804,-0.628206,1.505916,-0.743427
334,-0.475961,0.810466,-1.309436,0.648215,-0.555574,-0.423979,0.817804,-0.628206,-0.664048,1.345122


In [175]:
# train the k-NN model and look at performance on train data
knn = KNeighborsClassifier(n_neighbors=5).fit(train_X_classifier, train_y_classifier)
predicted_y_training = knn.predict(train_X_classifier)
f1_score(train_y_classifier, predicted_y_training)

0.9408194233687405

In [177]:
# performance of k-NN on test data
predicted_y_test = knn.predict(test_X_classifier)
f1_score(test_y_classifier, predicted_y_test)

0.9052631578947369

In [182]:
# train a classifier for different values of k
results = []
for k in range(1, 20):
    knn2 = KNeighborsClassifier(n_neighbors=k).fit(train_X_classifier, train_y_classifier)
    results.append({
        'k': k,
        'f1_score_train': f1_score(train_y_classifier, knn2.predict(train_X_classifier)),
        'f1_test_score': f1_score(test_y_classifier, knn2.predict(test_X_classifier))
    })

# Convert results to a pandas data frame
results = pd.DataFrame(results)
results['Score_diff'] = results['f1_score_train'] - results['f1_test_score']
print(results.sort_values(by=['f1_test_score'], ascending=False))

     k  f1_score_train  f1_test_score  Score_diff
12  13        0.912173       0.924188   -0.012015
10  11        0.922601       0.920290    0.002311
6    7        0.931298       0.908451    0.022847
8    9        0.924731       0.907143    0.017588
4    5        0.940819       0.905263    0.035556
14  15        0.906009       0.901818    0.004191
7    8        0.916535       0.901818    0.014717
2    3        0.955994       0.901408    0.054585
18  19        0.897196       0.901099   -0.003903
11  12        0.908805       0.900369    0.008436
13  14        0.908805       0.899628    0.009177
16  17        0.914110       0.899281    0.014830
9   10        0.916535       0.898876    0.017659
17  18        0.891339       0.892193   -0.000855
15  16        0.902821       0.888889    0.013932
5    6        0.928125       0.885609    0.042516
0    1        1.000000       0.879433    0.120567
3    4        0.935837       0.868914    0.066923
1    2        0.933333       0.844961    0.088372


## Part 3: Logistic regression and model comparison

In [183]:
# training with logistic regression
logistic_model = LogisticRegression()
logistic_model = logistic_model.fit(train_X_classifier, train_y_classifier)
predicted_y_training2 = logistic_model.predict(train_X_classifier)
f1_score(train_y_classifier, predicted_y_training2)

0.8796433878157504

In [184]:
# test data prediction
predicted_y_test2 = logistic_model.predict(test_X_classifier)
f1_score(test_y_classifier, predicted_y_test2)

0.8719723183391003