In [115]:
%matplotlib inline
from pathlib import Path

import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
import matplotlib.pylab as plt

In [116]:
uni = pd.read_csv('UniversalBank.csv')
uni['Number'] = uni.index + 1

In [117]:
uni.head(5)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard,Number
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0,1
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0,2
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0,3
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0,4
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1,5


In [118]:
uni.shape

(5000, 15)

In [119]:
uni.Education.unique()

array([1, 2, 3], dtype=int64)

In [120]:
uni.dtypes

ID                      int64
Age                     int64
Experience              int64
Income                  int64
ZIP Code                int64
Family                  int64
CCAvg                 float64
Education               int64
Mortgage                int64
Personal Loan           int64
Securities Account      int64
CD Account              int64
Online                  int64
CreditCard              int64
Number                  int64
dtype: object

In [121]:
uni.drop(['ID', 'ZIP Code' ], axis = 1, inplace = True)

In [122]:
uni['Education'] = uni['Education'].astype('category')

In [123]:
uni = pd.get_dummies(uni)

In [124]:
uni.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard,Number,Education_1,Education_2,Education_3
0,25,1,49,4,1.6,0,0,1,0,0,0,1,1,0,0
1,45,19,34,3,1.5,0,0,1,0,0,0,2,1,0,0
2,39,15,11,1,1.0,0,0,0,0,0,0,3,1,0,0
3,35,9,100,1,2.7,0,0,0,0,0,0,4,0,1,0
4,35,8,45,4,1.0,0,0,0,0,0,1,5,0,1,0


In [125]:
trainData, validData = train_test_split(uni, test_size = 0.4, random_state = 26)
print(trainData.shape, validData.shape)
newCustomer = pd.DataFrame({'Age': 40, 'Experience': 10, 'Income': 84, 'Family': 2, 'CCAvg': 2, 'Education_1': 0, 'Education_2':1, 
                            'Education_3': 0, 'Mortgage': 0, 'Securities Account': 0, 'CD Account': 0, 'Online': 1, 'CreditCard': 1}, index = [0])
newCustomer

(3000, 15) (2000, 15)


Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education_1,Education_2,Education_3,Mortgage,Securities Account,CD Account,Online,CreditCard
0,40,10,84,2,2,0,1,0,0,0,0,1,1


In [128]:
scaler = preprocessing.StandardScaler()
scaler.fit(trainData[['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education_1', 'Education_2', 'Education_3', 'Mortgage', 'Securities Account', 'CD Account', 'Online', 'CreditCard']])  # Note the use of an array of column names

#Transform the full dataset
uniNorm = pd.concat([pd.DataFrame(scaler.transform(uni[['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education_1', 'Education_2', 'Education_3', 'Mortgage', 'Securities Account', 'CD Account', 'Online', 'CreditCard']]),
                                  columns = ['zAge', 'zExperience', 'zIncome', 'zFamily', 'zCCAvg', 'zEducation_1', 'zEducation_2', 'zEducation_3', 'zMortgage', 'zSecurities Account', 'zCD Account', 'zOnline', 'zCreditCard']),
                                   uni[['Personal Loan', 'Number']]], axis = 1)
trainNorm = uniNorm.iloc[trainData.index]
validNorm = uniNorm.iloc[validData.index]
newCustomerNorm = pd.DataFrame(scaler.transform(newCustomer), columns = ['zAge', 'zExperience', 'zIncome', 'zFamily', 'zCCAvg', 'zEducation_1', 'zEducation_2', 'zEducation_3', 'zMortgage', 'zSecurities Account', 'zCD Account', 'zOnline', 'zCreditCard'])

In [137]:
knn = NearestNeighbors(n_neighbors = 1)
knn.fit(trainNorm[['zAge', 'zExperience', 'zIncome', 'zFamily', 'zCCAvg', 'zEducation_1', 'zEducation_2', 'zEducation_3', 'zMortgage', 'zSecurities Account', 'zCD Account', 'zOnline', 'zCreditCard']])
distances, indices = knn.kneighbors(newCustomerNorm)
print(trainNorm.iloc[indices[0]])

          zAge  zExperience   zIncome   zFamily    zCCAvg  zEducation_1  \
4407 -0.719804    -0.613601 -0.042428 -0.340587 -0.122566     -0.856799   

      zEducation_2  zEducation_3  zMortgage  zSecurities Account  zCD Account  \
4407      1.587806     -0.643242  -0.547625            -0.346151    -0.248891   

       zOnline  zCreditCard  Personal Loan  Number  
4407  0.806328     1.549632              0    4408  


The new customer was classified as 0 and that means the loan was not accepted.

In [138]:
train_X = trainNorm[['zAge', 'zExperience', 'zIncome', 'zFamily', 'zCCAvg', 'zEducation_1', 'zEducation_2', 'zEducation_3', 'zMortgage', 'zSecurities Account', 'zCD Account', 'zOnline', 'zCreditCard']]
train_y = trainNorm[['Personal Loan']]
valid_X = validNorm[['zAge', 'zExperience', 'zIncome', 'zFamily', 'zCCAvg', 'zEducation_1', 'zEducation_2', 'zEducation_3', 'zMortgage', 'zSecurities Account', 'zCD Account', 'zOnline', 'zCreditCard']]
valid_y = validNorm[['Personal Loan']]       

#Train a classifier for different values of k
results = []
for k in range(1, 15):
    knn = KNeighborsClassifier(n_neighbors = k).fit(train_X, train_y)
    results.append({'k': k,
                   'accuracy': accuracy_score(valid_y, knn.predict(valid_X))
                   })
results = pd.DataFrame(results)
print(results)

  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':


     k  accuracy
0    1    0.9550
1    2    0.9460
2    3    0.9555
3    4    0.9445
4    5    0.9525
5    6    0.9445
6    7    0.9495
7    8    0.9425
8    9    0.9460
9   10    0.9430
10  11    0.9450
11  12    0.9390
12  13    0.9405
13  14    0.9350


The choice of k that balances between overfitting and ignoring the predictor information is k = 3

In [143]:
# Retrain with full dataset
uni_X = uniNorm[['zAge', 'zExperience', 'zIncome', 'zFamily', 'zCCAvg', 'zEducation_1', 'zEducation_2', 'zEducation_3', 'zMortgage', 'zSecurities Account', 'zCD Account', 'zOnline', 'zCreditCard']]
uni_y = uniNorm[['Personal Loan']]
knn = KNeighborsClassifier(n_neighbors = 3).fit(uni_X, uni_y)
distances, indices = knn.kneighbors(newCustomerNorm)
print(knn.predict(newCustomerNorm))
print('Distances',distances)
print('Indices', indices)
print(uniNorm.iloc[indices[0], :])

[0]
Distances [[0.47685308 0.49690056 0.638058  ]]
Indices [[4034 4407 3398]]
          zAge  zExperience   zIncome   zFamily    zCCAvg  zEducation_1  \
4034 -0.893355    -0.787261  0.199307 -0.340587 -0.122566     -0.856799   
4407 -0.719804    -0.613601 -0.042428 -0.340587 -0.122566     -0.856799   
3398 -0.459477    -0.526770 -0.240212 -0.340587  0.279333     -0.856799   

      zEducation_2  zEducation_3  zMortgage  zSecurities Account  zCD Account  \
4034      1.587806     -0.643242  -0.547625            -0.346151    -0.248891   
4407      1.587806     -0.643242  -0.547625            -0.346151    -0.248891   
3398      1.587806     -0.643242  -0.547625            -0.346151    -0.248891   

       zOnline  zCreditCard  Personal Loan  Number  
4034  0.806328     1.549632              0    4035  
4407  0.806328     1.549632              0    4408  
3398  0.806328     1.549632              0    3399  


  after removing the cwd from sys.path.


--Solve this

Repartition the data, this time into training, validation, and test sets (50%:30%:20%). Apply the k-NN method with the k chosen above. Compare the confusion matrix of the test set with that of the training and validation sets. Comment on the differences and their reason. 