# Chapter 7: k-Nearest Neighbors (kNN)

> (c) 2019 Galit Shmueli, Peter C. Bruce, Peter Gedeck 
>
> Code included in
>
> _Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python_ (First Edition) 
> Galit Shmueli, Peter C. Bruce, Peter Gedeck, and Nitin R. Patel. 2019.

## Import required packages

In [30]:
%matplotlib inline

from pathlib import Path

import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from dmba import plotDecisionTree, classificationSummary, regressionSummary
import matplotlib.pylab as plt


## Table 7.1

In [31]:
# Loading the datafile UniversalBank.csv
ubank_df = pd.read_csv('UniversalBank.csv')

# Populating the Number column with [index+1]
ubank_df['Number'] = ubank_df.index + 1

# Displaying the raw data file - optional
ubank_df

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard,Number
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0,1
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0,2
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0,3
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0,4
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,29,3,40,92697,1,1.9,3,0,0,0,0,1,0,4996
4996,4997,30,4,15,92037,4,0.4,1,85,0,0,0,1,0,4997
4997,4998,63,39,24,93023,2,0.3,3,0,0,0,0,0,0,4998
4998,4999,65,40,49,90034,3,0.5,2,0,0,0,0,1,0,4999


In [32]:
# Drop the ID and ZIP Code columns
ubank_df = ubank_df.drop(columns=['ID', 'ZIP Code'])

# Replace spaces with underscore in any column names
ubank_df.columns = [c.replace(' ', '_') for c in ubank_df.columns]

# Displaying the raw data file - optional
ubank_df

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal_Loan,Securities_Account,CD_Account,Online,CreditCard,Number
0,25,1,49,4,1.6,1,0,0,1,0,0,0,1
1,45,19,34,3,1.5,1,0,0,1,0,0,0,2
2,39,15,11,1,1.0,1,0,0,0,0,0,0,3
3,35,9,100,1,2.7,2,0,0,0,0,0,0,4
4,35,8,45,4,1.0,2,0,0,0,0,0,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,29,3,40,1,1.9,3,0,0,0,0,1,0,4996
4996,30,4,15,4,0.4,1,85,0,0,0,1,0,4997
4997,63,39,24,2,0.3,3,0,0,0,0,0,0,4998
4998,65,40,49,3,0.5,2,0,0,0,0,1,0,4999


In [33]:
# Change the 'Education' column heading to 'category', or treat it as such for the time being.
ubank_df['Education'] = ubank_df['Education'].astype('category')
# get_dummies method in Pandas:  work in ubank_df, set delimiter as _, do not get k-1 categories out of k categorical levels
ubank_df = pd.get_dummies(ubank_df, prefix_sep='_', drop_first=False)

In [34]:
# Sorting the data into the training and validation data.  "test_size" = validation data percentage 

# For parts a-c.
trainData, validData = train_test_split(ubank_df, test_size=0.4, random_state=1)

# For part e.  
# trainData, validData = train_test_split(ubank_df, test_size=0.2, train_size=0.5, random_state=1)

# Print the rows,columns of the training and validation data
print(trainData.shape, validData.shape)

# Defining the new customer with the given data
newCust = pd.DataFrame([{'Age':40,  'Experience':10,    'Income':84,
                        'Family':2, 'CCAvg':2,  'Mortgage':0,
                        'Securities Account':0, 'CD Account':0,
                        'Online':1, 'Credit Card':1,    'Education_1':0,
                        'Education_2':1,    'Education_3':0,}],
                        columns=['Age', 'Experience',   'Income',
                        'Family',   'CCAvg',    'Mortgage', 
                        'Securities Account',   'CD Account',
                        'Online',   'Credit Card',      'Education_1',
                        'Education_2',      'Education_3'])

# Display the new customer - optional.
# newCust

(3000, 15) (2000, 15)


## Table 7.2
Initialize normalized training, validation, and complete data frames. Use the training data to learn the transformation.

In [35]:
# Normalize the data in the columns

# Establish "scaler" as the call to StandardScaler
scaler = preprocessing.StandardScaler()
# Find the best fit of the data by supplying the data columns  
scaler.fit(trainData[['Age', 'Experience', 'Income', 'Family',
                        'CCAvg', 'Mortgage', 'Securities_Account',
                        'CD_Account', 'Online', 'CreditCard',
                        'Education_1', 'Education_2', 'Education_3']])  # Note the use of an array of column names

# Transform the full dataset
# Scaler transform on available columns in DataFrame, then concatenate that with ubank_df
educateNorm = pd.concat([pd.DataFrame(
                            scaler.transform(
                                ubank_df[['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Mortgage',
                                            'Securities_Account', 'CD_Account', 'Online', 'CreditCard',
                                            'Education_1', 'Education_2', 'Education_3']]), # End of columns to transform into...
                                    columns=['zAge', 'zExperience', 'zIncome', 'zFamily', # these new columns
                                            'zCCAvg', 'zMortgage', 'zSecurities_Account', 
                                            'zCD_Account', 'zOnline', 'zCreditCard',
                                            'zEducation_1', 'zEducation_2', 'zEducation_3']), # end of pd.DataFrame.  Concat...
                       ubank_df['Personal_Loan']], axis=1) # with these original columns
                                    
trainNorm = educateNorm.iloc[trainData.index]
validNorm = educateNorm.iloc[validData.index]

# Create a new DataFrame with newCust data
newEduNorm = pd.DataFrame(scaler.transform(newCust), # newCust has 13 columns in it, while scaler expects 14
                                columns=['zAge', 'zExperience', 'zIncome', 'zFamily',
                                            'zCCAvg', 'zMortgage', 'zSecurities_Account', 
                                            'zCD_Account', 'zOnline', 'zCreditCard',
                                            'zEducation_1', 'zEducation_2', 'zEducation_3'])
# Display the processed data - optional
# educateNorm
# newEduNorm

Use k-nearest neighbour

In [36]:
# Training data
knn = NearestNeighbors(n_neighbors=1)
knn.fit(trainNorm[['zAge', 'zExperience', 'zIncome', 'zFamily', 
                    'zCCAvg', 'zMortgage', 'zSecurities_Account', 
                    'zCD_Account', 'zOnline', 'zCreditCard',
                    'zEducation_1', 'zEducation_2', 'zEducation_3']])
# Let distances & indices = output of knn.kneighbors
distances, indices = knn.kneighbors(newEduNorm)
# Print closest 3 similar records (remember iloc is integer locate)
print(trainNorm.iloc[indices[0], :])  # indices is a list of lists, we are only interested in the first element

          zAge  zExperience   zIncome   zFamily    zCCAvg  zMortgage  \
4407 -0.747929    -0.639658 -0.059674 -0.352127 -0.136574  -0.559242   

      zSecurities_Account  zCD_Account  zOnline  zCreditCard  zEducation_1  \
4407            -0.337025    -0.252646  0.83419      1.53728     -0.838795   

      zEducation_2  zEducation_3  Personal_Loan  
4407      1.591719     -0.660895              0  


## Table 7.3
Initialize a data frame with two columns: `k` and `accuracy`

In [37]:
train_X = trainNorm[['zAge', 'zExperience', 'zIncome', 'zFamily', 
                    'zCCAvg', 'zMortgage', 'zSecurities_Account', 
                    'zCD_Account', 'zOnline', 'zCreditCard',
                    'zEducation_1', 'zEducation_2', 'zEducation_3']]
train_y = trainNorm['Personal_Loan']
valid_X = validNorm[['zAge', 'zExperience', 'zIncome', 'zFamily', 
                    'zCCAvg', 'zMortgage', 'zSecurities_Account', 
                    'zCD_Account', 'zOnline', 'zCreditCard',
                    'zEducation_1', 'zEducation_2', 'zEducation_3']]
valid_y = validNorm['Personal_Loan']

# Train a classifier for different values of k
results = []
for k in range(1, 15):
    knn = KNeighborsClassifier(n_neighbors=k).fit(train_X, train_y)
    results.append({'k': k, 'accuracy': accuracy_score(valid_y, knn.predict(valid_X)) })

# Convert results to a pandas data frame
results = pd.DataFrame(results)
print(results)

     k  accuracy
0    1    0.9545
1    2    0.9500
2    3    0.9535
3    4    0.9495
4    5    0.9565
5    6    0.9495
6    7    0.9520
7    8    0.9460
8    9    0.9475
9   10    0.9435
10  11    0.9465
11  12    0.9435
12  13    0.9450
13  14    0.9435


## Table 7.4

In [38]:
# Retrain with full dataset
ubank_X = educateNorm[['zAge', 'zExperience', 'zIncome', 'zFamily', 
                    'zCCAvg', 'zMortgage', 'zSecurities_Account', 
                    'zCD_Account', 'zOnline', 'zCreditCard',
                    'zEducation_1', 'zEducation_2', 'zEducation_3']]
ubank_y = educateNorm['Personal_Loan']
knn = KNeighborsClassifier(n_neighbors=1).fit(ubank_X, ubank_y)
distances, indices = knn.kneighbors(newEduNorm)
print(knn.predict(newEduNorm))
print('Distances',distances)
print('Indices', indices)
print(educateNorm.iloc[indices[0], :])

[0]
Distances [[0.47859833]]
Indices [[4034]]
          zAge  zExperience   zIncome   zFamily    zCCAvg  zMortgage  \
4034 -0.922251    -0.813928  0.177728 -0.352127 -0.136574  -0.559242   

      zSecurities_Account  zCD_Account  zOnline  zCreditCard  zEducation_1  \
4034            -0.337025    -0.252646  0.83419      1.53728     -0.838795   

      zEducation_2  zEducation_3  Personal_Loan  
4034      1.591719     -0.660895              0  


a.  How would this customer be classified?
    In both the training data and the full dataset, using k = 1 closest neighbor did not accept the loan (Personal_Loan = 0).  Therefore, there is a possible likelihood that the new customer also will not accept the offered loan.

b.  What is a choice of k that balances between overfitting and ignoring the predictor information?
    If the goal is to choose the k that results in the lowest error rate (highest accuracy), then k=5 would be a good choice.  The accuracy was 0.9565, creating an error rate of 0.0435 (1 - 0.9565 = 0.0435).

c.  Show the confusion matrix for the validation data that results from using the best k.


In [39]:
# Retrain with full dataset using k=5
ubank_X = educateNorm[['zAge', 'zExperience', 'zIncome', 'zFamily', 
                    'zCCAvg', 'zMortgage', 'zSecurities_Account', 
                    'zCD_Account', 'zOnline', 'zCreditCard',
                    'zEducation_1', 'zEducation_2', 'zEducation_3']]
ubank_y = educateNorm['Personal_Loan']
knn = KNeighborsClassifier(n_neighbors=5).fit(ubank_X, ubank_y)
distances, indices = knn.kneighbors(newEduNorm)
print(knn.predict(newEduNorm))
print('Distances',distances)
print('Indices', indices)
print(educateNorm.iloc[indices[0], :])

[0]
Distances [[0.47859833 0.49507362 0.63219765 0.70542183 0.83571448]]
Indices [[4034 4407 3398 1630 4127]]
          zAge  zExperience   zIncome   zFamily    zCCAvg  zMortgage  \
4034 -0.922251    -0.813928  0.177728 -0.352127 -0.136574  -0.559242   
4407 -0.747929    -0.639658 -0.059674 -0.352127 -0.136574  -0.559242   
3398 -0.486446    -0.552523 -0.253912 -0.352127  0.265373  -0.559242   
1630 -0.399285    -0.291118  0.544622 -0.352127 -0.079153  -0.559242   
4127 -0.224963    -0.116848  0.177728 -0.352127 -0.079153  -0.559242   

      zSecurities_Account  zCD_Account  zOnline  zCreditCard  zEducation_1  \
4034            -0.337025    -0.252646  0.83419      1.53728     -0.838795   
4407            -0.337025    -0.252646  0.83419      1.53728     -0.838795   
3398            -0.337025    -0.252646  0.83419      1.53728     -0.838795   
1630            -0.337025    -0.252646  0.83419      1.53728     -0.838795   
4127            -0.337025    -0.252646  0.83419      1.53728     -0

Question 1
d.  Repartition the data, this time into training, validation, and test sets (50%, 30%, and 20%).  Apply the k-NN method with the k chosen above (k=5).  Compare the confusion matrix of the test set with that of the training and validation sets.  Comment on the differences and their reason. 

In [40]:


X = ubank_df.drop(columns=['Personal_Loan'])
y = ubank_df['Personal_Loan']

train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.2, train_size=0.5, random_state=1)

smallClassTree = RandomForestClassifier(n_estimators=5000, random_state=1)
smallClassTree.fit(train_X, train_y)
classificationSummary(train_y, smallClassTree.predict(train_X))
classificationSummary(valid_y, smallClassTree.predict(valid_X))

Confusion Matrix (Accuracy 1.0000)

       Prediction
Actual    0    1
     0 2268    0
     1    0  232
Confusion Matrix (Accuracy 0.9780)

       Prediction
Actual   0   1
     0 899   1
     1  21  79
