In [28]:
# some imports to get you going

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report

## Part 1: Data exploration and preprocessing

In [30]:
# read in data file

bank_df = pd.read_csv("C:/Users/ankit/Downloads/UniversalBank.csv")

### Basic Exploratory Analysis

In [32]:
# check the head of the data frame

print(bank_df.head(10))

   ID  Age  Experience  Income  ZIP Code  Family  CCAvg  Education  Mortgage  \
0   1   25           1      49     91107       4    1.6          1         0   
1   2   45          19      34     90089       3    1.5          1         0   
2   3   39          15      11     94720       1    1.0          1         0   
3   4   35           9     100     94112       1    2.7          2         0   
4   5   35           8      45     91330       4    1.0          2         0   
5   6   37          13      29     92121       4    0.4          2       155   
6   7   53          27      72     91711       2    1.5          2         0   
7   8   50          24      22     93943       1    0.3          3         0   
8   9   35          10      81     90089       3    0.6          2       104   
9  10   34           9     180     93023       1    8.9          3         0   

   Personal Loan  CD Account  Online  CreditCard  
0              0           0       0           0  
1              0 

In [33]:
# print the data types of each column

print(bank_df.dtypes)

ID                 int64
Age                int64
Experience         int64
Income             int64
ZIP Code           int64
Family             int64
CCAvg            float64
Education          int64
Mortgage           int64
Personal Loan      int64
CD Account         int64
Online             int64
CreditCard         int64
dtype: object


In [34]:
# print the shape of the data frame 

print(bank_df.shape)

(5000, 13)


In [35]:
# check for null values

missing_values = bank_df.isnull().sum()
print(missing_values)

ID               0
Age              0
Experience       0
Income           0
ZIP Code         0
Family           0
CCAvg            0
Education        0
Mortgage         0
Personal Loan    0
CD Account       0
Online           0
CreditCard       0
dtype: int64


In [36]:
# Look at the frequency of each occurence in the "Personal Loan" column

print(bank_df["Personal Loan"].value_counts(dropna=False))

Personal Loan
0    4520
1     480
Name: count, dtype: int64


In [37]:
# Look at the correlation between all of the predictors

predictors_df = bank_df[['ID', 'Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education',
       'Mortgage', 'CD Account', 'Online', 'CreditCard']]
response_df = bank_df['Personal Loan']

predictors_df.corr(numeric_only=True)

Unnamed: 0,ID,Age,Experience,Income,Family,CCAvg,Education,Mortgage,CD Account,Online,CreditCard
ID,1.0,-0.008473,-0.008326,-0.017695,-0.016797,-0.024675,0.021463,-0.01392,-0.006909,-0.002528,0.017028
Age,-0.008473,1.0,0.994215,-0.055269,-0.046418,-0.052012,0.041334,-0.012539,0.008043,0.013702,0.007681
Experience,-0.008326,0.994215,1.0,-0.046574,-0.052563,-0.050077,0.013152,-0.010582,0.010353,0.013898,0.008967
Income,-0.017695,-0.055269,-0.046574,1.0,-0.157501,0.645984,-0.187524,0.206806,0.169738,0.014206,-0.002385
Family,-0.016797,-0.046418,-0.052563,-0.157501,1.0,-0.109275,0.064929,-0.020445,0.01411,0.010354,0.011588
CCAvg,-0.024675,-0.052012,-0.050077,0.645984,-0.109275,1.0,-0.136124,0.109905,0.136534,-0.003611,-0.006689
Education,0.021463,0.041334,0.013152,-0.187524,0.064929,-0.136124,1.0,-0.033327,0.013934,-0.015004,-0.011014
Mortgage,-0.01392,-0.012539,-0.010582,0.206806,-0.020445,0.109905,-0.033327,1.0,0.089311,-0.005995,-0.007231
CD Account,-0.006909,0.008043,0.010353,0.169738,0.01411,0.136534,0.013934,0.089311,1.0,0.17588,0.278644
Online,-0.002528,0.013702,0.013898,0.014206,0.010354,-0.003611,-0.015004,-0.005995,0.17588,1.0,0.00421


### Data Preprocessing


#### Drop Predictors

In [40]:
# drop 'ID' and "Experience'

predictors_df = predictors_df.drop(['ID', 'Experience'], axis=1)
predictors_df.head()

Unnamed: 0,Age,Income,Family,CCAvg,Education,Mortgage,CD Account,Online,CreditCard
0,25,49,4,1.6,1,0,0,0,0
1,45,34,3,1.5,1,0,0,0,0
2,39,11,1,1.0,1,0,0,0,0
3,35,100,1,2.7,2,0,0,0,0
4,35,45,4,1.0,2,0,0,0,1


#### Deal with NA values


In [42]:
# drop na values

bank_df = bank_df.dropna()

predictors_df = bank_df[['Age', 'Income', 'Family', 'CCAvg', 'Education',
       'Mortgage', 'CD Account', 'Online', 'CreditCard']]
response_df = bank_df['Personal Loan']

print(predictors_df.isnull().sum())

Age           0
Income        0
Family        0
CCAvg         0
Education     0
Mortgage      0
CD Account    0
Online        0
CreditCard    0
dtype: int64


#### Deal with categorical variables

In [44]:
# flag categorical varibales

print(predictors_df['Education'].value_counts())
predictors_df1 = pd.get_dummies(predictors_df) 
predictors_df1.corr()

Education
1    2096
3    1501
2    1403
Name: count, dtype: int64


Unnamed: 0,Age,Income,Family,CCAvg,Education,Mortgage,CD Account,Online,CreditCard
Age,1.0,-0.055269,-0.046418,-0.052012,0.041334,-0.012539,0.008043,0.013702,0.007681
Income,-0.055269,1.0,-0.157501,0.645984,-0.187524,0.206806,0.169738,0.014206,-0.002385
Family,-0.046418,-0.157501,1.0,-0.109275,0.064929,-0.020445,0.01411,0.010354,0.011588
CCAvg,-0.052012,0.645984,-0.109275,1.0,-0.136124,0.109905,0.136534,-0.003611,-0.006689
Education,0.041334,-0.187524,0.064929,-0.136124,1.0,-0.033327,0.013934,-0.015004,-0.011014
Mortgage,-0.012539,0.206806,-0.020445,0.109905,-0.033327,1.0,0.089311,-0.005995,-0.007231
CD Account,0.008043,0.169738,0.01411,0.136534,0.013934,0.089311,1.0,0.17588,0.278644
Online,0.013702,0.014206,0.010354,-0.003611,-0.015004,-0.005995,0.17588,1.0,0.00421
CreditCard,0.007681,-0.002385,0.011588,-0.006689,-0.011014,-0.007231,0.278644,0.00421,1.0


In [45]:
# shape of the predictors dataframe

print(predictors_df1.shape)

(5000, 9)


#### Normalize data

In [47]:
from sklearn import preprocessing

In [48]:
# normalize predictors using Z-score normalization

z_score_norm = preprocessing.StandardScaler()
predictor_df_normalized = z_score_norm.fit_transform(predictors_df1)
predictor_df_normalized = pd.DataFrame(predictor_df_normalized, columns = predictors_df1.columns)
predictor_df_normalized.head(10)

Unnamed: 0,Age,Income,Family,CCAvg,Education,Mortgage,CD Account,Online,CreditCard
0,-1.774417,-0.538229,1.397414,-0.193385,-1.049078,-0.555524,-0.25354,-1.216618,-0.645314
1,-0.029524,-0.864109,0.525991,-0.250611,-1.049078,-0.555524,-0.25354,-1.216618,-0.645314
2,-0.552992,-1.363793,-1.216855,-0.536736,-1.049078,-0.555524,-0.25354,-1.216618,-0.645314
3,-0.90197,0.569765,-1.216855,0.436091,0.141703,-0.555524,-0.25354,-1.216618,-0.645314
4,-0.90197,-0.62513,1.397414,-0.536736,0.141703,-0.555524,-0.25354,-1.216618,1.549632
5,-0.727481,-0.972736,1.397414,-0.880087,0.141703,0.968512,-0.25354,0.821951,-0.645314
6,0.668434,-0.038545,-0.345432,-0.250611,0.141703,-0.555524,-0.25354,0.821951,-0.645314
7,0.4067,-1.124814,-1.216855,-0.937312,1.332484,-0.555524,-0.25354,-1.216618,1.549632
8,-0.90197,0.156983,0.525991,-0.765637,0.141703,0.467055,-0.25354,0.821951,-0.645314
9,-0.989215,2.307795,-1.216855,3.984049,1.332484,-0.555524,-0.25354,-1.216618,-0.645314


## Part 2: $k$-NN

In [50]:
# partition data into train and test sets

X_classifier = predictor_df_normalized
y_classifier = response_df
train_X_classifier, test_X_classifier, train_y_classifier, test_y_classifier = train_test_split(X_classifier, 
                                                            y_classifier, test_size=0.3, random_state=616, 
                                                                                    stratify = y_classifier)

In [51]:
# train the k-NN model and look at performance on train data

knn = KNeighborsClassifier(n_neighbors=5).fit(train_X_classifier, train_y_classifier)
predicted_y_training = knn.predict(train_X_classifier)
f1_score(train_y_classifier, predicted_y_training)

0.8440677966101695

In [52]:
# performance of k-NN on test data

predicted_y_test = knn.predict(test_X_classifier)
f1_score(test_y_classifier, predicted_y_test)

0.7563025210084033

#### Let's find the optimal value of $k$

In [54]:
# train a classifier for different values of k

results = []
for k in range(1, 20):
    knn2 = KNeighborsClassifier(n_neighbors=k).fit(train_X_classifier, train_y_classifier)
    results.append({
        'k': k,
        'f1_score_train': f1_score(train_y_classifier, knn2.predict(train_X_classifier)),
        'f1_score_test': f1_score(test_y_classifier, knn2.predict(test_X_classifier))
    })

# Convert results to a pandas data frame
#results = pd.DataFrame(results)
print(results)

[{'k': 1, 'f1_score_train': 1.0, 'f1_score_test': 0.789272030651341}, {'k': 2, 'f1_score_train': 0.8433734939759037, 'f1_score_test': 0.7105263157894737}, {'k': 3, 'f1_score_train': 0.8910569105691057, 'f1_score_test': 0.7704918032786885}, {'k': 4, 'f1_score_train': 0.8112874779541446, 'f1_score_test': 0.7280701754385965}, {'k': 5, 'f1_score_train': 0.8440677966101695, 'f1_score_test': 0.7563025210084033}, {'k': 6, 'f1_score_train': 0.7841726618705036, 'f1_score_test': 0.7248908296943232}, {'k': 7, 'f1_score_train': 0.8166089965397924, 'f1_score_test': 0.7350427350427351}, {'k': 8, 'f1_score_train': 0.7695099818511797, 'f1_score_test': 0.6756756756756757}, {'k': 9, 'f1_score_train': 0.7972027972027972, 'f1_score_test': 0.7217391304347827}, {'k': 10, 'f1_score_train': 0.738404452690167, 'f1_score_test': 0.684931506849315}, {'k': 11, 'f1_score_train': 0.7603603603603604, 'f1_score_test': 0.6936936936936937}, {'k': 12, 'f1_score_train': 0.6934865900383141, 'f1_score_test': 0.6635944700460

In [55]:
# Using max() to find optimal k and F1 test score

optimal_entry = max(results, key=lambda x: x['f1_score_test'])
optimal_k = optimal_entry['k']
optimal_f1_test = optimal_entry['f1_score_test']

print(optimal_entry)
print("Optimal k:", optimal_k)
print("F1 score of test data:", optimal_f1_test)

{'k': 1, 'f1_score_train': 1.0, 'f1_score_test': 0.789272030651341}
Optimal k: 1
F1 score of test data: 0.789272030651341


## Part 3: Logistic regression and model comparison

In [57]:
# train the LR model

logistic_model = LogisticRegression(solver='lbfgs').fit(train_X_classifier, train_y_classifier)

In [58]:
# F1 score of train data set

predicted_y_training = logistic_model.predict(train_X_classifier)
f1_train_lr = f1_score(train_y_classifier, predicted_y_training)
f1_train_lr

0.7241962774957699

In [59]:
# F1 score of the test data set

predicted_y_test = logistic_model.predict(test_X_classifier)
f1_test_lr = f1_score(test_y_classifier, predicted_y_test)
f1_test_lr

0.7054263565891473

#### Let's compare $k$-NN to logisitic regression

In [61]:
# comparison data frame of f1 scores of logistic regression and k-nn

comparison = [
    {
        'Model': 'Logistic Regression',
        'F1_Train': f1_train_lr,
        'F1_Test': f1_test_lr
    },
    {
        'Model': f'k-NN (k={optimal_k})',
        'F1_Train': optimal_entry['f1_score_train'],
        'F1_Test': optimal_entry['f1_score_test']
    }
]

comparison_df = pd.DataFrame(comparison)
print("\n", comparison_df)



                  Model  F1_Train   F1_Test
0  Logistic Regression  0.724196  0.705426
1           k-NN (k=1)  1.000000  0.789272
