In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [2]:
default = pd.read_csv("data/default.csv")

In [3]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
features_to_scale = ["balance","income"]
scaled_values = scaler.fit_transform(default[features_to_scale])
default["norm_balance"] = scaled_values[:,0]
default["norm_income"] = scaled_values[:,1]

In [4]:
from sklearn.model_selection import train_test_split
default_train,default_test = train_test_split(default,test_size=0.2,random_state=100)

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
k_fold = KFold(n_splits=5,shuffle=False,random_state=100)
features = ["norm_balance","norm_income"]
target = ["default"]
knn_k_vals = [1,3,5,7,9,11]
avg_train_accuracy = []
avg_val_accuracy = []
X = default_train[features]
y = default_train[target]
#Taking each value of k from the list
for k in knn_k_vals: 
    # Iterating over each fold from kfold
    train_accuracy = []
    val_accuracy = []
    for i,(train,val) in enumerate(k_fold.split(X,y)): 
        #Train a model on the selected fold for a selected value of k
        model = KNeighborsClassifier(n_neighbors=k,metric="euclidean")
        model.fit(X.iloc[train],y.iloc[train])
        train_accuracy.append(model.score(X.iloc[train],y.iloc[train]))
        val_accuracy.append(model.score(X.iloc[val],y.iloc[val]))
    avg_train_accuracy.append(np.mean(train_accuracy))
    avg_val_accuracy.append(np.mean(val_accuracy))

In [6]:
performance_scores = pd.DataFrame(np.array([knn_k_vals,
                                            avg_train_accuracy,
                                            avg_val_accuracy]).T,
                     columns=["k","avg_train_accuracy","avg_val_accuracy"])
performance_scores

Unnamed: 0,k,avg_train_accuracy,avg_val_accuracy
0,1.0,1.0,0.9545
1,3.0,0.977688,0.968
2,5.0,0.975313,0.97075
3,7.0,0.974531,0.97125
4,9.0,0.974781,0.9725
5,11.0,0.974469,0.972625


In [7]:
best_k = performance_scores["k"][performance_scores["avg_val_accuracy"]==
                                 performance_scores["avg_val_accuracy"].max()]
best_k

5    11.0
Name: k, dtype: float64

#### Using the best k found to train a model

In [13]:
features = ["norm_balance","norm_income"]
target = ["default"]
model = KNeighborsClassifier(n_neighbors = 11, metric="euclidean")
model.fit(default_train[features],default_train[target])
train_accuracy = model.score(default_train[features],default_train[target])
test_accuracy = model.score(default_test[features],default_test[target])
print(train_accuracy,test_accuracy)

0.97475 0.9715
