In [9]:
from pydataset import data
df = data('iris')
import pandas as pd

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [12]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]
x_train, x_test, y_train,y_test = train_test_split(x,y, test_size=0.3,
                                                  random_state=0)

In [14]:
sc = StandardScaler()
sc.fit(x_train)
train_scale = sc.transform(x_train)
test_scale = sc.transform(x_test)

In [15]:
train_scale = pd.DataFrame(data=train_scale, columns=x_train.columns)
test_scale = pd.DataFrame(data=test_scale, columns=x_test.columns)

In [17]:
knn_model = KNeighborsClassifier()

In [18]:
knn_model.fit(train_scale, y_train)

KNeighborsClassifier()

In [19]:
pred_train = knn_model.predict(train_scale)
pred_test = knn_model.predict(test_scale)

In [20]:
from sklearn.metrics import accuracy_score

In [21]:
train_accu = accuracy_score(y_train, pred_train)
test_accu = accuracy_score(y_test, pred_test)
print('Accuracy of Train Data: ', train_accu)
print('Accuracy of Test Data: ', test_accu)

Accuracy of Train Data:  0.9714285714285714
Accuracy of Test Data:  0.9777777777777777


In [22]:
from sklearn.metrics import confusion_matrix

In [23]:
confusion_matrix(y_train, pred_train)

array([[34,  0,  0],
       [ 0, 31,  1],
       [ 0,  2, 37]], dtype=int64)

In [24]:
confusion_matrix(y_test, pred_test)

array([[16,  0,  0],
       [ 0, 17,  1],
       [ 0,  0, 11]], dtype=int64)

In [25]:
knn_model.classes_

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [26]:
knn_model.algorithm

'auto'

In [None]:
KNeighborsClassifier()

In [27]:
algo = ['ball_tree', 'kd_tree', 'brute']

In [30]:
train_accu = []
test_accu = []
for i in algo:
    knn = KNeighborsClassifier(algorithm=i)
    knn.fit(train_scale,y_train)
    accu_train = knn.score(train_scale, y_train)
    accu_test = knn.score(test_scale,y_test)
    train_accu.append(accu_train)
    test_accu.append(accu_test)
    

In [31]:
train_accu

[0.9714285714285714, 0.9714285714285714, 0.9714285714285714]

In [32]:
test_accu

[0.9777777777777777, 0.9777777777777777, 0.9777777777777777]

In [33]:
k = list(range(1,70))

In [34]:
train_accu = []
test_accu = []
for i in k:
    knn = KNeighborsClassifier(n_neighbors=i, n_jobs=-1)
    knn.fit(train_scale,y_train)
    accu_train = knn.score(train_scale, y_train)
    accu_test = knn.score(test_scale,y_test)
    train_accu.append(accu_train)
    test_accu.append(accu_test)

In [38]:
import plotly.express as px

In [41]:
import matplotlib.pyplot as plt

In [44]:
%matplotlib qt
plt.plot(k, train_accu, label='Accuracy of Train')
plt.plot(k, test_accu, label = 'Accuracy of Test')
plt.legend()
plt.xlabel('Number neighbours')
plt.ylabel('Accuracy of the Model')
plt.show()

In [45]:
train_accu = []
test_accu = []
for i in range(5,8):
    knn = KNeighborsClassifier(n_neighbors=i, n_jobs=-1)
    knn.fit(train_scale,y_train)
    accu_train = knn.score(train_scale, y_train)
    accu_test = knn.score(test_scale,y_test)
    train_accu.append(accu_train)
    test_accu.append(accu_test)

In [46]:
train_accu

[0.9714285714285714, 0.9809523809523809, 0.9714285714285714]

In [47]:
test_accu

[0.9777777777777777, 0.9777777777777777, 0.9777777777777777]

In [48]:
list(map(lambda x,y:x-y, train_accu, test_accu))

[-0.006349206349206327, 0.0031746031746031633, -0.006349206349206327]

In [52]:
from sklearn.model_selection import GridSearchCV
knncv=KNeighborsClassifier()
param_grid={'algorithm' : ['ball_tree', 'kd_tree', 'brute'],'n_neighbors':[6,7,8]}
grid_knn=GridSearchCV(estimator=knncv,param_grid=param_grid,cv=5)
grid_knn.fit(train_scale,y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'algorithm': ['ball_tree', 'kd_tree', 'brute'],
                         'n_neighbors': [6, 7, 8]})

In [53]:
grid_knn.best_params_

{'algorithm': 'ball_tree', 'n_neighbors': 6}

In [54]:
train_error = []
for i in range(1,70):
    knn = KNeighborsClassifier(n_neighbors=i, n_jobs=-1)
    knn.fit(train_scale, y_train)
    pred_train = knn.predict(train_scale)
    train_error.append(np.mean(pred_train != y_train))

In [55]:
test_error = []
for i in range(1,70):
    knn = KNeighborsClassifier(n_neighbors=i, n_jobs=-1)
    knn.fit(train_scale, y_train)
    pred_test = knn.predict(test_scale)
    test_error.append(np.mean(pred_test != y_test))

In [62]:
plt.plot(range(1,70),train_error,color = 'red', linestyle = 'dashed',
        marker = 'o', markerfacecolor = 'green', markersize = 5,label='Error of Train')
plt.plot(range(1,70),test_error,color = 'blue', linestyle = 'dashed',
        marker = 'o', markerfacecolor = 'red', markersize = 5,label='Error of Test')
plt.legend()
plt.title('Error Rates vs Neighbours')
plt.xlabel('Number of Neighbours')
plt.ylabel('Error rates for Train and Test')
plt.show()

In [63]:
df = data('Boston')

In [64]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]
x_train, x_test, y_train,y_test = train_test_split(x,y, test_size=0.3,
                                                  random_state=0)
sc = StandardScaler()
sc.fit(x_train)
train_scale = sc.transform(x_train)
test_scale = sc.transform(x_test)
train_scale = pd.DataFrame(data=train_scale, columns=x_train.columns)
test_scale = pd.DataFrame(data=test_scale, columns=x_test.columns)

In [66]:
from sklearn.neighbors import KNeighborsRegressor

In [67]:
knn_reg = KNeighborsRegressor()

In [68]:
knn_reg.fit(train_scale, y_train)

KNeighborsRegressor()

In [None]:
knn_reg.

In [69]:
from sklearn.metrics import mean_squared_error

In [70]:
pred_train = knn_reg.predict(train_scale)
pred_test = knn_reg.predict(test_scale)

In [71]:
print('RMSE of Train:', np.sqrt(mean_squared_error(y_train, pred_train)))
print('RMSE of Test:', np.sqrt(mean_squared_error(y_test, pred_test)))

RMSE of Train: 3.624613285431503
RMSE of Test: 5.305116447950581


In [76]:
rmse_train = []
rmse_test = []
for i in range(1,200):
    knn = KNeighborsRegressor(n_neighbors=i).fit(train_scale, y_train)
    rmse_tr = np.sqrt(mean_squared_error(y_train, knn.predict(train_scale)))
    rmse_te = np.sqrt(mean_squared_error(y_test, knn.predict(test_scale)))
    rmse_train.append(rmse_tr)
    rmse_test.append(rmse_te)

In [77]:
%matplotlib qt
plt.plot(range(1,200), rmse_train, label='RMSE of Train')
plt.plot(range(1,200), rmse_test, label = 'RMSE of Test')
plt.legend()
plt.xlabel('Number neighbours')
plt.ylabel('RMSE of the Model')
plt.show()

In [79]:
rmse_train = []
rmse_test = []
for i in range(50,100):
    knn = KNeighborsRegressor(n_neighbors=i).fit(train_scale, y_train)
    rmse_tr = np.sqrt(mean_squared_error(y_train, knn.predict(train_scale)))
    rmse_te = np.sqrt(mean_squared_error(y_test, knn.predict(test_scale)))
    rmse_train.append(rmse_tr)
    rmse_test.append(rmse_te)
%matplotlib qt
plt.plot(range(50,100), rmse_train, label='RMSE of Train')
plt.plot(range(50,100), rmse_test, label = 'RMSE of Test')
plt.legend()
plt.xlabel('Number neighbours')
plt.ylabel('RMSE of the Model')
plt.show()

In [80]:
final_model = KNeighborsRegressor(n_neighbors=75).fit(train_scale, y_train)
pred_train = final_model.predict(train_scale)
pred_test = final_model.predict(test_scale)
print('RMSE of Train:', np.sqrt(mean_squared_error(y_train, pred_train)))
print('RMSE of Test:', np.sqrt(mean_squared_error(y_test, pred_test)))

RMSE of Train: 6.1128872271909325
RMSE of Test: 6.618840106157361


In [86]:
from sklearn.model_selection import RepeatedKFold
rk = RepeatedKFold(n_repeats=10, n_splits=10,random_state=0)
from sklearn.model_selection import GridSearchCV
knncv=KNeighborsRegressor()
param_grid={'algorithm' : ['ball_tree', 'kd_tree', 'brute'],'n_neighbors':list(range(1,100))}
grid_knn=GridSearchCV(estimator=knncv,param_grid=param_grid,cv=rk)
grid_knn.fit(train_scale,y_train)

GridSearchCV(cv=RepeatedKFold(n_repeats=10, n_splits=10, random_state=0),
             estimator=KNeighborsRegressor(),
             param_grid={'algorithm': ['ball_tree', 'kd_tree', 'brute'],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30, ...]})

In [87]:
grid_knn.best_params_

{'algorithm': 'ball_tree', 'n_neighbors': 2}

In [91]:
final_model = KNeighborsRegressor(n_neighbors=75, algorithm='ball_tree').fit(train_scale, y_train)
pred_train = final_model.predict(train_scale)
pred_test = final_model.predict(test_scale)
print('RMSE of Train:', np.sqrt(mean_squared_error(y_train, pred_train)))
print('RMSE of Test:', np.sqrt(mean_squared_error(y_test, pred_test)))

RMSE of Train: 6.1128872271909325
RMSE of Test: 6.618840106157361


In [90]:
5.6118235673809655-6.255759455767226

-0.6439358883862605

In [92]:
6.1128872271909325-6.618840106157361

-0.5059528789664283