In [68]:
%matplotlib inline

from pathlib import Path

import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error 
from math import sqrt
import matplotlib.pylab as plt

In [69]:
bosknn = pd.read_csv('BostonHousing.csv')
bosknn['Number'] = bosknn.index + 1

In [70]:
bosknn.drop(['CAT. MEDV'], axis = 1, inplace = True)

In [71]:
bosknn.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV,Number
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0,1
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6,2
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7,3
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4,4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2,5


In [72]:
bosknn.shape


(506, 14)

In [73]:
trainData, validData = train_test_split(bosknn, test_size = 0.4, random_state = 26)
print(trainData.shape, validData.shape)

(303, 14) (203, 14)


In [74]:
scaler = preprocessing.StandardScaler()
scaler.fit(trainData[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'LSTAT']])
           
#Transform the full dataset
bosknn = pd.concat([pd.DataFrame(scaler.transform(bosknn[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'LSTAT']]),
                                 columns = ['zCRIM', 'zZN', 'zINDUS', 'zCHAS', 'zNOX', 'zRM', 'zAGE', 'zDIS', 'zRAD', 'zTAX', 'zPTRATIO', 'zLSTAT']),
                                 bosknn[['MEDV', 'Number']]], axis = 1)
trainNorm = bosknn.iloc[trainData.index]
validNorm = bosknn.iloc[validData.index]

In [75]:
train_X = trainNorm[['zCRIM', 'zZN', 'zINDUS', 'zCHAS', 'zNOX', 'zRM', 'zAGE', 'zDIS', 'zRAD', 'zTAX', 'zPTRATIO', 'zLSTAT']]
train_y = trainNorm['MEDV']
valid_X = validNorm[['zCRIM', 'zZN', 'zINDUS', 'zCHAS', 'zNOX', 'zRM', 'zAGE', 'zDIS', 'zRAD', 'zTAX', 'zPTRATIO', 'zLSTAT']]
valid_y = validNorm['MEDV']    

In [76]:
rmse_val = [] #to store rmse values for different k
for K in range(5):
    K = K+1
    model = KNeighborsRegressor(n_neighbors = K)

    model.fit(train_X, train_y)  #fit the model
    pred=model.predict(valid_X) #make prediction on test set
    error = sqrt(mean_squared_error(valid_y,pred)) #calculate rmse
    rmse_val.append(error) #store rmse values
    print('RMSE value for k= ' , K , 'is:', error)

RMSE value for k=  1 is: 5.27314508745023
RMSE value for k=  2 is: 4.668888750815089
RMSE value for k=  3 is: 4.7489082085789445
RMSE value for k=  4 is: 5.017560296603246
RMSE value for k=  5 is: 5.110195353577408


The best value for k is when k = 2

The RMSE value decreases as we increase the k value from 1 to 2. At k= 2, the RMSE is approximately 4.67, and shoots up on further increasing the k value. We can safely say that k=2 will give us the best result in this case.

In [77]:
newHouse = pd.DataFrame({'CRIM':0.2, 'ZN':0, 'INDUS':7, 'CHAS':0, 'NOX':0.538, 'RM':6, 'AGE':62, 'DIS':4.7, 'RAD':4, 'TAX':307, 'PTRATIO':21, 'LSTAT':10}, index = [0])

In [78]:
newHouseNorm = pd.DataFrame(scaler.transform(newHouse), columns = ['zCRIM', 'zZN', 'zINDUS', 'zCHAS', 'zNOX', 'zRM', 'zAGE', 'zDIS', 'zRAD', 'zTAX', 'zPTRATIO', 'zLSTAT'])

In [79]:
knn = KNeighborsRegressor(n_neighbors = 2)
predNH = model.predict(newHouseNorm)
print(predNH)

[19.64]


In [81]:
rmse_val = [] #to store rmse values for different k
for K in range(5):
    K = K+1
    model = KNeighborsRegressor(n_neighbors = K)

    model.fit(train_X, train_y)  #fit the model
    pred=model.predict(train_X) #make prediction on test set
    error = sqrt(mean_squared_error(train_y,pred)) #calculate rmse
    rmse_val.append(error) #store rmse values
    print('RMSE value for k= ' , K , 'is:', error)

RMSE value for k=  1 is: 0.0
RMSE value for k=  2 is: 2.401001922328073
RMSE value for k=  3 is: 3.1611943916287135
RMSE value for k=  4 is: 3.718019851636314
RMSE value for k=  5 is: 4.159139127731649


The validation data error is overly optimistic compared to the error rate of the training data because it is less prone to overfitting.