In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv')

In [3]:
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [5]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [6]:

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x = scaler.fit_transform(X)

In [7]:
x

array([[-0.41978194,  0.28482986, -1.2879095 , ..., -1.45900038,
         0.44105193, -1.0755623 ],
       [-0.41733926, -0.48772236, -0.59338101, ..., -0.30309415,
         0.44105193, -0.49243937],
       [-0.41734159, -0.48772236, -0.59338101, ..., -0.30309415,
         0.39642699, -1.2087274 ],
       ...,
       [-0.41344658, -0.48772236,  0.11573841, ...,  1.17646583,
         0.44105193, -0.98304761],
       [-0.40776407, -0.48772236,  0.11573841, ...,  1.17646583,
         0.4032249 , -0.86530163],
       [-0.41500016, -0.48772236,  0.11573841, ...,  1.17646583,
         0.44105193, -0.66905833]])

# without hyperparameter tuning

In [9]:
from sklearn.model_selection import cross_val_score,KFold
from sklearn.neighbors import KNeighborsRegressor

In [10]:
knn = KNeighborsRegressor(n_neighbors=5,algorithm='auto',
                          p=2,metric='minkowski',
                          weights='uniform')

kfold = KFold(n_splits=5, shuffle=True, random_state=1)
scores = cross_val_score(knn, x, y, cv=kfold, scoring='r2')

In [11]:
scores.mean()

np.float64(0.7312777271205292)

# with hyperparameter tuning

### 1. GridSearch_CV

In [13]:
from sklearn.model_selection import GridSearchCV
knn = KNeighborsRegressor()

para_grid = {
    'n_neighbors':[1,3,5,7,10,12,15,17,20],
    'weights':['uniform','distance'],
    'algorithm':['ball_tree', 'kd_tree', 'brute'],
    'p':[1,2]
}

In [14]:
grid_cv = GridSearchCV(knn, para_grid,
                       scoring='r2', refit=True,
                       cv=kfold, verbose=2)

In [15]:
grid_cv.fit(x,y)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=2, weights=uniform; total

In [17]:
grid_cv.best_score_

np.float64(0.8107708570758332)

In [20]:
df_grid = grid_cv.cv_results_

In [21]:
pd.DataFrame(df_grid)[['param_algorithm',	'param_n_neighbors',	'param_p', 'param_weights', 'mean_test_score']].sort_values('mean_test_score',ascending=False)

Unnamed: 0,param_algorithm,param_n_neighbors,param_p,param_weights,mean_test_score
5,ball_tree,3,1,distance,0.810771
41,kd_tree,3,1,distance,0.810771
77,brute,3,1,distance,0.810771
9,ball_tree,5,1,distance,0.806288
45,kd_tree,5,1,distance,0.806288
...,...,...,...,...,...
28,ball_tree,17,1,uniform,0.695784
64,kd_tree,17,1,uniform,0.695784
32,ball_tree,20,1,uniform,0.679702
68,kd_tree,20,1,uniform,0.679702


### 2. RandomSearch_CV

In [18]:
from sklearn.model_selection import RandomizedSearchCV
knn = KNeighborsRegressor()

para_grid = {
    'n_neighbors':[1,3,5,7,10,12,15,17,20],
    'weights':['uniform','distance'],
    'algorithm':['ball_tree', 'kd_tree', 'brute'],
    'p':[1,2]
}

In [22]:
random_cv = RandomizedSearchCV(
    knn, para_grid, scoring='r2', cv=kfold,
    verbose=2, n_iter = 10, refit=True
)


In [23]:
random_cv.fit(x,y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END algorithm=brute, n_neighbors=5, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=brute, n_neighbors=5, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=brute, n_neighbors=5, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=brute, n_neighbors=5, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=brute, n_neighbors=5, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=12, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=12, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=12, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=12, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=12, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=kd_tree, n_neighbors=10, p=1, weights=uniform; total time=   0.0s
[CV]

In [24]:
random_cv.best_score_

np.float64(0.8032121892231553)

In [25]:
random_cv.best_params_

{'weights': 'distance', 'p': 2, 'n_neighbors': 3, 'algorithm': 'kd_tree'}