In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.preprocessing import scale
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn import model_selection


In [18]:
hit = pd.read_csv('Hitters.csv')
df = hit.copy()
df = df.dropna()
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = df['Salary']
X_ = df.drop(['League', 'Division', 'NewLeague', 'Salary'], axis=1).astype('float')
X = pd.concat([X_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [19]:
from sklearn.neighbors import KNeighborsRegressor
knn_model = KNeighborsRegressor().fit(X_train, y_train)

In [20]:
knn_model.n_neighbors

5

In [21]:
knn_model.predict(X_test)

array([ 510.3334,  808.3334,  772.5   ,  125.5   , 1005.    ,  325.5   ,
        216.5   ,  101.5   ,  982.    ,  886.6666,  590.    ,  901.6666,
        831.6666,  157.5   ,  393.    , 1005.    ,  735.5   ,   97.    ,
        884.4   ,  302.    ,  450.    ,  817.6666,  832.6666,  392.3334,
        528.    ,   81.6   ,  735.    ,  470.    ,  722.5   ,  101.    ,
         90.5   ,   74.6   ,  748.3334,  217.    ,  280.5334, 1044.5   ,
        955.    ,  232.    ,   78.6   ,  529.    ,   77.6   ,  106.5   ,
        516.6666,  593.6666, 1005.    ,  649.1666,  715.    ,  101.5   ,
        134.5   ,  810.    ,  743.    ,  521.3334,  664.3334,  195.    ,
        102.4   ,  728.5   ,  488.    ,  962.5   ,  230.8334, 1040.    ,
        885.    ,  542.    ,  720.4   ,  571.    ,  735.    ,   81.6   ])

In [22]:
y_pred = knn_model.predict(X_test)

In [23]:
np.sqrt(mean_squared_error(y_test, y_pred))

426.6570764525201

In [24]:
RMSE = []

for k in range(10):
    k = k+1
    knn_model = KNeighborsRegressor(n_neighbors=k).fit(X_train, y_train)
    y_pred = knn_model.predict(X_train)
    rmse = np.sqrt(mean_squared_error(y_train, y_pred))
    RMSE.append(rmse)
    print('RMSE for k =', k, 'is', rmse)

RMSE for k = 1 is 0.0
RMSE for k = 2 is 179.52761335480352
RMSE for k = 3 is 205.20157172291863
RMSE for k = 4 is 220.5139794876305
RMSE for k = 5 is 239.6467132541376
RMSE for k = 6 is 243.5904190007242
RMSE for k = 7 is 258.1478781634636
RMSE for k = 8 is 266.05374203349805
RMSE for k = 9 is 269.73782093553376
RMSE for k = 10 is 271.2798300436963


<h4>TUNING</h4>

In [16]:
from sklearn.model_selection import GridSearchCV

In [25]:
knn_params = {'n_neighbors': np.arange(1, 30, 1)}

In [26]:
knn = KNeighborsRegressor()

In [27]:
knn_cv_model = GridSearchCV(knn, knn_params, cv=10)

In [28]:
knn_cv_model.fit(X_train, y_train)

In [30]:
knn_tuned = KNeighborsRegressor(n_neighbors=knn_cv_model.best_params_['n_neighbors']).fit(X_train, y_train)

In [31]:
np.sqrt(mean_squared_error(y_test, knn_tuned.predict(X_test)))

413.7094731463598