# Environment

In [None]:
# whether to use Spark when performing grid search for hyperparameters
use_spark = False
# the number of grid search processes to spawn if NOT using Spark
n_jobs = 4

# Imports

In [None]:
import numpy as np
np.random.seed(1)

In [None]:
import os
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [None]:
if not os.path.exists('results'):
    os.makedirs('results')

# $k$NN 

In [None]:
clf = KNeighborsRegressor()
clf

In [None]:
train = pd.read_csv('../feature_engineering/train.gz')
X_train = train[train.columns[:-1]].values
y_train = train[train.columns[-1]].values

In [None]:
test = pd.read_csv('../feature_engineering/test.gz')
X_test = test[test.columns[:-1]].values
y_test = test[test.columns[-1]].values

## Grid Search

In [None]:
cv = KFold(n_splits=5, shuffle=True, random_state=1)
cv

In [None]:
params = {
    "n_neighbors": np.arange(1, 100, 4),
    "metric": ["euclidean", "manhattan", "chebyshev", "hamming", "canberra", "braycurtis"],
    "weights": ["uniform", "distance"],
}

In [None]:
# setup the keyword arguments for grid search
kwargs = {
    'estimator': clf, 
    'param_grid': params, 
    'cv': cv,
}
# setup the grid based on whether spark is being used
if use_spark:
    from spark_sklearn import GridSearchCV
    grid = GridSearchCV(sc, **kwargs)
else:
    from sklearn.model_selection import GridSearchCV
    grid = GridSearchCV(**kwargs, n_jobs=n_jobs)

grid

In [None]:
_ = grid.fit(X_train, y_train)

In [None]:
best = pd.Series(grid.best_estimator_.get_params())
best.to_csv('results/knn-params.csv')
best

In [None]:
results = grid.cv_results_
df = pd.DataFrame(results)
df.to_csv('results/knn-grid.csv')
df

# Testing

In [None]:
clf = KNeighborsRegressor(**grid.best_estimator_.get_params(), n_jobs=n_jobs)
clf

In [None]:
clf.fit(X_train, y_train)

## Training Accuracy

In [None]:
train_pred = clf.predict(X_train)

In [None]:
train_err = mean_squared_error(y_train, train_pred)
train_err

In [None]:
train_df = pd.DataFrame([y_train, train_pred], index=['Truth', 'Predictions']).T

In [None]:
ax = train_df.plot(figsize=(16, 4))
ax.set_ylabel('$\MWHr')
ax.set_xlabel('Time Step')
plt.savefig('results/knn-train.pdf')
plt.savefig('results/knn-train.svg')

## Testing Accuracy

In [None]:
test_pred = clf.predict(X_test)

In [None]:
test_err = mean_squared_error(y_test, test_pred)
test_err

In [None]:
test_df = pd.DataFrame([y_test, test_pred], index=['Truth', 'Predictions']).T

In [None]:
ax = test_df.plot(figsize=(16, 4))
ax.set_ylabel('$\MWHr')
ax.set_xlabel('Time Step')
plt.savefig('results/knn-test.pdf')
plt.savefig('results/knn-test.svg')

## Results

In [None]:
results = pd.Series([train_err, test_err], index=['Train', 'Test'])
results

In [None]:
results.to_csv('results/knn-mse.csv')