## Notebook that will evaluate the KNN model

In [1]:
#importing libraires to use
import sklearn.preprocessing as skl_pre
import sklearn.neighbors as skl_nb
import sklearn.model_selection as skl_ms
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import copy

#need to get the processing python file in another directory
import sys
import os
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from process_data import process_data

In [2]:
#getting the percentage splits of our training and testing data
# split_prec = {
#     'train': 0.8, 
#     'test': 0.2,
# }
#whatever scaler we will use
#scaler = skl_pre.StandardScaler()

#get the training and testing data
X_train, X_test, Y_train, Y_test = process_data()
print(X_train)
# print(X_test)
print(Y_train)
# print(Y_test)

Dropped columns: ['snow']
New columns: ['good_weather' 'is_day' 'temp_fahrenheit']
Split: "train" 	[Size: 1280] 	[Prec: 0.8]
	X: (1280, 17)
	Y: (1280,)
Split: "test" 	[Size: 320] 	[Prec: 0.2]
	X: (320, 17)
	Y: (320,)
      hour_of_day  day_of_week     month  holiday  weekday  summertime  \
0        0.652174     0.166667  0.272727      0.0      1.0         1.0   
1        0.000000     0.000000  0.545455      0.0      1.0         1.0   
2        0.826087     0.333333  0.818182      0.0      1.0         1.0   
3        0.000000     1.000000  0.454545      0.0      0.0         1.0   
4        0.869565     0.166667  0.727273      0.0      1.0         1.0   
...           ...          ...       ...      ...      ...         ...   
1275     1.000000     0.833333  0.636364      0.0      0.0         1.0   
1276     0.000000     0.833333  1.000000      0.0      0.0         0.0   
1277     0.304348     0.333333  0.727273      0.0      1.0         1.0   
1278     0.478261     0.500000  0.181818   

### Training we will only be working with X_train from here on down
Now that we have our training and test data we will run a KNN model on the training data with k-fold validation for hyperparameter tunning (what is a good value of k, distance measure to use, ect...). Then once we have a decent value of k we will "retrain" the model on the entire training data set and use that model on the test data which has never been seen and use the error from that as an estimation of $E_{new}$

In [3]:
#returns a trained KNN classifier that uses cross validation and grid search to find the best hyperparameters
def get_fitted_KNN(X,Y):
    #these are the hyperparameters that we are using Gridsearch to find
    parameters = {"n_neighbors": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], "p" : [1,2,3]}
    #running grid search over a KNN model with 25 fold cross validation.
    classifier = skl_ms.GridSearchCV(skl_nb.KNeighborsClassifier(), parameters, cv=25)
    #fit the best classifier
    classifier.fit(X,Y)
    #return it
    return classifier


Below we can see the expected $E_{new}$ of the training data only using cross validation and grid search in one

In [4]:
# print(np.array(Y_train.values.tolist()).reshape(-1,))
classifier = get_fitted_KNN(X_train,np.array(Y_train.values.tolist()).reshape(-1,))
print(classifier.best_params_)
print(classifier.best_score_)

{'n_neighbors': 9, 'p': 1}
0.8639517345399698


Now that we have a way to hyperparameter tune and cross validate at the same time we will execute a semi-gridsearch method to determine which features we should use.  

Since the gridsearch in Sklearn uses cross validation we can assume the score it returns is an estiamtion of $E_{new}$ (even though this is an invalid estimation for the true $E_{new}$) for the "training" data. This is the score we want to try and increase because we are focusing on accuracy.  

So we will start with all the features and remove each one individually and see if the score increases. The feature removal that increases the score the most will be used and we will continue to do this until we plateau. Then that final model will be the one we use on the true test set to get our approximation of $E_{new}$

In [None]:
#this function will work through the trianing data and try to find a good column selection
def find_good_column_selection(df = X_train.join(Y_train), num_folds = 10):
    #get the columns for column selection
    xcols = df.columns.tolist()
    xcols.remove("increase_stock")
    ycols = "increase_stock"
    #Need to get a baseline classifier with all the columns
    baseline = get_fitted_KNN(df[xcols], df[ycols])
    #this variable will tell us when to stop
    score_increases = True
    #these are the hyperparameters we will be tunning and keeping track of
    #the baseline is a model that is fitted to all the parameters
    best_score = baseline.best_score_
    best_column_selection = xcols
    best_hyperparameters = baseline.best_params_
    while score_increases:
        #we are slowly going to remove columns to try and increase the score
        score_increases = False
        #loop through all the columns
        for col in xcols:
            print("Trying a new column selection")
            #remove one col at a time
            testing_cols = copy.copy(xcols)
            testing_cols.remove(col)
            print("Xcols",xcols)
            print("testingcols",testing_cols)
            #get our new fitted model with only one column removed
            model = get_fitted_KNN(df[testing_cols], df[ycols])
            score = model.best_score_
            #see if that model with the column removed does better
            if model.best_score_ > best_score:
                print("Found a better column selection")
                best_score = score
                best_column_selection = testing_cols
                best_hyperparameters = model.best_params_
                score_increases = True
        #need to reset the columns that we are looking 
        xcols = best_column_selection
    return (best_column_selection,best_hyperparameters,best_score)
results = find_good_column_selection()

Trying a new column selection
Xcols ['hour_of_day', 'day_of_week', 'month', 'holiday', 'weekday', 'summertime', 'temp', 'dew', 'humidity', 'precip', 'snowdepth', 'windspeed', 'cloudcover', 'visibility', 'temp_fahrenheit', 'good_weather', 'is_day']
testingcols ['day_of_week', 'month', 'holiday', 'weekday', 'summertime', 'temp', 'dew', 'humidity', 'precip', 'snowdepth', 'windspeed', 'cloudcover', 'visibility', 'temp_fahrenheit', 'good_weather', 'is_day']
Trying a new column selection
Xcols ['hour_of_day', 'day_of_week', 'month', 'holiday', 'weekday', 'summertime', 'temp', 'dew', 'humidity', 'precip', 'snowdepth', 'windspeed', 'cloudcover', 'visibility', 'temp_fahrenheit', 'good_weather', 'is_day']
testingcols ['hour_of_day', 'month', 'holiday', 'weekday', 'summertime', 'temp', 'dew', 'humidity', 'precip', 'snowdepth', 'windspeed', 'cloudcover', 'visibility', 'temp_fahrenheit', 'good_weather', 'is_day']
Found a better column selection
Trying a new column selection
Xcols ['hour_of_day', 'd

In [None]:
#the best parameters for knn
results

(['hour_of_day',
  'day_of_week',
  'holiday',
  'weekday',
  'temp',
  'dew',
  'humidity',
  'precip',
  'snowdepth',
  'windspeed',
  'cloudcover',
  'visibility',
  'temp_fahrenheit',
  'good_weather',
  'is_day'],
 {'n_neighbors': 15, 'p': 1},
 0.8796983408748115)

In [None]:
X_train

Unnamed: 0,hour_of_day,day_of_week,month,holiday,weekday,summertime,temp,dew,humidity,precip,snowdepth,windspeed,cloudcover,visibility,temp_fahrenheit,good_weather,is_day
0,0.558988,-1.017343,-0.709870,-0.184189,0.643712,0.751527,1.073206,0.506614,-0.935864,-0.130133,-0.105593,0.664882,-1.233517,0.283313,1.068775,0.782355,1.267645
1,-1.625091,-1.516270,0.152211,-0.184189,0.643712,0.751527,1.277959,1.445451,0.574511,-0.130133,-0.105593,-0.392781,-1.233517,0.283313,1.248354,1.310216,-0.788864
2,1.141408,-0.518416,1.014292,-0.184189,0.643712,0.751527,0.631369,1.015983,0.966110,-0.130133,-0.105593,-0.724097,0.506560,0.283313,0.649758,0.935465,-0.788864
3,-1.625091,1.477292,-0.135149,-0.184189,-1.553491,0.751527,1.040876,1.335587,0.820569,-0.000790,-0.105593,-0.086951,0.956106,-0.857198,1.008915,1.190262,-0.788864
4,1.287014,-1.017343,0.726932,-0.184189,0.643712,0.751527,0.340403,-0.332346,-1.268303,-0.130133,-0.105593,0.014993,-1.123424,0.283313,0.350459,0.183458,-0.788864
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1275,1.723829,0.978365,0.439572,-0.184189,-1.553491,0.751527,0.685251,1.075909,0.980768,-0.130133,-0.105593,0.282594,0.506560,0.283313,0.709617,0.965596,-0.788864
1276,-1.625091,0.978365,1.589013,-0.184189,-1.553491,-1.330625,-1.243744,-0.781789,0.644141,-0.130133,-0.105593,-1.654331,-1.979701,0.283313,-1.265751,-1.155562,-0.788864
1277,-0.605854,-0.518416,0.726932,-0.184189,0.643712,0.751527,0.825346,1.355562,1.398543,-0.130133,-0.105593,-0.507467,-0.178461,0.283313,0.829337,1.149995,-0.788864
1278,-0.023433,-0.019489,-0.997230,-0.184189,0.643712,0.751527,-0.554047,0.117097,1.498536,-0.073222,-0.105593,-0.239866,1.078431,0.156589,-0.547435,0.175148,1.267645


In [None]:
#training knn model with the best parameters
best_knn = skl_nb.KNeighborsClassifier(n_neighbors=15, p=1).fit(X_train[results[0]],np.array(Y_train.values.tolist()).reshape(-1,))
best_knn.score(X_train[results[0]],Y_train)

0.88984375

In [None]:
#projected E_train with best found parameters
best_knn.score(X_test[results[0]],Y_test)

0.878125

In [None]:
knn_with_all_columns = 

In [None]:
#unnecessary code

#this will run k_folds cross validation with the passed hyperparameters
#returns the approximation of the accuracy of the model
# def run_k_folds(df = X_train.join(Y_train), folds = 5):
#     #setting up our kfolds cross validation
#     kf = skl_ms.KFold(folds, shuffle= True)
#     #getting the column list so we can separate into X and Y
#     cols = df.columns.tolist()
#     ycol = "increase_stock"
#     #only the x columns
#     cols.remove(ycol)
#     #will be the mean accuracy of the model. This is what we return
#     score = 0
#     parameters = []
#     # print(df)
#     # print(cols)
#     #loop through all the training and test data
#     for i, (train_index, test_index) in enumerate(kf.split(df)):
#         #get the specific training data
#         train = df.loc[train_index]
#         # print(train)
#         #get the specific testing data
#         test = df.loc[test_index]
#         #make a KNN classifier and fit it 
#         classifier = get_fitted_KNN(X = train[cols], Y=train[ycol])
#         #get the score using the k folds test data set
#         print(classifier.best_params_)
#         parameters.append(classifier.best_params_)
#         score += classifier.score(test[cols], test[ycol])
#     print(parameters)
#     print(Counter(parameters))
#     return score/folds
