In [1]:
# General libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
# Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier 

## Preliminary work on the dataset

In [2]:
yahoo_df = pd.read_csv("../data/OIL.csv")
yahoo_df = yahoo_df.set_index("Date")


X = yahoo_df[['ROC_1', 'ROC_4', 'ROC_5', 'ROC_30', 'Return', '2 Day ROI', '3 Day ROI',
       '4 Day ROI', '30 Day ROI', 'RSI_1', 'RSI_2', 'RSI_3', 'RSI_4',
       'MACDs_12_26_9', 'SRSI_14', 'SRSI_30', 'CCI', 'Previous_differenced',
       'principalDf']]
y = yahoo_df["Up down"]

# Split in 80/20 the dataframeX
X_train_80, X_test, y_train_80, y_test = train_test_split(X, y, test_size = 0.20,
                                                          shuffle=False)

# Split in 75/25 the remaining 80 %
X_train, X_valid, y_train, y_valid = train_test_split(X_train_80, y_train_80,
                                                      test_size = 0.25, shuffle=False)


# Normalize

In [3]:
def shift_norm(df):
    #df = data
    for column in df:
        df[column]=((df[column]-df[column].mean())/df[column].std())
    return df

In [4]:
X_train_80 = shift_norm(X_train_80)
X_test = shift_norm(X_test)

In [5]:
X_train = shift_norm(X_train)
X_valid = shift_norm(X_valid)

# kNN

In [6]:
# Train and fit
kNN = KNeighborsClassifier()
kNN.fit(X_train, y_train)

# Predict
y_pred_train = kNN.predict(X_train)
y_pred_valid = kNN.predict(X_valid)

# Compute Accuracy
train_acc = accuracy_score(y_train,y_pred_train)
valid_acc = accuracy_score(y_valid, y_pred_valid)

print("Train Accuracy: {:.5f} - Validation Accuracy: {:.5f}".format(train_acc, valid_acc))

Train Accuracy: 0.68830 - Validation Accuracy: 0.50000


### Tuning parameters

In [7]:
knn = KNeighborsClassifier()

k_range = list(range(1,31))
weight_options = ["uniform", "distance"]
param_grid = dict(n_neighbors = k_range, weights = weight_options)
  
# defining parameter range
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
  
# fitting the model for grid search
grid_search=grid.fit(X_train, y_train)

In [8]:
print(grid_search.best_estimator_)

KNeighborsClassifier()


In [9]:
# Train and fit
kNN = KNeighborsClassifier(n_neighbors=2)
kNN.fit(X_train_80, y_train_80)

# Predict
y_pred_test = kNN.predict(X_test)
y_prob = kNN.predict_proba(X_test)[:,1]

# Finally evaluate on test
test_acc = accuracy_score(y_test, y_pred_test)
print ("Test Accuracy: {:.5f}".format(test_acc) )

# Store the Test Accuracy 
kNN_accuracy_test = test_acc

Test Accuracy: 0.48808


# Save data prediction

In [10]:
df_prediction = pd.read_csv("../data/model_accuracy_oil.csv", index_col = False)
df_prediction["y"] = y_test.values
df_prediction["kNN_pred"] = y_pred_test
df_prediction["kNN_accuracy"] = [kNN_accuracy_test for x in range(len(y_pred_test))]
df_prediction["kNN_prob"] = y_prob

In [11]:
df_prediction = df_prediction.drop(columns=["Unnamed: 0"])
df_prediction

Unnamed: 0,ANN_pred,ANN_accuracy,ANN_prob,y,kNN_pred,kNN_accuracy,kNN_prob
0,0,0.469018,0.404047,0,1,0.488084,1.0
1,0,0.469018,0.342436,1,0,0.488084,0.0
2,0,0.469018,0.339922,1,0,0.488084,0.0
3,0,0.469018,0.404326,0,1,0.488084,1.0
4,0,0.469018,0.361674,0,1,0.488084,1.0
...,...,...,...,...,...,...,...
1044,0,0.469018,0.448136,0,0,0.488084,0.5
1045,0,0.469018,0.448477,1,0,0.488084,0.5
1046,0,0.469018,0.438599,0,0,0.488084,0.5
1047,0,0.469018,0.417918,1,0,0.488084,0.0


In [12]:
df_prediction.to_csv("../data/model_accuracy_oil.csv")