In [1]:
# General libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
# Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Models libraries
from sklearn import svm
# Models tuning libraries
from sklearn.model_selection import GridSearchCV

## Preliminary work on the dataset

In [2]:
yahoo_df = pd.read_csv("../data/GOLD.csv")
yahoo_df = yahoo_df.set_index("Date")

X = yahoo_df[['ROC_3', 'Momentum', 'RSI_2', 'ATR_14']]

y = yahoo_df["Up down"]

# Split in 80/20 the dataframeX
X_train_80, X_test, y_train_80, y_test = train_test_split(X, y, test_size = 0.20,
                                                          shuffle=False)

# Split in 75/25 the remaining 80 %
X_train, X_valid, y_train, y_valid = train_test_split(X_train_80, y_train_80,
                                                      test_size = 0.25, shuffle=False)

# Normalize

In [3]:
def shift_norm(df):
    #df = data
    for column in df:
        df[column]=((df[column]-df[column].mean())/df[column].std())
    return df

In [4]:
X_train_80 = shift_norm(X_train_80)
X_test = shift_norm(X_test)

In [5]:
X_train = shift_norm(X_train)
X_valid = shift_norm(X_valid)

# Support Vector Machine (SVC)

In [6]:
SVC = svm.SVC(random_state = 42, kernel='linear')

# Fit the model
SVC.fit(X_train, y_train)

# Predict
y_pred_train = SVC.predict(X_train)
y_pred_valid = SVC.predict(X_valid)

# Compute accuracy
train_acc = accuracy_score(y_train, y_pred_train)
valid_acc = accuracy_score(y_valid, y_pred_valid)
print("Train Accuracy: {:.5f} - Validation Accuracy: {:.5f}".format(train_acc, valid_acc))

Train Accuracy: 0.60808 - Validation Accuracy: 0.53477


In [7]:
y_pred_test = SVC.predict(X_test)
# Finally evaluate on test
test_acc = accuracy_score(y_test, y_pred_test)
print ("Test Accuracy: {:.5f}".format(test_acc) )

# Store the Test Accuracy 
SVC_accuracy_test = test_acc

Test Accuracy: 0.53427


### Tuning parametres with GridSearchCV(VERY SLOW!!)

We make parameters tuning using the GridSearchCV instance. This one allows us to fit on a dataset all the possible combinations of parameter values. Moreover it evaluates and retaines the best combination of parameters.

# Applying Gridsearchcv for parameters tuning
# 'poly' --> molto lento
# Setting C parameters
parameters = [{'C': [0.1, 1.0, 10, 50, 100 ],
              'gamma': [1,0.1,0.001],
                'kernel': ['linear','rbf', 'sigmoid',"poly"]
              }] 
best_SVC = GridSearchCV(estimator = svm.SVC(),refit=True,verbose=2,
                             param_grid = parameters, scoring = 'accuracy')

# Fit the model
best_SVC.fit(X_train, y_train)

print(best_SVC.best_estimator_)

In [8]:
SVC = svm.SVC(C=10, gamma=1, kernel='linear',probability=True)

# Fit the model
SVC.fit(X_train_80, y_train_80)

# Predict
y_pred_train = SVC.predict(X_train)
y_pred_valid = SVC.predict(X_valid)

# Compute accuracy
train_acc = accuracy_score(y_train, y_pred_train)
valid_acc = accuracy_score(y_valid, y_pred_valid)
print("Train Accuracy: {:.5f} - Validation Accuracy: {:.5f}".format(train_acc, valid_acc))

Train Accuracy: 0.60808 - Validation Accuracy: 0.53477


In [9]:
y_pred_test = SVC.predict(X_test)
y_prob = SVC.predict_proba(X_test)[:,1]

# Finally evaluate on test
test_acc = accuracy_score(y_test, y_pred_test)
print ("Test Accuracy: {:.5f}".format(test_acc) )

# Store the Test Accuracy 
SVC_accuracy_test = test_acc

Test Accuracy: 0.53427


# Predict on all data 

In [19]:
df_prediction = pd.read_csv("../data/model_accuracy_gold.csv", index_col = False)
df_prediction =  df_prediction.drop(columns=["Unnamed: 0"])

df_prediction

Unnamed: 0,LogReg_pred,LogReg_accuracy,LogReg_prob,RNN_pred,RNN_accuracy,RNN_prob,LSTM_price_accuracy,RF_pred,RF_accuracy,RF_prob,SVC_pred,SVC_accuracy,SVC_prob
0,0,0.534272,0.435015,0,0.539906,0.440501,0.829887,0,0.537089,0.348535,0,0.534272,0.402208
1,0,0.534272,0.438317,0,0.539906,0.438469,0.829887,0,0.537089,0.321573,0,0.534272,0.425238
2,0,0.534272,0.441080,0,0.539906,0.438368,0.829887,1,0.537089,0.586142,0,0.534272,0.411844
3,0,0.534272,0.435446,0,0.539906,0.437083,0.829887,0,0.537089,0.413046,0,0.534272,0.406224
4,0,0.534272,0.430490,0,0.539906,0.403022,0.829887,0,0.537089,0.437003,0,0.534272,0.391746
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1060,1,0.534272,0.528832,0,0.539906,0.486045,0.829887,0,0.537089,0.337631,0,0.534272,0.384227
1061,1,0.534272,0.523879,1,0.539906,0.510491,0.829887,0,0.537089,0.305151,0,0.534272,0.392646
1062,1,0.534272,0.521895,0,0.539906,0.478003,0.829887,0,0.537089,0.498127,0,0.534272,0.408522
1063,1,0.534272,0.517769,0,0.539906,0.462106,0.829887,1,0.537089,0.564314,0,0.534272,0.426515


In [16]:
df_prediction["SVC_pred"] = y_pred_test
df_prediction["SVC_accuracy"] = [SVC_accuracy_test for x in range(len(y_pred_test))]
df_prediction["SVC_prob"] = y_prob


In [17]:
df_prediction

Unnamed: 0,LogReg_pred,LogReg_accuracy,LogReg_prob,RNN_pred,RNN_accuracy,RNN_prob,LSTM_price_accuracy,RF_pred,RF_accuracy,RF_prob,SVC_pred,SVC_accuracy,SVC_prob
0,0,0.534272,0.435015,0,0.539906,0.440501,0.829887,0,0.537089,0.348535,0,0.534272,0.402208
1,0,0.534272,0.438317,0,0.539906,0.438469,0.829887,0,0.537089,0.321573,0,0.534272,0.425238
2,0,0.534272,0.441080,0,0.539906,0.438368,0.829887,1,0.537089,0.586142,0,0.534272,0.411844
3,0,0.534272,0.435446,0,0.539906,0.437083,0.829887,0,0.537089,0.413046,0,0.534272,0.406224
4,0,0.534272,0.430490,0,0.539906,0.403022,0.829887,0,0.537089,0.437003,0,0.534272,0.391746
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1060,1,0.534272,0.528832,0,0.539906,0.486045,0.829887,0,0.537089,0.337631,0,0.534272,0.384227
1061,1,0.534272,0.523879,1,0.539906,0.510491,0.829887,0,0.537089,0.305151,0,0.534272,0.392646
1062,1,0.534272,0.521895,0,0.539906,0.478003,0.829887,0,0.537089,0.498127,0,0.534272,0.408522
1063,1,0.534272,0.517769,0,0.539906,0.462106,0.829887,1,0.537089,0.564314,0,0.534272,0.426515


In [18]:
df_prediction.to_csv("../data/model_accuracy_gold.csv")

In [14]:
# Plot the confusion matrix
#conf_stat = confusion_matrix(y_test, y_pred_test)
#ut.plot_confusion_matrix(conf_stat, Linearmodel_nt.classes_)