In [1]:
# General libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
# Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Import Logistic Regression function
from sklearn.linear_model import LogisticRegression


# To Do

## Preliminary work on the dataset

'ROC_1', 'ROC_4', 'ROC_7', 'Momentum', '1 Day ROI', '3 Day ROI',
       '5 Day ROI', '20 Day ROI', '6_day_RSI', 'MACD_12_26', 'SRSI_30',
       'Williams_1', 'Williams_3', 'Williams_14', 'ATR_14', 'CCI'

In [2]:
yahoo_df = pd.read_csv("../data/NSQ.csv")
yahoo_df = yahoo_df.set_index("Date")

X = yahoo_df[['ROC_30', '4 Day ROI', 'EMA_12', 'MACD_12_26_9', 'SRSI_30',
       'Williams_14', 'ATR_14', 'Previous_differenced']]
y = yahoo_df["Up down"]

# Split in 80/20 the dataframeX
X_train_80, X_test, y_train_80, y_test = train_test_split(X, y, test_size = 0.20,
                                                          shuffle=False)

# Split in 75/25 the remaining 80 %
X_train, X_valid, y_train, y_valid = train_test_split(X_train_80, y_train_80,
                                                      test_size = 0.25, shuffle=False)


# Normalize

In [3]:
def shift_norm(df):
    #df = data
    for column in df:
        x = 0
        #df[column]=((df[column]-df[column].mean())/df[column].std())
    return df

In [4]:
X_train_80 = shift_norm(X_train_80)
X_test = shift_norm(X_test)

In [5]:
X_train = shift_norm(X_train)
X_valid = shift_norm(X_valid)

# Logistic Regression

<b>Definition:</b> Stochastic gradient descent is a simple and very efficient approach to fit linear models. It is particularly useful when the number of samples is very large.

<strong>Advantages:</strong> Efficiency and ease of implementation.

<strong>Disadvantages:</strong> It equires a number of hyper-parameters and it is sensitive to feature scaling.

In [6]:
# Fit the model
lr = LogisticRegression(random_state = 42)
lr.fit(X_train, y_train)

# Predict
y_pred_train = lr.predict(X_train)
y_pred_valid = lr.predict(X_valid)

# Compute Accuracy
train_acc = accuracy_score(y_train, y_pred_train)
valid_acc = accuracy_score(y_valid, y_pred_valid)
print("Train Accuracy: {:.5f} - Validation Accuracy: {:.5f}".format(train_acc, valid_acc))

Train Accuracy: 0.54474 - Validation Accuracy: 0.55860


# Tuning

In [7]:
# Create param grid.
param_grid = dict()
param_grid['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
param_grid['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
param_grid['C'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100,1000]

# Create grid search object
lr_tuned = GridSearchCV(estimator = LogisticRegression(), param_grid = param_grid)

# Fit the model
lr_tuned.fit(X_train, y_train)

# Predict
y_pred_train = lr_tuned.predict(X_train)
y_pred_valid = lr_tuned.predict(X_valid)

# Compute Accuracy
y_train_acc = accuracy_score(y_train, y_pred_train)
y_valid_acc = accuracy_score(y_valid, y_pred_valid)
print("Train Accuracy: {:.5f} - Validation Accuracy: {:.5f}".format(y_train_acc, y_valid_acc))









Train Accuracy: 0.53529 - Validation Accuracy: 0.55860


270 fits failed out of a total of 540.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\enric\anaconda3\envs\data_science\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\enric\anaconda3\envs\data_science\lib\site-packages\sklearn\linear_model\_logistic.py", line 1094, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\enric\anaconda3\envs\data_science\lib\site-packages\sklearn\linear_model\_logistic.py", line 78, in _check_solver
    raise ValueError("penalty='none' is not supported for the liblinear solve

In [8]:
print(lr_tuned.best_estimator_)

LogisticRegression(C=1e-05, penalty='l1', solver='liblinear')


In [9]:
lr = LogisticRegression(C=1000, solver='liblinear')
#C=0.0001
#C=1e-05, penalty='l1', solver='liblinear'
lr.fit(X_train_80, y_train_80)

In [10]:
y_pred_test = lr.predict(X_test)
y_prob = lr.predict_proba(X_test)[:,1]

# Finally evaluate on test
test_acc = accuracy_score(y_test, y_pred_test)
print ("Test Accuracy: {:.5f}".format(test_acc) )

# Store the Test Accuracy 
lr_accuracy_test = test_acc

Test Accuracy: 0.52741


# Save data Prediction

In [11]:
y_pred_test = lr.predict(X_test)
y_prob = lr.predict_proba(X_test)[:,1]

# Finally evaluate on test
test_acc = accuracy_score(y_test, y_pred_test)
print ("Test Accuracy: {:.5f}".format(test_acc) )

# Store the Test Accuracy 
lr_accuracy_test = test_acc

Test Accuracy: 0.52741


In [12]:
df_prediction = pd.read_csv("../data/model_accuracy_nsq.csv", index_col = False)
#df_prediction =  df_prediction.drop(columns=["Unnamed: 0"])
df_prediction

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,RF_pred,RF_accuracy,RF_prob,SVC_pred,SVC_accuracy,SVC_prob,XGB_pred,XGB_accuracy,...,ANN_accuracy,ANN_prob,LGBM_pred,LGBM_accuracy,LGBM_prob,y,kNN_pred,kNN_accuracy,kNN_prob,LSTM_price_accuracy
0,0,0,1,0.523629,0.657543,1,0.558601,0.544082,1,0.533081,...,0.441399,0.464561,1,0.529301,0.532322,0,0,0.482987,0.5,0.624764
1,1,1,0,0.523629,0.324182,1,0.558601,0.544076,0,0.533081,...,0.441399,0.464505,1,0.529301,0.518503,1,0,0.482987,0.5,0.624764
2,2,2,0,0.523629,0.496864,1,0.558601,0.545460,1,0.533081,...,0.441399,0.464505,1,0.529301,0.515129,1,1,0.482987,1.0,0.624764
3,3,3,1,0.523629,0.574333,1,0.558601,0.545009,1,0.533081,...,0.441399,0.464505,1,0.529301,0.531008,1,0,0.482987,0.5,0.624764
4,4,4,0,0.523629,0.421569,1,0.558601,0.543872,0,0.533081,...,0.441399,0.464505,1,0.529301,0.529314,0,0,0.482987,0.0,0.624764
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1053,1053,1053,1,0.523629,0.679418,1,0.558601,0.539406,1,0.533081,...,0.441399,0.464011,1,0.529301,0.510938,0,1,0.482987,1.0,0.624764
1054,1054,1054,1,0.523629,0.555801,1,0.558601,0.537904,1,0.533081,...,0.441399,0.464421,1,0.529301,0.511045,1,0,0.482987,0.5,0.624764
1055,1055,1055,1,0.523629,0.573323,1,0.558601,0.540966,1,0.533081,...,0.441399,0.463568,1,0.529301,0.509250,0,0,0.482987,0.0,0.624764
1056,1056,1056,1,0.523629,0.719503,1,0.558601,0.536726,1,0.533081,...,0.441399,0.464013,1,0.529301,0.510938,1,1,0.482987,1.0,0.624764


In [13]:
df_prediction["LogReg_pred"] = y_pred_test
df_prediction["LogReg_accuracy"] = [lr_accuracy_test for x in range(len(y_pred_test))]
df_prediction["LogReg_prob"] = y_prob
df_prediction

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,RF_pred,RF_accuracy,RF_prob,SVC_pred,SVC_accuracy,SVC_prob,XGB_pred,XGB_accuracy,...,LGBM_accuracy,LGBM_prob,y,kNN_pred,kNN_accuracy,kNN_prob,LSTM_price_accuracy,LogReg_pred,LogReg_accuracy,LogReg_prob
0,0,0,1,0.523629,0.657543,1,0.558601,0.544082,1,0.533081,...,0.529301,0.532322,0,0,0.482987,0.5,0.624764,1,0.52741,0.547267
1,1,1,0,0.523629,0.324182,1,0.558601,0.544076,0,0.533081,...,0.529301,0.518503,1,0,0.482987,0.5,0.624764,1,0.52741,0.553199
2,2,2,0,0.523629,0.496864,1,0.558601,0.545460,1,0.533081,...,0.529301,0.515129,1,1,0.482987,1.0,0.624764,1,0.52741,0.552663
3,3,3,1,0.523629,0.574333,1,0.558601,0.545009,1,0.533081,...,0.529301,0.531008,1,0,0.482987,0.5,0.624764,1,0.52741,0.545378
4,4,4,0,0.523629,0.421569,1,0.558601,0.543872,0,0.533081,...,0.529301,0.529314,0,0,0.482987,0.0,0.624764,1,0.52741,0.539814
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1053,1053,1053,1,0.523629,0.679418,1,0.558601,0.539406,1,0.533081,...,0.529301,0.510938,0,1,0.482987,1.0,0.624764,1,0.52741,0.610656
1054,1054,1054,1,0.523629,0.555801,1,0.558601,0.537904,1,0.533081,...,0.529301,0.511045,1,0,0.482987,0.5,0.624764,1,0.52741,0.580581
1055,1055,1055,1,0.523629,0.573323,1,0.558601,0.540966,1,0.533081,...,0.529301,0.509250,0,0,0.482987,0.0,0.624764,1,0.52741,0.534213
1056,1056,1056,1,0.523629,0.719503,1,0.558601,0.536726,1,0.533081,...,0.529301,0.510938,1,1,0.482987,1.0,0.624764,1,0.52741,0.515443


In [14]:
df_prediction.to_csv("../data/model_accuracy_nsq.csv")