# Logistic Regression

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report,zero_one_loss,recall_score
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import TimeSeriesSplit

- Logistic with traditional train test split

In [9]:
def logi(stock, freq):
    # freq = 'Weekly'
    # stock = 'AAPL'
    price = pd.read_csv('../../encode_price/'+freq+'/'+stock+'.csv')
    price = price[(price['Date']>='2010-01-01') & (price['Date']<='2020-02-01')]
    y = price.direction2.shift(-1).values[:-1]
    predictors = pd.read_csv('../../predictors/Merged/'+freq+'/'+stock+'.csv',index_col='Date')
    predictors.fillna(0,inplace=True)
    X = predictors.values[:-1,]
    X.shape

    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,shuffle=False)

    pipe = make_pipeline(MinMaxScaler(),LogisticRegression(dual=True,penalty = 'l2',solver='liblinear'))
    pipe.fit(X_train,y_train)
    y_pred = pipe.predict(X_test)
    print(stock,'/',freq,'\n',classification_report(y_test,y_pred))

- logistic with cross validation using tscv

In [10]:
def logi_cv(stock, freq, cv=5):
    # freq = 'Weekly'
    # stock = 'AAPL'
    price = pd.read_csv('../../encode_price/'+freq+'/'+stock+'.csv')
    price = price[(price['Date']>='2010-01-01') & (price['Date']<='2020-02-01')]
    y = price.direction2.shift(-1).values[:-1]
    predictors = pd.read_csv('../../predictors/Merged/'+freq+'/'+stock+'.csv',index_col='Date')
    predictors.fillna(0,inplace=True)
    X = predictors.values[:-1,]
    X.shape

    tscv = TimeSeriesSplit(n_splits=cv)
    scores = []
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        pipe = make_pipeline(MinMaxScaler(),LogisticRegression(dual=True,penalty = 'l2',solver='liblinear'))
        pipe.fit(X_train,y_train)
        y_pred = pipe.predict(X_test)
        score = classification_report(y_test,y_pred,output_dict=True)
        scores.append(score['weighted avg']['f1-score'])

    average_score = np.mean(scores)
    print(stock,'/',freq,'\n','Average F1-score:', average_score)

In [12]:
stock_list = ['AAPL','MSFT','GOOG','AMZN','NVDA','BRK-B','TSLA','META','JNJ','V']
freq_list = ['Daily','Weekly','Monthly']

import sys 

import warnings
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)


with open('./result of logi/result_of_logi.txt', 'w') as f:
    sys.stdout = f
    
    for name in stock_list:
        for freq in freq_list:
            logi(name, freq)
    
    sys.stdout = sys.__stdout__


In [13]:
with open('./result of logi/result_of_logi_with_cv.txt', 'w') as f:
    sys.stdout = f
    
    for name in stock_list:
        for freq in freq_list:
            logi_cv(name, freq, 3)
    
    sys.stdout = sys.__stdout__

You can see the result in the result folder under Classification

In general, the monthly F1 accuracy shows higher than the daily/weekly one, and for some stocks, the monthly F1 could be nearly 0.8, which indicates the model could be useful to tell the trend of stock price. In average, the monthly F1 is around 0.6, which is still good for trade strategy. In addition, our model has a robustly great result for visa among the stock list we test, and we believe we could based on this model to find the suitable stock and based on them build response trade strategy.