# Logistic Regression

In [2]:
import pandas as pd
import numpy as np

## Taking the AAPL for example to investigate the effect

In [75]:
freq = 'Daily'
stock = 'AAPL'
price = pd.read_csv('../encode_price/'+freq+'/'+stock+'.csv')
y = price.direction.shift(-1).values[:-1]
predictors = pd.read_csv('../predictors/Merged/'+freq+'/'+stock+'.csv',index_col='Date')
predictors.fillna(0,inplace=True)
predictors = predictors.drop(['gdp','adjusted_close'],axis=1)
X = predictors.values[:-1,]
X


array([[ 6.79374263e+00, -1.18868843e-01,  4.01859581e-02, ...,
         5.05057466e+01,  1.40579557e+23,  9.48200377e+06],
       [ 4.18722538e+00, -5.36824720e-02, -7.59893880e-02, ...,
         5.92613609e+01,  1.37823095e+23,  1.82290811e+07],
       [ 3.94897208e+00,  1.74515808e-01,  1.22371094e-01, ...,
         6.66248645e+01,  1.35120681e+23,  2.72652181e+07],
       ...,
       [ 8.49680526e+00,  9.33175468e-01, -6.63043092e-02, ...,
         0.00000000e+00,  2.97160000e+02,  1.75108076e+11],
       [ 1.29197633e+01,  4.08612913e-01,  4.67014344e-02, ...,
         0.00000000e+00,  3.04880000e+02,  1.77411455e+11],
       [ 1.03502457e+01, -6.48851599e-01,  3.30228381e-01, ...,
         0.00000000e+00,  3.04880000e+02,  1.75079001e+11]])

In [64]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

In [65]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,shuffle=False)
sm = SMOTE(sampling_strategy='minority',random_state=42,k_neighbors=5)
X_res,y_res = sm.fit_resample(X_train,y_train)

## SMOTE Balanced Dataset

In [66]:
pipe = make_pipeline(StandardScaler(),LogisticRegression())
pipe.fit(X_res,y_res)
y_pred = pipe.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.56      0.59      0.57       288
         1.0       0.64      0.62      0.63       346

    accuracy                           0.61       634
   macro avg       0.60      0.60      0.60       634
weighted avg       0.61      0.61      0.61       634



## Original Dataset

In [67]:
pipe = make_pipeline(StandardScaler(),LogisticRegression())
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.55      0.56      0.55       288
         1.0       0.63      0.61      0.62       346

    accuracy                           0.59       634
   macro avg       0.59      0.59      0.59       634
weighted avg       0.59      0.59      0.59       634



In [72]:
from sklearn.kernel_approximation import PolynomialCountSketch
# Approximates feature map of a Polynomial kernel by approximation via Tensor Sketch.
poly_feature = PolynomialCountSketch(degree=1, random_state=2)
X_features = poly_feature.fit_transform(X_train)
X_features_t = poly_feature.fit_transform(X_test)
#X_test = poly_feature.fit_transform(X_test)
# Fit a Logistic Regression
clf = LogisticRegression()
clf.fit(X_features, y_train)

# Evaluate the KLR in-sample
clf.score(X_features, y_train)
#y_pred = clf.predict(X_features_t)
#print(classification_report(y_test,y_pred))

0.5252365930599369

In [73]:
from sklearn.kernel_approximation import RBFSampler
rbf_feature = RBFSampler(gamma=1, random_state=123)
X_features = rbf_feature.fit_transform(X_train)
clf = LogisticRegression()
clf.fit(X_features, y_train)
clf.score(X_features, y_train)

0.5935856992639327

In [74]:
from sklearn.kernel_approximation import PolynomialCountSketch,AdditiveChi2Sampler
adchi2 = AdditiveChi2Sampler(sample_steps=2)
X_features = adchi2.fit_transform(X_train)
clf = LogisticRegression()
clf.fit(X_features, y_train)
clf.score(X_features, y_train)

ValueError: Negative values in data passed to X in AdditiveChi2Sampler.fit

In [76]:
from sklearn import datasets
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import matplotlib.pyplot as plt

# Fit LDA
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

# Evaluate the LDA in-sample
lda.score(X_train, y_train)
y_pred = lda.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.53      0.58      0.55       288
         1.0       0.62      0.58      0.60       346

    accuracy                           0.58       634
   macro avg       0.58      0.58      0.58       634
weighted avg       0.58      0.58      0.58       634



In [77]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# Fit QDA
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)

# Evaluate the QDA in-sample
qda.score(X_train, y_train)

0.47791798107255523