# Logistic Regression

In [2]:
import pandas as pd
import numpy as np

## Taking the AAPL for example to investigate the effect

In [96]:
freq = 'Daily'
stock = 'AAPL'
price = pd.read_csv('../encode_price/'+freq+'/'+stock+'.csv')
y = price.direction.shift(-1).values[:-1]
predictors = pd.read_csv('../predictors/Merged/'+freq+'/'+stock+'.csv',index_col='Date')
predictors.fillna(0,inplace=True)
#predictors = predictors.drop(['gdp','adjusted_close'],axis=1)
X = predictors.values[:-1,]



In [97]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

In [98]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,shuffle=False)
sm = SMOTE(sampling_strategy='minority',random_state=42,k_neighbors=5)
X_res,y_res = sm.fit_resample(X_train,y_train)

## SMOTE Balanced Dataset

In [99]:
pipe = make_pipeline(StandardScaler(),LogisticRegression())
pipe.fit(X_res,y_res)
y_pred = pipe.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.45      1.00      0.62       288
         1.0       0.00      0.00      0.00       346

    accuracy                           0.45       634
   macro avg       0.23      0.50      0.31       634
weighted avg       0.21      0.45      0.28       634



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Original Dataset

In [100]:
pipe = make_pipeline(StandardScaler(),LogisticRegression())
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.45      0.91      0.60       288
         1.0       0.42      0.05      0.09       346

    accuracy                           0.44       634
   macro avg       0.43      0.48      0.35       634
weighted avg       0.43      0.44      0.32       634



In [94]:
from sklearn.kernel_approximation import PolynomialCountSketch
# Approximates feature map of a Polynomial kernel by approximation via Tensor Sketch.
poly_feature = PolynomialCountSketch(degree=1, random_state=2)
X_features = poly_feature.fit_transform(X_train)
X_features_t = poly_feature.fit_transform(X_test)
#X_test = poly_feature.fit_transform(X_test)
# Fit a Logistic Regression
clf = LogisticRegression()
clf.fit(X_features, y_train)

# Evaluate the KLR in-sample
clf.score(X_features, y_train)
#y_pred = clf.predict(X_features_t)
#print(classification_report(y_test,y_pred))

0.5199789695057834

In [95]:
from sklearn.kernel_approximation import RBFSampler
rbf_feature = RBFSampler(gamma=1, random_state=123)
X_features = rbf_feature.fit_transform(X_train)
clf = LogisticRegression()
clf.fit(X_features, y_train)
clf.score(X_features, y_train)

0.6035751840168244

In [74]:
from sklearn.kernel_approximation import PolynomialCountSketch,AdditiveChi2Sampler
adchi2 = AdditiveChi2Sampler(sample_steps=2)
X_features = adchi2.fit_transform(X_train)
clf = LogisticRegression()
clf.fit(X_features, y_train)
clf.score(X_features, y_train)

ValueError: Negative values in data passed to X in AdditiveChi2Sampler.fit

In [76]:
from sklearn import datasets
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import matplotlib.pyplot as plt

# Fit LDA
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

# Evaluate the LDA in-sample
lda.score(X_train, y_train)
y_pred = lda.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.53      0.58      0.55       288
         1.0       0.62      0.58      0.60       346

    accuracy                           0.58       634
   macro avg       0.58      0.58      0.58       634
weighted avg       0.58      0.58      0.58       634



In [102]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# Fit QDA
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)

# Evaluate the QDA in-sample
qda.score(X_test, y_test)

0.45425867507886436