# Logistic Regression

In [22]:
import pandas as pd
import numpy as np

## Taking the AAPL for example to investigate the effect

In [23]:
freq = 'Daily'
stock = 'AAPL'
price = pd.read_csv('../encode_price/'+freq+'/'+stock+'.csv')
y = price.direction.shift(-1).values[:-1]
predictors = pd.read_csv('../predictors/Merged/'+freq+'/'+stock+'.csv',index_col='Date')
predictors.fillna(0,inplace=True)
X = predictors.values[:-1,]
X.shape


(2536, 36)

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

In [25]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,shuffle=False)
sm = SMOTE(sampling_strategy='minority',random_state=42,k_neighbors=5)
X_res,y_res = sm.fit_resample(X_train,y_train)

## SMOTE Balanced Dataset

In [28]:
pipe = make_pipeline(StandardScaler(),LogisticRegression(penalty = 'l1',solver='liblinear'))
pipe.fit(X_res,y_res)
y_pred = pipe.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.45      0.98      0.62       288
         1.0       0.40      0.01      0.02       346

    accuracy                           0.45       634
   macro avg       0.43      0.50      0.32       634
weighted avg       0.42      0.45      0.29       634



## Original Dataset

In [29]:
pipe = make_pipeline(StandardScaler(),LogisticRegression(penalty = 'l1',solver='liblinear'))
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.41      0.08      0.14       288
         1.0       0.54      0.90      0.68       346

    accuracy                           0.53       634
   macro avg       0.47      0.49      0.41       634
weighted avg       0.48      0.53      0.43       634



In [21]:
from sklearn.kernel_approximation import PolynomialCountSketch
# Approximates feature map of a Polynomial kernel by approximation via Tensor Sketch.
poly_feature = PolynomialCountSketch(degree=1, random_state=2)
X_features = poly_feature.fit_transform(X_train)
X_features_t = poly_feature.fit_transform(X_test)
#X_test = poly_feature.fit_transform(X_test)
# Fit a Logistic Regression
clf = LogisticRegression()
clf.fit(X_features, y_train)

# Evaluate the KLR in-sample
clf.score(X_features_t, y_train)
#y_pred = clf.predict(X_features_t)
#print(classification_report(y_test,y_pred))

ValueError: Found input variables with inconsistent numbers of samples: [1902, 634]

In [91]:
from sklearn.kernel_approximation import RBFSampler
rbf_feature = RBFSampler(gamma=1, random_state=123)
X_features = rbf_feature.fit_transform(X_train)
clf = LogisticRegression()
clf.fit(X_features, y_train)
clf.score(X_features, y_train)

0.5977917981072555

In [15]:
from sklearn.kernel_approximation import PolynomialCountSketch,AdditiveChi2Sampler
adchi2 = AdditiveChi2Sampler(sample_steps=2)
X_features = adchi2.fit_transform(X_train)
clf = LogisticRegression()
clf.fit(X_features, y_train)
clf.score(X_features, y_train)

ValueError: Negative values in data passed to X in AdditiveChi2Sampler.fit

In [16]:
from sklearn import datasets
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import matplotlib.pyplot as plt

# Fit LDA
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

# Evaluate the LDA in-sample
lda.score(X_train, y_train)
y_pred = lda.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.45      0.92      0.60       288
         1.0       0.42      0.05      0.09       346

    accuracy                           0.44       634
   macro avg       0.44      0.48      0.34       634
weighted avg       0.43      0.44      0.32       634



In [20]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# Fit QDA
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)

# Evaluate the QDA in-sample
qda.score(X_test, y_test)

0.45425867507886436