In [158]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn. metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import SelectFromModel
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline


In [159]:
df = pd.read_csv("df_m3_lower.csv", sep=";")

features = ['buy_ratio','sell_ratio','volume_mean','buy_mean','sell_mean','volatility','rsi2','rsi1','rsi0','std2','std1','std0','len']

label = 'is_lower'
X, y = df[features].values, df[label].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

print('Training cases: %d\nTest cases: %d' % (X_train.shape[0], X_test.shape[0]))

Training cases: 97
Test cases: 42


In [160]:
model = make_pipeline(PolynomialFeatures(),
                     RobustScaler(),
                     SGDClassifier(random_state=0))

params = {
    'polynomialfeatures__degree': [1,2,3,4],
    'sgdclassifier__penalty': ['l1','l2']
}

grid = GridSearchCV(model, param_grid=params, cv=4)

grid.fit(X_train, y_train)
grid.best_params_

{'polynomialfeatures__degree': 2, 'sgdclassifier__penalty': 'l1'}

In [161]:
predictions = grid.predict(X_test)
cm = confusion_matrix(y_test, predictions)

print ('Confusion Matrix:\n',cm, '\n')
print('Accuracy:', accuracy_score(y_test, predictions))
print("Overall Precision:",precision_score(y_test, predictions))
print("Overall Recall:",recall_score(y_test, predictions))
print(classification_report(y_test, predictions))

Confusion Matrix:
 [[18  7]
 [ 5 12]] 

Accuracy: 0.7142857142857143
Overall Precision: 0.631578947368421
Overall Recall: 0.7058823529411765
              precision    recall  f1-score   support

         0.0       0.78      0.72      0.75        25
         1.0       0.63      0.71      0.67        17

    accuracy                           0.71        42
   macro avg       0.71      0.71      0.71        42
weighted avg       0.72      0.71      0.72        42



In [162]:
chi2(X, y)

(array([1.45923332e-04, 7.67337382e-03, 1.42975200e-01, 1.41756054e-01,
        1.70405589e-01, 2.18004680e-03, 1.94718809e+01, 3.04774942e+00,
        4.47913760e-02, 1.03341698e+00, 1.31762020e+00, 1.85206859e+00,
        5.43894675e+03]),
 array([9.90361890e-01, 9.30196354e-01, 7.05340994e-01, 7.06541461e-01,
        6.79751599e-01, 9.62759526e-01, 1.02091570e-05, 8.08491074e-02,
        8.32388110e-01, 3.09357467e-01, 2.51019597e-01, 1.73542996e-01,
        0.00000000e+00]))

In [163]:
selector = SelectKBest(chi2, k=5)
selector.fit_transform(X, y)
selector.get_support()

array([False, False, False, False, False, False,  True,  True, False,
       False,  True,  True,  True])

In [164]:
np.array(features)[selector.get_support()]

array(['rsi2', 'rsi1', 'std1', 'std0', 'len'], dtype='<U11')

In [165]:
selector = SelectFromModel(SGDClassifier(random_state=0),
                          threshold='mean')
selector.fit_transform(X, y)
selector.get_support()
np.array(features)[selector.get_support()]

array(['rsi1', 'rsi0', 'len'], dtype='<U11')

In [166]:
selector.estimator_.coef_

array([[-8.00962230e+01, -1.28004423e+02, -4.82078511e+02,
        -4.84313958e+02, -5.11715122e+02,  1.47946969e-01,
         1.79499163e+03, -6.77738080e+03, -2.65494248e+03,
        -3.56719359e+02, -3.25075173e+02, -2.64254550e+02,
         1.30034130e+04]])