In [7]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn. metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.svm import NuSVC
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier, StackingClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB
import matplotlib
from sklearn.decomposition import PCA
import seaborn as sns
%matplotlib inline


In [8]:
df = pd.read_csv("df_m1_lower_.csv", sep=";")

features = [         
            'diff_h_l_0',
            'diff_o_c_0',
            'diff_h_o_0',
            'diff_c_l_0',
            'vol_total_0',
            'vol_buy_0',
            'vol_sell_0',
            'vol_other_0',
            'nb_tick_total_0',
            'nb_tick_buy_0',
            'nb_tick_sell_0',
            'nb_tick_other_0',
            'nb_price_changed_0',
            'rsi2',
            'rsi1',
            'rsi0',
            #'rsi2_long',
            #'rsi1_long',
            #'rsi0_long',
            'std2',
            'std1',
            'std0',
            'std2_long',
            'std1_long',
            'std0_long',
            #'diff_h_l_20',
            #'diff_h_l_10'
            ]

label = 'is_lower'
X, y = df[features].values, df[label].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

print('Training cases: %d\nTest cases: %d' % (X_train.shape[0], X_test.shape[0]))

Training cases: 296
Test cases: 128


In [9]:
clf1 = DecisionTreeClassifier(max_depth=6)
clf2 = KNeighborsClassifier(n_neighbors=7)
clf3 = AdaBoostClassifier(n_estimators=50)

eclf = VotingClassifier(estimators=[("dt", clf1), ("knn", clf2), ("ada", clf3)],
                        voting="soft",
                        weights=[3, 2, 1])

for model in (clf1, clf2, clf3, eclf):
    model.fit(X_train, y_train)
    print(model.__class__.__name__, model.score(X_test, y_test))

DecisionTreeClassifier 0.640625
KNeighborsClassifier 0.59375
AdaBoostClassifier 0.5703125
VotingClassifier 0.65625


In [10]:
predictions = eclf.predict(X_test)
cm = confusion_matrix(y_test, predictions)

print ('Confusion Matrix:\n',cm, '\n')
print('Accuracy:', accuracy_score(y_test, predictions))
print("Overall Precision:",precision_score(y_test, predictions))
print("Overall Recall:",recall_score(y_test, predictions))
print(classification_report(y_test, predictions))
print(eclf.score(X_test, y_test))

Confusion Matrix:
 [[47 25]
 [19 37]] 

Accuracy: 0.65625
Overall Precision: 0.5967741935483871
Overall Recall: 0.6607142857142857
              precision    recall  f1-score   support

         0.0       0.71      0.65      0.68        72
         1.0       0.60      0.66      0.63        56

    accuracy                           0.66       128
   macro avg       0.65      0.66      0.65       128
weighted avg       0.66      0.66      0.66       128

0.65625


In [11]:
clf = DecisionTreeClassifier(max_depth=6)
clf.fit(X_train, y_train)


predictions = clf.predict(X_test)
cm = confusion_matrix(y_test, predictions)

print ('Confusion Matrix:\n',cm, '\n')
print('Accuracy:', accuracy_score(y_test, predictions))
print("Overall Precision:",precision_score(y_test, predictions))
print("Overall Recall:",recall_score(y_test, predictions))
print(classification_report(y_test, predictions))
print(clf.score(X_test, y_test))

Confusion Matrix:
 [[45 27]
 [17 39]] 

Accuracy: 0.65625
Overall Precision: 0.5909090909090909
Overall Recall: 0.6964285714285714
              precision    recall  f1-score   support

         0.0       0.73      0.62      0.67        72
         1.0       0.59      0.70      0.64        56

    accuracy                           0.66       128
   macro avg       0.66      0.66      0.66       128
weighted avg       0.67      0.66      0.66       128

0.65625


In [12]:
model = make_pipeline(PolynomialFeatures(),
                     DecisionTreeClassifier(max_depth=6))

params = {
    'polynomialfeatures__degree': [1,2,3,4,5,6]
}

grid = GridSearchCV(model, param_grid=params, cv=6)

grid.fit(X_train, y_train)
grid.best_params_

predictions = grid.predict(X_test)
cm = confusion_matrix(y_test, predictions)

print ('Confusion Matrix:\n',cm, '\n')
print('Accuracy:', accuracy_score(y_test, predictions))
print("Overall Precision:",precision_score(y_test, predictions))
print("Overall Recall:",recall_score(y_test, predictions))
print(classification_report(y_test, predictions))


Confusion Matrix:
 [[47 25]
 [18 38]] 

Accuracy: 0.6640625
Overall Precision: 0.6031746031746031
Overall Recall: 0.6785714285714286
              precision    recall  f1-score   support

         0.0       0.72      0.65      0.69        72
         1.0       0.60      0.68      0.64        56

    accuracy                           0.66       128
   macro avg       0.66      0.67      0.66       128
weighted avg       0.67      0.66      0.67       128

