Importing useful librairies and setting plot parameters

In [1]:
import sys
import pandas as pd
import numpy as np
import ta
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import seaborn as sns
from sklearn.utils import shuffle
import pandas as pd
import glob
from sklearn.model_selection import cross_val_score, cross_validate, cross_val_predict
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import warnings
warnings.filterwarnings("ignore")
sys.path.append('./functions')
from indicators import *

plt.rcParams['figure.figsize'] = (30, 15)
plt.rcParams['figure.facecolor'] = "white"
plt.rcParams['axes.facecolor'] = "white"
plt.rcParams['axes.edgecolor'] = "black"


Loading csv files w/ cryptocurrency data and merge them together

Loading indicators, cleaning dataframe

In [2]:
# 1d or 4h
path = r'data/1d'
all_files = glob.glob(path + '/*.csv')
li = []
for filename in all_files:
    data = pd.read_csv(filename, index_col=None, header=0)
    li.append(data)

frame = pd.concat(li, axis=0, ignore_index=True)

data = frame.set_index(frame['date'])
data.index = pd.to_datetime(data.index, unit='ms')
del data['date']

# POSITIVE BREAKOUT
get_indicators(data)

del data['open']
del data['high']
del data['low']

data['pct_change'] = data.close.pct_change().shift(-1)
data['previous_pct_change'] = data.close.pct_change()
data['2previous_pct_change'] = data.close.pct_change().shift(1)
del data['close']

data = data.dropna()


Defining 3 binary variables (% change between two days)

In [3]:
df = data.copy()

for index, row in df.iterrows():
    df['pct_change'][index] = np.where(
        df['pct_change'][index] > 0.6, 1, 0)  # asset up, define by how much
    df['previous_pct_change'][index] = np.where(
        df['previous_pct_change'][index] > 0, 1, 0)
    df['2previous_pct_change'][index] = np.where(
        df['2previous_pct_change'][index] > 0, 1, 0)

df_ = df.dropna()

# Counting values for each class
df_['pct_change'].value_counts()


0.0    30134
1.0       26
Name: pct_change, dtype: int64

Selecting dataset

In [4]:
dataset = df_.copy()


Defining predictor and target variables, randomizing the data

In [5]:
X = dataset.drop('pct_change', axis=1)
y = dataset['pct_change']

X, y = shuffle(X, y)


Splitting dataset into train and test

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)

# concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)


Over sampling minority class to deal w/ imbalanced classes

In [7]:
# create two different dataframe of majority and minority class
df_majority = X[(X['pct_change'] == 0)]
df_minority = X[(X['pct_change'] == 1)]
# upsample minority class
minority_upsampled = resample(df_minority,
                              replace=True,    # sample with replacement
                              # to match majority class
                              n_samples=len(df_majority),
                              random_state=0)  # reproducible results
# Combine majority class with upsampled minority class
oversampled = pd.concat([minority_upsampled, df_majority])
oversampled['pct_change'].value_counts()

# from imblearn.over_sampling import SMOTE
# sm = SMOTE(sampling_strategy='minority', random_state=0)
# Fit the model to generate the data.
# oversampled_X, oversampled_Y = sm.fit_resample(
#    X.drop('pct_change', axis=1), X['pct_change'])
# oversampledSmote = pd.concat(
#    [pd.DataFrame(oversampled_X), pd.DataFrame(oversampled_Y)], axis=1)"""

1.0    21096
0.0    21096
Name: pct_change, dtype: int64

Splitting again

In [8]:
X = oversampled.drop('pct_change', axis=1)
y = oversampled['pct_change']
X, y = shuffle(X, y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)


Data scaling

In [9]:
feature_scaler = StandardScaler()
X_train = pd.DataFrame(feature_scaler.fit_transform(X_train))
X_test = pd.DataFrame(feature_scaler.transform(X_test))
X_train.columns = X.columns
X_test.columns = X.columns
X = pd.DataFrame(feature_scaler.fit_transform(X))
X.columns = X_train.columns


Defining categorical and numerical variables

In [10]:
cat_data = []
num_data = []

for i, c in enumerate(X_train.dtypes):
    if c == object:
        cat_data.append(X_train.iloc[:, i])
    else:
        num_data.append(X_train.iloc[:, i])

cat_data = pd.DataFrame(cat_data).transpose()
num_data = pd.DataFrame(num_data).transpose()

# Identify Numeric features
num_features = num_data.columns.values
# Identify Categorical features
cat_features = cat_data.columns.values


Finding highly correlated features

In [12]:
def find_correlated_features(df, threshold):
    # Get correlation matrix
    corr_matrix = df.corr().abs()
    # Take half of the matrix to prevent doubling results
    corr_matrix = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    # Restructure correlation matrix to dataframe
    df = corr_matrix.stack().reset_index()
    df.columns = ['feature1', 'feature2', 'corr_coef']
    # Apply filter and sort coefficients
    df = df[df.corr_coef >= threshold].sort_values(
        'corr_coef', ascending=False)
    return df


find_correlated_features(X_train, .9)


Unnamed: 0,feature1,feature2,corr_coef
261,stochOsc,willR,1.0
95,kama,VWAP,0.998006
604,VWAP,PSAR,0.996188
111,kama,PSAR,0.994637
649,bbWidth,DONC,0.99258
96,kama,ATR,0.964302
589,VWAP,ATR,0.96316
275,stochOsc,dcPer,0.9629
368,willR,dcPer,0.9629
624,ATR,PSAR,0.958852


Dropping highly correlated features

In [13]:
correlated_features = set()
correlation_matrix = X_train.corr()

for i in range(len(correlation_matrix .columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.9:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

print(correlated_features)
X_train.drop(labels=correlated_features, axis=1, inplace=True)
X_test.drop(labels=correlated_features, axis=1, inplace=True)
X.drop(labels=correlated_features, axis=1, inplace=True)


{'VI', 'willR', 'dcPer', 'bbPer', 'OBV', 'TSI', 'bbWidth', 'DONC', 'CCI', 'ATR', 'VWAP', 'PSAR'}


Shortlist of ML algorithms to be tested:

- Logistic Regression
- SVM
- Random Forest

Evaluation metrics

- Precision/Specificity: how many selected instances are relevant.
- Recall/Sensitivity: how many relevant instances are selected.
- F1 score: harmonic mean of precision and recall.
- AUC: relation between true-positive rate and false positive rate.

Training and evaluating on the training set

In [14]:
model = LogisticRegression(random_state=0).fit(X_train, y_train)
predictions = model.predict(X_test)
# print(confusion_matrix(y_test, predictions))
print('Accuracy : {} %'.format(
    round(accuracy_score(y_test, predictions).mean()*100, 2)))
print(classification_report(y_test, predictions))


Accuracy : 92.79 %
              precision    recall  f1-score   support

         0.0       1.00      0.86      0.92      6333
         1.0       0.87      1.00      0.93      6325

    accuracy                           0.93     12658
   macro avg       0.94      0.93      0.93     12658
weighted avg       0.94      0.93      0.93     12658



In [15]:
model = RandomForestClassifier(random_state=0).fit(X_train, y_train)
predictions = model.predict(X_test)
# print(confusion_matrix(y_test, predictions))
print('Accuracy : {} %'.format(
    round(accuracy_score(y_test, predictions).mean()*100, 2)))
print(classification_report(y_test, predictions))


Accuracy : 99.99 %
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      6333
         1.0       1.00      1.00      1.00      6325

    accuracy                           1.00     12658
   macro avg       1.00      1.00      1.00     12658
weighted avg       1.00      1.00      1.00     12658



Evaluation using Cross-Validation

In [None]:
def cross_validation(model, X_train, y_train, _cv=10):
    _scoring = ['accuracy', 'precision', 'recall', 'f1']
    results = cross_validate(estimator=model,
                             X=X_train,
                             y=y_train,
                             cv=_cv,
                             scoring=_scoring,
                             return_train_score=True)

    return {"Mean Training Accuracy": results['train_accuracy'].mean()*100,
            "Mean Training Precision": results['train_precision'].mean(),
            "Mean Training Recall": results['train_recall'].mean(),
            "Mean Training F1 Score": results['train_f1'].mean(),
            "Mean Validation Accuracy": results['test_accuracy'].mean()*100,
            "Mean Validation Precision": results['test_precision'].mean(),
            "Mean Validation Recall": results['test_recall'].mean(),
            "Mean Validation F1 Score": results['test_f1'].mean()
            }

In [16]:
model = LogisticRegression(random_state=0)
# print(cross_val_score(model, X_train, y_train, cv=10))
cv_results = cross_validate(model, X_train, y_train, cv=10)
print('Accuracy mean : {} %'.format(
    round(cv_results['test_score'].mean()*100, 2)))
print('Accuracy std : {}'.format(
    cv_results['test_score'].std()))
print('\n')
scores = cross_validate(model, X_train, y_train, cv=10,
                        scoring=('r2', 'neg_mean_squared_error'),
                        return_train_score=True)
print('test_neg_mean_squared_error mean : {}'.format(
    scores['test_neg_mean_squared_error'].mean()))
print('train_r2 mean : {}'.format(
    scores['train_r2'].mean()))
print('\n')
cross_validation(model, X_train, y_train, _cv=10)


Accuracy mean : 92.86 %
Accuracy std : 0.0046600368951547075


test_neg_mean_squared_error mean : -0.07137509311417123
train_r2 mean : 0.7164097042959995




{'Mean Training Accuracy': 92.91024313179452,
 'Mean Training Precision': 0.8758464481017905,
 'Mean Training Recall': 1.0,
 'Mean Training F1 Score': 0.9338138217457074,
 'Mean Validation Accuracy': 92.86249068858288,
 'Mean Validation Precision': 0.875170112059147,
 'Mean Validation Recall': 1.0,
 'Mean Validation F1 Score': 0.9334145856499816}

In [17]:
model = RandomForestClassifier(random_state=0)
# print(cross_val_score(model, X_train, y_train, cv=10))
cv_results = cross_validate(model, X_train, y_train, cv=10)
print('Accuracy mean : {} %'.format(
    round(cv_results['test_score'].mean()*100, 2)))
print('Accuracy std : {}'.format(
    cv_results['test_score'].std()))
print('\n')
scores = cross_validate(model, X_train, y_train, cv=10,
                        scoring=('r2', 'neg_mean_squared_error'),
                        return_train_score=True)
print('test_neg_mean_squared_error mean : {}'.format(
    scores['test_neg_mean_squared_error'].mean()))
print('train_r2 mean : {}'.format(
    scores['train_r2'].mean()))
print('\n')
cross_validation(model, X_train, y_train, _cv=10)


Accuracy mean : 100.0 %
Accuracy std : 0.0


test_neg_mean_squared_error mean : 0.0
train_r2 mean : 1.0




{'Mean Training Accuracy': 100.0,
 'Mean Training Precision': 1.0,
 'Mean Training Recall': 1.0,
 'Mean Training F1 Score': 1.0,
 'Mean Validation Accuracy': 100.0,
 'Mean Validation Precision': 1.0,
 'Mean Validation Recall': 1.0,
 'Mean Validation F1 Score': 1.0}

-----