# Data Pre Processing by Scientist

In [22]:
import sqlite3 as db
import numpy as np
from pathlib import Path
import pandas as pd

In [28]:
def read_to_data_frame(uri):
    conn = db.connect(uri)
    try:
        df = pd.read_sql("select * from Sample", conn)
        return df
    except:
        raise Exception("Could not execute the query")
    finally:
        conn.close()

In [29]:
URI = Path().resolve().parent
URI = URI.joinpath("db", "index.db")

In [30]:
raw_data = read_to_data_frame(URI)

In [31]:
raw_data.head()

Unnamed: 0,high,low,open,close,volume,adj_close
0,11206.439453,10089.314453,10796.930664,10583.134766,29378589324,10583.134766
1,10912.188477,9737.884766,10588.683594,10801.677734,31015895223,10801.677734
2,11968.078125,10818.15625,10818.15625,11961.269531,30796494294,11961.269531
3,12006.075195,11166.569336,11972.71875,11215.4375,25920294033,11215.4375
4,11395.661133,10874.964844,11203.102539,10978.459961,23838480210,10978.459961


# Preprocessing

In [32]:
def digitize(n):
    if n > 0:
        return 1
    return 0

In [35]:
def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return np.lib.stride_tricks.as_strided(a,shape=shape, strides=strides)

def prepare_training_data(data):
    data['delta'] = data['close'] - data['open']
    data['to_predict'] = data['delta'].apply(lambda d:
    digitize(d))
    return data

In [36]:
prepared_training_data_df = prepare_training_data(raw_data)

In [37]:
prepared_training_data_df.head()

Unnamed: 0,high,low,open,close,volume,adj_close,delta,to_predict
0,11206.439453,10089.314453,10796.930664,10583.134766,29378589324,10583.134766,-213.795898,0
1,10912.188477,9737.884766,10588.683594,10801.677734,31015895223,10801.677734,212.994141,1
2,11968.078125,10818.15625,10818.15625,11961.269531,30796494294,11961.269531,1143.113281,1
3,12006.075195,11166.569336,11972.71875,11215.4375,25920294033,11215.4375,-757.28125,0
4,11395.661133,10874.964844,11203.102539,10978.459961,23838480210,10978.459961,-224.642578,0


In [39]:
btc_mat = prepared_training_data_df.to_numpy()
WINDOW_SIZE = 14

X = rolling_window(btc_mat[:, 7], WINDOW_SIZE)[:-1, :]

Y = prepared_training_data_df['to_predict'].to_numpy()[WINDOW_SIZE:]

In [40]:
X

array([[0., 1., 1., ..., 1., 0., 0.],
       [1., 1., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 1.]])

In [41]:
Y

array([1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1])

# Training with ML Flow

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import mlflow.sklearn
import mlflow

In [53]:
with mlflow.start_run():
    X_train, X_test, y_train, y_test = train_test_split(
        X, Y, test_size=0.25, random_state=4284, stratify=Y
    )

    clf = RandomForestClassifier(
        bootstrap=True, criterion='gini', min_samples_split=2, 
        min_weight_fraction_leaf=0.0, n_estimators=50, random_state=4284, 
        verbose=0
    )

    clf.fit(X_train, y_train)
    predicted = clf.predict(X_test)
    mlflow.sklearn.log_model(clf, "model_random_forest")
    mlflow.log_metric("precision_label_0", precision_score(y_test, predicted, pos_label=0))
    mlflow.log_metric("recall_label_0", recall_score(y_test, predicted, pos_label=0))
    mlflow.log_metric("f1score_label_0", f1_score(y_test,predicted, pos_label=0))
    mlflow.log_metric("precision_label_1", precision_score(y_test, predicted, pos_label=1))
    mlflow.log_metric("recall_label_1", recall_score(y_test, predicted, pos_label=1))
    mlflow.log_metric("f1score_label_1", f1_score(y_test,predicted, pos_label=1))

In [54]:
from sklearn.linear_model import LogisticRegression

In [55]:
with mlflow.start_run():
    X_train, X_test, y_train, y_test = train_test_split(
        X, Y, test_size=0.25, random_state=4284, stratify=Y
    )

    clf = LogisticRegression(random_state=4284, verbose=0)

    clf.fit(X_train, y_train)
    predicted = clf.predict(X_test)
    mlflow.sklearn.log_model(clf, "model_logistic")
    mlflow.log_metric("precision_label_0", precision_score(y_test, predicted, pos_label=0))
    mlflow.log_metric("recall_label_0", recall_score(y_test, predicted, pos_label=0))
    mlflow.log_metric("f1score_label_0", f1_score(y_test,predicted, pos_label=0))
    mlflow.log_metric("precision_label_1", precision_score(y_test, predicted, pos_label=1))
    mlflow.log_metric("recall_label_1", recall_score(y_test, predicted, pos_label=1))
    mlflow.log_metric("f1score_label_1", f1_score(y_test,predicted, pos_label=1))

In [56]:
from sklearn import svm

In [57]:
with mlflow.start_run():
    X_train, X_test, y_train, y_test = train_test_split(
        X, Y, test_size=0.25, random_state=4284, stratify=Y
    )

    clf = svm.SVC(random_state=4284, verbose=0)

    clf.fit(X_train, y_train)
    predicted = clf.predict(X_test)
    mlflow.sklearn.log_model(clf, "model_svm")
    mlflow.log_metric("precision_label_0", precision_score(y_test, predicted, pos_label=0))
    mlflow.log_metric("recall_label_0", recall_score(y_test, predicted, pos_label=0))
    mlflow.log_metric("f1score_label_0", f1_score(y_test,predicted, pos_label=0))
    mlflow.log_metric("precision_label_1", precision_score(y_test, predicted, pos_label=1))
    mlflow.log_metric("recall_label_1", recall_score(y_test, predicted, pos_label=1))
    mlflow.log_metric("f1score_label_1", f1_score(y_test,predicted, pos_label=1))