# Classification

In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import re

## For plotting
import matplotlib.pyplot as plt
import seaborn as sns
from seaborn import set_style

## import model objects
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import confusion_matrix

## Read file

In [2]:
df = pd.read_csv('../Data/dataset_others_class.csv')

In [3]:
df = df.rename(columns={'DNZOY_Close_pred_class':'Prediction'})

In [4]:
df

Unnamed: 0,Date,ALV_Open,ALV_Close,ALV_High,ALV_Low,ALV_Volume,ALV_Change,ALV_Gain,ALV_Loss,ALV_Avg_Gain,...,DNZOY_RSI,DNZOY_k_percent,DNZOY_r_percent,DNZOY_MACD,DNZOY_MACD_EMA,DNZOY_ROC,DNZOY_PVT,DNZOY_Bollinger_Upper,DNZOY_Bollinger_Lower,Prediction
0,2019-11-22 00:00:00+00:00,80.980003,81.940002,82.019997,80.669998,361200.0,2.520004,2.520004,0.000000,0.556429,...,43.573069,9.649165,-90.350835,-0.024566,-0.008884,-2.023243,-152.544319,11.842262,11.222738,0
1,2019-11-25 00:00:00+00:00,82.250000,82.769997,82.860001,81.900002,356100.0,0.829994,0.829994,0.000000,0.485714,...,42.593532,6.140317,-93.859683,-0.031145,-0.013378,-2.489270,-162.386372,11.841552,11.224448,0
2,2019-11-26 00:00:00+00:00,82.580002,82.860001,83.470001,82.269997,347000.0,0.090004,0.090004,0.000000,0.492143,...,40.244797,0.000000,-100.000000,-0.038886,-0.018517,-3.947370,-215.467450,11.847754,11.201746,1
3,2019-11-27 00:00:00+00:00,82.760002,82.910004,83.360001,82.449997,252800.0,0.050003,0.050003,0.000000,0.384286,...,43.819375,7.758622,-92.241378,-0.041372,-0.023115,-3.810337,-122.404982,11.848555,11.185945,0
4,2019-11-29 00:00:00+00:00,82.360001,81.720001,82.389999,81.449997,218400.0,-1.190002,0.000000,1.190002,0.358572,...,35.402535,2.127706,-97.872294,-0.053469,-0.029215,-3.778450,-286.137099,11.855746,11.140755,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1232,2024-10-18 00:00:00+00:00,100.059998,99.519997,101.239998,98.099998,2354200.0,5.629997,5.629997,0.000000,0.792143,...,33.078516,10.227235,-89.772765,-0.229944,-0.153725,-4.976145,-333708.504661,15.434049,13.693951,0
1233,2024-10-21 00:00:00+00:00,99.559998,96.800003,100.099998,96.660004,1381400.0,-2.719994,0.000000,2.719994,0.792143,...,28.537447,2.793305,-97.206695,-0.249203,-0.172821,-6.385873,-335921.415099,15.433712,13.585288,0
1234,2024-10-22 00:00:00+00:00,97.169998,97.400002,97.739998,96.599998,966500.0,0.599998,0.599998,0.000000,0.810000,...,25.966725,2.564101,-97.435899,-0.269430,-0.192142,-6.938772,-336840.858332,15.407882,13.479118,1
1235,2024-10-23 00:00:00+00:00,97.389999,96.500000,98.080002,95.709999,468700.0,-0.900002,0.000000,0.900002,0.810000,...,36.523327,12.820505,-87.179495,-0.269443,-0.207602,-5.722072,-335645.537861,15.380513,13.410487,0


In [5]:
open_cols = [col for col in df.columns if '_Open' in col]
close_cols = [col for col in df.columns if '_Close' in col]
high_cols = [col for col in df.columns if '_High' in col]
low_cols = [col for col in df.columns if '_Low' in col]
vol_cols = [col for col in df.columns if '_Volume' in col]

In [6]:
close_cols

['ALV_Close',
 'F_Close',
 'GM_Close',
 'TM_Close',
 'TRYUSD=X_Close',
 'WFC_Close',
 'X_Close',
 'ALI=F_Close']

In [7]:
df.rename(columns={'DNZOY_Close_pred_class':'Prediction'})

Unnamed: 0,Date,ALV_Open,ALV_Close,ALV_High,ALV_Low,ALV_Volume,ALV_Change,ALV_Gain,ALV_Loss,ALV_Avg_Gain,...,DNZOY_RSI,DNZOY_k_percent,DNZOY_r_percent,DNZOY_MACD,DNZOY_MACD_EMA,DNZOY_ROC,DNZOY_PVT,DNZOY_Bollinger_Upper,DNZOY_Bollinger_Lower,Prediction
0,2019-11-22 00:00:00+00:00,80.980003,81.940002,82.019997,80.669998,361200.0,2.520004,2.520004,0.000000,0.556429,...,43.573069,9.649165,-90.350835,-0.024566,-0.008884,-2.023243,-152.544319,11.842262,11.222738,0
1,2019-11-25 00:00:00+00:00,82.250000,82.769997,82.860001,81.900002,356100.0,0.829994,0.829994,0.000000,0.485714,...,42.593532,6.140317,-93.859683,-0.031145,-0.013378,-2.489270,-162.386372,11.841552,11.224448,0
2,2019-11-26 00:00:00+00:00,82.580002,82.860001,83.470001,82.269997,347000.0,0.090004,0.090004,0.000000,0.492143,...,40.244797,0.000000,-100.000000,-0.038886,-0.018517,-3.947370,-215.467450,11.847754,11.201746,1
3,2019-11-27 00:00:00+00:00,82.760002,82.910004,83.360001,82.449997,252800.0,0.050003,0.050003,0.000000,0.384286,...,43.819375,7.758622,-92.241378,-0.041372,-0.023115,-3.810337,-122.404982,11.848555,11.185945,0
4,2019-11-29 00:00:00+00:00,82.360001,81.720001,82.389999,81.449997,218400.0,-1.190002,0.000000,1.190002,0.358572,...,35.402535,2.127706,-97.872294,-0.053469,-0.029215,-3.778450,-286.137099,11.855746,11.140755,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1232,2024-10-18 00:00:00+00:00,100.059998,99.519997,101.239998,98.099998,2354200.0,5.629997,5.629997,0.000000,0.792143,...,33.078516,10.227235,-89.772765,-0.229944,-0.153725,-4.976145,-333708.504661,15.434049,13.693951,0
1233,2024-10-21 00:00:00+00:00,99.559998,96.800003,100.099998,96.660004,1381400.0,-2.719994,0.000000,2.719994,0.792143,...,28.537447,2.793305,-97.206695,-0.249203,-0.172821,-6.385873,-335921.415099,15.433712,13.585288,0
1234,2024-10-22 00:00:00+00:00,97.169998,97.400002,97.739998,96.599998,966500.0,0.599998,0.599998,0.000000,0.810000,...,25.966725,2.564101,-97.435899,-0.269430,-0.192142,-6.938772,-336840.858332,15.407882,13.479118,1
1235,2024-10-23 00:00:00+00:00,97.389999,96.500000,98.080002,95.709999,468700.0,-0.900002,0.000000,0.900002,0.810000,...,36.523327,12.820505,-87.179495,-0.269443,-0.207602,-5.722072,-335645.537861,15.380513,13.410487,0


In [8]:
df = df.drop(columns=open_cols)
df = df.drop(columns=close_cols)
df = df.drop(columns=high_cols)
df = df.drop(columns=low_cols)
df = df.drop(columns=vol_cols)

df

Unnamed: 0,Date,ALV_Change,ALV_Gain,ALV_Loss,ALV_Avg_Gain,ALV_Avg_Loss,ALV_SMA,ALV_EMA,ALV_change_in_price,ALV_RSI,...,DNZOY_change_in_price,DNZOY_RSI,DNZOY_k_percent,DNZOY_r_percent,DNZOY_MACD,DNZOY_MACD_EMA,DNZOY_ROC,DNZOY_PVT,DNZOY_Bollinger_Upper,Prediction
0,2019-11-22 00:00:00+00:00,2.520004,2.520004,0.000000,0.556429,0.612857,82.027999,81.850204,2.520004,51.234749,...,-0.175000,43.573069,9.649165,-90.350835,-0.024566,-0.008884,-2.023243,-152.544319,11.842262,0
1,2019-11-25 00:00:00+00:00,0.829994,0.829994,0.000000,0.485714,0.612857,82.092499,81.937803,0.829994,55.655907,...,-0.020000,42.593532,6.140317,-93.859683,-0.031145,-0.013378,-2.489270,-162.386372,11.841552,0
2,2019-11-26 00:00:00+00:00,0.090004,0.090004,0.000000,0.492143,0.601428,82.249999,82.025631,0.090004,56.153297,...,-0.045000,40.244797,0.000000,-100.000000,-0.038886,-0.018517,-3.947370,-215.467450,11.847754,1
3,2019-11-27 00:00:00+00:00,0.050003,0.050003,0.000000,0.384286,0.601428,82.491000,82.109857,0.050003,56.466315,...,0.045000,43.819375,7.758622,-92.241378,-0.041372,-0.023115,-3.810337,-122.404982,11.848555,0
4,2019-11-29 00:00:00+00:00,-1.190002,0.000000,1.190002,0.358572,0.686429,82.685000,82.072728,-1.190002,47.211290,...,-0.155000,35.402535,2.127706,-97.872294,-0.053469,-0.029215,-3.778450,-286.137099,11.855746,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1232,2024-10-18 00:00:00+00:00,5.629997,5.629997,0.000000,0.792143,0.495000,93.786499,94.758369,5.629997,72.013171,...,0.160000,33.078516,10.227235,-89.772765,-0.229944,-0.153725,-4.976145,-333708.504661,15.434049,0
1233,2024-10-21 00:00:00+00:00,-2.719994,0.000000,2.719994,0.792143,0.547143,93.984000,94.952811,-2.719994,57.640864,...,-0.160000,28.537447,2.793305,-97.206695,-0.249203,-0.172821,-6.385873,-335921.415099,15.433712,0
1234,2024-10-22 00:00:00+00:00,0.599998,0.599998,0.000000,0.810000,0.547143,94.204500,95.185876,0.599998,59.688594,...,-0.099999,25.966725,2.564101,-97.435899,-0.269430,-0.192142,-6.938772,-336840.858332,15.407882,1
1235,2024-10-23 00:00:00+00:00,-0.900002,0.000000,0.900002,0.810000,0.575714,94.374500,95.311031,-0.900002,55.080078,...,0.160000,36.523327,12.820505,-87.179495,-0.269443,-0.207602,-5.722072,-335645.537861,15.380513,0


In [9]:
#correlation = df.loc[:, df.columns != 'Date'].corr()
#plt.figure(figsize=(15,15))
#plt.title('Correlation Matrix')
#sns.heatmap(correlation, vmax=1, square=True,annot=True)

## Classification models

In [10]:
tree = DecisionTreeClassifier(
    #max_depth = 10, 
    min_samples_leaf = 5, # minimum number of samples in each leaf, to prevent overfitting
    random_state= 216)

rf = RandomForestClassifier(
    n_estimators = 500, # number of trees in ensemble
    #max_depth = 10, # max_depth of each tree
    min_samples_leaf = 5, 
    max_features = 2, # default is round(sqrt(num_features)), which in this case is 1.
    bootstrap= True, # sampling with replacement
    max_samples = 500, # number of training samples selected with replacement to build tree
    random_state = 216 # for consistency
    )

rand_first_clf = RandomForestClassifier(n_estimators = 100, oob_score = True, criterion = "gini", random_state = 0)

In [11]:
Y = df['Prediction']
X = df.loc[:, (df.columns != 'Date') & (df.columns != 'Prediction')]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=1)

In [13]:
## Fit the models
tree.fit(X_train, Y_train)
print(accuracy_score(Y_test, tree.predict(X_test)))

## rf and et may take slightly longer because we are fitting many trees
rf.fit(X_train, Y_train)
print(accuracy_score(Y_test, rf.predict(X_test)))

rand_first_clf.fit(X_train, Y_train)
print(accuracy_score(Y_test, rand_first_clf.predict(X_test)))

0.5161290322580645
0.5040322580645161
0.5483870967741935


In [15]:
## Confusion_matrix
print('Decision Tree')
print(confusion_matrix(Y_test, tree.predict(X_test)))
print('---------------------')
print('Random Forest 1')
print(confusion_matrix(Y_test, rf.predict(X_test)))
print('---------------------')
print('Random Forest 2')
print(confusion_matrix(Y_test, rand_first_clf.predict(X_test)))

Decision Tree
[[72 53]
 [67 56]]
---------------------
Random Forest 1
[[61 64]
 [59 64]]
---------------------
Random Forest 2
[[72 53]
 [59 64]]
