<a href="https://colab.research.google.com/github/DanielDekhtyar/AI-Accelerator/blob/main/Final%20Project/notebooks/Classical%20ML%20models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [2]:
# Loading the S&P500 data into a DataFrame
snp = pd.read_csv('snp_data.csv')

In [3]:
snp.head(3)

Unnamed: 0,Date,Close,High,Low,Open,Volume,Change,label
0,2009-02-18,58.63269,59.307826,58.076262,59.196539,362964800,-0.140957,0.0
1,2009-02-19,58.002068,59.46362,57.88336,59.233626,316867500,-0.630623,-1.0
2,2009-02-20,57.438248,58.120798,56.214104,56.926337,477176600,-0.56382,-1.0


In [4]:
# Create the features and the targets for the models
# We droped the 'Date', 'Change' and 'label' becouse, data can't be changed in to a number to train with; change will give a very big hit (data leakage) for the label.
X = snp.drop(snp[['Date','Change','label']], axis=1)
y = snp['label']

In [5]:
X.head(1)

Unnamed: 0,Close,High,Low,Open,Volume
0,58.63269,59.307826,58.076262,59.196539,362964800


In [6]:
y.head(1)

Unnamed: 0,label
0,0.0


In [7]:
# We tried spliting the data 80/20 not randomly (to preserve the order of the dates) but we found it was less afffective then train_test_split

#split_size = int(len(snp) * 0.8)
#X_train = X.iloc[:split_size]
#X_test = X.iloc[split_size:]
#y_train = y.iloc[:split_size]
#y_test = y.iloc[split_size:]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
X_train

Unnamed: 0,Close,High,Low,Open,Volume
942,110.514877,111.685880,110.170000,110.314364,123018300
598,104.617867,104.758488,102.946106,103.188275,202385700
2674,265.760864,268.943806,264.450804,268.934571,122539500
2210,231.186447,231.648402,230.982107,231.346351,52274900
789,111.004318,111.385282,110.591600,111.290044,146896000
...,...,...,...,...,...
1638,172.968475,176.631869,172.909108,175.122413,194327900
1095,129.333939,130.581847,128.395960,129.260523,162262200
1130,137.628906,138.493469,137.596274,138.273246,79829200
1294,154.852386,154.902046,153.172178,153.619138,100254000


In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
X_train_scaled

array([[-0.82709007, -0.81833901, -0.82149358, -0.82959922, -0.15850414],
       [-0.91729984, -0.92385328, -0.93252366, -0.93862118,  0.78499381],
       [ 1.54779199,  1.57692837,  1.54977724,  1.59712953, -0.16419598],
       ...,
       [-0.41231206, -0.41002036, -0.39995622, -0.4018566 , -0.67192434],
       [-0.14883515, -0.16009383, -0.16055711, -0.16707999, -0.42911991],
       [-0.85351986, -0.85908827, -0.86494169, -0.85772397,  0.02982123]])

In [12]:
# Useing KNN
knn = KNeighborsClassifier(n_neighbors=12)
knn.fit(X_train_scaled, y_train)
y_train_pred = knn.predict(X_train_scaled)
y_test_pred = knn.predict(X_test_scaled)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(train_accuracy)
print(test_accuracy)
print(train_accuracy-test_accuracy)

# We got very poor results :(

0.5741878841088675
0.48596491228070177
0.08822297182816569


In [13]:
# Using Random Forset
rf = RandomForestClassifier(n_estimators=200,
                            max_depth=40,
                            min_samples_split=4,
                            max_samples=200,
                            random_state=42)
rf.fit(X_train_scaled, y_train)
y_train_pred = rf.predict(X_train_scaled)
y_test_pred = rf.predict(X_test_scaled)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(train_accuracy)
print(test_accuracy)
print(train_accuracy-test_accuracy)

# We got very poor results :(

0.6062335381913959
0.5
0.10623353819139592


In [14]:
# Using Decision Tree classifier
dt = DecisionTreeClassifier()
dt.fit(X_train_scaled, y_train)
y_train_pred = dt.predict(X_train_scaled)
y_test_pred = dt.predict(X_test_scaled)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(train_accuracy)
print(test_accuracy)
print(train_accuracy-test_accuracy)

# We can see that using Decision Trees, the model is VERY overfitted

1.0
0.4192982456140351
0.5807017543859649


In [15]:
# Using LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
y_train_pred = lr.predict(X_train_scaled)
y_test_pred = lr.predict(X_test_scaled)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(train_accuracy)
print(test_accuracy)
print(train_accuracy-test_accuracy)

# We got very poor results :(

0.5447761194029851
0.5543859649122806
-0.009609845509295556
