Baseline Modelling

In [1]:
#libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy import stats
from scipy.stats import norm
from scipy.stats import pearsonr
from statsmodels.api import tsa
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

In [2]:
#reading csv with columns because of the index column
df = pd.read_csv("../data/FinalData.csv",usecols=['date', 'open', 'high', 'low', 'close', 'Volume BTC','Volume USD'])
#converting date column to date time
df['date'] = pd.to_datetime(df['date'])
#setting date as index for easier timer series analysis
df.set_index('date', inplace=True)

In [3]:
# Simple Moving Averages (SMA)
df['SMA_20'] = df['close'].rolling(window=20).mean()
df['SMA_50'] = df['close'].rolling(window=50).mean()

# Exponential Moving Averages (EMA)
df['EMA_20'] = df['close'].ewm(span=20, adjust=False).mean()
df['EMA_50'] = df['close'].ewm(span=50, adjust=False).mean()

In [4]:
#dropping NA values because of moving average creation
df.dropna(inplace=True)

In [5]:
# Defining target variable and features
target = 'close'
features = ['open', 'high', 'low', 'Volume BTC', 'Volume USD', 'SMA_20', 'SMA_50']

In [6]:
X = df[features]
y = df[target]

In [7]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

Building some baseline models and evaluating them

In [8]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

In [9]:
m2_lr = mean_squared_error(y_test, y_pred_lr)
ma_lr =  mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

In [10]:
print(f'For our linear regression model the R squared value is {r2_lr}')
print(f'For our linear regression model the mean squared error is {m2_lr}')
print(f'For our linear regression model the mean absolute error is {ma_lr}')

For our linear regression model the R squared value is 0.9999781528126838
For our linear regression model the mean squared error is 7414.478632507588
For our linear regression model the mean absolute error is 40.42892872276092


In [11]:
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

In [12]:
print(f"DT R^2 score on training set: {dt_model.score(X_train, y_train):0.3f}")
print(f"DT R^2 score on test set: {dt_model.score(X_test, y_test):0.3f}")
print(f'For our decision tree model the mean squared error is {mean_squared_error(y_test, y_pred_dt)}')
print(f'For our decision tree model the mean absolute error is {mean_absolute_error(y_test, y_pred_dt)}')

DT R^2 score on training set: 1.000
DT R^2 score on test set: 1.000
For our decision tree model the mean squared error is 18220.243790678684
For our decision tree model the mean absolute error is 60.32519661947799


Sprint three stuff below please ignore it is work in progress

Finding the best parameters

In [13]:
estimators = [('dim_reducer', PCA()),
              ('model', DecisionTreeRegressor())]
pipe = Pipeline(estimators)

In [14]:
param_grid = [
    {
        'dim_reducer': [PCA(), None],
        'model': [DecisionTreeRegressor()],
        'model__max_depth': [None, 10, 20, 30],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4]
    },
    {
        'dim_reducer': [PCA(), None],
        'model': [RandomForestRegressor()],
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [None, 10, 20, 30],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4]
    },
    {
        'dim_reducer': [PCA(), None],
        'model': [GradientBoostingRegressor()],
        'model__n_estimators': [100, 200, 300],
        'model__learning_rate': [0.01, 0.1, 0.05],
        'model__max_depth': [3, 5, 7],
        'model__subsample': [0.8, 0.9, 1.0],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4]
    }
]

In [15]:
grid = GridSearchCV(pipe, param_grid, cv=5)
fitted_grid = grid.fit(X_train, y_train)


KeyboardInterrupt: 

In [None]:
best_model = fitted_grid.best_estimator_
best_params = fitted_grid.best_params_

In [None]:
print("Best Parameters:", best_params)
print("Best Model:", best_model)