In [1]:
import pandas as pd
import numpy as np
import datetime
from sklearn.impute import KNNImputer
import statsmodels.api as sm

from statsmodels.tsa.api import VAR
import matplotlib.pyplot as plt
import random
from scalecast.Forecaster import Forecaster
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.stattools import adfuller, kpss
from sklearn.model_selection import TimeSeriesSplit
from tqdm.notebook import tqdm as log_progress
from scalecast import GridGenerator
from scalecast.SeriesTransformer import SeriesTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings

warnings.filterwarnings("ignore")

%matplotlib inline

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
## import the cleaned dataset
file_path = "/Users/justinfarnan_hakkoda/capstone_project/Capstone_Project/Cleaned_Data/cleaned_crypto_updated_module5.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0.1,Unnamed: 0,time,product_id,low,close,volume,volatility,pct_change,day_of_week,day_name,...,volatility_7d,volatility_14d,volatility_30d,momentum_1d,momentum_7d,momentum_30d,RSI_14,pct_change_lag_1,pct_change_lag_3,pct_change_lag_7
0,0,2021-07-16,ADA-USD,1.1661,1.1714,36764910.0,6.783295,0.001366,3,Thursday,...,0.145345,0.143173,0.260852,0.001366,0.031159,0.851289,70.002537,0.001366,0.001366,0.001366
1,1,2021-07-16,COMP-USD,367.1,367.89,23815.54,11.822392,0.030036,3,Thursday,...,0.171204,0.18125,0.28605,0.030036,0.136318,0.329202,58.238869,0.030036,0.030036,0.030036
2,2,2021-07-16,ETH-USD,1850.15,1877.22,184221.7,6.226522,0.012002,3,Thursday,...,0.136244,0.136448,0.220291,0.012002,0.132425,0.763741,88.586851,0.012002,0.012002,0.012002
3,3,2021-07-16,LINK-USD,15.13583,15.23963,2246281.0,7.91942,0.007881,3,Thursday,...,0.159056,0.224341,0.282708,0.007881,0.083315,0.832218,78.511587,0.007881,0.007881,0.007881
4,4,2021-07-16,LTC-USD,120.44,120.68,130554.6,6.127532,-0.005469,3,Thursday,...,0.137388,0.13971,0.222501,-0.005469,0.029831,0.530991,78.981735,-0.005469,-0.005469,-0.005469


In [5]:
# remove the unamed column and the day_of_week column
df = df.drop(columns = ['Unnamed: 0', 'day_name'])
df.head()

Unnamed: 0,time,product_id,low,close,volume,volatility,pct_change,day_of_week,volume_lag_1,volume_lag_3,...,volatility_7d,volatility_14d,volatility_30d,momentum_1d,momentum_7d,momentum_30d,RSI_14,pct_change_lag_1,pct_change_lag_3,pct_change_lag_7
0,2021-07-16,ADA-USD,1.1661,1.1714,36764910.0,6.783295,0.001366,2,36764910.0,36764910.0,...,0.145345,0.143173,0.260852,0.001366,0.031159,0.851289,70.002537,0.001366,0.001366,0.001366
1,2021-07-16,COMP-USD,367.1,367.89,23815.54,11.822392,0.030036,2,23815.54,23815.54,...,0.171204,0.18125,0.28605,0.030036,0.136318,0.329202,58.238869,0.030036,0.030036,0.030036
2,2021-07-16,ETH-USD,1850.15,1877.22,184221.7,6.226522,0.012002,2,184221.7,184221.7,...,0.136244,0.136448,0.220291,0.012002,0.132425,0.763741,88.586851,0.012002,0.012002,0.012002
3,2021-07-16,LINK-USD,15.13583,15.23963,2246281.0,7.91942,0.007881,2,2246281.0,2246281.0,...,0.159056,0.224341,0.282708,0.007881,0.083315,0.832218,78.511587,0.007881,0.007881,0.007881
4,2021-07-16,LTC-USD,120.44,120.68,130554.6,6.127532,-0.005469,2,130554.6,130554.6,...,0.137388,0.13971,0.222501,-0.005469,0.029831,0.530991,78.981735,-0.005469,-0.005469,-0.005469


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10764 entries, 0 to 10763
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   time              10764 non-null  object 
 1   product_id        10764 non-null  object 
 2   low               10764 non-null  float64
 3   close             10764 non-null  float64
 4   volume            10764 non-null  float64
 5   volatility        10764 non-null  float64
 6   pct_change        10764 non-null  float64
 7   day_of_week       10764 non-null  int64  
 8   volume_lag_1      10764 non-null  float64
 9   volume_lag_3      10764 non-null  float64
 10  volume_lag_7      10764 non-null  float64
 11  RSI               10764 non-null  float64
 12  MACD              10764 non-null  float64
 13  Signal_Line       10764 non-null  float64
 14  volatility_7d     10764 non-null  float64
 15  volatility_14d    10764 non-null  float64
 16  volatility_30d    10764 non-null  float6

In [11]:
# set the time column as the index then scale the float values
df['time'] = pd.to_datetime(df['time'])
df.set_index('time', inplace=True)
scaler = StandardScaler()
for columns in df.columns:
    if df[columns].dtype == 'float':
        df[[columns]] = scaler.fit_transform(df[[columns]])
df.head()


Unnamed: 0_level_0,product_id,low,close,volume,volatility,pct_change,day_of_week,volume_lag_1,volume_lag_3,volume_lag_7,...,volatility_7d,volatility_14d,volatility_30d,momentum_1d,momentum_7d,momentum_30d,RSI_14,pct_change_lag_1,pct_change_lag_3,pct_change_lag_7
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-07-16,ADA-USD,-0.330402,-0.33035,0.929873,-0.107785,0.011618,2,0.929625,0.928968,0.928107,...,0.566046,-0.230912,0.190376,0.011618,0.198782,2.436595,1.540159,0.012317,0.012046,0.009119
2021-07-16,COMP-USD,-0.299691,-0.300313,-0.371395,0.827461,0.614296,2,-0.371503,-0.371845,-0.372274,...,0.983513,0.267894,0.453747,0.614296,1.033567,0.860587,0.63791,0.615253,0.615804,0.614281
2021-07-16,ETH-USD,-0.175229,-0.176687,-0.365714,-0.211121,0.235195,2,-0.365822,-0.366166,-0.366596,...,0.419129,-0.319,-0.233571,0.235195,1.00266,2.172317,2.965538,0.23599,0.236025,0.233618
2021-07-16,LINK-USD,-0.32923,-0.329198,-0.292681,0.103077,0.148581,2,-0.292798,-0.293159,-0.293614,...,0.787406,0.83238,0.418822,0.148581,0.612812,2.379027,2.192786,0.149339,0.149255,0.146647
2021-07-16,LTC-USD,-0.320392,-0.320562,-0.367615,-0.229494,-0.132058,2,-0.367723,-0.368066,-0.368496,...,0.437592,-0.27627,-0.210469,-0.132058,0.188237,1.469722,2.228845,-0.131421,-0.131887,-0.13515


In [16]:
df.columns

Index(['product_id', 'low', 'close', 'volume', 'volatility', 'pct_change',
       'day_of_week', 'volume_lag_1', 'volume_lag_3', 'volume_lag_7', 'RSI',
       'MACD', 'Signal_Line', 'volatility_7d', 'volatility_14d',
       'volatility_30d', 'momentum_1d', 'momentum_7d', 'momentum_30d',
       'RSI_14', 'pct_change_lag_1', 'pct_change_lag_3', 'pct_change_lag_7'],
      dtype='object')

In [27]:
results_dict = {}
for product_id, group in df.groupby('product_id'):
    # Set the date as the index

   # Initialize the scaler
    scaler = StandardScaler()

    # Scale only the float columns except for 'pct_change'
    for column in group.columns:
        if column != 'pct_change' and group[column].dtype == 'float64':
            group[[column]] = scaler.fit_transform(group[[column]])

    explanatory_vars = [col for col in group if col != 'pct_change']
   
    # Initialize Forecaster
    f = Forecaster(
        y=group['pct_change'],    # Target variable
        current_dates=group.index,       # Date index
        future_dates=24,                 # Number of periods to forecast
        test_length=12,                  # Length of test set
        cis=False,
        Xvars=['Volume'],                                      # Confidence intervals (True/False)
        metrics=['rmse', 'mae', 'mape', 'r2']  # Metrics to evaluate
    )
    print(f)
#  # Set explanatory variables
#     # explanatory_vars = [col for col in group if col != 'pct_change']
#     # f.set_Xvars(explanatory_vars)
#    # Set estimator (e.g., XGBoost)
#     f.set_estimator('xgboost')

#     # Manually set the hyperparameters
#     f.manual_forecast(
#         hyperparameters={
#             'n_estimators': 100,
#             'learning_rate': 0.1,
#             'max_depth': 5,
#             'subsample': 0.8,
#             'colsample_bytree': 0.8,
#             'min_child_weight': 1,
#             'gamma': 0
#         }
#     )

#     # Automatically select best variables
#     f.auto_Xvar_select()

#     # Perform cross-validation and hyperparameter tuning
#     f.cross_validate(k=3)

#     # Automatically generate forecast
#     f.auto_forecast()

#     # Export results
#     results = f.export(['lvl_fcsts', 'model_summaries'])

#     # Store results in the dictionary
#     results_dict[product_id] = results

#     # Plot results
#     fig, ax = plt.subplots(2, 1, figsize=(12, 6))
#     f.plot_test_set(models=['xgboost'], order_by='TestSetRMSE', ax=ax[0])
#     f.plot(models=['xgboost'], order_by='TestSetRMSE', ax=ax[1])
#     plt.show()

# # Access results for a specific cryptocurrency
# for product_id, res in results_dict.items():
#     print(f"Results for {product_id}:")
#     print(res['model_summaries'])

Forecaster(
    DateStartActuals=2021-07-16T00:00:00.000000000
    DateEndActuals=2024-07-08T00:00:00.000000000
    Freq=None
    N_actuals=1084
    ForecastLength=24
    Xvars=[]
    TestLength=12
    ValidationMetric=rmse
    ForecastsEvaluated=[]
    CILevel=None
    CurrentEstimator=mlr
    GridsFile=Grids
)
Forecaster(
    DateStartActuals=2021-09-30T00:00:00.000000000
    DateEndActuals=2024-07-08T00:00:00.000000000
    Freq=None
    N_actuals=1008
    ForecastLength=24
    Xvars=[]
    TestLength=12
    ValidationMetric=rmse
    ForecastsEvaluated=[]
    CILevel=None
    CurrentEstimator=mlr
    GridsFile=Grids
)
Forecaster(
    DateStartActuals=2021-07-16T00:00:00.000000000
    DateEndActuals=2024-07-08T00:00:00.000000000
    Freq=None
    N_actuals=1084
    ForecastLength=24
    Xvars=[]
    TestLength=12
    ValidationMetric=rmse
    ForecastsEvaluated=[]
    CILevel=None
    CurrentEstimator=mlr
    GridsFile=Grids
)
Forecaster(
    DateStartActuals=2021-07-16T00:00:00.00000