# **Bitcoin price forecasting - Random Forest**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author
Corsi Danilo - corsi.1742375@studenti.uniroma1.it



# Dependencies, Libraries and Tools

In [1]:
JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
SLOW_OPERATION = False

In [2]:
#Install some useful dependencies
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from itertools import cycle

import plotly.express as px

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import gc

import pandas as pd
import numpy as np
import json

import matplotlib.pyplot as plt
import seaborn as sns

# !pip install -U -q PyDrive # To use files that are stored in Google Drive directly (e.g., without downloading them from an external URL)
# !apt install openjdk-8-jdk-headless -qq
# import os
# os.environ["JAVA_HOME"] = JAVA_HOME

In [3]:
# Install Spark and related dependencies
!pip install pyspark

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.ml import Pipeline

# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '4G').\
                set('spark.driver.memory', '45G').\
                set('spark.driver.maxResultSize', '10G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPriceForecasting").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285397 sha256=b1d2fa636bd25323b23b146fb7a6e0f877f07a7bc87e95d72ddd67eb56893a46
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


# Link to Google Drive

In [4]:
# Define GDrive paths
GDRIVE_DIR = "/content/drive"

GDRIVE_DATASET_RAW_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/raw"
GDRIVE_DATASET_TEMP_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/temp"
GDRIVE_DATASET_OUTPUT_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/output"

GDRIVE_DATASET_NAME = "bitcoin_blockchain_data_1h"
GDRIVE_DATASET_NAME_TRAIN = GDRIVE_DATASET_NAME + "_train"
GDRIVE_DATASET_NAME_VALID = GDRIVE_DATASET_NAME + "_valid"

GDRIVE_DATASET_NAME_EXT = "/" + GDRIVE_DATASET_NAME + ".parquet"
GDRIVE_DATASET_NAME_EXT_TRAIN  = "/" + GDRIVE_DATASET_NAME_TRAIN + ".parquet"
GDRIVE_DATASET_NAME_EXT_VALID = "/" + GDRIVE_DATASET_NAME_VALID + ".parquet"

GDRIVE_DATASET = GDRIVE_DATASET_RAW_DIR + GDRIVE_DATASET_NAME_EXT
GDRIVE_DATASET_TRAIN = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_DATASET_NAME_EXT_TRAIN
GDRIVE_DATASET_VALID = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_DATASET_NAME_EXT_VALID

In [5]:
# Point Colaboratory to our Google Drive
from google.colab import drive

drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


# Training the model ❗

In [6]:
# Load datasets into pyspark dataframe objects
train_df = spark.read.load(GDRIVE_DATASET_TRAIN,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

valid_df = spark.read.load(GDRIVE_DATASET_VALID,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

In [7]:
# Function that create a simple rf model (with no hyperparameter tuning)
def simple_random_forest_model(train, featureCol, labelCol):
  rf = RandomForestRegressor(featuresCol=featureCol, labelCol=labelCol)
  rf_model = rf.fit(train)
  return rf_model

In [8]:
# Define the evaluation metrics
# Notice that r2_adj metric is included when calculating r2
metrics = ['mse', 'rmse', 'mae', 'r2']

In [9]:
# Function to compute the r2 adjusted metric
# r2 is the r2 metric, n is the number of observations, k is the number of features
def compute_r2adj(r2, n, k):
  return 1 - (1 - r2) * ((n - 1) / (n - k - 1))

In [10]:
# Function to evaluate a model
def evaluate_models(predictions, modelName, typeName, label, prediction, metrics):
  r2 = None
  for metric in metrics:
    evaluator = RegressionEvaluator(labelCol=label, predictionCol=prediction, metricName=metric)
    evaluation = evaluator.evaluate(predictions)
    print(metric.upper()+' for '+modelName+' on '+typeName+' set: '+str(evaluation))
    if metric == 'r2':
      print('R2_adj'+' for '+modelName+' on '+typeName+' set: '+str(compute_r2adj(evaluation, predictions.count(), len(predictions.columns))))

In [11]:
# Function that create simple models (without hyperparameter tuning) and evaluate them
def test_best_features(train_data, valid_data, features, featureCol, labelCol, metrics = ['rmse', 'r2']):
  # Train the models
  rf = simple_random_forest_model(train_data, featureCol, labelCol)

  # Training set evaluation
  rf_training = rf.transform(train_data)
  evaluate_models(rf_training, 'random forest regression', 'training', labelCol, 'prediction', metrics)

  # validation set evaluation
  rf_predictions = rf.transform(valid_data)
  evaluate_models(rf_predictions, 'random forest regression', 'validation', labelCol, 'prediction', metrics)

  return rf_training, rf_predictions

In [12]:
# Retrieve all / cor_matrix / gb features
GDRIVE_FEATURES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/features"

GDRIVE_COR_MATRIX_FEATURES_NAME = "cor_matrix_features"
GDRIVE_GB_FEATURES_NAME = "gb_features"

GDRIVE_COR_MATRIX_FEATURES_NAME_EXT = "/" + GDRIVE_COR_MATRIX_FEATURES_NAME + ".json"
GDRIVE_GB_FEATURES_NAME_EXT = "/" + GDRIVE_GB_FEATURES_NAME + ".json"

GDRIVE_COR_MATRIX_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_COR_MATRIX_FEATURES_NAME_EXT
GDRIVE_GB_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_GB_FEATURES_NAME_EXT

In [13]:
cor_matrix_features = spark.read.json(GDRIVE_COR_MATRIX_FEATURES).columns
gb_features = spark.read.json(GDRIVE_GB_FEATURES).columns

all_features = train_df.columns[1:-2]

# Set the depended variable
dep_var = 'market-price'

In [14]:
# Return the dataset with the selected features
def select_features(dataset, features):
  vectorAssembler = VectorAssembler(
    inputCols = features,
    outputCol = 'features')

  dataset = vectorAssembler.transform(dataset)
  dataset = dataset.select(['timestamp','index', 'features', dep_var])
  return dataset

In [15]:
def show_results(train, valid, training, predictions):
  trace1 = go.Scatter(
      x = train['timestamp'],
      y = train['market-price'].astype(float),
      mode = 'lines',
      name = 'Train'
  )

  trace2 = go.Scatter(
      x = valid['timestamp'],
      y = valid['market-price'].astype(float),
      mode = 'lines',
      name = 'Validation'
  )

  trace3 = go.Scatter(
      x = training['timestamp'],
      y = training['prediction'].astype(float),
      mode = 'lines',
      name = 'Training'
  )

  trace4 = go.Scatter(
      x = predictions['timestamp'],
      y = predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Prediction'
  )

  layout = dict(
      title='Train, valid and prediction set with Rangeslider',
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  #change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace1, trace2, trace3, trace4]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = "Train, valid and prediction set with Rangeslider")

In [16]:
# valid performances with all the features
rf_training, rf_predictions = test_best_features(select_features(train_df, all_features), select_features(valid_df, all_features), all_features, 'features', dep_var)

RMSE for random forest regression on training set: 1030.237617924664
R2 for random forest regression on training set: 0.9961708375803587
R2_adj for random forest regression on training set: 0.9961704771614641
RMSE for random forest regression on validation set: 10889.513912811677
R2 for random forest regression on validation set: -0.4071755930701335
R2_adj for random forest regression on validation set: -0.40788236132381006


In [17]:
show_results(train_df.toPandas(), valid_df.toPandas(), rf_training.toPandas(), rf_predictions.toPandas())

Output hidden; open in https://colab.research.google.com to view.

In [18]:
# valid features with the corr matrix features
rf_training, rf_predictions = test_best_features(select_features(train_df, cor_matrix_features), select_features(valid_df, cor_matrix_features), cor_matrix_features, 'features', dep_var)

RMSE for random forest regression on training set: 1027.2996858734302
R2 for random forest regression on training set: 0.9961926457112859
R2_adj for random forest regression on training set: 0.9961922873450758
RMSE for random forest regression on validation set: 10626.445542270025
R2 for random forest regression on validation set: -0.3400078651620926
R2_adj for random forest regression on validation set: -0.3406808977412801


In [19]:
show_results(train_df.toPandas(), valid_df.toPandas(), rf_training.toPandas(), rf_predictions.toPandas())

Output hidden; open in https://colab.research.google.com to view.

In [20]:
# valid performances with gb features
rf_training, rf_predictions = test_best_features(select_features(train_df, gb_features), select_features(valid_df, gb_features), gb_features, 'features', dep_var)

RMSE for random forest regression on training set: 946.9678396558386
R2 for random forest regression on training set: 0.9967648126123174
R2_adj for random forest regression on training set: 0.9967645081011648
RMSE for random forest regression on validation set: 11461.241369860953
R2 for random forest regression on validation set: -0.5588151681255777
R2_adj for random forest regression on validation set: -0.5595980988981171


In [21]:
show_results(train_df.toPandas(), valid_df.toPandas(), rf_training.toPandas(), rf_predictions.toPandas())

Output hidden; open in https://colab.research.google.com to view.

# Import my utils ❗

In [63]:
import sys
GDRIVE_UTILITIES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/utilities"
sys.path.append(GDRIVE_UTILITIES_DIR)

import utilities

import importlib
importlib.reload(utilities)

<module 'utilities' from '/content/drive/MyDrive/BDC/project/utilities/utilities.py'>

# Parameter tuning ❗

In [23]:
combined_df = train_df.union(valid_df)

In [24]:
# def show_results(train, valid, pred):
#   trace1 = go.Scatter(
#       x = train['timestamp'],
#       y = train['market-price'].astype(float),
#       mode = 'lines',
#       name = 'Train'
#   )

#   trace2 = go.Scatter(
#       x = valid['timestamp'],
#       y = valid['market-price'].astype(float),
#       mode = 'lines',
#       name = 'Valid'
#   )


#   trace3 = go.Scatter(
#       x = pred['timestamp'],
#       y = pred['prediction'].astype(float),
#       mode = 'lines',
#       name = 'Prediction'
#   )


#   layout = dict(
#       title='Train, valid and prediction set with Rangeslider',
#       xaxis=dict(
#           rangeselector=dict(
#               buttons=list([
#                   #change the count to desired amount of months.
#                   dict(count=1,
#                       label='1m',
#                       step='month',
#                       stepmode='backward'),
#                   dict(count=6,
#                       label='6m',
#                       step='month',
#                       stepmode='backward'),
#                   dict(count=12,
#                       label='1y',
#                       step='month',
#                       stepmode='backward'),
#                   dict(count=36,
#                       label='3y',
#                       step='month',
#                       stepmode='backward'),
#                   dict(step='all')
#               ])
#           ),
#           rangeslider=dict(
#               visible = True
#           ),
#           type='date'
#       )
#   )

#   data = [trace1,trace2,trace3]
#   fig = dict(data=data, layout=layout)
#   iplot(fig, filename = "Train, valid and prediction set with Rangeslider")

In [25]:
# '''
# Description: Split and keep the original time-series order
# Args:
#     dataSet: The dataSet which needs to be splited
#     proportion: A number represents the split proportion

# Return:
#     train_data: The train dataSet
#     test_data: The test dataSet
# '''
# def trainSplit(dataset, proportion):
#     records_num = dataset.count()
#     split_point = int(records_num * proportion)

#     train_data = dataset.filter(dataset.index < split_point)
#     test_data = dataset.filter(dataset.index >= split_point)

#     return (train_data,test_data)

In [26]:
# import time
# from sklearn.metrics import mean_absolute_percentage_error
# from itertools import product

# '''
# Description: Use Grid Search to tune the Model
# Args:
#     dataSet: The dataSet which needs to be splited
#     proportion_lst: A list represents the split proportion
#     feature_col: The column name of features
#     label_col: The column name of label
#     ml_model: The module to use
#     params: Parameters which want to test
#     assembler: An assembler to dataSet
# Return:
#     results_df: The best result in a pandas dataframe
# '''
# def autoTuning(dataSet, proportion_lst, feature_col, label_col, ml_model, params, assembler):

#     # Initialize the best result for comparison
#     result_best = {"RMSE": float('inf')}
#     predictions_best = pd.DataFrame()

#     # Try different proportions
#     for proportion in proportion_lst:
#         # Split the dataSet
#         train_data,test_data = trainSplit(dataSet, proportion)

#         # Cache it
#         train_data.cache()
#         test_data.cache()

#         # ALL combination of params
#         param_lst = [dict(zip(params, param)) for param in product(*params.values())]

#         for param in param_lst:
#             # Chosen Model
#             # if ml_model == "LinearRegression":
#             #     model = LinearRegression(featuresCol=feature_col, \
#             #                              labelCol=label_col, \
#             #                              maxIter=param['maxIter'], \
#             #                              regParam=param['regParam'], \
#             #                              elasticNetParam=param['elasticNetParam'])

#             # elif ml_model == "GeneralizedLinearRegression":
#             #     model = GeneralizedLinearRegression(featuresCol=feature_col, \
#             #                                         labelCol=label_col, \
#             #                                         maxIter=param['maxIter'], \
#             #                                         regParam=param['regParam'], \
#             #                                         family=param['family'], \
#             #                                         link=param['link'])

#             # elif ml_model == "DecisionTree":
#             #     model = DecisionTreeRegressor(featuresCol=feature_col, \
#             #                                   labelCol=label_col, \
#             #                                   maxDepth = param["maxDepth"])

#             if ml_model == "RandomForest":
#                 model = RandomForestRegressor(featuresCol=feature_col, \
#                                               labelCol=label_col, \
#                                               numTrees = param["numTrees"], \
#                                               maxDepth = param["maxDepth"])

#             # elif ml_model == "GBTRegression":
#             #     model = GBTRegressor(featuresCol=feature_col, \
#             #                          labelCol=label_col, \
#             #                          maxIter = param['maxIter'], \
#             #                          maxDepth = param['maxDepth'], \
#             #                          stepSize = param['stepSize'])

#             # Chain assembler and model in a Pipeline
#             pipeline = Pipeline(stages=[assembler, model])
#             # Train a model and calculate running time
#             start = time.time()
#             pipeline_model = pipeline.fit(train_data)
#             end = time.time()

#             # Make predictions
#             predictions = pipeline_model.transform(test_data)

#             # Compute test error by several evaluators
#             # https://spark.apache.org/docs/3.1.1/mllib-evaluation-metrics.html#regression-model-evaluation
#             # https://spark.apache.org/docs/3.1.1/api/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.html
#             rmse_evaluator = RegressionEvaluator(labelCol=label_col, predictionCol="prediction", metricName='rmse')
#             mae_evaluator = RegressionEvaluator(labelCol=label_col, predictionCol="prediction", metricName='mae')
#             r2_evaluator = RegressionEvaluator(labelCol=label_col, predictionCol="prediction", metricName='r2')
#             var_evaluator = RegressionEvaluator(labelCol=label_col, predictionCol="prediction", metricName='var')

#             predictions_pd = predictions.select(dep_var,"prediction").toPandas()
#             mape = mean_absolute_percentage_error(predictions_pd[dep_var], predictions_pd["prediction"])

#             rmse = rmse_evaluator.evaluate(predictions)
#             mae = mae_evaluator.evaluate(predictions)
#             var = var_evaluator.evaluate(predictions)
#             r2 = r2_evaluator.evaluate(predictions)
#             # Adjusted R-squared
#             n = predictions.count()
#             p = len(predictions.columns)
#             adj_r2 = 1-(1-r2)*(n-1)/(n-p-1)

#             # Use dict to store each result
#             results = {
#                 "Model": ml_model,
#                 "Proportion": proportion,
#                 "Parameters": [list(param.values())],
#                 "RMSE": rmse,
#                 "MAPE":mape,
#                 "MAE": mae,
#                 "Variance": var,
#                 "R2": r2,
#                 "Adjusted_R2": adj_r2,
#                 "Time": end - start,
#                 "Predictions": predictions.select(dep_var,"prediction",'timestamp')
#             }

#             # # Virtualization
#             # show_results(train_data.toPandas(), test_data.toPandas(), predictions.toPandas())

#             # Only store the lowest RMSE
#             if results['RMSE'] < result_best['RMSE']:
#                 result_best = results
#                 predictions_best = predictions.toPandas()

#         # Release Cache
#         train_data.unpersist()
#         test_data.unpersist()

#     # Transform dict to pandas dataframe
#     results_df = pd.DataFrame(result_best)
#     return results_df, predictions_best.toPandas()

In [28]:
cor_matrix_features = spark.read.json(GDRIVE_COR_MATRIX_FEATURES).columns
gb_features = spark.read.json(GDRIVE_GB_FEATURES).columns

all_features = train_df.columns[1:-2]

# Set the depended variable
dep_var = 'market-price'

In [29]:
vector_assembler = VectorAssembler(
  inputCols = cor_matrix_features,
  outputCol = 'features')

In [27]:
# Split proportion list
proportion_lst = [0.6, 0.7, 0.8, 0.9]

In [None]:
# RandomForest
rf_params = {
    'numTrees' : [3, 5, 10, 20, 30],# Number of trees to train, >=1, default:20
    'maxDepth' : [3, 5, 10] # Maximum depth of the tree, <=30, default:5
}

In [30]:
result_rf, train_rf, valid_rf = utilities.autoTuning(combined_df, proportion_lst, "features", dep_var, "RandomForest", rf_params, vector_assembler)
result_rf


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'd

Unnamed: 0,Model,Proportion,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time,Predictions
0,RandomForest,0.9,"[3, 5]",2298.536834,0.0978,1901.84965,6451369.0,0.113702,0.111024,2.712295,"DataFrame[market-price: double, prediction: do..."


In [31]:
# Virtualization
utilities.show_results(train_rf, valid_rf, predictions_rf)

# Time Series Cross Validation ❗

In [32]:
# '''
# Description: Blocked Time Series Cross Validation
# Args:
#     num: Number of DataSet
#     n_splits: Split times
# Return:
#     split_position_df: All set of splits position in a Pandas dataframe
# '''
# def blockedTsCrossValidation(num, n_splits):
#     kfold_size = num // n_splits

#     split_position_lst = []
#     # Calculate the split position for each time
#     for i in range(n_splits):
#         # Calculate the start/split/end point for each fold
#         start = i * kfold_size
#         end = start + kfold_size
#         # Manually set train-test split proportion in each fold
#         split = int(0.8 * (end - start)) + start
#         split_position_lst.append((start,split,end))

#     # Transform the split position list to a Pandas Dataframe
#     split_position_df = pd.DataFrame(split_position_lst,columns=['start','split','end'])
#     return split_position_df

In [33]:
# '''
# Description: Cross Validation on Time Series data
# Args:
#     dataSet: The dataSet which needs to be splited
#     feature_col: The column name of features
#     label_col: The column name of label
#     ml_model: The module to use
#     params: Parameters which want to test
#     assembler: An assembler to dataSet
#     cv_info: The type of Cross Validation
# Return:
#     tsCv_df: All the splits performance of each model in a pandas dataframe
# '''
# def tsCrossValidation(dataSet, feature_col, label_col, ml_model, params, assembler, cv_info):

#     # Get the number of samples
#     num = dataSet.count()

#     # Save results in a list
#     result_lst = []

#     # Initialize the best result for comparison
#     result_best = {"RMSE": float('inf')}
#     predictions_best = pd.DataFrame()

#     # ALL combination of params
#     param_lst = [dict(zip(params, param)) for param in product(*params.values())]

#     for param in param_lst:
#         # Chosen Model
#         # if ml_model == "LinearRegression":
#         #     model = LinearRegression(featuresCol=feature_col, \
#         #                              labelCol=label_col, \
#         #                              maxIter=param['maxIter'], \
#         #                              regParam=param['regParam'], \
#         #                              elasticNetParam=param['elasticNetParam'])

#         # elif ml_model == "GeneralizedLinearRegression":
#         #     model = GeneralizedLinearRegression(featuresCol=feature_col, \
#         #                                         labelCol=label_col, \
#         #                                         maxIter=param['maxIter'], \
#         #                                         regParam=param['regParam'], \
#         #                                         family=param['family'], \
#         #                                         link=param['link'])

#         # elif ml_model == "DecisionTree":
#         #     model = DecisionTreeRegressor(featuresCol=feature_col, \
#         #                                   labelCol=label_col, \
#         #                                   maxDepth = param["maxDepth"])

#         if ml_model == "RandomForest":
#             model = RandomForestRegressor(featuresCol=feature_col, \
#                                           labelCol=label_col, \
#                                           numTrees = param["numTrees"], \
#                                           maxDepth = param["maxDepth"])

#         # elif ml_model == "GBTRegression":
#         #     model = GBTRegressor(featuresCol=feature_col, \
#         #                          labelCol=label_col, \
#         #                          maxIter = param['maxIter'], \
#         #                          maxDepth = param['maxDepth'], \
#         #                          stepSize = param['stepSize'])

#         # # Identify the type of Cross Validation
#         # if cv_info['cv_type'] == 'mulTs':
#         #     split_position_df = mulTsCrossValidation(num, cv_info['kSplits'])
#         if cv_info['cv_type'] == 'blkTs':
#              split_position_df = blockedTsCrossValidation(num, cv_info['kSplits'])
#         # elif cv_info['cv_type'] == 'wfTs':
#         #     split_position_df = wfTsCrossValidation(num, cv_info['min_obser'], cv_info['expand_window'])

#         for position in split_position_df.itertuples():
#             # Get the start/split/end position from a kind of Time Series Cross Validation
#             start = getattr(position, 'start')
#             splits = getattr(position, 'split')
#             end = getattr(position, 'end')
#             idx  = getattr(position, 'Index')

#             # Train/Test size
#             train_size = splits - start
#             test_size = end - splits

#             # Get training data and test data
#             train_data = dataSet.filter(dataSet.index.between(start, splits-1))
#             test_data = dataSet.filter(dataSet.index.between(splits, end-1))

#             # Cache it
#             train_data.cache()
#             test_data.cache()

#             # Chain assembler and model in a Pipeline
#             pipeline = Pipeline(stages=[assembler, model])
#             # Train a model and calculate running time
#             start = time.time()
#             pipeline_model = pipeline.fit(train_data)
#             end = time.time()

#             # Make predictions
#             predictions = pipeline_model.transform(test_data)

#             # Compute test error by several evaluator
#             rmse_evaluator = RegressionEvaluator(labelCol=label_col, predictionCol="prediction", metricName='rmse')
#             mae_evaluator = RegressionEvaluator(labelCol=label_col, predictionCol="prediction", metricName='mae')
#             r2_evaluator = RegressionEvaluator(labelCol=label_col, predictionCol="prediction", metricName='r2')
#             var_evaluator = RegressionEvaluator(labelCol=label_col, predictionCol="prediction", metricName='var')

#             predictions_pd = predictions.select("timestamp",dep_var,"prediction").toPandas()
#             mape = mean_absolute_percentage_error(predictions_pd[dep_var], predictions_pd["prediction"])

#             rmse = rmse_evaluator.evaluate(predictions)
#             mae = mae_evaluator.evaluate(predictions)
#             var = var_evaluator.evaluate(predictions)
#             r2 = r2_evaluator.evaluate(predictions)
#             # Adjusted R-squared
#             n = predictions.count()
#             p = len(predictions.columns)
#             adj_r2 = 1-(1-r2)*(n-1)/(n-p-1)

#             # Use dict to store each result
#             results = {
#                 "Model": ml_model,
#                 'CV_type': cv_info['cv_type'],
#                 "Splits": idx + 1,
#                 "Train&Test": (train_size,test_size),
#                 "Parameters": list(param.values()),
#                 "RMSE": rmse,
#                 "MAPE": mape,
#                 "MAE": mae,
#                 "Variance": var,
#                 "R2": r2,
#                 "Adjusted_R2": adj_r2,
#                 "Time": end - start
#             }

#             # # Virtualization
#             # show_results(train_data.toPandas(), test_data.toPandas(), predictions.toPandas())

#             # Store each splits result
#             result_lst.append(results)

#             # Only store the lowest RMSE
#             if results['RMSE'] < result_best['RMSE']:
#                 result_best = results
#                 predictions_best = predictions.toPandas()

#             # Release Cache
#             train_data.unpersist()
#             test_data.unpersist()

#     # Transform dict to pandas dataframe
#     tsCv_df = pd.DataFrame(result_lst)
#     return tsCv_df, predictions_best

In [46]:
## Cross Validation Parameter
# Multiple Splits Time Series Cross Validation
mul_cv = {'cv_type':'mulTs',
          'kSplits': 5}

# Blocked Time Series Cross Validation
blk_cv = {'cv_type':'blkTs',
          'kSplits': 10}

# Walk Forward Validation, Last 50 steps
wf_cv = {'cv_type':'wfTs',
         'min_obser': 4856359,
         'expand_window': 1}

In [37]:
rf_params = {
    'numTrees' : [3],# Number of trees to train, >=1, default:20
    'maxDepth' : [5] # Maximum depth of the tree, <=30, default:5
}

In [38]:
rf_mul_cv, rf_train_mul_cv, rf_valid_mul_cv, rf_predictions_mul_cv = utilities.tsCrossValidation(combined_df, "features", dep_var, "RandomForest", rf_params, vector_assembler, mul_cv)
rf_mul_cv


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'd

Unnamed: 0,Model,CV_type,Splits,Train&Test,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,RandomForest,mulTs,1,"(10518, 10514)","[3, 5]",6663.805267,0.658074,5117.231575,26122290.0,-1.418102,-1.422481,1.490921
1,RandomForest,mulTs,2,"(21033, 10514)","[3, 5]",1518.943539,0.271398,1227.691962,3651191.0,0.525014,0.524154,1.147429
2,RandomForest,mulTs,3,"(31548, 10514)","[3, 5]",1153.59928,0.083675,818.445476,1884745.0,0.417131,0.416076,1.258619
3,RandomForest,mulTs,4,"(42062, 10514)","[3, 5]",30867.715704,0.612987,27977.58832,785408900.0,-3.683208,-3.691688,1.32051
4,RandomForest,mulTs,5,"(52577, 10511)","[3, 5]",9985.247958,0.421047,8862.773256,109671000.0,-0.056551,-0.058465,1.661134


In [43]:
utilities.show_results(rf_train_mul_cv, rf_valid_mul_cv, rf_predictions_mul_cv)

In [39]:
rf_blk_cv, rf_train_blk_cv, rf_valid_blk_cv, rf_predictions_blk_cv = utilities.tsCrossValidation(combined_df, "features", dep_var, "RandomForest", rf_params, vector_assembler, blk_cv)
rf_blk_cv


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'd

Unnamed: 0,Model,CV_type,Splits,Train&Test,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,RandomForest,blkTs,1,"(5046, 1262)","[3, 5]",41.514211,0.066578,38.910492,1758.913,-2.397126,-2.449095,1.101145
1,RandomForest,blkTs,2,"(5046, 1262)","[3, 5]",807.506186,0.305547,646.036363,417285.7,-1.773244,-1.815669,0.854992
2,RandomForest,blkTs,3,"(5046, 1262)","[3, 5]",2546.912761,0.22986,2173.232315,5608485.0,-0.24611,-0.265173,0.666342
3,RandomForest,blkTs,4,"(5046, 1262)","[3, 5]",181.613354,0.017601,109.498147,3003.894,0.224217,0.212349,0.724453
4,RandomForest,blkTs,5,"(5046, 1262)","[3, 5]",2428.404626,0.203653,2236.838744,5007279.0,-5.397307,-5.495172,0.676696
5,RandomForest,blkTs,6,"(5046, 1262)","[3, 5]",1500.386714,0.207516,1308.867094,1774136.0,-1.747933,-1.789971,0.803422
6,RandomForest,blkTs,7,"(5046, 1262)","[3, 5]",9560.543547,0.234802,6901.529444,48037360.0,-0.887838,-0.916718,0.772151
7,RandomForest,blkTs,8,"(5046, 1262)","[3, 5]",2755.999237,0.049712,2356.984082,9106707.0,-0.155055,-0.172725,0.82326
8,RandomForest,blkTs,9,"(5046, 1262)","[3, 5]",10633.472925,0.367977,9528.223132,90720850.0,-3.880989,-3.955658,1.544493
9,RandomForest,blkTs,10,"(5046, 1262)","[3, 5]",2295.851243,0.089766,2090.678969,4488361.0,-4.379926,-4.462228,1.599468


In [40]:
utilities.show_results(rf_train_blk_cv, rf_valid_blk_cv, rf_predictions_blk_cv)

In [64]:
rf_wf_cv, rf_train_wf_cv, rf_valid_wf_cv, rf_predictions_wf_cv = utilities.tsCrossValidation(combined_df, "features", dep_var, "RandomForest", rf_params, vector_assembler, wf_cv)
rf_wf_cv

Empty DataFrame
Columns: [start, split, end]
Index: []


In [52]:
utilities.show_results(rf_train_wf_cv, rf_valid_wf_cv, rf_predictions_wf_cv)

KeyError: ignored

# Hyperparameter tuning ❗

In [None]:
# Hyperparameter tuning for the model
def random_forest_cross_val(dataset, k_fold=5):
    rf = RandomForestRegressor(featuresCol='features', labelCol=dep_var)
    pipeline = Pipeline(stages=[rf])

    # Default (too much memory!!)
    # param_grid = ParamGridBuilder()\
    # .addGrid(rf.maxDepth, [8, 9, 10]) \
    # .addGrid(rf.numTrees, [40, 60, 80]) \
    # .build()

    param_grid = ParamGridBuilder()\
    .addGrid(rf.maxDepth, [7, 8, 9]) \
    .addGrid(rf.numTrees, [40, 60, 80]) \
    .addGrid(rf.minInstancesPerNode, [1, 5, 10]) \
    .build()

    cross_val = CrossValidator(estimator=pipeline,
                               estimatorParamMaps=param_grid,
                               evaluator=RegressionEvaluator(labelCol=dep_var),
                               numFolds=k_fold,
                               collectSubModels=True
                               )

    # Run cross-validation, and choose the best set of parameters.
    cv_model = cross_val.fit(dataset)

    return cv_model

In [None]:
# Execute cross validation with random forest
cv_rf_models = random_forest_cross_val(select_features(train_df, cor_matrix_features))

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


Py4JError: ignored

In [None]:
# Summarizes all the models trained during cross validation
def summarize_rf_models(cv_models):
    for k, models in enumerate(cv_models):
        print("*************** Fold #{:d} ***************\n".format(k+1))
        for i, m in enumerate(models):
            print("--- Model #{:d} out of {:d} ---".format(i+1, len(models)))
            print("\tParameters: maxDepth=[{:.3f}]; numTrees=[{:.3f}] ".format(m.stages[-1]._java_obj.getMaxDepth(), m.stages[-1]._java_obj.getNumTrees()))
            print("\tModel summary: {}\n".format(m.stages[-1]))
        print("***************************************\n")

In [None]:
# Call the function above|
summarize_rf_models(cv_rf_models.subModels)

In [None]:
# Summarize average error
for i, avg_rmse in enumerate(cv_rf_models.avgMetrics):
    print("Avg. RMSE computed across k-fold cross validation for model setting #{:d}: {:3f}".format(i+1, avg_rmse))

In [None]:
# Get the best model to extract best hyperparameters
best_rf_model_params = cv_rf_models.bestModel.stages[-1].extractParamMap()

print('Best parameters for random forest regressor:')
for param, value in best_rf_model_params.items():
    print(param.name, "=", value)

In [None]:
# bootstrap: Se impostato su True, il campionamento con sostituzione viene utilizzato per creare i sottoinsiemi di dati per l'addestramento di ciascun albero nella foresta. Se impostato su False, il campionamento senza sostituzione viene utilizzato.
# cacheNodeIds: Se impostato su True, gli ID dei nodi per ciascuna istanza vengono memorizzati nella cache, il che può velocizzare l'addestramento di alberi più profondi.
# checkpointInterval: Determina la frequenza con cui i checkpoint vengono creati durante l'addestramento. Un valore più basso può causare un sovraccarico, mentre un valore più alto può ridurre il rischio di perdita di dati in caso di guasti.
# featureSubsetStrategy: Specifica il numero di funzionalità da considerare per le divisioni in ciascun nodo dell'albero. Le opzioni supportate includono "auto", "all", "sqrt", "log2" e valori numerici.
# featuresCol: Il nome della colonna che contiene le funzionalità utilizzate per l'addestramento del modello.
# impurity: La misura dell'impurità utilizzata per dividere i nodi durante l'addestramento. Per i problemi di regressione, l'impurità comune è "variance".
# labelCol: Il nome della colonna che contiene i valori target (etichette) per l'addestramento del modello.
# leafCol: Non è un parametro valido per il Random Forest in PySpark. Potrebbe essere un errore di battitura o un parametro specifico per un'altra libreria.
# maxBins: Il numero massimo di contenitori utilizzati per dividere le funzionalità continue e categoriche.
# maxDepth: La profondità massima degli alberi nella foresta casuale.
# maxMemoryInMB: La quantità massima di memoria (in MB) allocata per l'aggregazione degli istogrammi durante l'addestramento.
# minInfoGain: Il guadagno di informazione minimo richiesto per dividere un nodo.
# minInstancesPerNode: Il numero minimo di istanze per nodo richiesto per dividere un nodo.
# minWeightFractionPerNode: La frazione minima del peso totale delle istanze richiesta per dividere un nodo.
# numTrees: Il numero di alberi nella foresta casuale.
# predictionCol: Il nome della colonna che conterrà le previsioni generate dal modello.
# seed: Il seme utilizzato per generare numeri casuali durante l'addestramento, che può essere utile per garantire la riproducibilità dei risultati.
# subsamplingRate: La frazione delle istanze da utilizzare per l'addestramento di ciascun albero. Un valore di 1.0 indica che tutte le istanze vengono utilizzate.

In [None]:
# Fit a model with best parameters
rf = RandomForestRegressor(featuresCol='features', labelCol=dep_var, maxDepth=7, numTrees=50, minInstancesPerNode=1)
pipeline = Pipeline(stages=[rf])

rf_model = pipeline.fit(select_features(valid_df, cor_matrix_features))

In [None]:
# Training set evaluation
rf_training = rf_model.transform(select_features(train_df, cor_matrix_features))
evaluate_models(rf_training, 'random forest regressor', 'training', dep_var, 'prediction', metrics)

# Validation set evaluation
rf_predictions = rf_model.transform(select_features(valid_df, cor_matrix_features))
evaluate_models(rf_predictions, 'random forest regressor', 'validation', dep_var, 'prediction', metrics)

In [None]:
show_results(train_df.toPandas(), valid_df.toPandas(), rf_training.toPandas(), rf_predictions.toPandas())

In [None]:
GDRIVE_MODEL_NAME = "random_forest"
GDRIVE_MODELS_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/models"
GDRIVE_MODEL_NAME_EXT = GDRIVE_MODELS_DIR + "/" + GDRIVE_MODEL_NAME

In [None]:
# Save the RF best model
rf_model.write().overwrite().save(GDRIVE_MODEL_NAME_EXT)