# **Bitcoin price forecasting - Random Forest**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author
Corsi Danilo - corsi.1742375@studenti.uniroma1.it



# Dependencies, Libraries and Tools

In [1]:
JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
SLOW_OPERATION = False

In [2]:
#Install some useful dependencies
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from itertools import cycle

import plotly.express as px

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import gc

import pandas as pd
import numpy as np
import json

import matplotlib.pyplot as plt
import seaborn as sns

# !pip install -U -q PyDrive # To use files that are stored in Google Drive directly (e.g., without downloading them from an external URL)
# !apt install openjdk-8-jdk-headless -qq
# import os
# os.environ["JAVA_HOME"] = JAVA_HOME

In [3]:
# Install Spark and related dependencies
!pip install pyspark

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.ml import Pipeline

# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '4G').\
                set('spark.driver.memory', '45G').\
                set('spark.driver.maxResultSize', '10G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPriceForecasting").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285397 sha256=057838920495dff36ab27a79c49add680ad5320dd67830a606a35e4c79be45ef
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


# Link to Google Drive

In [4]:
# Define GDrive paths
GDRIVE_DIR = "/content/drive"

GDRIVE_DATASET_OUTPUT_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/output"

GDRIVE_DATASET_NAME = "bitcoin_blockchain_data_1h"
GDRIVE_DATASET_NAME_TEST = GDRIVE_DATASET_NAME + "_test"

GDRIVE_DATASET_NAME_EXT_TEST = "/" + GDRIVE_DATASET_NAME_TEST + ".parquet"

GDRIVE_DATASET_TEST = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_DATASET_NAME_EXT_TEST

In [5]:
# Point Colaboratory to our Google Drive
from google.colab import drive

drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


# Testing the models ❗

In [6]:
# Load datasets into pyspark dataframe objects
test_df = spark.read.load(GDRIVE_DATASET_TEST,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

In [7]:
# Upload the 3 templates saved in Google Drive
from pyspark.ml import PipelineModel

rf_path = "/content/drive/MyDrive/BDC/project/models/random_forest"
lr_path = "/content/drive/MyDrive/BDC/project/models/linear_regression"
gbt_path = "/content/drive/MyDrive/BDC/project/models/gradient_boosting_tree"

rf = PipelineModel.load(rf_path)
lr = PipelineModel.load(lr_path)
gbt = PipelineModel.load(gbt_path)

In [8]:
# Retrieve all / cor_matrix / gb features
GDRIVE_FEATURES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/features"

GDRIVE_COR_MATRIX_FEATURES_NAME = "cor_matrix_features"
GDRIVE_GB_FEATURES_NAME = "gb_features"

GDRIVE_COR_MATRIX_FEATURES_NAME_EXT = "/" + GDRIVE_COR_MATRIX_FEATURES_NAME + ".json"
GDRIVE_GB_FEATURES_NAME_EXT = "/" + GDRIVE_GB_FEATURES_NAME + ".json"

GDRIVE_COR_MATRIX_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_COR_MATRIX_FEATURES_NAME_EXT
GDRIVE_GB_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_GB_FEATURES_NAME_EXT

In [9]:
cor_matrix_features = spark.read.json(GDRIVE_COR_MATRIX_FEATURES).columns
gb_features = spark.read.json(GDRIVE_GB_FEATURES).columns

all_features = test_df.columns[1:-2]

# Set the depended variable
dep_var = 'market-price'

In [10]:
# Define the evaluation metrics
# Notice that r2_adj metric is included when calculating r2
metrics = ['mse', 'rmse', 'mae', 'r2']

In [11]:
# Function to compute the r2 adjusted metric
# r2 is the r2 metric, n is the number of observations, k is the number of features
def compute_r2adj(r2, n, k):
  return 1 - (1 - r2) * ((n - 1) / (n - k - 1))

In [12]:
# Function to evaluate a model
overall_metrics = []

def evaluate_models(predictions, modelName, typeName, label, prediction, metrics):
  model_metrics = {'model': modelName}

  r2 = None
  for metric in metrics:
    evaluator = RegressionEvaluator(labelCol=label, predictionCol=prediction, metricName=metric)
    evaluation = evaluator.evaluate(predictions)
    model_metrics[metric] = evaluation
    print(metric.upper()+' for '+modelName+' on '+typeName+' set: '+str(evaluation))
    if metric == 'r2':
      model_metrics['r2_adj'] = compute_r2adj(evaluation, predictions.count(), len(predictions.columns))
      print('R2_adj'+' for '+modelName+' on '+typeName+' set: '+str(compute_r2adj(evaluation, predictions.count(), len(predictions.columns))))
  return model_metrics

In [13]:
# Return the dataset with the selected features
def select_features(dataset, features):
  vectorAssembler = VectorAssembler(
    inputCols = features,
    outputCol = 'features')

  dataset = vectorAssembler.transform(dataset)
  dataset = dataset.select(['timestamp','index', 'features', dep_var])
  return dataset

In [14]:
# Testing set evaluation
rf_predictions = rf.transform(select_features(test_df, cor_matrix_features))
rf_metrics = evaluate_models(rf_predictions, 'random forest', 'testing', dep_var, 'prediction', metrics)

overall_metrics.append(rf_metrics)

MSE for random forest on testing set: 20135235.053964578
RMSE for random forest on testing set: 4487.230220744706
MAE for random forest on testing set: 3964.0711293646177
R2 for random forest on testing set: -0.44720116264813115
R2_adj for random forest on testing set: -0.44862866428814696


In [15]:
# Testing set evaluation
lr_predictions = lr.transform(select_features(test_df, cor_matrix_features))
lr_metrics = evaluate_models(lr_predictions, 'linear regression', 'testing', dep_var, 'prediction', metrics)

overall_metrics.append(lr_metrics)

MSE for linear regression on testing set: 164056.06784868002
RMSE for linear regression on testing set: 405.0383535526976
MAE for linear regression on testing set: 323.75175329083817
R2 for linear regression on testing set: 0.988208623763578
R2_adj for linear regression on testing set: 0.9881969928933507


In [16]:
# Testing set evaluation
gbt_predictions = gbt.transform(select_features(test_df, cor_matrix_features))
gbt_metrics = evaluate_models(gbt_predictions, 'gradient boosting', 'testing', dep_var, 'prediction', metrics)

overall_metrics.append(gbt_metrics)

MSE for gradient boosting on testing set: 45000377.77112805
RMSE for gradient boosting on testing set: 6708.232089837683
MAE for gradient boosting on testing set: 5355.889452395793
R2 for gradient boosting on testing set: -2.2343600089813083
R2_adj for gradient boosting on testing set: -2.2375503423892598


In [17]:
def show_results(test, rf_pred, lr_pred, gbt_pred):
  trace1 = go.Scatter(
      x = test['timestamp'],
      y = test['market-price'].astype(float),
      mode = 'lines',
      name = 'Test'
  )

  trace2 = go.Scatter(
      x = rf_pred['timestamp'],
      y = rf_pred['market-price'].astype(float),
      mode = 'lines',
      name = 'Random Forest predictions'
  )

  trace3 = go.Scatter(
      x = lr_pred['timestamp'],
      y = lr_pred['prediction'].astype(float),
      mode = 'lines',
      name = 'Linear Regression prediction'
  )

  trace4 = go.Scatter(
      x = gbt_pred['timestamp'],
      y = gbt_pred['prediction'].astype(float),
      mode = 'lines',
      name = 'Gradient Boosting Tree prediction'
  )


  layout = dict(
      title='Test and predictions set with Rangeslider',
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  #change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace1, trace2, trace3, trace4]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = "Test and predictions set with Rangeslider")

In [18]:
show_results(test_df.toPandas(), rf_predictions.toPandas(), lr_predictions.toPandas(), gbt_predictions.toPandas())


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead



In [19]:
# Show the best model
metrics = ['mse', 'rmse', 'mae', 'r2', 'r2_adj']
best_model = max(overall_metrics, key=lambda x: (x['mse'], x['rmse'], x['mae'], -x['r2']))

print("Best Model:", best_model['model'])

TypeError: ignored

# OLD ❗

In [None]:
# Function that create a simple rf model (with no hyperparameter tuning)
def simple_random_forest_model(train, featureCol, labelCol):
  rf = RandomForestRegressor(featuresCol=featureCol, labelCol=labelCol)
  rf_model = rf.fit(train)
  return rf_model

In [None]:
# Define the evaluation metrics
# Notice that r2_adj metric is included when calculating r2
metrics = ['mse', 'rmse', 'mae', 'r2']

In [None]:
# Function to compute the r2 adjusted metric
# r2 is the r2 metric, n is the number of observations, k is the number of features
def compute_r2adj(r2, n, k):
  return 1 - (1 - r2) * ((n - 1) / (n - k - 1))

In [None]:
# Function to evaluate a model
def evaluate_models(predictions, modelName, typeName, label, prediction, metrics):
  r2 = None
  for metric in metrics:
    evaluator = RegressionEvaluator(labelCol=label, predictionCol=prediction, metricName=metric)
    evaluation = evaluator.evaluate(predictions)
    print(metric.upper()+' for '+modelName+' on '+typeName+' set: '+str(evaluation))
    if metric == 'r2':
      print('R2_adj'+' for '+modelName+' on '+typeName+' set: '+str(compute_r2adj(evaluation, predictions.count(), len(predictions.columns))))

In [None]:
# Function that create simple models (without hyperparameter tuning) and evaluate them
def test_best_features(train_data, test_data, features, featureCol, labelCol, metrics = ['rmse', 'r2']):
  # Train the models
  rf = simple_random_forest_model(train_data, featureCol, labelCol)

  # Training set evaluation
  rf_training = rf.transform(train_data)
  evaluate_models(rf_training, 'random forest regression', 'training', labelCol, 'prediction', metrics)

  # Testing set evaluation
  rf_predictions = rf.transform(test_data)
  evaluate_models(rf_predictions, 'random forest regression', 'testing', labelCol, 'prediction', metrics)

  return rf_training, rf_predictions

In [None]:
# Retrieve all / cor_matrix / gb features
GDRIVE_FEATURES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/features"

GDRIVE_COR_MATRIX_FEATURES_NAME = "cor_matrix_features"
GDRIVE_GB_FEATURES_NAME = "gb_features"

GDRIVE_COR_MATRIX_FEATURES_NAME_EXT = "/" + GDRIVE_COR_MATRIX_FEATURES_NAME + ".json"
GDRIVE_GB_FEATURES_NAME_EXT = "/" + GDRIVE_GB_FEATURES_NAME + ".json"

GDRIVE_COR_MATRIX_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_COR_MATRIX_FEATURES_NAME_EXT
GDRIVE_GB_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_GB_FEATURES_NAME_EXT

In [None]:
cor_matrix_features = spark.read.json(GDRIVE_COR_MATRIX_FEATURES).columns
gb_features = spark.read.json(GDRIVE_GB_FEATURES).columns

all_features = train_df.columns[1:-2]

# Set the depended variable
dep_var = 'market-price'

In [None]:
# Return the dataset with the selected features
def select_features(dataset, features):
  vectorAssembler = VectorAssembler(
    inputCols = features,
    outputCol = 'features')

  dataset = vectorAssembler.transform(dataset)
  dataset = dataset.select(['timestamp','index', 'features', dep_var])
  return dataset

In [None]:
def show_results(train, test, training, predictions):
  trace1 = go.Scatter(
      x = train['timestamp'],
      y = train['market-price'].astype(float),
      mode = 'lines',
      name = 'Train'
  )

  trace2 = go.Scatter(
      x = test['timestamp'],
      y = test['market-price'].astype(float),
      mode = 'lines',
      name = 'Test'
  )

  trace3 = go.Scatter(
      x = training['timestamp'],
      y = training['prediction'].astype(float),
      mode = 'lines',
      name = 'Training'
  )

  trace4 = go.Scatter(
      x = predictions['timestamp'],
      y = predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Prediction'
  )

  layout = dict(
      title='Train, test and prediction set with Rangeslider',
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  #change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace1, trace2, trace3, trace4]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = "Train, test and prediction set with Rangeslider")

In [None]:
# Test performances with all the features
rf_training, rf_predictions = test_best_features(select_features(train_df, all_features), select_features(valid_df, all_features), all_features, 'features', dep_var)

In [None]:
show_results(train_df.toPandas(), valid_df.toPandas(), rf_training.toPandas(), rf_predictions.toPandas())

In [None]:
# Test features with the corr matrix features
rf_training, rf_predictions = test_best_features(select_features(train_df, cor_matrix_features), select_features(valid_df, cor_matrix_features), cor_matrix_features, 'features', dep_var)

In [None]:
show_results(train_df.toPandas(), valid_df.toPandas(), rf_training.toPandas(), rf_predictions.toPandas())

In [None]:
# Test performances with gb features
rf_training, rf_predictions = test_best_features(select_features(train_df, gb_features), select_features(valid_df, gb_features), gb_features, 'features', dep_var)

In [None]:
show_results(train_df.toPandas(), valid_df.toPandas(), rf_training.toPandas(), rf_predictions.toPandas())

## Hyperparameter tuning ❗

In [None]:
# Hyperparameter tuning for the model
def random_forest_cross_val(dataset, k_fold=5):
    rf = RandomForestRegressor(featuresCol='features', labelCol=dep_var)
    pipeline = Pipeline(stages=[rf])

    # Default (too much memory!!)
    # param_grid = ParamGridBuilder()\
    # .addGrid(rf.maxDepth, [8, 9, 10]) \
    # .addGrid(rf.numTrees, [40, 60, 80]) \
    # .build()

    param_grid = ParamGridBuilder()\
    .addGrid(rf.maxDepth, [5, 6, 7]) \
    .addGrid(rf.numTrees, [30, 50, 70]) \
    .build()

    cross_val = CrossValidator(estimator=pipeline,
                               estimatorParamMaps=param_grid,
                               evaluator=RegressionEvaluator(labelCol=dep_var),
                               numFolds=k_fold,
                               collectSubModels=True
                               )

    # Run cross-validation, and choose the best set of parameters.
    cv_model = cross_val.fit(dataset)

    return cv_model

In [None]:
# Execute cross validation with random forest
cv_rf_models = random_forest_cross_val(select_features(train_df, cor_matrix_features))

In [None]:
# Summarizes all the models trained during cross validation
def summarize_rf_models(cv_models):
    for k, models in enumerate(cv_models):
        print("*************** Fold #{:d} ***************\n".format(k+1))
        for i, m in enumerate(models):
            print("--- Model #{:d} out of {:d} ---".format(i+1, len(models)))
            print("\tParameters: maxDepth=[{:.3f}]; numTrees=[{:.3f}] ".format(m.stages[-1]._java_obj.getMaxDepth(), m.stages[-1]._java_obj.getNumTrees()))
            print("\tModel summary: {}\n".format(m.stages[-1]))
        print("***************************************\n")

In [None]:
# Call the function above|
summarize_rf_models(cv_rf_models.subModels)

In [None]:
# Summarize average error
for i, avg_rmse in enumerate(cv_rf_models.avgMetrics):
    print("Avg. RMSE computed across k-fold cross validation for model setting #{:d}: {:3f}".format(i+1, avg_rmse))

In [None]:
# Get the best model to extract best hyperparameters
best_rf_model_params = cv_rf_models.bestModel.stages[-1].extractParamMap()
# parameters = [] ❗

print('Best parameters for random forest regressor:')
for param, value in best_rf_model_params.items():
    # parameters.append(param.name + "=" + str(value)) ❗
    print(param.name, "=", value)

In [None]:
# parameters = [parameter.replace("'", "") for parameter in parameters] ❗
# parameters ❗

In [None]:
# Fit a model with best parameters
rf = RandomForestRegressor(featuresCol='features', labelCol=dep_var, maxDepth=7, numTrees=50)
rf_model = rf.fit(select_features(valid_df, cor_matrix_features))

In [None]:
# Training set evaluation
rf_training = rf_model.transform(select_features(train_df, cor_matrix_features))
evaluate_models(rf_training, 'random forest regressor', 'training', dep_var, 'prediction', metrics)

# Testing set evaluation
rf_predictions = rf_model.transform(select_features(valid_df, cor_matrix_features))
evaluate_models(rf_predictions, 'random forest regressor', 'testing', dep_var, 'prediction', metrics)

In [None]:
show_results(train_df.toPandas(), valid_df.toPandas(), rf_training.toPandas(), rf_predictions.toPandas())

In [None]:
GDRIVE_MODEL_NAME = "random_forest"
GDRIVE_MODELS_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/models"
GDRIVE_MODEL_NAME_EXT = GDRIVE_MODELS_DIR + "/" + GDRIVE_MODEL_NAME

In [None]:
# Save the RF best model
rf_model.write().overwrite().save(GDRIVE_MODEL_NAME_EXT)