# **Bitcoin price forecasting - Gradient Boosted Tree**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author
Corsi Danilo - corsi.1742375@studenti.uniroma1.it



# Dependencies, Libraries and Tools

In [None]:
JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
SLOW_OPERATION = False

In [None]:
#Install some useful dependencies
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from itertools import cycle

import plotly.express as px

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import gc

import pandas as pd
import numpy as np
import json

import matplotlib.pyplot as plt
import seaborn as sns

# !pip install -U -q PyDrive # To use files that are stored in Google Drive directly (e.g., without downloading them from an external URL)
# !apt install openjdk-8-jdk-headless -qq
# import os
# os.environ["JAVA_HOME"] = JAVA_HOME

In [None]:
# Install Spark and related dependencies
!pip install pyspark

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.ml import Pipeline

# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '4G').\
                set('spark.driver.memory', '45G').\
                set('spark.driver.maxResultSize', '10G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPriceForecasting").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285397 sha256=05f18bbf3f33bd2f33ab6c5048c20dce198162584e1ac6d19997d034ecb7835d
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


# Link to Google Drive

In [None]:
# Define GDrive paths
GDRIVE_DIR = "/content/drive"

GDRIVE_MODEL_NAME = "gradient_boosting_tree"
GDRIVE_MODELS_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/models"
GDRIVE_MODEL_NAME_EXT = GDRIVE_MODELS_DIR + "/" + GDRIVE_MODEL_NAME

GDRIVE_DATASET_RAW_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/raw"
GDRIVE_DATASET_TEMP_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/temp"
GDRIVE_DATASET_OUTPUT_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/output"

GDRIVE_DATASET_NAME = "bitcoin_blockchain_data_1h"
GDRIVE_DATASET_NAME_TRAIN = GDRIVE_DATASET_NAME + "_train"
GDRIVE_DATASET_NAME_TEST = GDRIVE_DATASET_NAME + "_test"

GDRIVE_DATASET_NAME_EXT = "/" + GDRIVE_DATASET_NAME + ".parquet"
GDRIVE_DATASET_NAME_EXT_TRAIN  = "/" + GDRIVE_DATASET_NAME_TRAIN + ".parquet"
GDRIVE_DATASET_NAME_EXT_TEST = "/" + GDRIVE_DATASET_NAME_TEST + ".parquet"

GDRIVE_DATASET = GDRIVE_DATASET_RAW_DIR + GDRIVE_DATASET_NAME_EXT
GDRIVE_DATASET_TRAIN = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_DATASET_NAME_EXT_TRAIN
GDRIVE_DATASET_TEST = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_DATASET_NAME_EXT_TEST

In [None]:
# Point Colaboratory to our Google Drive
from google.colab import drive

drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


# Training the model ❗

In [None]:
# Load datasets into pyspark dataframe objects
train_df = spark.read.load(GDRIVE_DATASET_TRAIN,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

test_df = spark.read.load(GDRIVE_DATASET_TEST,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

In [None]:
# Function that create a simple gbt model (with no hyperparameter tuning)
def simple_gbt_model(train, featureCol, labelCol):
  gbt = GBTRegressor(featuresCol=featureCol, labelCol=labelCol)
  gbt_model = gbt.fit(train)
  return gbt_model

In [None]:
# Define the evaluation metrics
# Notice that r2_adj metric is included when calculating r2
metrics = ['mse', 'rmse', 'mae', 'r2']

In [None]:
# Function to compute the r2 adjusted metric
# r2 is the r2 metric, n is the number of observations, k is the number of features
def compute_r2adj(r2, n, k):
  return 1 - (1 - r2) * ((n - 1) / (n - k - 1))

In [None]:
# Function to evaluate a model
def evaluate_models(predictions, modelName, typeName, label, prediction, metrics):
  r2 = None
  for metric in metrics:
    evaluator = RegressionEvaluator(labelCol=label, predictionCol=prediction, metricName=metric)
    evaluation = evaluator.evaluate(predictions)
    print(metric.upper()+' for '+modelName+' on '+typeName+' set: '+str(evaluation))
    if metric == 'r2':
      print('R2_adj'+' for '+modelName+' on '+typeName+' set: '+str(compute_r2adj(evaluation, predictions.count(), len(predictions.columns))))

In [None]:
# Function that create simple models (without hyperparameter tuning) and evaluate them
def test_best_features(train_data, test_data, features, featureCol, labelCol, metrics = ['rmse', 'r2']):
  # Train the models
  gbt = simple_gbt_model(train_data, featureCol, labelCol)

  # Training set evaluation
  gbt_training = gbt.transform(train_data)
  evaluate_models(gbt_training, 'gradient boosted tree regression', 'training', labelCol, 'prediction', metrics)

  # Testing set evaluation
  gbt_predictions = gbt.transform(test_data)
  evaluate_models(gbt_predictions, 'gradient boosted tree regression', 'testing', labelCol, 'prediction', metrics)

  return gbt_training, gbt_predictions

In [None]:
# Retrieve all / cor_matrix / gb features
GDRIVE_COR_MATRIX_FEATURES_NAME = "cor_matrix_features"
GDRIVE_GB_FEATURES_NAME = "gb_features"

GDRIVE_COR_MATRIX_FEATURES_NAME_EXT = "/" + GDRIVE_COR_MATRIX_FEATURES_NAME + ".json"
GDRIVE_GB_FEATURES_NAME_EXT = "/" + GDRIVE_GB_FEATURES_NAME + ".json"

GDRIVE_COR_MATRIX_FEATURES = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_COR_MATRIX_FEATURES_NAME_EXT
GDRIVE_GB_FEATURES = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_GB_FEATURES_NAME_EXT

cor_matrix_features = spark.read.json(GDRIVE_COR_MATRIX_FEATURES).columns
gb_features = spark.read.json(GDRIVE_GB_FEATURES).columns

all_features = train_df.columns[1:-2]

# Set the depended variable
dep_var = 'market-price'

In [None]:
# Return the dataset with the selected features
def select_features(dataset, features):
  vectorAssembler = VectorAssembler(
    inputCols = features,
    outputCol = 'features')

  dataset = vectorAssembler.transform(dataset)
  dataset = dataset.select(['timestamp','index', 'features', dep_var])
  return dataset

In [None]:
def show_results(train, test, training, predictions):
  trace1 = go.Scatter(
      x = train['timestamp'],
      y = train['market-price'].astype(float),
      mode = 'lines',
      name = 'Train'
  )

  trace2 = go.Scatter(
      x = test['timestamp'],
      y = test['market-price'].astype(float),
      mode = 'lines',
      name = 'Test'
  )

  trace3 = go.Scatter(
      x = training['timestamp'],
      y = training['prediction'].astype(float),
      mode = 'lines',
      name = 'Training'
  )

  trace4 = go.Scatter(
      x = predictions['timestamp'],
      y = predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Prediction'
  )

  layout = dict(
      title='Train, test and prediction set with Rangeslider',
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  #change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace1, trace2, trace3, trace4]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = "Train, test and prediction set with Rangeslider")

In [None]:
# Test performances with the selected features
gbt_training, gbt_predictions = test_best_features(select_features(train_df, all_features), select_features(test_df, all_features), all_features, 'features', dep_var)

RMSE for gradient boosted tree regression on training set: 465.5991789779761
R2 for gradient boosted tree regression on training set: 0.9957789373567789
R2_adj for gradient boosted tree regression on training set: 0.9957786754327648
RMSE for gradient boosted tree regression on testing set: 10680.803314478148
R2 for gradient boosted tree regression on testing set: 0.38932782478059647
R2_adj for gradient boosted tree regression on testing set: 0.3891762104563451


In [None]:
show_results(train_df.toPandas(), test_df.toPandas(), gbt_training.toPandas(), gbt_predictions.toPandas())

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Test features with the relevant features
gbt_training, gbt_predictions = test_best_features(select_features(train_df, cor_matrix_features), select_features(test_df, cor_matrix_features), cor_matrix_features, 'features', dep_var)

RMSE for gradient boosted tree regression on training set: 614.9921936690923
R2 for gradient boosted tree regression on training set: 0.9926356131804731
R2_adj for gradient boosted tree regression on training set: 0.9926351562079235
RMSE for gradient boosted tree regression on testing set: 5830.521322933046
R2 for gradient boosted tree regression on testing set: 0.8180235985209241
R2_adj for gradient boosted tree regression on testing set: 0.8179784184222402


In [None]:
show_results(train_df.toPandas(), test_df.toPandas(), gbt_training.toPandas(), gbt_predictions.toPandas())

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Test performances with all features
gbt_training, gbt_predictions = test_best_features(select_features(train_df, gb_features), select_features(test_df, gb_features), gb_features, 'features', dep_var)

RMSE for gradient boosted tree regression on training set: 627.8319494380522
R2 for gradient boosted tree regression on training set: 0.9923248970284066
R2_adj for gradient boosted tree regression on training set: 0.992324420775399
RMSE for gradient boosted tree regression on testing set: 5620.664662994159
R2 for gradient boosted tree regression on testing set: 0.8308875241796402
R2_adj for gradient boosted tree regression on testing set: 0.8308455378655679


In [None]:
show_results(train_df.toPandas(), test_df.toPandas(), gbt_training.toPandas(), gbt_predictions.toPandas())

Output hidden; open in https://colab.research.google.com to view.

# Hyperparameter tuning ❗

In [None]:
# Hyperparameter tuning for the model
def gbt_cross_val(dataset, k_fold=5):
    gbt = GBTRegressor(featuresCol='features', labelCol=dep_var)
    pipeline = Pipeline(stages=[gbt])

    # Default (very slow!!)
    # param_grid = ParamGridBuilder()\
    # .addGrid(gbt.maxDepth, [6, 7, 8]) \
    # .addGrid(gbt.maxIter, [11, 12, 13]) \
    # .addGrid(gbt.stepSize, [0.15, 0.2, 0.25]) \
    # .build()

    param_grid = ParamGridBuilder()\
    .addGrid(gbt.maxDepth, [5, 6, 7]) \
    .addGrid(gbt.maxIter, [8, 9, 10]) \
    .addGrid(gbt.stepSize, [0.1, 0.15, 0.2]) \
    .build()

    cross_val = CrossValidator(estimator=pipeline,
                               estimatorParamMaps=param_grid,
                               evaluator=RegressionEvaluator(labelCol=dep_var),
                               numFolds=k_fold,
                               collectSubModels=True
                               )

    # Run cross-validation, and choose the best set of parameters.
    cv_model = cross_val.fit(dataset)

    return cv_model

In [None]:
# Execute cross validation with gradient-boosted trees
cv_gbt_models = gbt_cross_val(select_features(train_df, cor_matrix_features))

In [None]:
# Summarizes all the models trained during cross validation
def summarize_gbt_models(cv_models):
    for k, models in enumerate(cv_models):
        print("*************** Fold #{:d} ***************\n".format(k+1))
        for i, m in enumerate(models):
            print("--- Model #{:d} out of {:d} ---".format(i+1, len(models)))
            print("\tParameters: maxDepth=[{:.3f}]; maxIter=[{:.3f}; stepSize=[{:.3f}] ".format(m.stages[-1]._java_obj.getMaxDepth(), m.stages[-1]._java_obj.getMaxIter(), m.stages[-1]._java_obj.getStepSize()))
            print("\tModel summary: {}\n".format(m.stages[-1]))
        print("***************************************\n")

In [None]:
# Call the function above
summarize_gbt_models(cv_gbt_models.subModels)

*************** Fold #1 ***************

--- Model #1 out of 27 ---
	Parameters: maxDepth=[5.000]; maxIter=[8.000; stepSize=[0.100] 
	Model summary: GBTRegressionModel: uid=GBTRegressor_59cd56f80fee, numTrees=8, numFeatures=7

--- Model #2 out of 27 ---
	Parameters: maxDepth=[5.000]; maxIter=[8.000; stepSize=[0.150] 
	Model summary: GBTRegressionModel: uid=GBTRegressor_59cd56f80fee, numTrees=8, numFeatures=7

--- Model #3 out of 27 ---
	Parameters: maxDepth=[5.000]; maxIter=[8.000; stepSize=[0.200] 
	Model summary: GBTRegressionModel: uid=GBTRegressor_59cd56f80fee, numTrees=8, numFeatures=7

--- Model #4 out of 27 ---
	Parameters: maxDepth=[5.000]; maxIter=[9.000; stepSize=[0.100] 
	Model summary: GBTRegressionModel: uid=GBTRegressor_59cd56f80fee, numTrees=9, numFeatures=7

--- Model #5 out of 27 ---
	Parameters: maxDepth=[5.000]; maxIter=[9.000; stepSize=[0.150] 
	Model summary: GBTRegressionModel: uid=GBTRegressor_59cd56f80fee, numTrees=9, numFeatures=7

--- Model #6 out of 27 ---
	P

In [None]:
# Summarize average error
for i, avg_rmse in enumerate(cv_gbt_models.avgMetrics):
    print("Avg. RMSE computed across k-fold cross validation for model setting #{:d}: {:3f}".format(i+1, avg_rmse))

Avg. RMSE computed across k-fold cross validation for model setting #1: 688.139858
Avg. RMSE computed across k-fold cross validation for model setting #2: 659.128581
Avg. RMSE computed across k-fold cross validation for model setting #3: 636.708316
Avg. RMSE computed across k-fold cross validation for model setting #4: 678.205582
Avg. RMSE computed across k-fold cross validation for model setting #5: 646.556829
Avg. RMSE computed across k-fold cross validation for model setting #6: 630.932071
Avg. RMSE computed across k-fold cross validation for model setting #7: 672.082659
Avg. RMSE computed across k-fold cross validation for model setting #8: 641.050712
Avg. RMSE computed across k-fold cross validation for model setting #9: 625.940029
Avg. RMSE computed across k-fold cross validation for model setting #10: 609.173434
Avg. RMSE computed across k-fold cross validation for model setting #11: 597.667064
Avg. RMSE computed across k-fold cross validation for model setting #12: 587.395478
A

In [None]:
# Get the best model to extract best hyperparameters
best_gbt_model_params = cv_gbt_models.bestModel.stages[-1].extractParamMap()
print("Best parameters for gradient-boosted trees:")
for param, value in best_gbt_model_params.items():
    print(param.name, "=", value)

Best parameters for gradient-boosted trees:
cacheNodeIds = False
checkpointInterval = 10
featureSubsetStrategy = all
featuresCol = features
impurity = variance
labelCol = market-price
leafCol = 
lossType = squared
maxBins = 32
maxDepth = 7
maxIter = 10
maxMemoryInMB = 256
minInfoGain = 0.0
minInstancesPerNode = 1
minWeightFractionPerNode = 0.0
predictionCol = prediction
seed = -1133332458936899016
stepSize = 0.2
subsamplingRate = 1.0
validationTol = 0.01


In [None]:
# Fit a model with best parameters
gbt = GBTRegressor(featuresCol='features', labelCol=dep_var, maxDepth=7, maxIter=10, stepSize=0.2)
gbt_model = gbt.fit(select_features(train_df, cor_matrix_features))

In [None]:
# Training set evaluation
gbt_training = gbt_model.transform(select_features(train_df, cor_matrix_features))
evaluate_models(gbt_training, 'gradient-boosted tree regressor', 'training', dep_var, 'prediction', metrics)

# Testing set evaluation
gbt_predictions = gbt_model.transform(select_features(test_df, cor_matrix_features))
evaluate_models(gbt_predictions, 'gradient-boosted tree regressor', 'testing', dep_var, 'prediction', metrics)

MSE for gradient-boosted tree regressor on training set: 317196.0271469096
RMSE for gradient-boosted tree regressor on training set: 563.2015865983597
MAE for gradient-boosted tree regressor on training set: 146.9228566070433
R2 for gradient-boosted tree regressor on training set: 0.9938237463302984
R2_adj for gradient-boosted tree regressor on training set: 0.9938233630834028
MSE for gradient-boosted tree regressor on testing set: 45209870.799276486
RMSE for gradient-boosted tree regressor on testing set: 6723.828581937265
MAE for gradient-boosted tree regressor on testing set: 5222.3375855479935
R2 for gradient-boosted tree regressor on testing set: 0.7579898600828516
R2_adj for gradient-boosted tree regressor on testing set: 0.7579297751382374


In [None]:
show_results(train_df.toPandas(), test_df.toPandas(), gbt_training.toPandas(), gbt_predictions.toPandas())

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Save the best model
# rf_model.write().overwrite().save(GDRIVE_MODEL_NAME_EXT)