# **Bitcoin price forecasting - Random Forest**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author
Corsi Danilo - corsi.1742375@studenti.uniroma1.it



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Dependencies, Libraries and Tools

In [2]:
JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
SLOW_OPERATION = False

In [3]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
#Install some useful dependencies
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from itertools import cycle

import plotly.express as px

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import gc

import pandas as pd
import numpy as np
import json

import matplotlib.pyplot as plt
import seaborn as sns

# !pip install -U -q PyDrive # To use files that are stored in Google Drive directly (e.g., without downloading them from an external URL)
# !apt install openjdk-8-jdk-headless -qq
# import os
# os.environ["JAVA_HOME"] = JAVA_HOME

In [5]:
# Install Spark and related dependencies
!pip install pyspark

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.ml import Pipeline

# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '4G').\
                set('spark.driver.memory', '45G').\
                set('spark.driver.maxResultSize', '10G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPriceForecasting").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285397 sha256=26c3d4b2c2fa5bda0f6be66555d7805ed8413e7875bdf6d321efd155fc85034b
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


# Link to Google Drive

In [6]:
# Define GDrive paths
GDRIVE_DIR = "/content/drive"

GDRIVE_DATASET_RAW_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/raw"
GDRIVE_DATASET_TEMP_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/temp"
GDRIVE_DATASET_OUTPUT_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/output"

GDRIVE_DATASET_NAME = "bitcoin_blockchain_data_30min"
GDRIVE_DATASET_NAME_TRAIN = GDRIVE_DATASET_NAME + "_train"
GDRIVE_DATASET_NAME_VALID = GDRIVE_DATASET_NAME + "_valid"

# GDRIVE_DATASET_NAME_EXT = "/" + GDRIVE_DATASET_NAME + ".parquet"
GDRIVE_DATASET_NAME_EXT_TRAIN  = "/" + GDRIVE_DATASET_NAME_TRAIN + ".parquet"
GDRIVE_DATASET_NAME_EXT_VALID = "/" + GDRIVE_DATASET_NAME_VALID + ".parquet"

# GDRIVE_DATASET = GDRIVE_DATASET_RAW_DIR + GDRIVE_DATASET_NAME_EXT
GDRIVE_DATASET_TRAIN = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_DATASET_NAME_EXT_TRAIN
GDRIVE_DATASET_VALID = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_DATASET_NAME_EXT_VALID

In [7]:
# Point Colaboratory to our Google Drive
from google.colab import drive

drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


In [8]:
# Load datasets into pyspark dataframe objects
train_df = spark.read.load(GDRIVE_DATASET_TRAIN,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

valid_df = spark.read.load(GDRIVE_DATASET_VALID,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

# Import my utilities ❗

In [9]:
import sys
GDRIVE_UTILITIES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/utilities"
sys.path.append(GDRIVE_UTILITIES_DIR)

import shutil
shutil.rmtree(GDRIVE_UTILITIES_DIR + '/__pycache__')

import utilities

import importlib
importlib.reload(utilities)

<module 'utilities' from '/content/drive/MyDrive/BDC/project/utilities/utilities.py'>

# Training simple model ❗

In [10]:
# Retrieve all / cor_matrix / gb features
GDRIVE_FEATURES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/features"

GDRIVE_COR_MATRIX_FEATURES_NAME = "cor_matrix_features"

GDRIVE_COR_MATRIX_FEATURES_NAME_EXT = "/" + GDRIVE_COR_MATRIX_FEATURES_NAME + ".json"

GDRIVE_COR_MATRIX_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_COR_MATRIX_FEATURES_NAME_EXT

In [11]:
all_features = train_df.columns[1:-2]

cor_matrix_features = spark.read.json(GDRIVE_COR_MATRIX_FEATURES).columns

# Set the depended variable
dep_var = 'market-price'

In [12]:
# valid performances with all the features
rf_training, rf_predictions = utilities.test_best_features(train_df, valid_df, 'random forest regression', all_features, 'features', dep_var)

RMSE for random forest regression on training set: 1003.1558435016956
R2 for random forest regression on training set: 0.9963694964669548
R2_adj for random forest regression on training set: 0.9963693256149057
-----
RMSE for random forest regression on validation set: 12181.70865588765
R2 for random forest regression on validation set: -0.7609155988683847
R2_adj for random forest regression on validation set: -0.7613576845278716


In [13]:
utilities.show_results(train_df.toPandas(), valid_df.toPandas(), rf_training.toPandas(), rf_predictions.toPandas())

Output hidden; open in https://colab.research.google.com to view.

In [14]:
# valid features with the corr matrix features
rf_training, rf_predictions = utilities.test_best_features(train_df, valid_df, 'random forest regression', cor_matrix_features, 'features', dep_var)

RMSE for random forest regression on training set: 1003.1513485366692
R2 for random forest regression on training set: 0.9963695290021785
R2_adj for random forest regression on training set: 0.9963693581516604
-----
RMSE for random forest regression on validation set: 9657.562198716525
R2 for random forest regression on validation set: -0.10676942077022566
R2_adj for random forest regression on validation set: -0.10704728013474907


In [15]:
utilities.show_results(train_df.toPandas(), valid_df.toPandas(), rf_training.toPandas(), rf_predictions.toPandas())

Output hidden; open in https://colab.research.google.com to view.

# Hyperparameter tuning ❗

In [16]:
# Define the evaluation metrics
# Notice that r2_adj metric is included when calculating r2
metrics = ['mse', 'rmse', 'mae', 'r2']

In [None]:
param_grid = ParamGridBuilder()\
    .addGrid(rf.maxDepth, [1, 2, 3]) \
    .addGrid(rf.numTrees, [4, 5, 6]) \
    .build()

In [17]:
# Execute cross validation with random forest
cv_rf_models = utilities.random_forest_cross_val(utilities.select_features(train_df, cor_matrix_features, dep_var), dep_var, param_grid)

In [18]:
# Call the function above|
utilities.summarize_rf_models(cv_rf_models.subModels)

*************** Fold #1 ***************

--- Model #1 out of 9 ---
	Parameters: maxDepth=[1.000]; numTrees=[4.000] 
	Model summary: RandomForestRegressionModel: uid=RandomForestRegressor_a91a2bb5beff, numTrees=4, numFeatures=7

--- Model #2 out of 9 ---
	Parameters: maxDepth=[1.000]; numTrees=[5.000] 
	Model summary: RandomForestRegressionModel: uid=RandomForestRegressor_a91a2bb5beff, numTrees=5, numFeatures=7

--- Model #3 out of 9 ---
	Parameters: maxDepth=[1.000]; numTrees=[6.000] 
	Model summary: RandomForestRegressionModel: uid=RandomForestRegressor_a91a2bb5beff, numTrees=6, numFeatures=7

--- Model #4 out of 9 ---
	Parameters: maxDepth=[2.000]; numTrees=[4.000] 
	Model summary: RandomForestRegressionModel: uid=RandomForestRegressor_a91a2bb5beff, numTrees=4, numFeatures=7

--- Model #5 out of 9 ---
	Parameters: maxDepth=[2.000]; numTrees=[5.000] 
	Model summary: RandomForestRegressionModel: uid=RandomForestRegressor_a91a2bb5beff, numTrees=5, numFeatures=7

--- Model #6 out of 9 --

In [19]:
# Summarize average error
for i, avg_rmse in enumerate(cv_rf_models.avgMetrics):
    print("Avg. RMSE computed across k-fold cross validation for model setting #{:d}: {:3f}".format(i+1, avg_rmse))

Avg. RMSE computed across k-fold cross validation for model setting #1: 5994.184357
Avg. RMSE computed across k-fold cross validation for model setting #2: 6063.187676
Avg. RMSE computed across k-fold cross validation for model setting #3: 5898.548838
Avg. RMSE computed across k-fold cross validation for model setting #4: 3464.199065
Avg. RMSE computed across k-fold cross validation for model setting #5: 3437.251476
Avg. RMSE computed across k-fold cross validation for model setting #6: 3417.095764
Avg. RMSE computed across k-fold cross validation for model setting #7: 2069.070222
Avg. RMSE computed across k-fold cross validation for model setting #8: 2055.239986
Avg. RMSE computed across k-fold cross validation for model setting #9: 2148.298239


In [20]:
# Get the best model to extract best hyperparameters
best_rf_model_params = cv_rf_models.bestModel.stages[-1].extractParamMap()

print('Best parameters for random forest regressor:')
for param, value in best_rf_model_params.items():
    print(param.name, "=", value)

Best parameters for random forest regressor:
bootstrap = True
cacheNodeIds = False
checkpointInterval = 10
featureSubsetStrategy = auto
featuresCol = features
impurity = variance
labelCol = market-price
leafCol = 
maxBins = 32
maxDepth = 3
maxMemoryInMB = 256
minInfoGain = 0.0
minInstancesPerNode = 1
minWeightFractionPerNode = 0.0
numTrees = 5
predictionCol = prediction
seed = -2418255558323656717
subsamplingRate = 1.0


In [21]:
# bootstrap: Se impostato su True, il campionamento con sostituzione viene utilizzato per creare i sottoinsiemi di dati per l'addestramento di ciascun albero nella foresta. Se impostato su False, il campionamento senza sostituzione viene utilizzato.
# cacheNodeIds: Se impostato su True, gli ID dei nodi per ciascuna istanza vengono memorizzati nella cache, il che può velocizzare l'addestramento di alberi più profondi.
# checkpointInterval: Determina la frequenza con cui i checkpoint vengono creati durante l'addestramento. Un valore più basso può causare un sovraccarico, mentre un valore più alto può ridurre il rischio di perdita di dati in caso di guasti.
# featureSubsetStrategy: Specifica il numero di funzionalità da considerare per le divisioni in ciascun nodo dell'albero. Le opzioni supportate includono "auto", "all", "sqrt", "log2" e valori numerici.
# featuresCol: Il nome della colonna che contiene le funzionalità utilizzate per l'addestramento del modello.
# impurity: La misura dell'impurità utilizzata per dividere i nodi durante l'addestramento. Per i problemi di regressione, l'impurità comune è "variance".
# labelCol: Il nome della colonna che contiene i valori target (etichette) per l'addestramento del modello.
# leafCol: Non è un parametro valido per il Random Forest in PySpark. Potrebbe essere un errore di battitura o un parametro specifico per un'altra libreria.
# maxBins: Il numero massimo di contenitori utilizzati per dividere le funzionalità continue e categoriche.
# maxDepth: La profondità massima degli alberi nella foresta casuale.
# maxMemoryInMB: La quantità massima di memoria (in MB) allocata per l'aggregazione degli istogrammi durante l'addestramento.
# minInfoGain: Il guadagno di informazione minimo richiesto per dividere un nodo.
# minInstancesPerNode: Il numero minimo di istanze per nodo richiesto per dividere un nodo.
# minWeightFractionPerNode: La frazione minima del peso totale delle istanze richiesta per dividere un nodo.
# numTrees: Il numero di alberi nella foresta casuale.
# predictionCol: Il nome della colonna che conterrà le previsioni generate dal modello.
# seed: Il seme utilizzato per generare numeri casuali durante l'addestramento, che può essere utile per garantire la riproducibilità dei risultati.
# subsamplingRate: La frazione delle istanze da utilizzare per l'addestramento di ciascun albero. Un valore di 1.0 indica che tutte le istanze vengono utilizzate.

In [22]:
# Fit a model with best parameters
rf = RandomForestRegressor(featuresCol='features', labelCol=dep_var, maxDepth=10, numTrees=80)
pipeline = Pipeline(stages=[rf])

rf_model = pipeline.fit(utilities.select_features(train_df, cor_matrix_features, dep_var))

In [23]:
# Training set evaluation
rf_training = rf_model.transform(utilities.select_features(train_df, cor_matrix_features, dep_var))
utilities.evaluate_models(rf_training, 'random forest regressor', 'training', dep_var, 'prediction', metrics)

# Validation set evaluation
rf_predictions = rf_model.transform(utilities.select_features(valid_df, cor_matrix_features, dep_var))
utilities.evaluate_models(rf_predictions, 'random forest regressor', 'validation', dep_var, 'prediction', metrics)

MSE for random forest regressor on training set: 268881.9301775978
RMSE for random forest regressor on training set: 518.5382629831648
MAE for random forest regressor on training set: 245.52946730461437
R2 for random forest regressor on training set: 0.9990299554809132
R2_adj for random forest regressor on training set: 0.9990299098304705
MSE for random forest regressor on validation set: 90916519.31194498
RMSE for random forest regressor on validation set: 9535.015433230561
MAE for random forest regressor on validation set: 8347.073184474806
R2 for random forest regressor on validation set: -0.07885958489923262
R2_adj for random forest regressor on validation set: -0.07913043737585923


In [24]:
utilities.show_results(train_df.toPandas(), valid_df.toPandas(), rf_training.toPandas(), rf_predictions.toPandas())

Output hidden; open in https://colab.research.google.com to view.

In [25]:
GDRIVE_MODEL_NAME = "random_forest"
GDRIVE_MODELS_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/models"
GDRIVE_MODEL_NAME_EXT = GDRIVE_MODELS_DIR + "/" + GDRIVE_MODEL_NAME

In [26]:
# Save the RF best model
rf_model.write().overwrite().save(GDRIVE_MODEL_NAME_EXT)