# **Bitcoin price forecasting - Random Forest Regressor**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author
Corsi Danilo - corsi.1742375@studenti.uniroma1.it



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Dependencies, Libraries and Tools

In [2]:
JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
MODEL_NAME = "Random Forest Regressor"
FEATURES_LABEL = "features"
SLOW_OPERATION = False

In [3]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
#Install some useful dependencies
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from itertools import cycle

import plotly.express as px

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import gc

import pandas as pd
import numpy as np
import json

import matplotlib.pyplot as plt
import seaborn as sns

# !pip install -U -q PyDrive # To use files that are stored in Google Drive directly (e.g., without downloading them from an external URL)
# !apt install openjdk-8-jdk-headless -qq
# import os
# os.environ["JAVA_HOME"] = JAVA_HOME

In [5]:
# Install Spark and related dependencies
!pip install pyspark

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.ml import Pipeline

# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '4G').\
                set('spark.driver.memory', '45G').\
                set('spark.driver.maxResultSize', '10G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPriceForecasting").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285388 sha256=224de4a5e430d74f0e3176220b989f7cb37b945ccf0db865ecef4fa5792c25aa
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


# Link to Google Drive

In [6]:
# Define GDrive paths
GDRIVE_DIR = "/content/drive"

GDRIVE_DATASET_RAW_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/raw"
GDRIVE_DATASET_TEMP_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/temp"
GDRIVE_DATASET_OUTPUT_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/output"

GDRIVE_DATASET_NAME = "bitcoin_blockchain_data_30min"
GDRIVE_DATASET_NAME_TRAIN = GDRIVE_DATASET_NAME + "_train"
GDRIVE_DATASET_NAME_VALID = GDRIVE_DATASET_NAME + "_valid"

# GDRIVE_DATASET_NAME_EXT = "/" + GDRIVE_DATASET_NAME + ".parquet"
GDRIVE_DATASET_NAME_EXT_TRAIN  = "/" + GDRIVE_DATASET_NAME_TRAIN + ".parquet"
GDRIVE_DATASET_NAME_EXT_VALID = "/" + GDRIVE_DATASET_NAME_VALID + ".parquet"

# GDRIVE_DATASET = GDRIVE_DATASET_RAW_DIR + GDRIVE_DATASET_NAME_EXT
GDRIVE_DATASET_TRAIN = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_DATASET_NAME_EXT_TRAIN
GDRIVE_DATASET_VALID = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_DATASET_NAME_EXT_VALID

In [7]:
# Point Colaboratory to our Google Drive
from google.colab import drive

drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


In [8]:
# Load datasets into pyspark dataframe objects
train_df = spark.read.load(GDRIVE_DATASET_TRAIN,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

valid_df = spark.read.load(GDRIVE_DATASET_VALID,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

# Cache it
train_df.cache()
valid_df.cache()

DataFrame[market-price: double, market-cap: double, total-bitcoins: double, trade-volume: double, blocks-size: double, avg-block-size: double, n-transactions-total: double, n-transactions-per-block: double, hash-rate: double, difficulty: double, miners-revenue: double, transaction-fees-usd: double, n-unique-addresses: double, n-transactions: double, estimated-transaction-volume-usd: double, timestamp: timestamp_ntz, index: int]

# Import my utilities ❗

In [9]:
import sys
GDRIVE_UTILITIES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/utilities"
sys.path.append(GDRIVE_UTILITIES_DIR)

import shutil
shutil.rmtree(GDRIVE_UTILITIES_DIR + '/__pycache__')

import utilities

import importlib
importlib.reload(utilities)

<module 'utilities' from '/content/drive/MyDrive/BDC/project/utilities/utilities.py'>

# Training simple model ❗

In [10]:
# Retrieve all / cor_matrix / gb features
GDRIVE_FEATURES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/features"

GDRIVE_COR_MATRIX_FEATURES_NAME = "cor_matrix_features"

GDRIVE_COR_MATRIX_FEATURES_NAME_EXT = "/" + GDRIVE_COR_MATRIX_FEATURES_NAME + ".json"

GDRIVE_COR_MATRIX_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_COR_MATRIX_FEATURES_NAME_EXT

In [11]:
all_features = train_df.columns[1:-2]

cor_matrix_features = spark.read.json(GDRIVE_COR_MATRIX_FEATURES).columns

# Set the depended variable
dep_var = 'market-price'

In [12]:
# Valid performances with all the features
utilities.train_valid_simple_model(train_df, valid_df, MODEL_NAME, all_features, FEATURES_LABEL, dep_var)

Output hidden; open in https://colab.research.google.com to view.

In [13]:
# Valid performances with the corr matrix features
utilities.train_valid_simple_model(train_df, valid_df, MODEL_NAME, cor_matrix_features, FEATURES_LABEL, dep_var)

Output hidden; open in https://colab.research.google.com to view.

# Hyperparameter tuning ❗

In [14]:
combined_df = train_df.union(valid_df)

# Release Cache
train_df.unpersist()
valid_df.unpersist()

DataFrame[market-price: double, market-cap: double, total-bitcoins: double, trade-volume: double, blocks-size: double, avg-block-size: double, n-transactions-total: double, n-transactions-per-block: double, hash-rate: double, difficulty: double, miners-revenue: double, transaction-fees-usd: double, n-unique-addresses: double, n-transactions: double, estimated-transaction-volume-usd: double, timestamp: timestamp_ntz, index: int]

In [15]:
combined_df = utilities.select_features(combined_df, cor_matrix_features, dep_var)

In [16]:
# Split proportion list
proportion_lst = [0.6, 0.7, 0.8, 0.9]

In [17]:
# RandomForest params
# params = {
#     'numTrees' : [3, 5, 10, 20, 30],# Number of trees to train, >=1, default:20
#     'maxDepth' : [3, 5, 10] # Maximum depth of the tree, <=30, default:5
# }

params = {
    'numTrees' : [5, 10, 15, 20, 25], # Number of trees to train, >=1, default:20
    'maxDepth' : [2, 3, 5, 7, 10] # Maximum depth of the tree, <=30, default:5
}

In [18]:
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.ml import Pipeline

# Apache Spark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler,StandardScaler
from pyspark.ml.regression import LinearRegression, GeneralizedLinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# Python
import numpy as np
import pandas as pd
from itertools import product
import time

# Graph packages
# https://plotly.com/python/getting-started/#jupyterlab-support
# https://plotly.com/python/time-series/
import plotly.express as px

# Scikit-learn
from sklearn.metrics import mean_absolute_percentage_error

#Install some useful dependencies
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import cycle

import plotly.express as px

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import gc

import pandas as pd
import numpy as np
import json

import matplotlib.pyplot as plt
import seaborn as sns

In [19]:
results = utilities.autoTuning(combined_df, proportion_lst, MODEL_NAME, FEATURES_LABEL, dep_var, params)
results

Unnamed: 0,Model,Proportion,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time,Predictions
0,RandomForestRegressor,0.9,"[5, 7]",1962.557567,0.082887,1626.444485,2227240.0,0.353816,0.35356,1.734956,"DataFrame[market-price: double, prediction: do..."


# Time Series Cross Validation ❗

In [20]:
## Cross Validation Parameter
# Multiple Splits Time Series Cross Validation
mul_cv = {'cv_type':'mulTs',
          'kSplits': 5}

# Blocked Time Series Cross Validation
blk_cv = {'cv_type':'blkTs',
          'kSplits': 10}

In [21]:
# RandomForest params
params = {
    'numTrees' : [3],# Number of trees to train, >=1, default:20
    'maxDepth' : [5] # Maximum depth of the tree, <=30, default:5
}

In [22]:
results_mul_cv, trained_models_mul_cv = utilities.tsCrossValidation(combined_df, MODEL_NAME, FEATURES_LABEL, dep_var, params, mul_cv)
results_mul_cv

Unnamed: 0,Model,CV_type,Splits,Train&Test,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,RandomForestRegressor,mulTs,1,"(21030, 21029)","[3, 5]",6641.378964,0.65391,5092.078011,25813850.0,-1.401121,-1.401692,1.299022
1,RandomForestRegressor,mulTs,2,"(42059, 21029)","[3, 5]",1479.693836,0.260847,1169.092236,3663778.0,0.548557,0.54845,1.385547
2,RandomForestRegressor,mulTs,3,"(63088, 21029)","[3, 5]",554.318518,0.054507,462.486426,1576508.0,0.865451,0.865419,2.710768
3,RandomForestRegressor,mulTs,4,"(84117, 21029)","[3, 5]",32405.532593,0.641482,29369.478619,863577100.0,-4.154333,-4.155558,1.518844
4,RandomForestRegressor,mulTs,5,"(105146, 21029)","[3, 5]",3328.657099,0.109108,2737.973204,124834500.0,0.882719,0.882692,1.624759


In [23]:
results_blk_cv, trained_models_blk_cv = utilities.tsCrossValidation(combined_df, MODEL_NAME, FEATURES_LABEL, dep_var, params, blk_cv)
results_blk_cv

Unnamed: 0,Model,CV_type,Splits,Train&Test,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,RandomForestRegressor,blkTs,1,"(10093, 2524)","[3, 5]",41.370781,0.067677,39.583738,1766.638,-2.389177,-2.395906,0.997831
1,RandomForestRegressor,blkTs,2,"(10093, 2524)","[3, 5]",789.729177,0.293568,625.243746,390428.1,-1.646616,-1.651871,1.028131
2,RandomForestRegressor,blkTs,3,"(10093, 2524)","[3, 5]",1695.428514,0.149317,1496.778192,5169077.0,0.442809,0.441702,1.929907
3,RandomForestRegressor,blkTs,4,"(10093, 2524)","[3, 5]",187.70014,0.017029,104.889441,2172.494,0.19528,0.193682,1.098866
4,RandomForestRegressor,blkTs,5,"(10093, 2524)","[3, 5]",2246.070599,0.184045,2031.952877,4128832.0,-4.507456,-4.518392,0.980583
5,RandomForestRegressor,blkTs,6,"(10093, 2524)","[3, 5]",1125.265076,0.150064,934.993608,995857.9,-0.560731,-0.56383,0.977488
6,RandomForestRegressor,blkTs,7,"(10093, 2524)","[3, 5]",9581.453001,0.236592,6946.136318,48864470.0,-0.882148,-0.885886,1.158237
7,RandomForestRegressor,blkTs,8,"(10093, 2524)","[3, 5]",1809.485065,0.030741,1458.993779,5886727.0,0.502215,0.501226,0.929572
8,RandomForestRegressor,blkTs,9,"(10093, 2524)","[3, 5]",10935.158184,0.38032,9847.511378,96892490.0,-4.17509,-4.185367,1.016454
9,RandomForestRegressor,blkTs,10,"(10093, 2524)","[3, 5]",1409.330521,0.053991,1258.132695,1807133.0,-1.058137,-1.062224,1.852441


# Model Comparison Table

In [24]:
# Define what model_info and evaluators in the Model Comparison Table
model_info = ['Model','CV_type','Parameters']
evaluator_lst = ['RMSE','MAPE','MAE','Variance','R2','Adjusted_R2','Time']

# The the Cross Validation results would like to compare
comparison_lst = [results_mul_cv, results_blk_cv]

In [25]:
# Show the Comparison Table
pd.concat([utilities.modelComparison(cv_result ,model_info,evaluator_lst) for cv_result in comparison_lst])

Unnamed: 0,Model,CV_type,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,RandomForestRegressor,mulTs,"[3, 5]",8881.916202,0.343971,7766.221699,203893100.0,-0.651745,-0.652138,1.707788
0,RandomForestRegressor,blkTs,"[3, 5]",2982.099106,0.156334,2474.421577,16413890.0,-1.407905,-1.412687,1.196951


In [26]:
GDRIVE_MODELS_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/models"
GDRIVE_MODEL_NAME_EXT = GDRIVE_MODELS_DIR + "/" + MODEL_NAME

In [30]:
# Save the best models
for i, model in enumerate(trained_models_blk_cv):
    model.write().overwrite().save(GDRIVE_MODEL_NAME_EXT + "/" + MODEL_NAME + "_" + str(i))

In [28]:
# # Save the best models
# for i, model in enumerate(trained_models_blk_cv):
#     joblib.dump(model, f"{GDRIVE_MODEL_NAME_EXT}/{MODEL_NAME}_{i}.joblib")

In [29]:
# import pickle

# for i, model in enumerate(trained_models_blk_cv):
#     file_path = f"{GDRIVE_MODEL_NAME_EXT}/{MODEL_NAME}_{i}.pickle"
#     with open(file_path, 'wb') as file:
#         pickle.dump(model, file)

TypeError: ignored