# **Bitcoin price forecasting - Generalized Linear Regression**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author
Corsi Danilo - corsi.1742375@studenti.uniroma1.it



# Dependencies, Libraries and Tools

In [1]:
JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
MODEL_NAME = "GeneralizedLinearRegression"
SLOW_OPERATION = False

In [2]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
#Install some useful dependencies
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from itertools import cycle

import plotly.express as px

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import gc

import pandas as pd
import numpy as np
import json

import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Install Spark and related dependencies
!pip install pyspark

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.ml import Pipeline

# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '4G').\
                set('spark.driver.memory', '45G').\
                set('spark.driver.maxResultSize', '10G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPriceForecasting").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285388 sha256=dc360271f11f1a4bf9b2f852072053db3e4837ef11ac8007419a06bb9677a551
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


# Link to Google Drive

In [5]:
# Define GDrive paths
GDRIVE_DIR = "/content/drive"

GDRIVE_DATASET_OUTPUT_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/output"

GDRIVE_DATASET_NAME = "bitcoin_blockchain_data_30min"
GDRIVE_DATASET_NAME_ENG = GDRIVE_DATASET_NAME + "_eng"

GDRIVE_DATASET_NAME_EXT_ENG  = "/" + GDRIVE_DATASET_NAME_ENG + ".parquet"

GDRIVE_DATASET_NAME_ENG = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_DATASET_NAME_EXT_ENG


In [6]:
# Point Colaboratory to our Google Drive
from google.colab import drive

drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


In [7]:
# Load datasets into pyspark dataframe objects
df = spark.read.load(GDRIVE_DATASET_NAME_ENG,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

# Import my utilities

In [8]:
import sys
GDRIVE_UTILITIES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/utilities"
sys.path.append(GDRIVE_UTILITIES_DIR)

import shutil
shutil.rmtree(GDRIVE_UTILITIES_DIR + '/__pycache__')

import utilities

import importlib
importlib.reload(utilities)

<module 'utilities' from '/content/drive/MyDrive/BDC/project/utilities/utilities.py'>

# Loading features

In [9]:
GDRIVE_FEATURES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/features"

GDRIVE_ALL_FEATURES_NAME = "all_features"
GDRIVE_MORE_REL_FEATURES_NAME = "more_rel_features"
GDRIVE_LESS_REL_FEATURES_NAME = "less_rel_features"

GDRIVE_ALL_FEATURES_NAME_EXT = "/" + GDRIVE_ALL_FEATURES_NAME + ".json"
GDRIVE_MORE_REL_FEATURES_NAME_EXT = "/" + GDRIVE_MORE_REL_FEATURES_NAME + ".json"
GDRIVE_LESS_REL_FEATURES_NAME_EXT = "/" + GDRIVE_LESS_REL_FEATURES_NAME + ".json"

GDRIVE_ALL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_ALL_FEATURES_NAME_EXT
GDRIVE_MORE_REL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_MORE_REL_FEATURES_NAME_EXT
GDRIVE_LESS_REL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_LESS_REL_FEATURES_NAME_EXT

In [10]:
# Set the target variable
TARGET_VAL = 'market-price'

# Set the features label
FEATURES_LABEL = "features"

In [11]:
# Loading correlation matrix features
with open(GDRIVE_ALL_FEATURES, "r") as f:
    all_features = json.load(f)
print(all_features)

['total-bitcoins', 'market-cap', 'trade-volume', 'blocks-size', 'avg-block-size', 'n-transactions-total', 'n-transactions-per-block', 'hash-rate', 'difficulty', 'miners-revenue', 'transaction-fees-usd', 'n-unique-addresses', 'n-transactions', 'estimated-transaction-volume-usd', 'rate-of-change', 'sma-5-days', 'sma-7-days', 'sma-10-days', 'sma-20-days', 'sma-50-days', 'sma-100-days']


In [12]:
# Loading correlation matrix features
with open(GDRIVE_MORE_REL_FEATURES, "r") as f:
    more_rel_features = json.load(f)
print(more_rel_features)

['market-cap', 'miners-revenue', 'sma-5-days', 'sma-7-days', 'estimated-transaction-volume-usd', 'sma-10-days', 'n-transactions-total', 'blocks-size', 'sma-100-days', 'total-bitcoins']


In [13]:
# Loading correlation matrix features
with open(GDRIVE_LESS_REL_FEATURES, "r") as f:
    less_rel_features = json.load(f)
print(less_rel_features)

['sma-20-days', 'sma-50-days', 'n-unique-addresses', 'difficulty', 'hash-rate', 'avg-block-size', 'transaction-fees-usd', 'trade-volume', 'n-transactions-per-block', 'n-transactions', 'rate-of-change']


# Evaluation of a simple model

In [14]:
# Get default params
params = utilities.get_defaults_model_params(MODEL_NAME)
params

{'maxIter': [25],
 'regParam': [0],
 'family': ['gaussian'],
 'link': ['identity']}

In [15]:
# Valid performances with all the features
simple_res_all, simple_pred_all = utilities.evaluate_simple_model(df, all_features, params, GDRIVE_ALL_FEATURES_NAME, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
simple_res_all

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,GeneralizedLinearRegression,simple,all_features,"[25, 0, gaussian, identity]",843.020822,0.023818,629.717382,76020470.0,0.990671,0.990669,6.527013


In [16]:
utilities.show_results(simple_pred_all, MODEL_NAME, TARGET_VAL)

In [17]:
# Valid performances with the corr matrix features
simple_res_more_rel, simple_pred_more_rel = utilities.evaluate_simple_model(df, more_rel_features, params, GDRIVE_MORE_REL_FEATURES_NAME, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
simple_res_more_rel

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,GeneralizedLinearRegression,simple,more_rel_features,"[25, 0, gaussian, identity]",652.761659,0.01778,475.443556,79876610.0,0.994407,0.994406,1.434517


In [18]:
utilities.show_results(simple_pred_more_rel, MODEL_NAME, TARGET_VAL)

In [19]:
# Valid performances with the corr matrix features
simple_res_less_rel, simple_pred_less_rel = utilities.evaluate_simple_model(df, less_rel_features, params, GDRIVE_LESS_REL_FEATURES_NAME, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
simple_res_less_rel

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,GeneralizedLinearRegression,simple,less_rel_features,"[25, 0, gaussian, identity]",40956.685757,1.560198,37050.205975,1652957000.0,-21.01904,-21.023229,1.328509


In [20]:
utilities.show_results(simple_pred_less_rel, MODEL_NAME, TARGET_VAL)

# Hyperparameter tuning

In [21]:
choosen_features = more_rel_features
CHOSEN_FEATURES_LABEL = GDRIVE_MORE_REL_FEATURES_NAME

In [22]:
# Split proportion list
PORTION_LIST = [0.6, 0.7, 0.8, 0.9]

In [23]:
# Get simple params
params = utilities.get_simple_model_params(MODEL_NAME)
params

{'maxIter': [5, 10, 50, 80],
 'regParam': [0, 0.1, 0.2],
 'family': ['gaussian', 'gamma'],
 'link': ['identity', 'inverse']}

In [24]:
hyp_res = utilities.autoTuning(df, choosen_features, params, CHOSEN_FEATURES_LABEL, PORTION_LIST, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
hyp_res

Unnamed: 0,Model,Type,Features,Proportion,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,GeneralizedLinearRegression,autotuning,more_rel_features,0.9,"[5, 0.2, gaussian, identity]",507.805627,0.015979,364.640616,23092590.0,0.987814,0.987809,0.205595


# Cross validation

In [26]:
# Get tuned params
params = utilities.get_tuned_model_params(MODEL_NAME)
params

{'maxIter': [5],
 'regParam': [0.2],
 'family': ['gaussian'],
 'link': ['identity']}

In [27]:
## Cross Validation Parameter
# Multiple Splits Time Series Cross Validation
mul_cv = {'cv_type':'mulTs',
          'kSplits': 5}

# Blocked Time Series Cross Validation
blk_cv = {'cv_type':'blkTs',
          'kSplits': 10}

In [28]:
mul_cv_res, trained_models_mul_cv = utilities.tsCrossValidation(df, choosen_features, params, mul_cv, CHOSEN_FEATURES_LABEL, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
mul_cv_res

Unnamed: 0,Model,Type,Features,Splits,Train&Validation,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,GeneralizedLinearRegression,mulTs,more_rel_features,1,"(21904, 21904)","[5, 0.2, gaussian, identity]",499.769752,0.04524,347.324638,14785820.0,0.984677,0.984674,0.888706
1,GeneralizedLinearRegression,mulTs,more_rel_features,2,"(43808, 21904)","[5, 0.2, gaussian, identity]",2001.872507,0.291597,1730.393131,11741000.0,0.36239,0.362244,0.84722
2,GeneralizedLinearRegression,mulTs,more_rel_features,3,"(65712, 21904)","[5, 0.2, gaussian, identity]",318.131714,0.027601,248.529799,14347480.0,0.993203,0.993201,1.023068
3,GeneralizedLinearRegression,mulTs,more_rel_features,4,"(87616, 21904)","[5, 0.2, gaussian, identity]",2155.710338,0.04314,1907.354184,88823150.0,0.945527,0.945514,0.967914
4,GeneralizedLinearRegression,mulTs,more_rel_features,5,"(109520, 21904)","[5, 0.2, gaussian, identity]",1021.720224,0.039884,907.994967,45161430.0,0.975376,0.97537,2.095411


In [31]:
blk_cv_res, trained_models_mul_cv = utilities.tsCrossValidation(df, choosen_features, params, blk_cv, CHOSEN_FEATURES_LABEL, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
blk_cv_res

Unnamed: 0,Model,Type,Features,Splits,Train&Validation,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,GeneralizedLinearRegression,blkTs,more_rel_features,1,"(10513, 2629)","[5, 0.2, gaussian, identity]",8.751516,0.013598,8.05298,318.1647,0.673677,0.673055,0.735632
1,GeneralizedLinearRegression,blkTs,more_rel_features,2,"(10513, 2629)","[5, 0.2, gaussian, identity]",127.424087,0.044432,108.648456,112960.4,0.880088,0.87986,0.694652
2,GeneralizedLinearRegression,blkTs,more_rel_features,3,"(10513, 2629)","[5, 0.2, gaussian, identity]",397.1169,0.03687,322.28376,1245765.0,0.893134,0.89293,0.673364
3,GeneralizedLinearRegression,blkTs,more_rel_features,4,"(10513, 2629)","[5, 0.2, gaussian, identity]",118.245315,0.021867,89.546605,1144839.0,0.986643,0.986617,0.651285
4,GeneralizedLinearRegression,blkTs,more_rel_features,5,"(10513, 2629)","[5, 0.2, gaussian, identity]",248.400505,0.018987,199.128183,639842.5,0.920165,0.920013,0.568532
5,GeneralizedLinearRegression,blkTs,more_rel_features,6,"(10513, 2629)","[5, 0.2, gaussian, identity]",508.968419,0.051833,489.114172,310067.0,-2.531269,-2.538001,0.667535
6,GeneralizedLinearRegression,blkTs,more_rel_features,7,"(10513, 2629)","[5, 0.2, gaussian, identity]",1232.574629,0.020243,1014.573509,29940130.0,0.950062,0.949967,0.962579
7,GeneralizedLinearRegression,blkTs,more_rel_features,8,"(10513, 2629)","[5, 0.2, gaussian, identity]",2490.503742,0.041989,2194.663373,55888120.0,0.845396,0.845102,1.498923
8,GeneralizedLinearRegression,blkTs,more_rel_features,9,"(10513, 2629)","[5, 0.2, gaussian, identity]",897.258624,0.037776,779.083138,4253195.0,0.741217,0.740724,0.665532
9,GeneralizedLinearRegression,blkTs,more_rel_features,10,"(10513, 2629)","[5, 0.2, gaussian, identity]",956.713742,0.029552,819.176071,2436790.0,0.564788,0.563958,0.571508


# Comparison table

In [34]:
# Define what model_info and evaluators in the Model Comparison Table
model_info = ['Model','Type', 'Features', 'Parameters']
evaluator_lst = ['RMSE','MAPE','MAE','Variance','R2','Adjusted_R2','Time']

# The the Cross Validation results would like to compare
comparison_lst = [simple_res_all, simple_res_more_rel, simple_res_less_rel, hyp_res, mul_cv_res, blk_cv_res]

In [35]:
# Show the Comparison Table
pd.concat([utilities.modelComparison(cv_result, model_info, evaluator_lst) for cv_result in comparison_lst])

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,GeneralizedLinearRegression,simple,all_features,"[25, 0, gaussian, identity]",843.020822,0.023818,629.717382,76020470.0,0.990671,0.990669,6.527013
0,GeneralizedLinearRegression,simple,more_rel_features,"[25, 0, gaussian, identity]",652.761659,0.01778,475.443556,79876610.0,0.994407,0.994406,1.434517
0,GeneralizedLinearRegression,simple,less_rel_features,"[25, 0, gaussian, identity]",40956.685757,1.560198,37050.205975,1652957000.0,-21.01904,-21.023229,1.328509
0,GeneralizedLinearRegression,autotuning,more_rel_features,"[5, 0.2, gaussian, identity]",507.805627,0.015979,364.640616,23092590.0,0.987814,0.987809,0.205595
0,GeneralizedLinearRegression,mulTs,more_rel_features,"[5, 0.2, gaussian, identity]",1199.440907,0.089492,1028.319344,34971780.0,0.852234,0.852201,1.164464
0,GeneralizedLinearRegression,blkTs,more_rel_features,"[5, 0.2, gaussian, identity]",698.595748,0.031715,602.427025,9597203.0,0.49239,0.491422,0.768954


# Training the final model

In [36]:
model = utilities.train_final_model(df, more_rel_features, params, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)

In [37]:
GDRIVE_MODELS_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/models"
GDRIVE_MODEL_NAME_EXT = GDRIVE_MODELS_DIR + "/" + MODEL_NAME

In [38]:
# Save the trained model
model.write().overwrite().save(GDRIVE_MODEL_NAME_EXT)