# **Bitcoin price forecasting - Linear Regression**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author
Corsi Danilo - corsi.1742375@studenti.uniroma1.it



## Global constants, dependencies, libraries and tools

In [1]:
# GDrive root
GDRIVE_DIR = "/content/drive"

# Dataset
GDRIVE_DATASET_OUTPUT_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/output"
GDRIVE_DATASET_NAME = "bitcoin_blockchain_data_30min"
GDRIVE_DATASET_NAME_ENG = GDRIVE_DATASET_NAME + "_train_valid"
GDRIVE_DATASET_NAME_EXT_ENG  = "/" + GDRIVE_DATASET_NAME_ENG + ".parquet"
GDRIVE_DATASET_NAME_ENG = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_DATASET_NAME_EXT_ENG

# Features
GDRIVE_FEATURES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/features"
GDRIVE_ALL_FEATURES_NAME = "all_features"
GDRIVE_MORE_REL_FEATURES_NAME = "more_rel_features"
GDRIVE_LESS_REL_FEATURES_NAME = "less_rel_features"
GDRIVE_ALL_FEATURES_NAME_EXT = "/" + GDRIVE_ALL_FEATURES_NAME + ".json"
GDRIVE_MORE_REL_FEATURES_NAME_EXT = "/" + GDRIVE_MORE_REL_FEATURES_NAME + ".json"
GDRIVE_LESS_REL_FEATURES_NAME_EXT = "/" + GDRIVE_LESS_REL_FEATURES_NAME + ".json"
GDRIVE_ALL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_ALL_FEATURES_NAME_EXT
GDRIVE_MORE_REL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_MORE_REL_FEATURES_NAME_EXT
GDRIVE_LESS_REL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_LESS_REL_FEATURES_NAME_EXT

# Others
JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
MODEL_NAME = "GeneralizedLinearRegression"
SLOW_OPERATION = True

In [2]:
# Point Colaboratory to our Google Drive
from google.colab import drive

# Define GDrive paths
drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


In [3]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
# Install Spark and related dependencies
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285388 sha256=5869c795fa6f1f3744d5e6ce02b7a59fd02ba4c5a42a5de6d69bc68f3a220464
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


## Import files

In [133]:
GDRIVE_UTILITIES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/utilities"

import sys
sys.path.append(GDRIVE_UTILITIES_DIR)

from imports import *
import utilities, parameters

importlib.reload(utilities)
importlib.reload(parameters)

<module 'parameters' from '/content/drive/MyDrive/BDC/project/utilities/parameters.py'>

## Create the pyspark session

In [98]:
# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '4G').\
                set('spark.driver.memory', '45G').\
                set('spark.driver.maxResultSize', '10G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPriceForecasting").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

ValueError: ignored

## Loading dataset

In [100]:
# Load datasets into pyspark dataframe objects
df = spark.read.load(GDRIVE_DATASET_NAME_ENG,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

In [101]:
def dataset_info(dataset):
  dataset.show(3)

  # Get the number of rows
  num_rows = dataset.count()

  # Get the number of columns
  num_columns = len(dataset.columns)

  # Print the shape of the DataFrame
  print("Shape:", (num_rows, num_columns))

  # Print the schema of the DataFrame
  dataset.printSchema()

In [102]:
if SLOW_OPERATION:
  dataset_info(df)

+-------------------+---+------------------+--------------+-------------------+--------------+------------------+------------------+--------------------+------------------------+-----------------+-------------------+------------------+--------------------+------------------+------------------+--------------------------------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|          timestamp| id|      market-price|total-bitcoins|         market-cap|  trade-volume|       blocks-size|    avg-block-size|n-transactions-total|n-transactions-per-block|        hash-rate|         difficulty|    miners-revenue|transaction-fees-usd|n-unique-addresses|    n-transactions|estimated-transaction-volume-usd|      rate-of-change|        sma-5-days|        sma-7-days|       sma-10-days|       sma-20-days|       sma-50-days|      sma-100-days|
+-------------------+---+------------------+--------------+---------------

## Loading features

In [103]:
# Set the features label
FEATURES_LABEL = "features"

# Set the target variable
TARGET_VAL = 'market-price'

In [104]:
# Loading correlation matrix features
with open(GDRIVE_ALL_FEATURES, "r") as f:
    all_features = json.load(f)
print(all_features)

['total-bitcoins', 'market-cap', 'trade-volume', 'blocks-size', 'avg-block-size', 'n-transactions-total', 'n-transactions-per-block', 'hash-rate', 'difficulty', 'miners-revenue', 'transaction-fees-usd', 'n-unique-addresses', 'n-transactions', 'estimated-transaction-volume-usd', 'rate-of-change', 'sma-5-days', 'sma-7-days', 'sma-10-days', 'sma-20-days', 'sma-50-days', 'sma-100-days']


In [105]:
# Loading correlation matrix features
with open(GDRIVE_MORE_REL_FEATURES, "r") as f:
    more_rel_features = json.load(f)
print(more_rel_features)

['market-cap', 'miners-revenue', 'sma-5-days', 'sma-7-days', 'estimated-transaction-volume-usd', 'sma-10-days', 'n-transactions-total', 'blocks-size', 'sma-100-days', 'total-bitcoins']


In [106]:
# Loading correlation matrix features
with open(GDRIVE_LESS_REL_FEATURES, "r") as f:
    less_rel_features = json.load(f)
print(less_rel_features)

['sma-20-days', 'sma-50-days', 'n-unique-addresses', 'difficulty', 'hash-rate', 'avg-block-size', 'transaction-fees-usd', 'trade-volume', 'n-transactions-per-block', 'n-transactions', 'rate-of-change']


## Evaluate simple model

In [107]:
# Get default params
params = parameters.get_defaults_model_params(MODEL_NAME)
params

{'maxIter': [25],
 'regParam': [0],
 'family': ['gaussian'],
 'link': ['identity']}

In [108]:
# Valid performances with all the features
simple_res_all, simple_pred_all = utilities.evaluate_simple_model(df, all_features, params, GDRIVE_ALL_FEATURES_NAME, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
simple_res_all

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,GeneralizedLinearRegression,simple,all_features,"[25, 0, gaussian, identity]",4288.585096,0.124091,3461.845227,150196500.0,0.900484,0.90046,0.924207


In [109]:
utilities.show_results(simple_pred_all, MODEL_NAME, TARGET_VAL)

In [110]:
# Valid performances with the corr matrix features
simple_res_more_rel, simple_pred_more_rel = utilities.evaluate_simple_model(df, more_rel_features, params, GDRIVE_MORE_REL_FEATURES_NAME, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
simple_res_more_rel

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,GeneralizedLinearRegression,simple,more_rel_features,"[25, 0, gaussian, identity]",9026.635555,0.219481,6967.905283,36506310.0,0.559123,0.559019,1.026095


In [111]:
utilities.show_results(simple_pred_more_rel, MODEL_NAME, TARGET_VAL)

In [112]:
# Valid performances with the corr matrix features
simple_res_less_rel, simple_pred_less_rel = utilities.evaluate_simple_model(df, less_rel_features, params, GDRIVE_LESS_REL_FEATURES_NAME, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
simple_res_less_rel

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,GeneralizedLinearRegression,simple,less_rel_features,"[25, 0, gaussian, identity]",14186.315186,0.314145,10900.364514,65994100.0,-0.088943,-0.0892,1.429001


In [113]:
utilities.show_results(simple_pred_less_rel, MODEL_NAME, TARGET_VAL)

## Hyperparameter tuning

In [134]:
choosen_features = more_rel_features
CHOSEN_FEATURES_LABEL = GDRIVE_MORE_REL_FEATURES_NAME

In [135]:
# Split proportion list
PORTION_LIST = [0.6, 0.7, 0.8, 0.9]

In [136]:
# Get simple params
params = parameters.get_model_params(MODEL_NAME)
params

{'maxIter': [5, 10, 50, 80],
 'regParam': [0.0, 0.1, 0.2],
 'family': ['gaussian'],
 'link': ['log']}

In [137]:
hyp_res, hyp_params = utilities.autoTuning(df, choosen_features, params, CHOSEN_FEATURES_LABEL, PORTION_LIST, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
hyp_res

Unnamed: 0,Model,Type,Features,Proportion,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,GeneralizedLinearRegression,autotuning,more_rel_features,0.9,"[10, 0.1, gaussian, log]",6618.634809,0.2689,5886.813989,34654580.0,-3.785672,-3.787933,1.039413


## Cross validation

In [138]:
# Get tuned params
params = hyp_params
params

{'maxIter': [10], 'regParam': [0.1], 'family': ['gaussian'], 'link': ['log']}

In [139]:
## Cross Validation Parameter
# Multiple Splits Time Series Cross Validation
mul_cv = {'cv_type':'mulTs',
          'kSplits': 5}

# Blocked Time Series Cross Validation
blk_cv = {'cv_type':'blkTs',
          'kSplits': 10}

In [140]:
mul_cv_res, trained_models_mul_cv = utilities.tsCrossValidation(df, choosen_features, params, mul_cv, CHOSEN_FEATURES_LABEL, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
mul_cv_res

Unnamed: 0,Model,Type,Features,Splits,Train&Validation,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,GeneralizedLinearRegression,mulTs,more_rel_features,1,"(21182, 21181)","[10, 0.1, gaussian, log]",7114.244848,0.81744,5725.026313,32775930.0,-1.837541,-1.838345,0.748757
1,GeneralizedLinearRegression,mulTs,more_rel_features,2,"(42363, 21181)","[10, 0.1, gaussian, log]",3670.59793,0.376797,2866.853713,8176031.0,-1.542777,-1.543498,2.720743
2,GeneralizedLinearRegression,mulTs,more_rel_features,3,"(63544, 21181)","[10, 0.1, gaussian, log]",4987.430218,0.498555,4726.211257,22337070.0,-8.80125,-8.804027,1.320493
3,GeneralizedLinearRegression,mulTs,more_rel_features,4,"(84725, 21181)","[10, 0.1, gaussian, log]",39971.225855,0.849813,37690.730577,1420591000.0,-8.020795,-8.023351,1.36621
4,GeneralizedLinearRegression,mulTs,more_rel_features,5,"(105906, 21181)","[10, 0.1, gaussian, log]",16222.759878,0.45104,13421.618109,180139800.0,-2.169091,-2.169989,1.747039


In [141]:
blk_cv_res, trained_models_mul_cv = utilities.tsCrossValidation(df, choosen_features, params, blk_cv, CHOSEN_FEATURES_LABEL, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
blk_cv_res

Unnamed: 0,Model,Type,Features,Splits,Train&Validation,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,GeneralizedLinearRegression,blkTs,more_rel_features,1,"(10166, 2542)","[10, 0.1, gaussian, log]",103.436537,0.171464,101.641513,10331.0,-28.057031,-28.125805,0.864507
1,GeneralizedLinearRegression,blkTs,more_rel_features,2,"(10166, 2542)","[10, 0.1, gaussian, log]",1181.252175,0.508638,1060.662499,1125005.0,-4.161112,-4.173328,1.338354
2,GeneralizedLinearRegression,blkTs,more_rel_features,3,"(10166, 2542)","[10, 0.1, gaussian, log]",4351.150223,0.371107,4055.664571,16448420.0,-6.621349,-6.639388,0.973756
3,GeneralizedLinearRegression,blkTs,more_rel_features,4,"(10166, 2542)","[10, 0.1, gaussian, log]",1415.947617,0.22196,1276.245927,1628804.0,-4.330844,-4.343461,0.796149
4,GeneralizedLinearRegression,blkTs,more_rel_features,5,"(10166, 2542)","[10, 0.1, gaussian, log]",5947.578609,0.536723,5890.853506,34702160.0,-51.67511,-51.799784,0.636373
5,GeneralizedLinearRegression,blkTs,more_rel_features,6,"(10166, 2542)","[10, 0.1, gaussian, log]",1866.977016,0.256604,1676.062024,2556609.0,-2.751987,-2.760867,0.777379
6,GeneralizedLinearRegression,blkTs,more_rel_features,7,"(10166, 2542)","[10, 0.1, gaussian, log]",18401.456436,0.570093,17027.247619,289927200.0,-5.954983,-5.971444,0.97147
7,GeneralizedLinearRegression,blkTs,more_rel_features,8,"(10166, 2542)","[10, 0.1, gaussian, log]",6626.107162,0.094091,4967.852867,14636370.0,-0.500058,-0.503608,0.808528
8,GeneralizedLinearRegression,blkTs,more_rel_features,9,"(10166, 2542)","[10, 0.1, gaussian, log]",21353.69429,0.895851,20849.379567,434696600.0,-20.423104,-20.47381,0.637314
9,GeneralizedLinearRegression,blkTs,more_rel_features,10,"(10166, 2542)","[10, 0.1, gaussian, log]",5137.973112,0.17927,4553.768501,20736810.0,-3.662471,-3.673507,1.059441


## Comparison table

In [142]:
# Define what model_info and evaluators in the Model Comparison Table
model_info = ['Model','Type', 'Features', 'Parameters']
evaluator_lst = ['RMSE','MAPE','MAE','Variance','R2','Adjusted_R2','Time']

# The the Cross Validation results would like to compare
comparison_lst = [simple_res_all, simple_res_more_rel, simple_res_less_rel, hyp_res, mul_cv_res, blk_cv_res]

In [143]:
# Show the Comparison Table
pd.concat([utilities.modelComparison(cv_result, model_info, evaluator_lst) for cv_result in comparison_lst])

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,GeneralizedLinearRegression,simple,all_features,"[25, 0, gaussian, identity]",4288.585096,0.124091,3461.845227,150196500.0,0.900484,0.90046,0.924207
0,GeneralizedLinearRegression,simple,more_rel_features,"[25, 0, gaussian, identity]",9026.635555,0.219481,6967.905283,36506310.0,0.559123,0.559019,1.026095
0,GeneralizedLinearRegression,simple,less_rel_features,"[25, 0, gaussian, identity]",14186.315186,0.314145,10900.364514,65994100.0,-0.088943,-0.0892,1.429001
0,GeneralizedLinearRegression,autotuning,more_rel_features,"[10, 0.1, gaussian, log]",6618.634809,0.2689,5886.813989,34654580.0,-3.785672,-3.787933,1.039413
0,GeneralizedLinearRegression,mulTs,more_rel_features,"[10, 0.1, gaussian, log]",14393.251746,0.598729,12886.087994,332804000.0,-4.474291,-4.475842,1.580649
0,GeneralizedLinearRegression,blkTs,more_rel_features,"[10, 0.1, gaussian, log]",6638.557318,0.38058,6145.937859,81646830.0,-12.813805,-12.8465,0.886327


## Training the final model

In [144]:
model = utilities.train_final_model(df, more_rel_features, params, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)

In [145]:
GDRIVE_MODELS_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/models"
GDRIVE_MODEL_NAME_EXT = GDRIVE_MODELS_DIR + "/" + MODEL_NAME

In [146]:
# Save the trained model
model.write().overwrite().save(GDRIVE_MODEL_NAME_EXT)