# **Bitcoin price forecasting - Linear Regression**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author
Corsi Danilo - corsi.1742375@studenti.uniroma1.it



## Global constants, dependencies, libraries and tools

In [1]:
# GDrive root
GDRIVE_DIR = "/content/drive"

# Dataset
GDRIVE_DATASET_OUTPUT_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/output"
GDRIVE_DATASET_NAME = "bitcoin_blockchain_data_30min"
GDRIVE_DATASET_NAME_ENG = GDRIVE_DATASET_NAME + "_train_valid"
GDRIVE_DATASET_NAME_EXT_ENG  = "/" + GDRIVE_DATASET_NAME_ENG + ".parquet"
GDRIVE_DATASET_NAME_ENG = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_DATASET_NAME_EXT_ENG

# Features
GDRIVE_FEATURES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/features"
GDRIVE_ALL_FEATURES_NAME = "all_features"
GDRIVE_MORE_REL_FEATURES_NAME = "more_rel_features"
GDRIVE_LESS_REL_FEATURES_NAME = "less_rel_features"
GDRIVE_ALL_FEATURES_NAME_EXT = "/" + GDRIVE_ALL_FEATURES_NAME + ".json"
GDRIVE_MORE_REL_FEATURES_NAME_EXT = "/" + GDRIVE_MORE_REL_FEATURES_NAME + ".json"
GDRIVE_LESS_REL_FEATURES_NAME_EXT = "/" + GDRIVE_LESS_REL_FEATURES_NAME + ".json"
GDRIVE_ALL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_ALL_FEATURES_NAME_EXT
GDRIVE_MORE_REL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_MORE_REL_FEATURES_NAME_EXT
GDRIVE_LESS_REL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_LESS_REL_FEATURES_NAME_EXT

# Others
JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
MODEL_NAME = "LinearRegression"
SLOW_OPERATION = True

In [2]:
# Point Colaboratory to our Google Drive
from google.colab import drive

# Define GDrive paths
drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


In [3]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
# Install Spark and related dependencies
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285388 sha256=2c297c337ec10f01dd9263f53238d1ae9b8cb331f53c55597125a0ef8f2527d8
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


## Import files

In [28]:
GDRIVE_UTILITIES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/utilities"

import sys
sys.path.append(GDRIVE_UTILITIES_DIR)

from imports import *
import utilities, parameters

importlib.reload(utilities)
importlib.reload(parameters)

<module 'parameters' from '/content/drive/MyDrive/BDC/project/utilities/parameters.py'>

## Create the pyspark session

In [6]:
# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '4G').\
                set('spark.driver.memory', '45G').\
                set('spark.driver.maxResultSize', '10G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPriceForecasting").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

## Loading dataset

In [7]:
# Load datasets into pyspark dataframe objects
df = spark.read.load(GDRIVE_DATASET_NAME_ENG,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

In [8]:
def dataset_info(dataset):
  dataset.show(3)

  # Get the number of rows
  num_rows = dataset.count()

  # Get the number of columns
  num_columns = len(dataset.columns)

  # Print the shape of the DataFrame
  print("Shape:", (num_rows, num_columns))

  # Print the schema of the DataFrame
  dataset.printSchema()

In [9]:
if SLOW_OPERATION:
  dataset_info(df)

+-------------------+---+------------------+--------------+-------------------+--------------+------------------+------------------+--------------------+------------------------+-----------------+-------------------+------------------+--------------------+------------------+------------------+--------------------------------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|          timestamp| id|      market-price|total-bitcoins|         market-cap|  trade-volume|       blocks-size|    avg-block-size|n-transactions-total|n-transactions-per-block|        hash-rate|         difficulty|    miners-revenue|transaction-fees-usd|n-unique-addresses|    n-transactions|estimated-transaction-volume-usd|      rate-of-change|        sma-5-days|        sma-7-days|       sma-10-days|       sma-20-days|       sma-50-days|      sma-100-days|
+-------------------+---+------------------+--------------+---------------

## Loading features

In [10]:
# Set the features label
FEATURES_LABEL = "features"

# Set the target variable
TARGET_VAL = 'market-price'

In [11]:
# Loading correlation matrix features
with open(GDRIVE_ALL_FEATURES, "r") as f:
    all_features = json.load(f)
print(all_features)

['total-bitcoins', 'market-cap', 'trade-volume', 'blocks-size', 'avg-block-size', 'n-transactions-total', 'n-transactions-per-block', 'hash-rate', 'difficulty', 'miners-revenue', 'transaction-fees-usd', 'n-unique-addresses', 'n-transactions', 'estimated-transaction-volume-usd', 'rate-of-change', 'sma-5-days', 'sma-7-days', 'sma-10-days', 'sma-20-days', 'sma-50-days', 'sma-100-days']


In [12]:
# Loading correlation matrix features
with open(GDRIVE_MORE_REL_FEATURES, "r") as f:
    more_rel_features = json.load(f)
print(more_rel_features)

['market-cap', 'miners-revenue', 'sma-5-days', 'sma-7-days', 'estimated-transaction-volume-usd', 'sma-10-days', 'n-transactions-total', 'blocks-size', 'sma-100-days', 'total-bitcoins']


In [13]:
# Loading correlation matrix features
with open(GDRIVE_LESS_REL_FEATURES, "r") as f:
    less_rel_features = json.load(f)
print(less_rel_features)

['sma-20-days', 'sma-50-days', 'n-unique-addresses', 'difficulty', 'hash-rate', 'avg-block-size', 'transaction-fees-usd', 'trade-volume', 'n-transactions-per-block', 'n-transactions', 'rate-of-change']


## Evaluate simple model

In [14]:
# Get default params
params = parameters.get_defaults_model_params(MODEL_NAME)
params

{'maxIter': [100], 'regParam': [0.0], 'elasticNetParam': [0.0]}

In [15]:
# Valid performances with all the features
simple_res_all, simple_pred_all = utilities.evaluate_simple_model(df, all_features, params, GDRIVE_ALL_FEATURES_NAME, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
simple_res_all

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,simple,all_features,"[100, 0.0, 0.0]",4288.585097,0.124091,3461.845228,150196500.0,0.900484,0.90046,6.229855


In [16]:
utilities.show_results(simple_pred_all, MODEL_NAME, TARGET_VAL)

In [17]:
# Valid performances with the corr matrix features
simple_res_more_rel, simple_pred_more_rel = utilities.evaluate_simple_model(df, more_rel_features, params, GDRIVE_MORE_REL_FEATURES_NAME, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
simple_res_more_rel

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,simple,more_rel_features,"[100, 0.0, 0.0]",9026.635554,0.219481,6967.905283,36506310.0,0.559123,0.559019,2.424016


In [18]:
utilities.show_results(simple_pred_more_rel, MODEL_NAME, TARGET_VAL)

In [19]:
# Valid performances with the corr matrix features
simple_res_less_rel, simple_pred_less_rel = utilities.evaluate_simple_model(df, less_rel_features, params, GDRIVE_LESS_REL_FEATURES_NAME, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
simple_res_less_rel

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,simple,less_rel_features,"[100, 0.0, 0.0]",14182.539053,0.314191,10899.108275,65829930.0,-0.088363,-0.08862,2.362139


In [20]:
utilities.show_results(simple_pred_less_rel, MODEL_NAME, TARGET_VAL)

## Hyperparameter tuning

In [29]:
choosen_features = more_rel_features
CHOSEN_FEATURES_LABEL = GDRIVE_MORE_REL_FEATURES_NAME

In [30]:
# Split proportion list
PORTION_LIST = [0.6, 0.7, 0.8, 0.9]

In [31]:
# Get simple params
params = parameters.get_model_params(MODEL_NAME)
params

{'maxIter': [5, 10, 50, 80, 100],
 'regParam': array([0. , 0.2, 0.4, 0.6, 0.8]),
 'elasticNetParam': array([0. , 0.2, 0.4, 0.6, 0.8])}

In [32]:
hyp_res, hyp_params= utilities.autoTuning(df, choosen_features, params, CHOSEN_FEATURES_LABEL, PORTION_LIST, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
hyp_res

Unnamed: 0,Model,Type,Features,Proportion,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,autotuning,more_rel_features,0.9,"[5, 0.2, 0.4]",1942.436008,0.068752,1465.138704,7577615.0,0.587808,0.587613,0.345756


## Cross validation

In [33]:
# Get tuned params
params = hyp_params
params

{'maxIter': [5], 'regParam': [0.2], 'elasticNetParam': [0.4]}

In [34]:
## Cross Validation Parameter
# Multiple Splits Time Series Cross Validation
mul_cv = {'cv_type':'mulTs',
          'kSplits': 5}

# Blocked Time Series Cross Validation
blk_cv = {'cv_type':'blkTs',
          'kSplits': 10}

In [35]:
mul_cv_res, trained_models_mul_cv = utilities.tsCrossValidation(df, choosen_features, params, mul_cv, CHOSEN_FEATURES_LABEL, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
mul_cv_res

Unnamed: 0,Model,Type,Features,Splits,Train&Validation,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,mulTs,more_rel_features,1,"(21182, 21181)","[5, 0.2, 0.4]",6356.530663,0.625849,4880.643357,23858920.0,-1.265295,-1.265937,0.48918
1,LinearRegression,mulTs,more_rel_features,2,"(42363, 21181)","[5, 0.2, 0.4]",1415.177725,0.150214,976.261954,1309760.0,0.622031,0.621923,0.631226
2,LinearRegression,mulTs,more_rel_features,3,"(63544, 21181)","[5, 0.2, 0.4]",2547.189992,0.232624,2258.207756,5536040.0,-1.556528,-1.557252,0.721277
3,LinearRegression,mulTs,more_rel_features,4,"(84725, 21181)","[5, 0.2, 0.4]",36031.852059,0.744761,33605.271682,1129568000.0,-6.330322,-6.332399,0.863575
4,LinearRegression,mulTs,more_rel_features,5,"(105906, 21181)","[5, 0.2, 0.4]",11008.947949,0.296967,8789.106567,95272600.0,-0.459408,-0.459822,0.99688


In [36]:
blk_cv_res, trained_models_mul_cv = utilities.tsCrossValidation(df, choosen_features, params, blk_cv, CHOSEN_FEATURES_LABEL, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
blk_cv_res

Unnamed: 0,Model,Type,Features,Splits,Train&Validation,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,blkTs,more_rel_features,1,"(10166, 2542)","[5, 0.2, 0.4]",43.621182,0.072071,42.610168,2098.182,-4.16771,-4.179941,0.697927
1,LinearRegression,blkTs,more_rel_features,2,"(10166, 2542)","[5, 0.2, 0.4]",504.500436,0.162464,377.406845,182471.1,0.058585,0.056357,0.784031
2,LinearRegression,blkTs,more_rel_features,3,"(10166, 2542)","[5, 0.2, 0.4]",6194.267106,0.597242,5964.114674,44827260.0,-14.445556,-14.482114,1.61152
3,LinearRegression,blkTs,more_rel_features,4,"(10166, 2542)","[5, 0.2, 0.4]",632.770316,0.060747,301.28888,1481414.0,-0.064618,-0.067138,0.725857
4,LinearRegression,blkTs,more_rel_features,5,"(10166, 2542)","[5, 0.2, 0.4]",1820.430846,0.15489,1724.662909,3062706.0,-3.934844,-3.946524,0.881561
5,LinearRegression,blkTs,more_rel_features,6,"(10166, 2542)","[5, 0.2, 0.4]",403.141027,0.044859,307.1291,1472352.0,0.825057,0.824643,0.751813
6,LinearRegression,blkTs,more_rel_features,7,"(10166, 2542)","[5, 0.2, 0.4]",9807.260929,0.262562,8323.548362,72825190.0,-0.975546,-0.980222,0.775474
7,LinearRegression,blkTs,more_rel_features,8,"(10166, 2542)","[5, 0.2, 0.4]",3465.073095,0.053601,2800.565306,20175660.0,0.589781,0.58881,0.625845
8,LinearRegression,blkTs,more_rel_features,9,"(10166, 2542)","[5, 0.2, 0.4]",16086.657208,0.616827,13327.257491,360859100.0,-11.158163,-11.18694,0.715921
9,LinearRegression,blkTs,more_rel_features,10,"(10166, 2542)","[5, 0.2, 0.4]",1408.008146,0.041944,1087.81093,3311204.0,0.649859,0.64903,1.522705


## Comparison table

In [37]:
# Define what model_info and evaluators in the Model Comparison Table
model_info = ['Model','Type', 'Features', 'Parameters']
evaluator_lst = ['RMSE','MAPE','MAE','Variance','R2','Adjusted_R2','Time']

# The the Cross Validation results would like to compare
comparison_lst = [simple_res_all, simple_res_more_rel, simple_res_less_rel, hyp_res, mul_cv_res, blk_cv_res]

In [38]:
# Show the Comparison Table
pd.concat([utilities.modelComparison(cv_result, model_info, evaluator_lst) for cv_result in comparison_lst])

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,simple,all_features,"[100, 0.0, 0.0]",4288.585097,0.124091,3461.845228,150196500.0,0.900484,0.90046,6.229855
0,LinearRegression,simple,more_rel_features,"[100, 0.0, 0.0]",9026.635554,0.219481,6967.905283,36506310.0,0.559123,0.559019,2.424016
0,LinearRegression,simple,less_rel_features,"[100, 0.0, 0.0]",14182.539053,0.314191,10899.108275,65829930.0,-0.088363,-0.08862,2.362139
0,LinearRegression,autotuning,more_rel_features,"[5, 0.2, 0.4]",1942.436008,0.068752,1465.138704,7577615.0,0.587808,0.587613,0.345756
0,LinearRegression,mulTs,more_rel_features,"[5, 0.2, 0.4]",11471.939678,0.410083,10101.898263,251109200.0,-1.797904,-1.798697,0.740428
0,LinearRegression,blkTs,more_rel_features,"[5, 0.2, 0.4]",4036.573029,0.206721,3425.639467,50819950.0,-3.262316,-3.272404,0.909265


## Training the final model

In [39]:
model = utilities.train_final_model(df, more_rel_features, params, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)

In [40]:
GDRIVE_MODELS_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/models"
GDRIVE_MODEL_NAME_EXT = GDRIVE_MODELS_DIR + "/" + MODEL_NAME

In [41]:
# Save the trained model
model.write().overwrite().save(GDRIVE_MODEL_NAME_EXT)