# **Bitcoin price forecasting - Linear Regression**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author
Corsi Danilo - corsi.1742375@studenti.uniroma1.it



## Global constants, dependencies, libraries and tools

In [1]:
# GDrive root
GDRIVE_DIR = "/content/drive"

# Dataset
GDRIVE_DATASET_OUTPUT_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/output"
GDRIVE_DATASET_NAME = "bitcoin_blockchain_data_30min"
GDRIVE_DATASET_NAME_ENG = GDRIVE_DATASET_NAME + "train_valid"
GDRIVE_DATASET_NAME_EXT_ENG  = "/" + GDRIVE_DATASET_NAME_ENG + ".parquet"
GDRIVE_DATASET_NAME_ENG = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_DATASET_NAME_EXT_ENG

# Features
GDRIVE_FEATURES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/features"
GDRIVE_ALL_FEATURES_NAME = "all_features"
GDRIVE_MORE_REL_FEATURES_NAME = "more_rel_features"
GDRIVE_LESS_REL_FEATURES_NAME = "less_rel_features"
GDRIVE_ALL_FEATURES_NAME_EXT = "/" + GDRIVE_ALL_FEATURES_NAME + ".json"
GDRIVE_MORE_REL_FEATURES_NAME_EXT = "/" + GDRIVE_MORE_REL_FEATURES_NAME + ".json"
GDRIVE_LESS_REL_FEATURES_NAME_EXT = "/" + GDRIVE_LESS_REL_FEATURES_NAME + ".json"
GDRIVE_ALL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_ALL_FEATURES_NAME_EXT
GDRIVE_MORE_REL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_MORE_REL_FEATURES_NAME_EXT
GDRIVE_LESS_REL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_LESS_REL_FEATURES_NAME_EXT

# Others
JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
MODEL_NAME = "LinearRegression"
SLOW_OPERATION = True

In [2]:
# Point Colaboratory to our Google Drive
from google.colab import drive

# Define GDrive paths
drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


In [3]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
# Install Spark and related dependencies
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285388 sha256=dd3696ba22a8457b1f2f1a7e1aa122c7fd3d134768e0aeccb8f7fd7d706b1433
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


## Import files

In [8]:
GDRIVE_UTILITIES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/utilities"

import sys
sys.path.append(GDRIVE_UTILITIES_DIR)

from imports import *
import utilities, parameters

importlib.reload(utilities)
importlib.reload(parameters)

<module 'parameters' from '/content/drive/MyDrive/BDC/project/utilities/parameters.py'>

## Create the pyspark session

In [9]:
# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '4G').\
                set('spark.driver.memory', '45G').\
                set('spark.driver.maxResultSize', '10G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPriceForecasting").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

## Loading dataset

In [10]:
# Load datasets into pyspark dataframe objects
df = spark.read.load(GDRIVE_DATASET_NAME_ENG,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

In [11]:
def dataset_info(dataset):
  dataset.show(3)

  # Get the number of rows
  num_rows = dataset.count()

  # Get the number of columns
  num_columns = len(dataset.columns)

  # Print the shape of the DataFrame
  print("Shape:", (num_rows, num_columns))

  # Print the schema of the DataFrame
  dataset.printSchema()

In [12]:
if SLOW_OPERATION:
  dataset_info(df)

+-------------------+---+------------------+--------------+-------------------+--------------+------------------+------------------+--------------------+------------------------+-----------------+-------------------+------------------+--------------------+------------------+------------------+--------------------------------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|          timestamp| id|      market-price|total-bitcoins|         market-cap|  trade-volume|       blocks-size|    avg-block-size|n-transactions-total|n-transactions-per-block|        hash-rate|         difficulty|    miners-revenue|transaction-fees-usd|n-unique-addresses|    n-transactions|estimated-transaction-volume-usd|      rate-of-change|        sma-5-days|        sma-7-days|       sma-10-days|       sma-20-days|       sma-50-days|      sma-100-days| next-market-price|
+-------------------+---+-----------

## Loading features

In [13]:
# Set the features label
FEATURES_LABEL = "features"

# Set the target variable
TARGET_VAL = 'market-price'

In [14]:
# Loading correlation matrix features
with open(GDRIVE_ALL_FEATURES, "r") as f:
    all_features = json.load(f)
print(all_features)

['total-bitcoins', 'market-cap', 'trade-volume', 'blocks-size', 'avg-block-size', 'n-transactions-total', 'n-transactions-per-block', 'hash-rate', 'difficulty', 'miners-revenue', 'transaction-fees-usd', 'n-unique-addresses', 'n-transactions', 'estimated-transaction-volume-usd', 'rate-of-change', 'sma-5-days', 'sma-7-days', 'sma-10-days', 'sma-20-days', 'sma-50-days', 'sma-100-days']


In [15]:
# Loading correlation matrix features
with open(GDRIVE_MORE_REL_FEATURES, "r") as f:
    more_rel_features = json.load(f)
print(more_rel_features)

['market-cap', 'miners-revenue', 'sma-5-days', 'sma-7-days', 'estimated-transaction-volume-usd', 'sma-10-days', 'n-transactions-total', 'blocks-size', 'sma-100-days', 'total-bitcoins']


In [16]:
# Loading correlation matrix features
with open(GDRIVE_LESS_REL_FEATURES, "r") as f:
    less_rel_features = json.load(f)
print(less_rel_features)

['sma-20-days', 'sma-50-days', 'n-unique-addresses', 'difficulty', 'hash-rate', 'avg-block-size', 'transaction-fees-usd', 'trade-volume', 'n-transactions-per-block', 'n-transactions', 'rate-of-change']


## Evaluate simple model

In [17]:
# Get default params
params = parameters.get_defaults_model_params(MODEL_NAME)
params

{'maxIter': [100], 'regParam': [0.0], 'elasticNetParam': [0.0]}

In [18]:
# Valid performances with all the features
simple_res_all, simple_pred_all = utilities.evaluate_simple_model(df, all_features, params, GDRIVE_ALL_FEATURES_NAME, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
simple_res_all

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,simple,all_features,"[100, 0.0, 0.0]",6328.114969,0.191128,4912.677774,90498580.0,0.474349,0.474229,7.484958


In [19]:
utilities.show_results(simple_pred_all, MODEL_NAME, TARGET_VAL)

Output hidden; open in https://colab.research.google.com to view.

In [20]:
# Valid performances with the corr matrix features
simple_res_more_rel, simple_pred_more_rel = utilities.evaluate_simple_model(df, more_rel_features, params, GDRIVE_MORE_REL_FEATURES_NAME, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
simple_res_more_rel

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,simple,more_rel_features,"[100, 0.0, 0.0]",7352.156437,0.260539,5907.680003,53618450.0,0.290458,0.290296,4.432118


In [21]:
utilities.show_results(simple_pred_more_rel, MODEL_NAME, TARGET_VAL)

Output hidden; open in https://colab.research.google.com to view.

In [22]:
# Valid performances with the corr matrix features
simple_res_less_rel, simple_pred_less_rel = utilities.evaluate_simple_model(df, less_rel_features, params, GDRIVE_LESS_REL_FEATURES_NAME, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
simple_res_less_rel

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,simple,less_rel_features,"[100, 0.0, 0.0]",11109.834435,0.399404,9927.127534,94812550.0,-0.620182,-0.620552,2.252941


In [23]:
utilities.show_results(simple_pred_less_rel, MODEL_NAME, TARGET_VAL)

Output hidden; open in https://colab.research.google.com to view.

## Hyperparameter tuning

In [24]:
choosen_features = more_rel_features
CHOSEN_FEATURES_LABEL = GDRIVE_MORE_REL_FEATURES_NAME

In [25]:
# Split proportion list
PORTION_LIST = [0.6, 0.7, 0.8, 0.9]

In [26]:
# Get simple params
params = parameters.get_model_params(MODEL_NAME)
params

{'maxIter': [5, 10, 50, 80, 100],
 'regParam': array([0. , 0.2, 0.4, 0.6, 0.8]),
 'elasticNetParam': array([0. , 0.2, 0.4, 0.6, 0.8])}

In [27]:
hyp_res = utilities.autoTuning(df, choosen_features, params, CHOSEN_FEATURES_LABEL, PORTION_LIST, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
hyp_res

Unnamed: 0,Model,Type,Features,Proportion,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,autotuning,more_rel_features,0.9,"[5, 0.8, 0.8]",1604.378022,0.057455,1252.913471,21787310.0,0.878357,0.878302,0.325904


## Cross validation

In [28]:
# Get tuned params
params = parameters.get_tuned_model_params(MODEL_NAME)
params

{'maxIter': [5], 'regParam': [0.8], 'elasticNetParam': [0.8]}

In [29]:
## Cross Validation Parameter
# Multiple Splits Time Series Cross Validation
mul_cv = {'cv_type':'mulTs',
          'kSplits': 5}

# Blocked Time Series Cross Validation
blk_cv = {'cv_type':'blkTs',
          'kSplits': 10}

In [30]:
mul_cv_res, trained_models_mul_cv = utilities.tsCrossValidation(df, choosen_features, params, mul_cv, CHOSEN_FEATURES_LABEL, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
mul_cv_res

Unnamed: 0,Model,Type,Features,Splits,Train&Validation,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,mulTs,more_rel_features,1,"(21904, 21904)","[5, 0.8, 0.8]",6424.133797,0.663416,5097.30339,26010650.0,-1.53182,-1.532513,1.362415
1,LinearRegression,mulTs,more_rel_features,2,"(43808, 21904)","[5, 0.8, 0.8]",1893.799915,0.226327,1492.591611,968495.5,0.429375,0.429219,0.903342
2,LinearRegression,mulTs,more_rel_features,3,"(65712, 21904)","[5, 0.8, 0.8]",5291.41071,0.340122,4023.949427,16562100.0,-0.880495,-0.881011,1.087136
3,LinearRegression,mulTs,more_rel_features,4,"(87616, 21904)","[5, 0.8, 0.8]",35532.696579,0.737162,34416.762023,1184971000.0,-13.799956,-13.804011,0.98431
4,LinearRegression,mulTs,more_rel_features,5,"(109520, 21904)","[5, 0.8, 0.8]",4727.433265,0.145719,3753.996555,37163470.0,0.472838,0.472694,1.165217


In [31]:
blk_cv_res, trained_models_mul_cv = utilities.tsCrossValidation(df, choosen_features, params, blk_cv, CHOSEN_FEATURES_LABEL, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
blk_cv_res

Unnamed: 0,Model,Type,Features,Splits,Train&Validation,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,blkTs,more_rel_features,1,"(10513, 2629)","[5, 0.8, 0.8]",41.387775,0.067585,40.151226,1777.808,-6.298358,-6.31506,0.575574
1,LinearRegression,blkTs,more_rel_features,2,"(10513, 2629)","[5, 0.8, 0.8]",789.702713,0.301556,732.235307,541903.3,-3.605597,-3.616136,1.168731
2,LinearRegression,blkTs,more_rel_features,3,"(10513, 2629)","[5, 0.8, 0.8]",2514.09359,0.267695,2441.441951,7716504.0,-3.283181,-3.292982,0.577143
3,LinearRegression,blkTs,more_rel_features,4,"(10513, 2629)","[5, 0.8, 0.8]",2707.762928,0.611596,2314.723272,11234790.0,-6.004338,-6.020367,0.594536
4,LinearRegression,blkTs,more_rel_features,5,"(10513, 2629)","[5, 0.8, 0.8]",620.674916,0.05031,523.611444,376373.7,0.501554,0.500413,0.616366
5,LinearRegression,blkTs,more_rel_features,6,"(10513, 2629)","[5, 0.8, 0.8]",211.034082,0.016889,159.246674,46367.66,0.392908,0.391519,0.611802
6,LinearRegression,blkTs,more_rel_features,7,"(10513, 2629)","[5, 0.8, 0.8]",19288.18157,0.354457,18582.097561,346189100.0,-11.228947,-11.256931,0.630353
7,LinearRegression,blkTs,more_rel_features,8,"(10513, 2629)","[5, 0.8, 0.8]",3031.181616,0.044736,2575.552551,31059920.0,0.770982,0.770458,0.612054
8,LinearRegression,blkTs,more_rel_features,9,"(10513, 2629)","[5, 0.8, 0.8]",2122.135356,0.080682,1749.012856,10777960.0,-0.447593,-0.450906,0.50062
9,LinearRegression,blkTs,more_rel_features,10,"(10513, 2629)","[5, 0.8, 0.8]",1295.713185,0.039223,1104.938939,2013716.0,0.20172,0.199894,0.498347


## Comparison table

In [32]:
# Define what model_info and evaluators in the Model Comparison Table
model_info = ['Model','Type', 'Features', 'Parameters']
evaluator_lst = ['RMSE','MAPE','MAE','Variance','R2','Adjusted_R2','Time']

# The the Cross Validation results would like to compare
comparison_lst = [simple_res_all, simple_res_more_rel, simple_res_less_rel, hyp_res, mul_cv_res, blk_cv_res]

In [33]:
# Show the Comparison Table
pd.concat([utilities.modelComparison(cv_result, model_info, evaluator_lst) for cv_result in comparison_lst])

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,simple,all_features,"[100, 0.0, 0.0]",6328.114969,0.191128,4912.677774,90498580.0,0.474349,0.474229,7.484958
0,LinearRegression,simple,more_rel_features,"[100, 0.0, 0.0]",7352.156437,0.260539,5907.680003,53618450.0,0.290458,0.290296,4.432118
0,LinearRegression,simple,less_rel_features,"[100, 0.0, 0.0]",11109.834435,0.399404,9927.127534,94812550.0,-0.620182,-0.620552,2.252941
0,LinearRegression,autotuning,more_rel_features,"[5, 0.8, 0.8]",1604.378022,0.057455,1252.913471,21787310.0,0.878357,0.878302,0.325904
0,LinearRegression,mulTs,more_rel_features,"[5, 0.8, 0.8]",10773.894853,0.422549,9756.920601,253135200.0,-3.062011,-3.063124,1.100484
0,LinearRegression,blkTs,more_rel_features,"[5, 0.8, 0.8]",3262.186773,0.183473,3022.301178,40995840.0,-2.900085,-2.90901,0.638553


## Training the final model

In [34]:
model = utilities.train_final_model(df, more_rel_features, params, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)

In [35]:
GDRIVE_MODELS_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/models"
GDRIVE_MODEL_NAME_EXT = GDRIVE_MODELS_DIR + "/" + MODEL_NAME

In [36]:
# Save the trained model
model.write().overwrite().save(GDRIVE_MODEL_NAME_EXT)