# **Bitcoin price forecasting - LinearRegression**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author
Corsi Danilo - corsi.1742375@studenti.uniroma1.it



## Global constants, dependencies, libraries and tools

In [1]:
# GDrive root
GDRIVE_DIR = "/content/drive"

# Dataset
GDRIVE_DATASET_OUTPUT_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/output"
GDRIVE_DATASET_NAME = "bitcoin_blockchain_data_30min"
GDRIVE_DATASET_NAME_ENG = GDRIVE_DATASET_NAME + "_train_valid"
GDRIVE_DATASET_NAME_EXT_ENG  = "/" + GDRIVE_DATASET_NAME_ENG + ".parquet"
GDRIVE_DATASET_NAME_ENG = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_DATASET_NAME_EXT_ENG

# Features
GDRIVE_FEATURES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/features"
GDRIVE_ALL_FEATURES_NAME = "all_features"
GDRIVE_MORE_REL_FEATURES_NAME = "more_rel_features"
GDRIVE_LESS_REL_FEATURES_NAME = "less_rel_features"
GDRIVE_ALL_FEATURES_NAME_EXT = "/" + GDRIVE_ALL_FEATURES_NAME + ".json"
GDRIVE_MORE_REL_FEATURES_NAME_EXT = "/" + GDRIVE_MORE_REL_FEATURES_NAME + ".json"
GDRIVE_LESS_REL_FEATURES_NAME_EXT = "/" + GDRIVE_LESS_REL_FEATURES_NAME + ".json"
GDRIVE_ALL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_ALL_FEATURES_NAME_EXT
GDRIVE_MORE_REL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_MORE_REL_FEATURES_NAME_EXT
GDRIVE_LESS_REL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_LESS_REL_FEATURES_NAME_EXT

# Others
JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
MODEL_NAME = "LinearRegression"
SLOW_OPERATION = True

In [2]:
# Point Colaboratory to our Google Drive
from google.colab import drive

# Define GDrive paths
drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


In [3]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
# Install Spark and related dependencies
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285388 sha256=6e8d8224154d5837fbf07dbe9febd5a55eb16a99a4e6d4fae7a30d6cb1c2b984
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


## Import files

In [5]:
GDRIVE_UTILITIES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/utilities"

import sys
sys.path.append(GDRIVE_UTILITIES_DIR)

from imports import *
import utilities, parameters

importlib.reload(utilities)
importlib.reload(parameters)

<module 'parameters' from '/content/drive/MyDrive/BDC/project/utilities/parameters.py'>

## Create the pyspark session

In [6]:
# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '4G').\
                set('spark.driver.memory', '45G').\
                set('spark.driver.maxResultSize', '10G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPriceForecasting").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

## Loading dataset

In [7]:
# Load datasets into pyspark dataset objects
df = spark.read.load(GDRIVE_DATASET_NAME_ENG,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

In [8]:
def dataset_info(dataset):
  dataset.show(3)

  # Get the number of rows
  num_rows = dataset.count()

  # Get the number of columns
  num_columns = len(dataset.columns)

  # Print the shape of the dataset
  print("Shape:", (num_rows, num_columns))

  # Print the schema of the dataset
  dataset.printSchema()

In [9]:
if SLOW_OPERATION:
  dataset_info(df)

+-------------------+---+------------------+--------------+-------------------+--------------+------------------+------------------+--------------------+------------------------+-----------------+-------------------+------------------+--------------------+------------------+------------------+--------------------------------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|          timestamp| id|      market-price|total-bitcoins|         market-cap|  trade-volume|       blocks-size|    avg-block-size|n-transactions-total|n-transactions-per-block|        hash-rate|         difficulty|    miners-revenue|transaction-fees-usd|n-unique-addresses|    n-transactions|estimated-transaction-volume-usd|      rate-of-change|        sma-5-days|        sma-7-days|       sma-10-days|       sma-20-days|       sma-50-days|      sma-100-days|
+-------------------+---+------------------+--------------+---------------

## [TO DELETE] ADDING NEXT-MARKET-PRICE ❗

In [11]:
# # Add the column 'tomorrow-market-price' which will be the target feature on which to make predictions
# df = df.withColumn("next-market-price", F.lag("market-price", offset=-1) \
#         .over(Window.orderBy("id"))) \
#         .dropna()

# if SLOW_OPERATION:
#   dataset_info(df)

+-------------------+---+------------------+--------------+-------------------+--------------+------------------+------------------+--------------------+------------------------+-----------------+-------------------+------------------+--------------------+------------------+------------------+--------------------------------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|          timestamp| id|      market-price|total-bitcoins|         market-cap|  trade-volume|       blocks-size|    avg-block-size|n-transactions-total|n-transactions-per-block|        hash-rate|         difficulty|    miners-revenue|transaction-fees-usd|n-unique-addresses|    n-transactions|estimated-transaction-volume-usd|      rate-of-change|        sma-5-days|        sma-7-days|       sma-10-days|       sma-20-days|       sma-50-days|      sma-100-days| next-market-price|
+-------------------+---+-----------

## Loading features

In [12]:
# Set the features label
FEATURES_LABEL = "features"

# Set the target variable
TARGET_VAL = 'next-market-price'

In [13]:
# Loading correlation matrix features
with open(GDRIVE_ALL_FEATURES, "r") as f:
    all_features = json.load(f)
print(all_features)

['market-price', 'total-bitcoins', 'market-cap', 'trade-volume', 'blocks-size', 'avg-block-size', 'n-transactions-total', 'n-transactions-per-block', 'hash-rate', 'difficulty', 'miners-revenue', 'transaction-fees-usd', 'n-unique-addresses', 'n-transactions', 'estimated-transaction-volume-usd', 'rate-of-change', 'sma-5-days', 'sma-7-days', 'sma-10-days', 'sma-20-days', 'sma-50-days', 'sma-100-days']


In [14]:
# Loading correlation matrix features
with open(GDRIVE_MORE_REL_FEATURES, "r") as f:
    more_rel_features = json.load(f)
print(more_rel_features)

['market-price', 'market-cap', 'miners-revenue', 'sma-5-days', 'sma-7-days', 'estimated-transaction-volume-usd', 'sma-10-days', 'n-transactions-total', 'blocks-size', 'sma-100-days', 'total-bitcoins']


In [15]:
# Loading correlation matrix features
with open(GDRIVE_LESS_REL_FEATURES, "r") as f:
    less_rel_features = json.load(f)
print(less_rel_features)

['sma-50-days', 'sma-20-days', 'difficulty', 'hash-rate', 'n-unique-addresses', 'avg-block-size', 'transaction-fees-usd', 'trade-volume', 'n-transactions-per-block', 'n-transactions', 'rate-of-change']


## Simple model

In [16]:
# Get default params
params = parameters.get_defaults_model_params(MODEL_NAME)
params

{'maxIter': [100], 'regParam': [0.0], 'elasticNetParam': [0.0]}

In [17]:
# Valid performances with all the features
simple_res_all, simple_pred_all = utilities.evaluate_simple_model(df, all_features, params, GDRIVE_ALL_FEATURES_NAME, MODEL_NAME, FEATURES_LABEL, TARGET_VAL, False)
simple_res_all

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,simple,all_features,"[100, 0.0, 0.0]",32.686623,0.00133,27.448278,9180860.0,0.999883,0.999883,9.082656


In [18]:
utilities.show_results(simple_pred_all, MODEL_NAME, TARGET_VAL)

In [19]:
# Valid performances with the corr matrix features
simple_res_more_rel, simple_pred_more_rel = utilities.evaluate_simple_model(df, more_rel_features, params, GDRIVE_MORE_REL_FEATURES_NAME, MODEL_NAME, FEATURES_LABEL, TARGET_VAL, False)
simple_res_more_rel

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,simple,more_rel_features,"[100, 0.0, 0.0]",12.593462,0.000425,9.053998,9167186.0,0.999983,0.999983,3.205917


In [20]:
utilities.show_results(simple_pred_more_rel, MODEL_NAME, TARGET_VAL)

In [21]:
# Valid performances with the corr matrix features
simple_res_less_rel, simple_pred_less_rel = utilities.evaluate_simple_model(df, less_rel_features, params, GDRIVE_LESS_REL_FEATURES_NAME, MODEL_NAME, FEATURES_LABEL, TARGET_VAL, False)
simple_res_less_rel

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,simple,less_rel_features,"[100, 0.0, 0.0]",52047.675462,2.37995,48443.576012,2732157000.0,-294.94347,-295.059956,3.628412


In [22]:
utilities.show_results(simple_pred_less_rel, MODEL_NAME, TARGET_VAL)

## Simple model (with data normalization)

In [23]:
# Valid performances with all the features
simple_norm_res_all, simple_norm_pred_all = utilities.evaluate_simple_model(df, all_features, params, GDRIVE_ALL_FEATURES_NAME, MODEL_NAME, FEATURES_LABEL, TARGET_VAL, True)
simple_norm_res_all

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,simple_norm,all_features,"[100, 0.0, 0.0]",5355.713437,0.194071,4326.128557,25039410.0,-2.13358,-2.134813,5.270741


In [24]:
utilities.show_results(simple_norm_pred_all, MODEL_NAME, TARGET_VAL)

In [25]:
# Valid performances with the corr matrix features
simple_norm_res_more_rel, simple_norm_pred_more_rel = utilities.evaluate_simple_model(df, more_rel_features, params, GDRIVE_MORE_REL_FEATURES_NAME, MODEL_NAME, FEATURES_LABEL, TARGET_VAL, True)
simple_norm_res_more_rel

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,simple_norm,more_rel_features,"[100, 0.0, 0.0]",8394.715213,0.37133,7443.668618,73116500.0,-6.698711,-6.701741,2.917202


In [26]:
utilities.show_results(simple_norm_pred_more_rel, MODEL_NAME, TARGET_VAL)

In [27]:
# Valid performances with the corr matrix features
simple_norm_res_less_rel, simple_norm_pred_less_rel = utilities.evaluate_simple_model(df, less_rel_features, params, GDRIVE_LESS_REL_FEATURES_NAME, MODEL_NAME, FEATURES_LABEL, TARGET_VAL, True)
simple_norm_res_less_rel

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,simple_norm,less_rel_features,"[100, 0.0, 0.0]",10404.844191,0.453678,9434.680694,91508770.0,-10.827072,-10.831727,2.990925


In [28]:
utilities.show_results(simple_norm_pred_less_rel, MODEL_NAME, TARGET_VAL)

## Hyperparameter tuning

In [29]:
choosen_features = all_features
CHOSEN_FEATURES_LABEL = GDRIVE_ALL_FEATURES_NAME

In [30]:
# Split proportion list
PORTION_LIST = [0.6, 0.7, 0.8, 0.9]

In [31]:
# Get simple params
params = parameters.get_model_params(MODEL_NAME)
params

{'maxIter': [5, 50, 100, 120, 150],
 'regParam': [0.0, 0.05, 0.1],
 'elasticNetParam': [0.0, 0.5, 1.0]}

In [32]:
hyp_res, hyp_params = utilities.autoTuning(df, choosen_features, params, CHOSEN_FEATURES_LABEL, PORTION_LIST, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
hyp_res

Unnamed: 0,Model,Type,Features,Proportion,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,autotuning,all_features,0.8,"[5, 0.1, 0.0]",4207.623486,0.123233,3439.298593,143585400.0,0.904206,0.904187,0.431761


## Cross validation

In [33]:
# Get tuned params
params = hyp_params
params

{'maxIter': [5], 'regParam': [0.1], 'elasticNetParam': [0.0]}

In [34]:
## Cross Validation Parameter
# Multiple Splits Time Series Cross Validation
mul_cv = {'cv_type':'mulTs',
          'kSplits': 5}

# Blocked Time Series Cross Validation
blk_cv = {'cv_type':'blkTs',
          'kSplits': 10}

In [35]:
mul_cv_res, trained_models_mul_cv = utilities.tsCrossValidation(df, choosen_features, params, mul_cv, CHOSEN_FEATURES_LABEL, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
mul_cv_res

Unnamed: 0,Model,Type,Features,Splits,Train&Validation,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,mulTs,all_features,1,"(21181, 21181)","[5, 0.1, 0.0]",5462.486702,0.492958,4092.241771,17317640.0,-0.672882,-0.673277,1.636434
1,LinearRegression,mulTs,all_features,2,"(42362, 21181)","[5, 0.1, 0.0]",3848.687768,0.652545,2947.640754,7766767.0,-1.795504,-1.796164,1.885046
2,LinearRegression,mulTs,all_features,3,"(63543, 21181)","[5, 0.1, 0.0]",2647.768457,0.223275,2211.12694,5366351.0,-1.762408,-1.76306,1.747969
3,LinearRegression,mulTs,all_features,4,"(84724, 21181)","[5, 0.1, 0.0]",26404.377918,0.519462,23937.270142,588157200.0,-2.936423,-2.937352,1.849222
4,LinearRegression,mulTs,all_features,5,"(105905, 21181)","[5, 0.1, 0.0]",4362.706764,0.147741,3551.924275,69424720.0,0.770809,0.770755,3.026283


In [36]:
blk_cv_res, trained_models_mul_cv = utilities.tsCrossValidation(df, choosen_features, params, blk_cv, CHOSEN_FEATURES_LABEL, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
blk_cv_res

Unnamed: 0,Model,Type,Features,Splits,Train&Validation,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,blkTs,all_features,1,"(10166, 2542)","[5, 0.1, 0.0]",20.44137,0.031616,18.488656,620.8278,-0.139872,-0.142119,1.266311
1,LinearRegression,blkTs,all_features,2,"(10166, 2542)","[5, 0.1, 0.0]",405.100641,0.133628,307.77717,165658.6,0.392926,0.391729,1.221522
2,LinearRegression,blkTs,all_features,3,"(10166, 2542)","[5, 0.1, 0.0]",628.772096,0.053473,509.75067,1233896.0,0.840425,0.84011,1.252367
3,LinearRegression,blkTs,all_features,4,"(10166, 2542)","[5, 0.1, 0.0]",130.115303,0.016317,87.63434,265659.1,0.95523,0.955142,2.154244
4,LinearRegression,blkTs,all_features,5,"(10166, 2542)","[5, 0.1, 0.0]",991.17648,0.077915,862.54012,1302898.0,-0.462579,-0.465463,1.381215
5,LinearRegression,blkTs,all_features,6,"(10166, 2542)","[5, 0.1, 0.0]",539.644405,0.063293,470.97345,716055.6,0.686835,0.686217,1.198828
6,LinearRegression,blkTs,all_features,7,"(10166, 2542)","[5, 0.1, 0.0]",1960.336828,0.038312,1326.262865,34101850.0,0.921012,0.920856,1.183768
7,LinearRegression,blkTs,all_features,8,"(10166, 2542)","[5, 0.1, 0.0]",1664.146968,0.02765,1400.115603,25889750.0,0.905581,0.905395,1.26131
8,LinearRegression,blkTs,all_features,9,"(10166, 2542)","[5, 0.1, 0.0]",1264.710869,0.04931,1092.851944,28272320.0,0.924852,0.924704,1.787592
9,LinearRegression,blkTs,all_features,10,"(10166, 2542)","[5, 0.1, 0.0]",723.24327,0.018165,480.354191,3380110.0,0.907712,0.90753,1.256508


## Comparison table

In [37]:
# Define what model_info and evaluators in the Model Comparison Table
model_info = ['Model','Type', 'Features', 'Parameters']
evaluator_lst = ['RMSE','MAPE','MAE','Variance','R2','Adjusted_R2','Time']

# The the Cross Validation results would like to compare
comparison_lst = [simple_res_all, simple_res_more_rel, simple_res_less_rel, simple_norm_res_all, simple_norm_res_more_rel, simple_norm_res_less_rel, hyp_res, mul_cv_res, blk_cv_res]

In [38]:
# Show the Comparison Table
pd.concat([utilities.modelComparison(cv_result, model_info, evaluator_lst) for cv_result in comparison_lst])

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,simple,all_features,"[100, 0.0, 0.0]",32.686623,0.00133,27.448278,9180860.0,0.999883,0.999883,9.082656
0,LinearRegression,simple,more_rel_features,"[100, 0.0, 0.0]",12.593462,0.000425,9.053998,9167186.0,0.999983,0.999983,3.205917
0,LinearRegression,simple,less_rel_features,"[100, 0.0, 0.0]",52047.675462,2.37995,48443.576012,2732157000.0,-294.94347,-295.059956,3.628412
0,LinearRegression,simple_norm,all_features,"[100, 0.0, 0.0]",5355.713437,0.194071,4326.128557,25039410.0,-2.13358,-2.134813,5.270741
0,LinearRegression,simple_norm,more_rel_features,"[100, 0.0, 0.0]",8394.715213,0.37133,7443.668618,73116500.0,-6.698711,-6.701741,2.917202
0,LinearRegression,simple_norm,less_rel_features,"[100, 0.0, 0.0]",10404.844191,0.453678,9434.680694,91508770.0,-10.827072,-10.831727,2.990925
0,LinearRegression,autotuning,all_features,"[5, 0.1, 0.0]",4207.623486,0.123233,3439.298593,143585400.0,0.904206,0.904187,0.431761
0,LinearRegression,mulTs,all_features,"[5, 0.1, 0.0]",8545.205522,0.407196,7348.040777,137606500.0,-1.279281,-1.27982,2.028991
0,LinearRegression,blkTs,all_features,"[5, 0.1, 0.0]",832.768823,0.050968,655.674901,9532882.0,0.593212,0.59241,1.396367


## Training the final model

In [39]:
model = utilities.train_final_model(df, choosen_features, params, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)

In [40]:
GDRIVE_MODELS_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/models"
GDRIVE_MODEL_NAME_EXT = GDRIVE_MODELS_DIR + "/" + MODEL_NAME

In [41]:
# Save the trained model
model.write().overwrite().save(GDRIVE_MODEL_NAME_EXT)