# **Bitcoin price forecasting - Linear Regression**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author
Corsi Danilo - corsi.1742375@studenti.uniroma1.it



# Dependencies, Libraries and Tools

In [1]:
JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
SLOW_OPERATION = False

In [2]:
#Install some useful dependencies
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from itertools import cycle

import plotly.express as px

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import gc

import pandas as pd
import numpy as np
import json

import matplotlib.pyplot as plt
import seaborn as sns

# !pip install -U -q PyDrive # To use files that are stored in Google Drive directly (e.g., without downloading them from an external URL)
# !apt install openjdk-8-jdk-headless -qq
# import os
# os.environ["JAVA_HOME"] = JAVA_HOME

In [3]:
# Install Spark and related dependencies
!pip install pyspark

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.ml import Pipeline

# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '4G').\
                set('spark.driver.memory', '45G').\
                set('spark.driver.maxResultSize', '10G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPriceForecasting").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285397 sha256=6c17df7b3897ba3e11c2c257918c00a3465c497e2c1050996f20264d48e83728
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


# Link to Google Drive

In [4]:
# Define GDrive paths
GDRIVE_DIR = "/content/drive"

GDRIVE_MODEL_NAME = "linear_regression"
GDRIVE_MODELS_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/models"
GDRIVE_MODEL_NAME_EXT = GDRIVE_MODELS_DIR + "/" + GDRIVE_MODEL_NAME

GDRIVE_DATASET_RAW_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/raw"
GDRIVE_DATASET_TEMP_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/temp"
GDRIVE_DATASET_OUTPUT_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/output"

GDRIVE_DATASET_NAME = "bitcoin_blockchain_data_1h"
GDRIVE_DATASET_NAME_TRAIN = GDRIVE_DATASET_NAME + "_train"
GDRIVE_DATASET_NAME_TEST = GDRIVE_DATASET_NAME + "_test"

GDRIVE_DATASET_NAME_EXT = "/" + GDRIVE_DATASET_NAME + ".parquet"
GDRIVE_DATASET_NAME_EXT_TRAIN  = "/" + GDRIVE_DATASET_NAME_TRAIN + ".parquet"
GDRIVE_DATASET_NAME_EXT_TEST = "/" + GDRIVE_DATASET_NAME_TEST + ".parquet"

GDRIVE_DATASET = GDRIVE_DATASET_RAW_DIR + GDRIVE_DATASET_NAME_EXT
GDRIVE_DATASET_TRAIN = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_DATASET_NAME_EXT_TRAIN
GDRIVE_DATASET_TEST = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_DATASET_NAME_EXT_TEST

In [5]:
# Point Colaboratory to our Google Drive
from google.colab import drive

drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


# Training the model ❗

In [6]:
# Load datasets into pyspark dataframe objects
train_df = spark.read.load(GDRIVE_DATASET_TRAIN,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

test_df = spark.read.load(GDRIVE_DATASET_TEST,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

In [7]:
# Function that create a simple lr model (with no hyperparameter tuning)
def simple_linear_regression_model(train, featureCol, labelCol):
  lr = LinearRegression(featuresCol=featureCol, labelCol=labelCol)
  lr_model = lr.fit(train)
  return lr_model

In [8]:
# Define the evaluation metrics
# Notice that r2_adj metric is included when calculating r2
metrics = ['mse', 'rmse', 'mae', 'r2']

In [9]:
# Function to compute the r2 adjusted metric
# r2 is the r2 metric, n is the number of observations, k is the number of features
def compute_r2adj(r2, n, k):
  return 1 - (1 - r2) * ((n - 1) / (n - k - 1))

In [10]:
# Function to evaluate a model
def evaluate_models(predictions, modelName, typeName, label, prediction, metrics):
  r2 = None
  for metric in metrics:
    evaluator = RegressionEvaluator(labelCol=label, predictionCol=prediction, metricName=metric)
    evaluation = evaluator.evaluate(predictions)
    print(metric.upper()+' for '+modelName+' on '+typeName+' set: '+str(evaluation))
    if metric == 'r2':
      print('R2_adj'+' for '+modelName+' on '+typeName+' set: '+str(compute_r2adj(evaluation, predictions.count(), len(predictions.columns))))

In [11]:
# Function that create simple models (without hyperparameter tuning) and evaluate them
def test_best_features(train_data, test_data, features, featureCol, labelCol, metrics = ['rmse', 'r2']):
  # Train the models
  lr = simple_linear_regression_model(train_data, featureCol, labelCol)

  # Training set evaluation
  lr_training = lr.transform(train_data)
  evaluate_models(lr_training, 'linear regression', 'training', labelCol, 'prediction', metrics)

  # Testing set evaluation
  lr_predictions = lr.transform(test_data)
  evaluate_models(lr_predictions, 'linear regression', 'testing', labelCol, 'prediction', metrics)

  return lr_training, lr_predictions

In [12]:
# Retrieve all / cor_matrix / gb features
GDRIVE_COR_MATRIX_FEATURES_NAME = "cor_matrix_features"
GDRIVE_GB_FEATURES_NAME = "gb_features"

GDRIVE_COR_MATRIX_FEATURES_NAME_EXT = "/" + GDRIVE_COR_MATRIX_FEATURES_NAME + ".json"
GDRIVE_GB_FEATURES_NAME_EXT = "/" + GDRIVE_GB_FEATURES_NAME + ".json"

GDRIVE_COR_MATRIX_FEATURES = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_COR_MATRIX_FEATURES_NAME_EXT
GDRIVE_GB_FEATURES = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_GB_FEATURES_NAME_EXT

cor_matrix_features = spark.read.json(GDRIVE_COR_MATRIX_FEATURES).columns
gb_features = spark.read.json(GDRIVE_GB_FEATURES).columns

all_features = train_df.columns[1:-2]

# Set the depended variable
dep_var = 'market-price'

In [13]:
# Return the dataset with the selected features
def select_features(dataset, features):
  vectorAssembler = VectorAssembler(
    inputCols = features,
    outputCol = 'features')

  dataset = vectorAssembler.transform(dataset)
  dataset = dataset.select(['timestamp','index', 'features', dep_var])
  return dataset

In [14]:
def show_results(train, test, training, predictions):
  trace1 = go.Scatter(
      x = train['timestamp'],
      y = train['market-price'].astype(float),
      mode = 'lines',
      name = 'Train'
  )

  trace2 = go.Scatter(
      x = test['timestamp'],
      y = test['market-price'].astype(float),
      mode = 'lines',
      name = 'Test'
  )

  trace3 = go.Scatter(
      x = training['timestamp'],
      y = training['prediction'].astype(float),
      mode = 'lines',
      name = 'Training'
  )

  trace4 = go.Scatter(
      x = predictions['timestamp'],
      y = predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Prediction'
  )

  layout = dict(
      title='Train, test and prediction set with Rangeslider',
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  #change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace1, trace2, trace3, trace4]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = "Train, test and prediction set with Rangeslider")

In [15]:
# Test performances with the selected features
lr_training, lr_predictions = test_best_features(select_features(train_df, all_features), select_features(test_df, all_features), all_features, 'features', dep_var)

RMSE for linear regression on training set: 221.34073443931288
R2 for linear regression on training set: 0.9990460621540038
R2_adj for linear regression on training set: 0.9990460029605611
RMSE for linear regression on testing set: 743.4447173983854
R2 for linear regression on testing set: 0.9970413223120623
R2_adj for linear regression on testing set: 0.9970405877478615


In [16]:
show_results(train_df.toPandas(), test_df.toPandas(), lr_training.toPandas(), lr_predictions.toPandas())

Output hidden; open in https://colab.research.google.com to view.

In [17]:
# Test features with the relevant features
lr_training, lr_predictions = test_best_features(select_features(train_df, cor_matrix_features), select_features(test_df, cor_matrix_features), cor_matrix_features, 'features', dep_var)

RMSE for linear regression on training set: 234.50719604872188
R2 for linear regression on training set: 0.9989291966193173
R2_adj for linear regression on training set: 0.9989291301741722
RMSE for linear regression on testing set: 796.6949594536837
R2 for linear regression on testing set: 0.9966023046503147
R2_adj for linear regression on testing set: 0.9966014610892269


In [18]:
show_results(train_df.toPandas(), test_df.toPandas(), lr_training.toPandas(), lr_predictions.toPandas())

Output hidden; open in https://colab.research.google.com to view.

In [19]:
# Test performances with all features
lr_training, lr_predictions = test_best_features(select_features(train_df, gb_features), select_features(test_df, gb_features), gb_features, 'features', dep_var)

RMSE for linear regression on training set: 238.02121378360758
R2 for linear regression on training set: 0.9988968648657841
R2_adj for linear regression on training set: 0.9988967964143994
RMSE for linear regression on testing set: 824.197238268392
R2 for linear regression on testing set: 0.996363675711917
R2_adj for linear regression on testing set: 0.9963627729053506


In [20]:
show_results(train_df.toPandas(), test_df.toPandas(), lr_training.toPandas(), lr_predictions.toPandas())

Output hidden; open in https://colab.research.google.com to view.

# Hyperparameter tuning ❗

In [21]:
# Hyperparameter tuning for the model
def linear_regression_cross_val(dataset, k_fold=5):
    lr = LinearRegression(featuresCol='features', labelCol=dep_var)
    pipeline = Pipeline(stages=[lr])

    param_grid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.0, 0.05, 0.1]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

    cross_val = CrossValidator(estimator=pipeline,
                               estimatorParamMaps=param_grid,
                               evaluator=RegressionEvaluator(labelCol=dep_var),
                               numFolds=k_fold,
                               collectSubModels=True
                               )

    # Run cross-validation, and choose the best set of parameters.
    cv_model = cross_val.fit(dataset)

    return cv_model

In [22]:
# Execute cross validation with linear regression
cv_lr_models = linear_regression_cross_val(select_features(train_df, cor_matrix_features))

In [23]:
# Summarizes all the models trained during cross validation
def summarize_lr_models(cv_models):
    for k, models in enumerate(cv_models):
        print("*************** Fold #{:d} ***************\n".format(k+1))
        for i, m in enumerate(models):
            print("--- Model #{:d} out of {:d} ---".format(i+1, len(models)))
            print("\tParameters: lambda=[{:.3f}]; alpha=[{:.3f}] ".format(m.stages[-1]._java_obj.getRegParam(), m.stages[-1]._java_obj.getElasticNetParam()))
            print("\tModel summary: {}\n".format(m.stages[-1]))
        print("***************************************\n")

In [24]:
# Call the function above|
summarize_lr_models(cv_lr_models.subModels)

*************** Fold #1 ***************

--- Model #1 out of 9 ---
	Parameters: lambda=[0.000]; alpha=[0.000] 
	Model summary: LinearRegressionModel: uid=LinearRegression_540bd3fe08d1, numFeatures=7

--- Model #2 out of 9 ---
	Parameters: lambda=[0.000]; alpha=[0.500] 
	Model summary: LinearRegressionModel: uid=LinearRegression_540bd3fe08d1, numFeatures=7

--- Model #3 out of 9 ---
	Parameters: lambda=[0.000]; alpha=[1.000] 
	Model summary: LinearRegressionModel: uid=LinearRegression_540bd3fe08d1, numFeatures=7

--- Model #4 out of 9 ---
	Parameters: lambda=[0.050]; alpha=[0.000] 
	Model summary: LinearRegressionModel: uid=LinearRegression_540bd3fe08d1, numFeatures=7

--- Model #5 out of 9 ---
	Parameters: lambda=[0.050]; alpha=[0.500] 
	Model summary: LinearRegressionModel: uid=LinearRegression_540bd3fe08d1, numFeatures=7

--- Model #6 out of 9 ---
	Parameters: lambda=[0.050]; alpha=[1.000] 
	Model summary: LinearRegressionModel: uid=LinearRegression_540bd3fe08d1, numFeatures=7

--- M

In [25]:
# Summarize average error
for i, avg_rmse in enumerate(cv_lr_models.avgMetrics):
    print("Avg. RMSE computed across k-fold cross validation for model setting #{:d}: {:3f}".format(i+1, avg_rmse))

Avg. RMSE computed across k-fold cross validation for model setting #1: 234.507563
Avg. RMSE computed across k-fold cross validation for model setting #2: 234.507563
Avg. RMSE computed across k-fold cross validation for model setting #3: 234.507563
Avg. RMSE computed across k-fold cross validation for model setting #4: 234.509342
Avg. RMSE computed across k-fold cross validation for model setting #5: 235.200427
Avg. RMSE computed across k-fold cross validation for model setting #6: 235.025420
Avg. RMSE computed across k-fold cross validation for model setting #7: 234.514514
Avg. RMSE computed across k-fold cross validation for model setting #8: 235.452818
Avg. RMSE computed across k-fold cross validation for model setting #9: 235.078913


In [26]:
# Get the best model to extract best hyperparameters
best_lr_model_params = cv_lr_models.bestModel.stages[-1].extractParamMap()
print('Best parameters for linear regression:')
for param, value in best_lr_model_params.items():
    print(param.name, "=", value)

Best parameters for linear regression:
aggregationDepth = 2
elasticNetParam = 0.0
epsilon = 1.35
featuresCol = features
fitIntercept = True
labelCol = market-price
loss = squaredError
maxBlockSizeInMB = 0.0
maxIter = 100
predictionCol = prediction
regParam = 0.0
solver = auto
standardization = True
tol = 1e-06


In [27]:
# Fit a model with best parameters
lr = LinearRegression(featuresCol='features', labelCol=dep_var, elasticNetParam=0, regParam=0, maxIter=100)
lr_model = lr.fit(select_features(test_df, cor_matrix_features))

In [28]:
# Training set evaluation
lr_training = lr_model.transform(select_features(train_df, cor_matrix_features))
evaluate_models(lr_training, 'linear regression', 'training', dep_var, 'prediction', metrics)

# Testing set evaluation
lr_predictions = lr_model.transform(select_features(test_df, cor_matrix_features))
evaluate_models(lr_predictions, 'linear regression', 'testing', dep_var, 'prediction', metrics)

MSE for linear regression on training set: 9872941.058456417
RMSE for linear regression on training set: 3142.12365422757
MAE for linear regression on training set: 2931.3043529299257
R2 for linear regression on training set: 0.8077599237559235
R2_adj for linear regression on training set: 0.8077479949368759
MSE for linear regression on testing set: 395805.6134634681
RMSE for linear regression on testing set: 629.1308397014632
MAE for linear regression on testing set: 390.07543884857813
R2 for linear regression on testing set: 0.9978812376545031
R2_adj for linear regression on testing set: 0.9978807116198575


In [29]:
show_results(train_df.toPandas(), test_df.toPandas(), lr_training.toPandas(), lr_predictions.toPandas())

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Save the LR best model
# rf_model.write().overwrite().save(GDRIVE_DATA_DIR + '/models/price/RandomForest')