# **Bitcoin price prediction - Final scores**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author: Corsi Danilo (1742375) - corsi.1742375@studenti.uniroma1.it


---


Description: display of final scores and making predictions on the test set with the models trained on the whole train / validation set.

# Global constants, dependencies, libraries and tools

In [1]:
# Main constants
LOCAL_RUNNING = False
ROOT_DIR = "D:/Documents/Repository/BDC/project" if LOCAL_RUNNING else "/content/drive"

In [2]:
if not LOCAL_RUNNING:
    # Point Colaboratory to Google Drive
    from google.colab import drive

    # Define GDrive paths
    drive.mount(ROOT_DIR, force_remount=True)

    # Install Spark and related dependencies
    !pip install pyspark
    !pip install -U -q PyDrive -qq
    !apt install openjdk-8-jdk-headless -qq

    # Install "kaleido" engine package to export image
    !pip install -U kaleido

Mounted at /content/drive
Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=de19770032c4229d79531b5734b1c8f3f077a0266bbedd554b26911dadd74319
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0
The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra fonts-nanum fonts-ipafont-gothic
  fonts-ipafont-mincho fonts-wqy-microhei fo

## Import my utilities

In [3]:
# Set main dir
MAIN_DIR = ROOT_DIR + "" if LOCAL_RUNNING else ROOT_DIR + "/MyDrive/BDC/project"

# Utilities dir
UTILITIES_DIR = MAIN_DIR + "/utilities"

# Import my utilities
import sys
sys.path.append(UTILITIES_DIR)

from imports import *
from config import *
import final_scores_utilities

importlib.reload(final_scores_utilities)

<module 'final_scores_utilities' from '/content/drive/MyDrive/BDC/project/utilities/final_scores_utilities.py'>

In [4]:
# Set main dir
MAIN_DIR = ROOT_DIR + "" if LOCAL_RUNNING else ROOT_DIR + "/MyDrive/BDC/project"

###################
# --- DATASET --- #
###################

# Datasets dirs
DATASET_OUTPUT_DIR = MAIN_DIR + "/datasets/output"

# Datasets paths
DATASET_TEST = DATASET_OUTPUT_DIR + "/" + DATASET_TEST_NAME + ".parquet"

####################
# --- FEATURES --- #
####################

# Features dir
FEATURES_DIR = MAIN_DIR + "/features"

# Features paths
FEATURES_CORRELATION = FEATURES_DIR + "/" + FEATURES_CORRELATION_LABEL + ".json"
BASE_FEATURES = FEATURES_DIR + "/" + BASE_FEATURES_LABEL + ".json"
BASE_AND_MOST_CORR_FEATURES = FEATURES_DIR + "/" + BASE_AND_MOST_CORR_FEATURES_LABEL + ".json"
BASE_AND_LEAST_CORR_FEATURES = FEATURES_DIR + "/" + BASE_AND_LEAST_CORR_FEATURES_LABEL + ".json"

##################
# --- MODELS --- #
##################

# Model dir
MODELS_DIR = MAIN_DIR + "/models"

# Model path
LR_MODEL = MODELS_DIR + "/" + LR_MODEL_NAME
GLR_MODEL = MODELS_DIR + "/" + GLR_MODEL_NAME
RF_MODEL = MODELS_DIR + "/" + RF_MODEL_NAME
GBTR_MODEL = MODELS_DIR + "/" + GBTR_MODEL_NAME

###################
# --- RESULTS --- #
###################

# Results dir
RESULTS_DIR = MAIN_DIR + "/results"
RESULTS_FINAL_DIR = RESULTS_DIR + "/final"

In [5]:
# Suppression of warnings for better reading
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

if LOCAL_RUNNING: pio.renderers.default='notebook' # To correctly export the notebook in html format

# Create the pyspark session

In [6]:
# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '12G').\
                set('spark.driver.memory', '12G').\
                set('spark.driver.maxResultSize', '109G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPricePrediction").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

# Loading dataset

In [7]:
# Load datasets into pyspark dataset objects
df = spark.read.load(DATASET_TEST,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

In [8]:
final_scores_utilities.dataset_info(df)

+-------------------+------+------------+------------------+------------------+------------------+------------------+------------------+--------------------+--------------------+--------------------+------------------+------------------+--------------------+------------------------+--------------------+------------------+--------------------+--------------------+------------------+-----------------+--------------------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+
|          timestamp|    id|market-price|     opening-price|     highest-price|      lowest-price|     closing-price|  trade-volume-btc|      total-bitcoins|          market-cap|    trade-volume-usd|       blocks-size|    avg-block-size|n-transactions-total|n-transactions-per-block|           hash-rate|        difficulty|      miners-revenue|transaction-fees-usd|n-unique-addresses|   n-transactions|estimated-transaction-volume-u

# Compare train / validation results

In [9]:
# Load all results
splits_list = [BLOCK_SPLITS_NAME, WALK_FORWARD_SPLITS_NAME, SHORT_TERM_SPLITS_NAME]
models_list = [LR_MODEL_NAME, GLR_MODEL_NAME, RF_MODEL_NAME, GBTR_MODEL_NAME]

train_valid_all_results_raw = final_scores_utilities.get_all_results(splits_list, models_list, RESULTS_DIR) # Get all results

In [10]:
train_valid_all_results = train_valid_all_results_raw[train_valid_all_results_raw['Dataset'] != 'train'].copy() # Remove the 'train' dataset

train_valid_all_results = final_scores_utilities.train_valid_dataset_fine_tuning(train_valid_all_results, 'results') # Fine tuning of the dataset
train_valid_all_results = train_valid_all_results[(train_valid_all_results['Dataset'] == 'valid') & (train_valid_all_results['Type'] == 'Default')] # Get only the default results

## RMSE and R2 values compared with the features used in the default models

In [11]:
rmse_title = 'RMSE per Features type'
r2_title = 'R2 per Features type'
save_path = RESULTS_FINAL_DIR + "/default_"
final_scores_utilities.train_val_rmse_r2_plot(train_valid_all_results, 'Features', 'Model', 'RMSE', 'R2', 'Splitting', rmse_title, r2_title, save_path)

❗TODO

In [12]:
# Exclude negative R2 values
r2_title = 'R2 per Features type (non-negative)'
final_scores_utilities.train_val_r2_plot(train_valid_all_results[train_valid_all_results['R2'] >= 0], 'Features', 'Model', 'R2', 'Splitting', r2_title)

❗ TODO

In [13]:
# Load relevant results
splits_list = [BLOCK_SPLITS_NAME, WALK_FORWARD_SPLITS_NAME, SHORT_TERM_SPLITS_NAME]
models_list = [LR_MODEL_NAME, GLR_MODEL_NAME, RF_MODEL_NAME, GBTR_MODEL_NAME]

train_valid_results_raw, train_valid_accuracy_raw = final_scores_utilities.get_rel_results(splits_list, models_list, RESULTS_DIR) # Get relevant results

train_valid_results = final_scores_utilities.train_valid_dataset_fine_tuning(train_valid_results_raw.copy(), 'results') # Fine tuning of the dataset
train_valid_accuracy = final_scores_utilities.train_valid_dataset_fine_tuning(train_valid_accuracy_raw.copy(), 'accuracy') # Fine tuning of the dataset

In [14]:
train_valid_results = final_scores_utilities.train_valid_dataset_fine_tuning(train_valid_results, 'results')
train_valid_results

Unnamed: 0,Model,Type,Dataset,Splitting,Features,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,LR,Default,valid,Block splits,Base + least corr. features (norm.),"[100, 0.0, 0.0]",2227.412278,8171259.0,1870.345923,0.05248,-1.631054,-1.633063,0.564635
1,LR,Tuned,valid,Block splits,Base + least corr. features (norm.),"[5, 0.8, 0.0]",1032.704613,1732495.0,776.557626,0.022809,0.583006,0.582688,0.326523
2,GLR,Default,valid,Block splits,Base + least corr. features (norm.),"[25, 0]",2227.412278,8171259.0,1870.345923,0.05248,-1.631054,-1.633063,0.363326
3,GLR,Tuned,valid,Block splits,Base + least corr. features (norm.),"[5, 0.1, 'gaussian', 'log']",1613.05515,4358777.0,1318.116337,0.036782,0.253175,0.252605,0.241773
4,RF,Default,valid,Block splits,Base features,"[20, 5, 42]",874.913453,876881.1,579.679197,0.022112,0.337898,0.337393,1.108412
5,RF,Tuned,valid,Block splits,Base features,"[30, 10, 42]",772.597736,713592.2,497.460229,0.018699,0.530111,0.529753,2.466774
6,GBTR,Default,valid,Block splits,Base features,"[20, 5, 0.1, 42]",694.84138,641161.3,446.415138,0.016936,0.763292,0.763111,5.69686
7,GBTR,Tuned,valid,Block splits,Base features,"[3, 5, 0.1, 42]",732.282155,728794.3,478.257071,0.018009,0.7354,0.735198,1.069041
8,LR,Default,valid,Walk-forward splits,Base + least corr. features (norm.),"[100, 0.0, 0.0]",1743.625346,5340021.0,1496.804362,0.043785,0.360428,0.359915,0.597084
9,LR,Tuned,valid,Walk-forward splits,base_features_norm,"[5, 0.0, 0.0]",1664.415862,4874436.0,1430.47418,0.041731,0.435987,0.435535,0.404754


In [15]:
train_valid_accuracy = final_scores_utilities.train_valid_dataset_fine_tuning(train_valid_accuracy, 'accuracy')
train_valid_accuracy

Unnamed: 0,Model,Features,Splitting,Accuracy (default),Accuracy (tuned)
0,LR,Base + least corr. features (norm.),Block splits,48.211971,46.164697
1,GLR,Base + least corr. features (norm.),Block splits,48.211971,48.215783
2,RF,Base features,Block splits,53.808616,54.292795
3,GBTR,Base features,Block splits,51.044605,50.171559
4,LR,base_features_norm,Walk-forward splits,48.015455,50.306364
5,GLR,base_features_norm,Walk-forward splits,48.015455,47.985455
6,RF,Base features,Walk-forward splits,50.989091,51.544545
7,GBTR,Base features,Walk-forward splits,49.032727,50.008182
8,LR,Base + most corr. features(norm.),Single split,46.706989,50.268817
9,GLR,Base + most corr. features(norm.),Single split,46.706989,46.673387


## RMSE and R2 values compared between default and tuned models

In [16]:
rmse_title = 'RMSE per Model type'
r2_title = 'R2 per Model type'
save_path = RESULTS_FINAL_DIR + "/final_"
final_scores_utilities.train_val_rmse_r2_plot(train_valid_results, 'Type', 'Model', 'RMSE', 'R2', 'Splitting', rmse_title, r2_title, save_path)

❗ TODO

## Accuracy percentage compared between default and tuned models

In [17]:
# Group by 'Splitting'
train_valid_accuracy_grouped = train_valid_accuracy.groupby('Splitting')

title = 'Percentage of accuracy between default and tuned model'
save_path = RESULTS_FINAL_DIR + "/final_"
final_scores_utilities.train_val_accuracy_plot(train_valid_accuracy_grouped, 'Model', 'Accuracy (default)', 'Accuracy (tuned)', title, save_path)

❗TODO

# Test models
For each model, predictions on the various mini-sets are made and the obtained results are compared.

The test set is divided into further mini-sets of **1 week**, **15 days**, **1 month** and **3 months** to see how the models' performance degrades as the time taken into account increases.

In [18]:
# Retrieve the last value of the timestamp column
first_timestamp = df.select(col("timestamp")).first()[0]

# Split the test set into mini-sets of 1 week, 15 days, 1 month, and 3 months
one_week_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(weeks=1))
fifteen_days_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(days=15))
one_month_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(months=1))
three_months_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(months=3))

# Save datasets
datasets_list = [one_week_df, fifteen_days_df, one_month_df, three_months_df]

In [19]:
final_scores_utilities.show_datasets(one_week_df.toPandas(), fifteen_days_df.toPandas(), one_month_df.toPandas(), three_months_df.toPandas(), "Test set split")

In this graph splits are overlaid, to view them individually turn on/off the elements in the legend.

In [20]:
# Loading base features
with open(BASE_FEATURES, "r") as f:
    BASE_FEATURES = json.load(f)
print(BASE_FEATURES)

['opening-price', 'highest-price', 'lowest-price', 'closing-price', 'trade-volume-btc', 'market-price', 'market-cap', 'total-bitcoins', 'trade-volume-usd']


In [21]:
# Loading currency and additional most correlated features
with open(BASE_AND_MOST_CORR_FEATURES, "r") as f:
    BASE_AND_MOST_CORR_FEATURES = json.load(f)
print(BASE_AND_MOST_CORR_FEATURES)

['opening-price', 'highest-price', 'lowest-price', 'closing-price', 'trade-volume-btc', 'market-price', 'market-cap', 'total-bitcoins', 'trade-volume-usd', 'miners-revenue', 'sma-5-days', 'sma-7-days', 'sma-10-days', 'estimated-transaction-volume-usd', 'sma-20-days']


In [22]:
# Loading currency and additional least correlated features
with open(BASE_AND_LEAST_CORR_FEATURES, "r") as f:
    BASE_AND_LEAST_CORR_FEATURES = json.load(f)
print(BASE_AND_LEAST_CORR_FEATURES)

['opening-price', 'highest-price', 'lowest-price', 'closing-price', 'trade-volume-btc', 'market-price', 'market-cap', 'total-bitcoins', 'trade-volume-usd', 'sma-100-days', 'transaction-fees-usd', 'n-unique-addresses', 'sma-50-days', 'n-transactions-total', 'blocks-size', 'hash-rate', 'difficulty', 'avg-block-size', 'n-transactions-per-block', 'n-transactions']


In [24]:
# Load models
lr = PipelineModel.load(LR_MODEL)
glr = PipelineModel.load(GLR_MODEL)
rf = PipelineModel.load(RF_MODEL)
gbtr = PipelineModel.load(GBTR_MODEL)

Py4JJavaError: ignored

In [None]:
# Group models and features
features_list = [BASE_FEATURES, BASE_AND_MOST_CORR_FEATURES, BASE_AND_LEAST_CORR_FEATURES]
models_list = [lr, glr, rf, gbtr]

# Get model parameters
model_params_list = final_scores_utilities.get_model_parameters(train_valid_results_raw, models_list, features_list)
print(model_params_list)

In [None]:
final_test_results_raw, predictions_df = final_scores_utilities.models_testing(datasets_list, model_params_list)

# Summary

In [None]:
final_test_results = final_scores_utilities.test_dataset_fine_tuning(final_test_results_raw.copy())
final_test_results

In [None]:
datasets_name_raw_list = ["one_week", "fifteen_days", "one_month", "three_months"]
datasets_name_list = ['One week', 'Fifteen days', 'One month', 'Three months']
models_name_list = ['LR', 'GLR', 'RF', 'GBTR']

# For each dataset type, it displays the predicitons of each model
for i, data in enumerate(datasets_list):
    predictions_to_show = predictions_df[predictions_df['Dataset'] == datasets_name_raw_list[i]]

    lr_predictions = predictions_to_show[predictions_to_show['Model'] == LR_MODEL_NAME]
    glr_predictions = predictions_to_show[predictions_to_show['Model'] == GLR_MODEL_NAME]
    rf_predictions = predictions_to_show[predictions_to_show['Model'] == RF_MODEL_NAME]
    gbtr_predictions = predictions_to_show[predictions_to_show['Model'] == GBTR_MODEL_NAME]

    final_scores_utilities.show_results(data.toPandas(), models_name_list[0], lr_predictions, models_name_list[1], glr_predictions, models_name_list[2], rf_predictions, models_name_list[3], gbtr_predictions, datasets_name_list[i] + " predictions", RESULTS_FINAL_DIR)

❗TODO

In [None]:
rmse_title = 'RMSE per Dataset type'
r2_title = 'R2 per Dataset type'
save_path = RESULTS_FINAL_DIR + "/final_"
final_scores_utilities.test_rmse_r2_plot(final_test_results, 'Model', 'RMSE', 'R2', 'Dataset', rmse_title, r2_title, save_path)

❗TODO

In [None]:
# Group by 'Splitting'
final_test_results_grouped = final_test_results.groupby('Dataset')

title = 'Percentage of accuracy between default and tuned model'
save_path = RESULTS_FINAL_DIR + "/final_"
final_scores_utilities.test_accuracy_plot(final_test_results_grouped, 'Model', 'Accuracy', title, save_path)

❗TODO

# Saving results


In [None]:
# Saving test results
final_test_results_raw.to_csv(RESULTS_FINAL_DIR + "/final.csv", index=False)

In [None]:
# Export notebook in html format (remember to save the notebook and change the model name)
if LOCAL_RUNNING:
    !jupyter nbconvert --to html 6-final-scores.ipynb --output 6-final-scores --output-dir='./exports'