# **Bitcoin price prediction - Final scores**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author: Corsi Danilo (1742375) - corsi.1742375@studenti.uniroma1.it


---


Description: testing the final models and compare the results.

# Global constants, dependencies, libraries and tools

In [None]:
# Main constants
LOCAL_RUNNING = False
ROOT_DIR = "D:/Documents/Repository/BDC/project" if LOCAL_RUNNING else "/content/drive"

In [None]:
if not LOCAL_RUNNING:
    # Point Colaboratory to Google Drive
    from google.colab import drive

    # Define GDrive paths
    drive.mount(ROOT_DIR, force_remount=True)

    # Install Spark and related dependencies
    !pip install pyspark
    !pip install -U -q PyDrive -qq
    !apt install openjdk-8-jdk-headless -qq

    # Install "kaleido" engine package to export image
    !pip install -U kaleido

## Import my utilities

In [None]:
# Set main dir
MAIN_DIR = ROOT_DIR + "" if LOCAL_RUNNING else ROOT_DIR + "/MyDrive/BDC/project"

# Utilities dir
UTILITIES_DIR = MAIN_DIR + "/utilities"

# Import my utilities
import sys
sys.path.append(UTILITIES_DIR)

from imports import *
from config import *
import final_scores_utilities

importlib.reload(final_scores_utilities)

In [None]:
# Set main dir
MAIN_DIR = ROOT_DIR + "" if LOCAL_RUNNING else ROOT_DIR + "/MyDrive/BDC/project"

###################
# --- DATASET --- #
###################

# Datasets dirs
DATASET_OUTPUT_DIR = MAIN_DIR + "/datasets/output"

# Datasets paths
DATASET_TEST = DATASET_OUTPUT_DIR + "/" + DATASET_TEST_NAME + ".parquet"

####################
# --- FEATURES --- #
####################

# Features dir
FEATURES_DIR = MAIN_DIR + "/features"

# Features paths
FEATURES_CORRELATION = FEATURES_DIR + "/" + FEATURES_CORRELATION_LABEL + ".json"
BASE_FEATURES = FEATURES_DIR + "/" + BASE_FEATURES_LABEL + ".json"
BASE_AND_MOST_CORR_FEATURES = FEATURES_DIR + "/" + BASE_AND_MOST_CORR_FEATURES_LABEL + ".json"
BASE_AND_LEAST_CORR_FEATURES = FEATURES_DIR + "/" + BASE_AND_LEAST_CORR_FEATURES_LABEL + ".json"

##################
# --- MODELS --- #
##################

# Model dir
MODELS_DIR = MAIN_DIR + "/models"

# Model path
LR_MODEL = MODELS_DIR + "/" + LR_MODEL_NAME
GLR_MODEL = MODELS_DIR + "/" + GLR_MODEL_NAME
RF_MODEL = MODELS_DIR + "/" + RF_MODEL_NAME
GBTR_MODEL = MODELS_DIR + "/" + GBTR_MODEL_NAME

###################
# --- RESULTS --- #
###################

# Results dir
RESULTS_DIR = MAIN_DIR + "/results"
RESULTS_FINAL_DIR = RESULTS_DIR + "/final"

In [None]:
# Suppression of warnings for better reading
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pio.renderers.default = 'vscode+colab' # To correctly render plotly plots

# Create the pyspark session

In [None]:
# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '12G').\
                set('spark.driver.memory', '12G').\
                set('spark.driver.maxResultSize', '109G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPricePrediction").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

# Loading dataset

In [None]:
# Load datasets into pyspark dataset objects
df = spark.read.load(DATASET_TEST,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

In [None]:
final_scores_utilities.dataset_info(df)

# Compare train / validation results

In [None]:
splits_list = [BLOCK_SPLITS_NAME, WALK_FORWARD_SPLITS_NAME, SHORT_TERM_SPLITS_NAME]
models_list = [LR_MODEL_NAME, GLR_MODEL_NAME, RF_MODEL_NAME, GBTR_MODEL_NAME]

In [None]:
# Load all results
train_valid_all_results_raw = final_scores_utilities.get_all_results(splits_list, models_list, RESULTS_DIR) # Get all results

In [None]:
train_valid_all_results = train_valid_all_results_raw[train_valid_all_results_raw['Dataset'] != 'train'].copy() # Remove the 'train' dataset

train_valid_all_results = final_scores_utilities.train_valid_dataset_fine_tuning(train_valid_all_results, 'results') # Fine tuning of the dataset
train_valid_all_results = train_valid_all_results[(train_valid_all_results['Dataset'] == 'valid') & (train_valid_all_results['Type'] == 'Default')] # Get only the default results

## RMSE and R2 values compared with the features used in the default models

In [None]:
rmse_title = 'RMSE per Features type'
r2_title = 'R2 per Features type'
save_path = RESULTS_FINAL_DIR + "/plots/default_"
final_scores_utilities.train_val_rmse_r2_plot(train_valid_all_results, 'Features', 'Model', 'RMSE', 'R2', 'Splitting', rmse_title, r2_title, save_path)

❗TODO

In [None]:
# Exclude negative R2 values
train_valid_all_results_non_negative = train_valid_all_results[train_valid_all_results['R2'] >= 0].copy()

# # Convert the columns to a category type with the custom order
train_valid_all_results_non_negative['Features'] = pd.Categorical(train_valid_all_results_non_negative['Features'], categories=final_scores_utilities.features_order, ordered=True)
train_valid_all_results_non_negative['Splitting'] = pd.Categorical(train_valid_all_results_non_negative['Splitting'], categories=final_scores_utilities.splitting_order, ordered=True)

# Sort the DataFrame by the columns
train_valid_all_results_non_negative.sort_values(by=['Features', 'Splitting'], inplace=True)

r2_title = 'R2 per Model type (non-negative)'
final_scores_utilities.train_val_r2_plot(train_valid_all_results_non_negative, 'Features', 'Model', 'R2', 'Splitting', r2_title)

❗ TODO

In [None]:
# Load relevant results
train_valid_results_raw, train_valid_accuracy_raw = final_scores_utilities.get_rel_results(splits_list, models_list, RESULTS_DIR) # Get relevant results

train_valid_results = final_scores_utilities.train_valid_dataset_fine_tuning(train_valid_results_raw.copy(), 'results') # Fine tuning of the dataset
train_valid_accuracy = final_scores_utilities.train_valid_dataset_fine_tuning(train_valid_accuracy_raw.copy(), 'accuracy') # Fine tuning of the dataset

In [None]:
train_valid_results = final_scores_utilities.train_valid_dataset_fine_tuning(train_valid_results, 'results')
train_valid_results

In [None]:
train_valid_accuracy = final_scores_utilities.train_valid_dataset_fine_tuning(train_valid_accuracy, 'accuracy')
train_valid_accuracy

## RMSE and R2 values compared between default and tuned models

In [None]:
rmse_title = 'RMSE per Model type'
r2_title = 'R2 per Model type'
save_path = RESULTS_FINAL_DIR + "/plots/final_"
final_scores_utilities.train_val_rmse_r2_plot(train_valid_results, 'Type', 'Model', 'RMSE', 'R2', 'Splitting', rmse_title, r2_title, save_path)

❗ TODO

In [None]:
# Exclude negative R2 values
train_valid_results_non_negative = train_valid_results[train_valid_results['R2'] >= 0].copy()

# # Convert the columns to a category type with the custom order
train_valid_results_non_negative['Type'] = pd.Categorical(train_valid_results_non_negative['Type'], categories=final_scores_utilities.type_order, ordered=True)
train_valid_results_non_negative['Model'] = pd.Categorical(train_valid_results_non_negative['Model'], categories=final_scores_utilities.model_order, ordered=True)
train_valid_results_non_negative['Splitting'] = pd.Categorical(train_valid_results_non_negative['Splitting'], categories=final_scores_utilities.splitting_order, ordered=True)

# Sort the DataFrame by the columns
train_valid_results_non_negative.sort_values(by=['Splitting', 'Type', 'Model'], inplace=True)

r2_title = 'R2 per Model type (non-negative)'
final_scores_utilities.train_val_r2_plot(train_valid_results_non_negative, 'Type', 'Model', 'R2', 'Splitting', r2_title)

❗ TODO

## Accuracy percentage compared between default and tuned models

In [None]:
# Group by 'Splitting'
train_valid_accuracy_grouped = train_valid_accuracy.groupby('Splitting')

title = 'Percentage of accuracy between default and tuned model'
save_path = RESULTS_FINAL_DIR + "/plots/final_"
final_scores_utilities.train_val_accuracy_plot(train_valid_accuracy_grouped, 'Model', 'Accuracy (default)', 'Accuracy (tuned)', title, save_path)

❗TODO

# Test models
After loading the trained models, the test set is divided into further mini-sets of `1 week`, `15 days`, `1 month` and `3 months` to see how the models' performance degrades as time increases. Final results are collected and compared to draw conclusions (see final results).

In [None]:
# Retrieve the last value of the timestamp column
first_timestamp = df.select(col("timestamp")).first()[0]

# Split the test set into mini-sets of 1 week, 15 days, 1 month, and 3 months
one_week_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(weeks=1))
fifteen_days_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(days=15))
one_month_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(months=1))
three_months_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(months=3))

# Save datasets
datasets_list = [one_week_df, fifteen_days_df, one_month_df, three_months_df]

In [None]:
final_scores_utilities.show_datasets(one_week_df.toPandas(), fifteen_days_df.toPandas(), one_month_df.toPandas(), three_months_df.toPandas(), "Test set split")

In this graph each split is overlaid with the others, to view them individually turn on / off the elements in the legend.

In [None]:
# Loading base features
with open(BASE_FEATURES, "r") as f:
    BASE_FEATURES = json.load(f)
print(BASE_FEATURES)

In [None]:
# Loading currency and additional most correlated features
with open(BASE_AND_MOST_CORR_FEATURES, "r") as f:
    BASE_AND_MOST_CORR_FEATURES = json.load(f)
print(BASE_AND_MOST_CORR_FEATURES)

In [None]:
# Loading currency and additional least correlated features
with open(BASE_AND_LEAST_CORR_FEATURES, "r") as f:
    BASE_AND_LEAST_CORR_FEATURES = json.load(f)
print(BASE_AND_LEAST_CORR_FEATURES)

In [None]:
# Load models
lr = PipelineModel.load(LR_MODEL)
glr = PipelineModel.load(GLR_MODEL)
rf = PipelineModel.load(RF_MODEL)
gbtr = PipelineModel.load(GBTR_MODEL)

In [None]:
# Group models and features
features_list = [BASE_FEATURES, BASE_AND_MOST_CORR_FEATURES, BASE_AND_LEAST_CORR_FEATURES]
models_list = [lr, glr, rf, gbtr]

# Get model parameters
model_params_list = final_scores_utilities.get_model_parameters(train_valid_results_raw, models_list, features_list)
print(model_params_list)

In [None]:
final_test_results_raw, predictions_df = final_scores_utilities.models_testing(datasets_list, model_params_list)

# Final results

In [None]:
final_test_results = final_scores_utilities.test_dataset_fine_tuning(final_test_results_raw.copy())
final_test_results

## Prediction

In [None]:
datasets_name_raw_list = ["one_week", "fifteen_days", "one_month", "three_months"]

# For each dataset type, it displays the predicitons of each model
for i, data in enumerate(datasets_list):
    predictions_to_show = predictions_df[predictions_df['Dataset'] == datasets_name_raw_list[i]]

    lr_predictions = predictions_to_show[predictions_to_show['Model'] == LR_MODEL_NAME]
    glr_predictions = predictions_to_show[predictions_to_show['Model'] == GLR_MODEL_NAME]
    rf_predictions = predictions_to_show[predictions_to_show['Model'] == RF_MODEL_NAME]
    gbtr_predictions = predictions_to_show[predictions_to_show['Model'] == GBTR_MODEL_NAME]

    final_scores_utilities.show_results(
        data.toPandas(),
        final_scores_utilities.model_order[0], lr_predictions,
        final_scores_utilities.model_order[1], glr_predictions,
        final_scores_utilities.model_order[2], rf_predictions,
        final_scores_utilities.model_order[3], gbtr_predictions,
        final_scores_utilities.dataset_order[i] + " predictions")

❗TODO

## RMSE and R2 values of each model for each dataset split

In [None]:
rmse_title = 'RMSE per Dataset type'
r2_title = 'R2 per Dataset type'
save_path = RESULTS_FINAL_DIR + "/plots/final_"
final_scores_utilities.test_rmse_r2_plot(final_test_results, 'Model', 'RMSE', 'R2', 'Dataset', rmse_title, r2_title, save_path)

❗TODO

In [None]:
# Exclude negative R2 values
final_test_results_non_negative = final_test_results[final_test_results['R2'] >= 0].copy()

# # Convert the columns to a category type with the custom order
final_test_results_non_negative['Dataset'] = pd.Categorical(final_test_results_non_negative['Dataset'], categories=final_scores_utilities.dataset_order, ordered=True)
final_test_results_non_negative['Model'] = pd.Categorical(final_test_results_non_negative['Model'], categories=final_scores_utilities.model_order, ordered=True)

# Sort the DataFrame by the columns
final_test_results_non_negative.sort_values(by=['Dataset', 'Model'], inplace=True)

r2_title = 'R2 per Dataset type (non-negative)'
final_scores_utilities.test_r2_plot(final_test_results_non_negative, 'Model', 'R2', 'Dataset', r2_title)

❗TODO

## Accuracy percentage of each model for each dataset split

In [None]:
# Group by 'Splitting'
final_test_results_grouped = final_test_results.groupby('Dataset')

title = 'Percentage of accuracy between default and tuned model'
save_path = RESULTS_FINAL_DIR + "/plots/final_"
final_scores_utilities.test_accuracy_plot(final_test_results_grouped, 'Model', 'Accuracy', title, save_path)

❗TODO

# Saving final results

In [None]:
# Saving test results
final_test_results_raw.to_csv(RESULTS_FINAL_DIR + "/final.csv", index=False)

In [None]:
# Export notebook in html format (remember to save the notebook and change the model name)
if LOCAL_RUNNING:
    !jupyter nbconvert --to html 6-final-scores.ipynb --output 6-final-scores --output-dir='./exports'