# **Bitcoin price prediction - Final predictions**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author: Corsi Danilo (1742375) - corsi.1742375@studenti.uniroma1.it


---


Description: making predictions on the test set with the final models trained on the whole train / validation set



# Global constants, dependencies, libraries and tools

In [None]:
# Main constants
GDRIVE_DIR = "/content/drive"
SLOW_OPERATIONS = True

In [None]:
###################
# --- DATASET --- #
###################

# Datasets dirs
GDRIVE_DATASET_OUTPUT_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/output"

# Datasets names
DATASET_TEST_NAME = "bitcoin_blockchain_data_30min_test"

# Datasets paths
GDRIVE_DATASET_TEST = GDRIVE_DATASET_OUTPUT_DIR + "/" + DATASET_TEST_NAME + ".parquet"

####################
# --- FEATURES --- #
####################

# Features dir
GDRIVE_FEATURES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/features"

# Features labels
FEATURES_LABEL = "features"
TARGET_LABEL = "next-market-price"
CHOSEN_FEATURES_LABEL = "most_corr_features"
FEATURES_NORMALIZATION = True

# Features paths
GDRIVE_CHOSEN_FEATURES = GDRIVE_FEATURES_DIR + "/" + CHOSEN_FEATURES_LABEL + ".json"

##################
# --- MODELS --- #
##################

# Model names
LR_MODEL_NAME = "LinearRegression"
GLR_MODEL_NAME = "GeneralizedLinearRegression"
RF_MODEL_NAME = "RandomForestRegressor"
GBT_MODEL_NAME = "GBTRegressor"

# Model dir
GDRIVE_MODELS_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/models"

# Model path
GDRIVE_LR_MODEL = GDRIVE_MODELS_DIR + "/" + LR_MODEL_NAME
GDRIVE_GLR_MODEL = GDRIVE_MODELS_DIR + "/" + GLR_MODEL_NAME
GDRIVE_RF_MODEL = GDRIVE_MODELS_DIR + "/" + RF_MODEL_NAME
GDRIVE_GBT_MODEL = GDRIVE_MODELS_DIR + "/" + GBT_MODEL_NAME

#####################
# --- UTILITIES --- #
#####################

# Utilities dir
GDRIVE_UTILITIES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/utilities"

###################
# --- RESULTS --- #
###################

# Results dir
GDRIVE_RESULTS_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/results"

# Model results path
GDRIVE_LR_MODEL_RESULTS = GDRIVE_RESULTS_DIR + "/" + LR_MODEL_NAME + ".csv"
GDRIVE_GLR_MODEL_RESULTS = GDRIVE_RESULTS_DIR + "/" + GLR_MODEL_NAME + ".csv"
GDRIVE_RF_MODEL_RESULTS = GDRIVE_RESULTS_DIR + "/" + RF_MODEL_NAME + ".csv"
GDRIVE_GBT_MODEL_RESULTS = GDRIVE_RESULTS_DIR + "/" + GBT_MODEL_NAME + ".csv"

# Final results path
GDRIVE_FINAL_RESULTS  = GDRIVE_RESULTS_DIR + "/final.csv"

In [None]:
# Point Colaboratory to Google Drive
from google.colab import drive

# Define GDrive paths
drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


In [None]:
# Suppression of warnings for better reading
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Install Spark and related dependencies
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285387 sha256=35d7eab7d2fe57d9e90c4ab293b36c7f67581e457e6927fcd4005e050da5b1b3
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


# Import files

In [None]:
# Import my files
import sys
sys.path.append(GDRIVE_UTILITIES_DIR)

from imports import *
import utilities

importlib.reload(utilities)

<module 'utilities' from '/content/drive/MyDrive/BDC/project/utilities/utilities.py'>

# Create the pyspark session

In [None]:
# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '4G').\
                set('spark.driver.memory', '45G').\
                set('spark.driver.maxResultSize', '10G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPricePrediction").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

# Loading dataset

In [None]:
# Load datasets into pyspark dataset objects
df = spark.read.load(GDRIVE_DATASET_TEST,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

In [None]:
def dataset_info(dataset):
  # Print dataset
  dataset.show(3)

  # Get the number of rows
  num_rows = dataset.count()

  # Get the number of columns
  num_columns = len(dataset.columns)

  # Print the shape of the dataset
  print("Shape:", (num_rows, num_columns))

  # Print the schema of the dataset
  dataset.printSchema()

In [None]:
if SLOW_OPERATIONS:
  dataset_info(df)

+-------------------+------+------------------+--------------------+--------------------+--------------------+---------------+------------------+--------------------+------------------------+-------------------+------------------+--------------------+--------------------+------------------+-----------------+--------------------------------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|          timestamp|    id|      market-price|      total-bitcoins|          market-cap|        trade-volume|    blocks-size|    avg-block-size|n-transactions-total|n-transactions-per-block|          hash-rate|        difficulty|      miners-revenue|transaction-fees-usd|n-unique-addresses|   n-transactions|estimated-transaction-volume-usd|      rate-of-change|        sma-5-days|        sma-7-days|       sma-10-days|       sma-20-days|       sma-50-days|      sma-100-days| next-market-price|
+-----

# Split test set
The test set is divided into further mini-sets of 1 week, 15 days, 1 month and 3 months to see how the models' performance degrades as time increases.

In [None]:
# Retrieve the last value of the timestamp column
first_timestamp = df.select(col("timestamp")).first()[0]

# Split the test set into mini-sets of 1 week, 15 days, 1 month, and 3 months
one_week_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(weeks=1))
fifteen_days_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(days=15))
one_month_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(months=1))
three_months_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(months=3))

# Load models

In [None]:
# Load models saved in Google Drive
lr = PipelineModel.load(GDRIVE_LR_MODEL)
glr = PipelineModel.load(GDRIVE_GLR_MODEL)
rf = PipelineModel.load(GDRIVE_RF_MODEL)
gbt = PipelineModel.load(GDRIVE_GBT_MODEL)

In [None]:
# Load models results
lr_results = pd.read_csv(GDRIVE_LR_MODEL_RESULTS)
glr_results = pd.read_csv(GDRIVE_GLR_MODEL_RESULTS)
rf_results = pd.read_csv(GDRIVE_RF_MODEL_RESULTS)
gbt_results = pd.read_csv(GDRIVE_GBT_MODEL_RESULTS)

In [None]:
def scatter_plot(dataset, x_axis, y_axis, title):
  fig = px.scatter(dataset, x=x_axis, y=y_axis, color=y_axis)
  fig.update_layout(title=title)
  fig.update_yaxes(showticklabels=False)
  fig.update_yaxes(title='')
  fig.show()

In [None]:
# Concatenate results into Pandas Dataframe
model_results_df = pd.DataFrame(pd.concat([lr_results, glr_results, rf_results, gbt_results], ignore_index=True))
model_results_df

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,LinearRegression,simple,all_features,"[100, 0.0, 0.0]",32.686907,1068.434,27.449421,0.00133,0.999883,0.999883,5.258529
1,LinearRegression,simple,most_corr_features,"[100, 0.0, 0.0]",12.592733,158.5769,9.053149,0.000425,0.999983,0.999983,2.231876
2,LinearRegression,simple,least_corr_features,"[100, 0.0, 0.0]",52051.055206,2709312000.0,48447.034042,2.380044,-294.829546,-294.8994,1.748487
3,LinearRegression,simple_norm,all_features,"[100, 0.0, 0.0]",5356.983033,28697270.0,4326.972011,0.1941,-2.133452,-2.134192,2.550066
4,LinearRegression,simple_norm,most_corr_features,"[100, 0.0, 0.0]",8393.218911,70446120.0,7442.156115,0.371253,-6.692005,-6.693822,1.862474
5,LinearRegression,simple_norm,least_corr_features,"[100, 0.0, 0.0]",10404.100087,108245300.0,9434.085066,0.453631,-10.819293,-10.822084,2.83674
6,LinearRegression,hyp_tuning,most_corr_features,"[5, 0.2, 0.2]",2372.522157,5628861.0,1956.793771,0.10006,0.385385,0.38524,0.513033
7,LinearRegression,multi_splits,most_corr_features,"[5, 0.2, 0.2]",11178.436608,284819100.0,9686.957189,0.394824,-1.593558,-1.593926,1.01712
8,LinearRegression,block_splits,most_corr_features,"[5, 0.2, 0.2]",4175.030323,44297450.0,3567.460701,0.215112,-3.678685,-3.684215,0.691858
9,LinearRegression,final_validated,most_corr_features,"[5, 0.2, 0.2]",2372.522157,5628861.0,1956.793771,0.10006,0.385385,0.38524,1.177968


In [None]:
scatter_plot(model_results_df, "RMSE", "Model", "RMSE value for each model")

# Load features

In [None]:
# Load choosen features
with open(GDRIVE_CHOSEN_FEATURES, "r") as f:
    CHOSEN_FEATURES = json.load(f)
print(CHOSEN_FEATURES)

['market-price', 'market-cap', 'miners-revenue', 'sma-5-days', 'sma-7-days', 'estimated-transaction-volume-usd', 'sma-10-days', 'n-transactions-total', 'blocks-size', 'sma-100-days', 'total-bitcoins']


# Test models
For each model, predictions on the various mini-sets are made and the obtained results are compared.


In [None]:
'''
Description: Evaluate final model by making predictions on the test set
Args:
    dataset: The dataSet which needs to be splited
    dataset_name: Name of selected dataset [one_week | fifteen_days | one_month | three_months]
    model: Trained model
    model_name: Model name selected
    features_normalization: Indicates whether features should be normalized (True) or not (False)
    features: Features to be used to make predictions
    features_name: Name of features used
    features_label: The column name of features
    target_label: The column name of target variable
Return:
    results_df: Results obtained from the evaluation
    predictions: Predictions obtained from the model
'''
def evaluate_final_model(dataset, dataset_name, model, model_name, features_normalization, features, features_label, target_label):
    # Select the type of features to be used
    dataset = utilities.select_features(dataset, features_normalization, features, features_label, target_label)

    # Chain assembler and model in a Pipeline
    pipeline = Pipeline(stages=[model])

    # Train a model and calculate running time
    start = time.time()
    pipeline_model = pipeline.fit(dataset)
    end = time.time()

    # Make predictions
    predictions = pipeline_model.transform(dataset).select(target_label, "prediction", 'timestamp')

    # Compute validation error by several evaluators
    eval_res = utilities.model_evaluation(target_label, predictions)

    # Use dict to store each result
    results = {
        "Model": model_name,
        "Dataset": dataset_name,
        "RMSE": eval_res['rmse'],
        "MSE": eval_res['mse'],
        "MAE": eval_res['mae'],
        "MAPE": eval_res['mape'],
        "R2": eval_res['r2'],
        "Adjusted_R2": eval_res['adj_r2'],
        "Time": end - start,
    }

    # Transform dict to pandas dataset
    results_pd = pd.DataFrame(results, index=[0])

    return results_pd, predictions

In [None]:
# Linear regression
lr_res_one_week, lr_pred_one_week = evaluate_final_model(one_week_df, "one_week", lr, LR_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
lr_res_fifteen_days, lr_pred_fifteen_days = evaluate_final_model(fifteen_days_df, "fifteen_days", lr, LR_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
lr_res_one_month, lr_pred_one_month = evaluate_final_model(one_month_df, "one_month", lr, LR_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
lr_res_three_months, lr_pred_three_months = evaluate_final_model(three_months_df, "three_months", lr, LR_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)

In [None]:
# Generalized linear regression
glr_res_one_week, glr_pred_one_week = evaluate_final_model(one_week_df, "one_week", glr, GLR_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
glr_res_fifteen_days, glr_pred_fifteen_days = evaluate_final_model(fifteen_days_df, "fifteen_days", glr, GLR_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
glr_res_one_month, glr_pred_one_month = evaluate_final_model(one_month_df, "one_month", glr, GLR_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
glr_res_three_months, glr_pred_three_months = evaluate_final_model(three_months_df, "three_months", glr, GLR_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)

In [None]:
# Random forest regressor
rf_res_one_week, rf_pred_one_week = evaluate_final_model(one_week_df, "one_week", rf, RF_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
rf_res_fifteen_days, rf_pred_fifteen_days = evaluate_final_model(fifteen_days_df, "fifteen_days", rf, RF_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
rf_res_one_month, rf_pred_one_month = evaluate_final_model(one_month_df, "one_month", rf, RF_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
rf_res_three_months, rf_pred_three_months = evaluate_final_model(three_months_df, "three_months", rf, RF_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)

In [None]:
# Gradient boosting tree regressor
gbt_res_one_week, gbt_pred_one_week = evaluate_final_model(one_week_df, "one_week", gbt, GBT_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
gbt_res_fifteen_days, gbt_pred_fifteen_days = evaluate_final_model(fifteen_days_df, "fifteen_days", gbt, GBT_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
gbt_res_one_month, gbt_pred_one_month = evaluate_final_model(one_month_df, "one_month", gbt, GBT_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
gbt_res_three_months, gbt_pred_three_months = evaluate_final_model(three_months_df, "three_months", gbt, GBT_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)

Py4JJavaError: ignored

# Models comparison

In [None]:
def show_results(df, lr_predictions, glr_predictions, rf_predictions, gbt_predictions, title):
  trace1 = go.Scatter(
      x = df['timestamp'],
      y = df['next-market-price'].astype(float),
      mode = 'lines',
      name = 'Next Market price (usd)'
  )

  trace2 = go.Scatter(
      x = lr_predictions['timestamp'],
      y = lr_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Linear Regression predictions'
  )

  trace2 = go.Scatter(
      x = lr_predictions['timestamp'],
      y = lr_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Linear Regression predictions'
  )

  trace3 = go.Scatter(
      x = glr_predictions['timestamp'],
      y = glr_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Generalized Linear Regression predictions'
  )

  trace4 = go.Scatter(
      x = rf_predictions['timestamp'],
      y = rf_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Random Forest Regressor predictions'
  )

  trace5 = go.Scatter(
      x = gbt_predictions['timestamp'],
      y = gbt_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'GBTRegressor predictions'
  )

  layout = dict(
      title=title + " predictions",
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  # Change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace1,trace2,trace3, trace4, trace5]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = title + " predictions")

In [None]:
# Displaying results for one week
one_week_comparison_lst_df = pd.DataFrame(pd.concat([lr_res_one_week, glr_res_one_week, rf_res_one_week, gbt_res_one_week], ignore_index=True))
one_week_comparison_lst_df

In [None]:
show_results(one_week_df.toPandas(), lr_pred_one_week.toPandas(), glr_pred_one_week.toPandas(), rf_pred_one_week.toPandas(), gbt_pred_one_week.toPandas(), "One week")

In [None]:
# Displaying results for fifteen days
fifteen_days_comparison_lst_df = pd.DataFrame(pd.concat([lr_res_fifteen_days, glr_res_fifteen_days, rf_res_fifteen_days, gbt_res_fifteen_days], ignore_index=True))
fifteen_days_comparison_lst_df

In [None]:
show_results(fifteen_days_df.toPandas(), lr_pred_fifteen_days.toPandas(), glr_pred_fifteen_days.toPandas(), rf_pred_fifteen_days.toPandas(), gbt_pred_fifteen_days.toPandas(), "Fifteen days")

In [None]:
# Displaying results for one month
one_month_comparison_lst_df = pd.DataFrame(pd.concat([lr_res_one_month, glr_res_one_month, rf_res_one_month, gbt_res_one_month], ignore_index=True))
one_month_comparison_lst_df

In [None]:
show_results(one_month_df.toPandas(), lr_pred_one_month.toPandas(), glr_pred_one_month.toPandas(), rf_pred_one_month.toPandas(), gbt_pred_one_month.toPandas(), "One month")

In [None]:
# Displaying results for three months
three_months_comparison_lst_df = pd.DataFrame(pd.concat([lr_res_three_months, glr_res_three_months, rf_res_three_months, gbt_res_three_months], ignore_index=True))
three_months_comparison_lst_df

In [None]:
show_results(three_months_df.toPandas(), lr_pred_three_months.toPandas(), glr_pred_three_months.toPandas(), rf_pred_three_months.toPandas(), gbt_pred_three_months.toPandas(), "Three months")

# Summary

In [None]:
# Displaying final results
final_model_results = pd.concat([one_week_comparison_lst_df, fifteen_days_comparison_lst_df, one_month_comparison_lst_df, three_months_comparison_lst_df], ignore_index=True)
final_model_results

In [None]:
scatter_plot(final_model_results, "RMSE", "Model", "RMSE value for each final model")

In [None]:
def scatter_plot(dataset, x_axis, y_axis, legend, title):
  fig = px.scatter(dataset, x=x_axis, y=y_axis, color=legend)
  fig.update_layout(title=title)
  fig.show()

In [None]:
scatter_plot(final_model_results, "RMSE", "Dataset", "Model", "RMSE value for each model (and dataset)")

In [None]:
# Saving final model results
final_model_results.to_csv(GDRIVE_FINAL_RESULTS, index=False)