# **Bitcoin price prediction - Final scores**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author: Corsi Danilo (1742375) - corsi.1742375@studenti.uniroma1.it


---


Description: display of final scores andmaking predictions on the test set with the models trained on the whole train / validation set.

# Global constants, dependencies, libraries and tools

In [1]:
# Main constants
LOCAL_RUNNING = True
SLOW_OPERATIONS = True # Decide whether or not to use operations that might slow down notebook execution
MAIN_DIR = "D:/Documents/Repository/BDC/project" if LOCAL_RUNNING else "/content/drive"

In [2]:
if not LOCAL_RUNNING: 
    # Point Colaboratory to Google Drive
    from google.colab import drive

    # Define GDrive paths
    drive.mount(MAIN_DIR, force_remount=True)

In [13]:
# Set main dir
MAIN_DIR = MAIN_DIR + "" if LOCAL_RUNNING else MAIN_DIR + "/MyDrive/BDC/project"

###################
# --- DATASET --- #
###################

# Datasets dirs
DATASET_OUTPUT_DIR = MAIN_DIR + "/datasets/output"

# Datasets names
DATASET_TEST_NAME = "bitcoin_blockchain_data_15min_test"

# Datasets paths
DATASET_TEST = DATASET_OUTPUT_DIR + "/" + DATASET_TEST_NAME + ".parquet"

####################
# --- FEATURES --- #
####################

# Features dir
FEATURES_DIR = MAIN_DIR + "/features"

# Features labels
FEATURES_LABEL = "features"
TARGET_LABEL = "next-market-price"
CHOSEN_FEATURES_LABEL = "most_corr_features"
FEATURES_NORMALIZATION = True

# Features paths
CHOSEN_FEATURES = FEATURES_DIR + "/" + CHOSEN_FEATURES_LABEL + ".json"

##################
# --- MODELS --- #
##################

# Model names
LR_MODEL_NAME = "LinearRegression"
GLR_MODEL_NAME = "GeneralizedLinearRegression"
RF_MODEL_NAME = "RandomForestRegressor"
GBT_MODEL_NAME = "GradientBoostingTreeRegressor"

# Model dir
MODELS_DIR = MAIN_DIR + "/models"

# Model path
LR_MODEL = MODELS_DIR + "/" + LR_MODEL_NAME
GLR_MODEL = MODELS_DIR + "/" + GLR_MODEL_NAME
RF_MODEL = MODELS_DIR + "/" + RF_MODEL_NAME
GBT_MODEL = MODELS_DIR + "/" + GBT_MODEL_NAME

###################
# --- RESULTS --- #
###################

# splits names
BLOCK_SPLITS_NAME = "block_splits"
WALK_FORWARD_SPLITS_NAME = "walk_forward_splits"
SHORT_TERM_SPLITS_NAME = "single_split"

# Results dir
RESULTS_DIR = MAIN_DIR + "/results"

#####################
# --- UTILITIES --- #
#####################

# Utilities dir
UTILITIES_DIR = MAIN_DIR + "/utilities"

In [4]:
# Suppression of warnings for better reading
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
if not LOCAL_RUNNING:
    # Install Spark and related dependencies
    !pip install pyspark
    !pip install -U -q PyDrive -qq
    !apt install openjdk-8-jdk-headless -qq

# Import files

In [6]:
# Import my files
import sys
sys.path.append(UTILITIES_DIR)

from imports import *
import utilities

importlib.reload(utilities)

<module 'utilities' from 'D:\\Documents/Repository/BDC/project/utilities\\utilities.py'>

# Create the pyspark session

In [7]:
# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '12G').\
                set('spark.driver.memory', '12G').\
                set('spark.driver.maxResultSize', '109G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPricePrediction").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

# Loading dataset

In [8]:
# Load datasets into pyspark dataset objects
df = spark.read.load(DATASET_TEST,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

In [9]:
def dataset_info(dataset):
  # Print dataset
  dataset.show(3)

  # Get the number of rows
  num_rows = dataset.count()

  # Get the number of columns
  num_columns = len(dataset.columns)

  # Print the shape of the dataset
  print("Shape:", (num_rows, num_columns))

  # Print the schema of the dataset
  dataset.printSchema()

In [10]:
if SLOW_OPERATIONS:
  dataset_info(df)

+-------------------+------+------------------+------------------+--------------------+-------------------+-----------------+------------------+--------------------+------------------------+--------------------+------------------+--------------------+--------------------+------------------+--------------+--------------------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|          timestamp|    id|      market-price|    total-bitcoins|          market-cap|       trade-volume|      blocks-size|    avg-block-size|n-transactions-total|n-transactions-per-block|           hash-rate|        difficulty|      miners-revenue|transaction-fees-usd|n-unique-addresses|n-transactions|estimated-transaction-volume-usd|     rate-of-change|        sma-5-days|        sma-7-days|       sma-10-days|       sma-20-days|       sma-50-days|      sma-100-days| next-market-price|
+-------------

# Split test set
The test set is divided into further mini-sets of 1 week, 15 days, 1 month and 3 months to see how the models' performance degrades as time increases.

In [11]:
# Retrieve the last value of the timestamp column
first_timestamp = df.select(col("timestamp")).first()[0]

# Split the test set into mini-sets of 1 week, 15 days, 1 month, and 3 months
one_week_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(weeks=1))
fifteen_days_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(days=15))
one_month_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(months=1))
three_months_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(months=3))

# ❗Load previous results

In [14]:
# Load models results
splits_list = [BLOCK_SPLITS_NAME, WALK_FORWARD_SPLITS_NAME, SHORT_TERM_SPLITS_NAME]
models_list = [LR_MODEL_NAME, GLR_MODEL_NAME, RF_MODEL_NAME, GBT_MODEL_NAME]
final_results = pd.DataFrame(columns=['Model', 'Type', 'Splitting', 'Features', 'Parameters', 'RMSE', 'MSE', 'MAE', 'MAPE', 'R2', 'Adjusted_R2', 'Time'])
final_accuracy = pd.DataFrame(columns=['Model', 'Features', 'Splitting', 'Accuracy'])
for split in splits_list:
    for model in models_list:
        if split == BLOCK_SPLITS_NAME:
            final_results = pd.concat([final_results, pd.read_csv(RESULTS_DIR + "/" + split + "/" + model + ".csv")], ignore_index=True)
            final_accuracy = pd.concat([final_accuracy, pd.read_csv(RESULTS_DIR + "/" + split + "/" + model + "_accuracy.csv")], ignore_index=True)
        elif split == WALK_FORWARD_SPLITS_NAME:
            final_results = pd.concat([final_results, pd.read_csv(RESULTS_DIR + "/" + split + "/" + model + ".csv")], ignore_index=True)
            final_accuracy = pd.concat([final_accuracy, pd.read_csv(RESULTS_DIR + "/" + split + "/" + model + "_accuracy.csv")], ignore_index=True)
        elif split == SHORT_TERM_SPLITS_NAME:
            final_results = pd.concat([final_results, pd.read_csv(RESULTS_DIR + "/" + split + "/" + model + ".csv")], ignore_index=True)
            final_accuracy = pd.concat([final_accuracy, pd.read_csv(RESULTS_DIR + "/" + split + "/" + model + "_accuracy.csv")], ignore_index=True)


In [15]:
def scatter_plot(dataset, x_axis, y_axis, title):
  fig = px.scatter(dataset, x=x_axis, y=y_axis, color=y_axis)
  fig.update_layout(title=title)
  fig.update_yaxes(showticklabels=False)
  fig.update_yaxes(title='')
  fig.show()

In [16]:
def adv_scatter_plot(dataset, x_axis, y_axis, legend, title):
  fig = px.scatter(dataset, x=x_axis, y=y_axis, color=legend)
  fig.update_layout(title=title)
  fig.show()

In [17]:
scatter_plot(final_results, "RMSE", "Model", "RMSE value for each model")

In [18]:
adv_scatter_plot(final_results, "RMSE", "Model", "Splitting", "RMSE value for each model (and splitting method)")

In [21]:
scatter_plot(final_accuracy, "Accuracy", "Model", "Accuracy for each model")

In [20]:
adv_scatter_plot(final_accuracy, "Accuracy", "Model", "Splitting", "Accuracy for each model (and splitting method)")

# ❗Test models
For each model, predictions on the various mini-sets are made and the obtained results are compared.


In [None]:
'''
Description: Evaluate final model by making predictions on the test set
Args:
    dataset: The dataSet which needs to be splited
    dataset_name: Name of selected dataset [one_week | fifteen_days | one_month | three_months]
    model: Trained model
    model_name: Model name selected
    features_normalization: Indicates whether features should be normalized (True) or not (False)
    features: Features to be used to make predictions
    features_name: Name of features used
    features_label: The column name of features
    target_label: The column name of target variable
Return:
    results_df: Results obtained from the evaluation
    predictions: Predictions obtained from the model
'''
def evaluate_final_model(dataset, dataset_name, model, model_name, features_normalization, features, features_label, target_label):
    # Select the type of features to be used
    dataset = utilities.select_features(dataset, features_normalization, features, features_label, target_label)

    # Chain assembler and model in a Pipeline
    pipeline = Pipeline(stages=[model])

    # Train a model and calculate running time
    start = time.time()
    pipeline_model = pipeline.fit(dataset)
    end = time.time()

    # Make predictions
    predictions = pipeline_model.transform(dataset).select(target_label, "market-price", "prediction", 'timestamp')

    # Compute validation error by several evaluators
    eval_res = utilities.model_evaluation(target_label, predictions)

    # Use dict to store each result
    results = {
        "Model": model_name,
        "Dataset": dataset_name,
        "RMSE": eval_res['rmse'],
        "MSE": eval_res['mse'],
        "MAE": eval_res['mae'],
        "MAPE": eval_res['mape'],
        "R2": eval_res['r2'],
        "Adjusted_R2": eval_res['adj_r2'],
        "Time": end - start,
    }

    # Transform dict to pandas dataset
    results_pd = pd.DataFrame(results, index=[0])

    return results_pd, predictions

In [None]:
'''
Description: How good the models are at predicting whether the price will go up or down
Args:
    dataset: The dataset which needs to be splited
Return: 
    accuracy: Return the percentage of correct predictions
'''
def model_accuracy(dataset):    
    # Compute the number of total rows in the DataFrame.
    total_rows = dataset.count()

    # Create a column "correct_prediction" which is worth 1 if the prediction is correct, otherwise 0
    dataset = dataset.withColumn(
        "correct_prediction",
        (
            (col("market-price") < col("next-market-price")) & (col("market-price") < col("prediction"))
        ) | (
            (col("market-price") > col("next-market-price")) & (col("market-price") > col("prediction"))
        )
    )

    # Count the number of correct predictions
    correct_predictions = dataset.filter(col("correct_prediction")).count()

    # Compite percentage of correct predictions
    accuracy = (correct_predictions / total_rows) * 100
        
    return accuracy

In [None]:
# Load choosen features
with open(CHOSEN_FEATURES, "r") as f:
    CHOSEN_FEATURES = json.load(f)
print(CHOSEN_FEATURES)

In [None]:
# # Compute model accuracy
# accuracy = model_accuracy(final_valid_pred_spark)

# # Saving accuracy data into dataframe
# accuracy_data = {
#     'Model': MODEL_NAME,
#     'Features': CHOSEN_FEATURES_LABEL,
#     'Splitting': SPLITTING_METHOD,
#     'Accuracy': accuracy
# }
# accuracy_data_df = pd.DataFrame(accuracy_data, index=['Model'])

# print(f"Percentage of correct predictions for {MODEL_NAME} with {CHOSEN_FEATURES_LABEL} and {SPLITTING_METHOD}: {accuracy:.2f}%")

In [None]:
# # Load models
# lr = PipelineModel.load(LR_MODEL)
# glr = PipelineModel.load(GLR_MODEL)
# rf = PipelineModel.load(RF_MODEL)
# gbt = PipelineModel.load(GBT_MODEL)

# # Test the final model
# model_name_list = [LR_MODEL_NAME, GLR_MODEL_NAME, RF_MODEL_NAME, GBT_MODEL_NAME]
# model_list = [lr, glr, rf, gbt]
# dataset_list = [one_week_df, fifteen_days_df, one_month_df, three_months_df]
# dataset_name_list = ["one_week", "fifteen_days", "one_month", "three_months"]
# final_test_results
# final_test_accuracy

# for i, model_name in enumerate(model_name_list):
#     for j, dataset_name in enumerate(dataset_name_list):
#         if model_name == LR_MODEL_NAME:
#             results, predictions = evaluate_final_model(dataset_list[j], dataset_name, models_list[i], model_name, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
#             final_test_results = pd.concat([final_test_results, results], ignore_index=True)
#             final_test_accuracy
#             final_results = pd.concat([final_results, pd.read_csv(RESULTS_DIR + "/" + split + "/" + model + ".csv")], ignore_index=True)

#         elif model == GLR_MODEL_NAME:
#         elif model == RF_MODEL_NAME:
#         elif model == GBT_MODEL_NAME:

In [None]:
# Linear regression
lr_res_one_week, lr_pred_one_week = evaluate_final_model(one_week_df, "one_week", lr, LR_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
lr_res_fifteen_days, lr_pred_fifteen_days = evaluate_final_model(fifteen_days_df, "fifteen_days", lr, LR_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
lr_res_one_month, lr_pred_one_month = evaluate_final_model(one_month_df, "one_month", lr, LR_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
lr_res_three_months, lr_pred_three_months = evaluate_final_model(three_months_df, "three_months", lr, LR_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)

In [None]:
# Generalized linear regression
glr_res_one_week, glr_pred_one_week = evaluate_final_model(one_week_df, "one_week", glr, GLR_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
glr_res_fifteen_days, glr_pred_fifteen_days = evaluate_final_model(fifteen_days_df, "fifteen_days", glr, GLR_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
glr_res_one_month, glr_pred_one_month = evaluate_final_model(one_month_df, "one_month", glr, GLR_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
glr_res_three_months, glr_pred_three_months = evaluate_final_model(three_months_df, "three_months", glr, GLR_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)

In [None]:
# Random forest regressor
rf_res_one_week, rf_pred_one_week = evaluate_final_model(one_week_df, "one_week", rf, RF_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
rf_res_fifteen_days, rf_pred_fifteen_days = evaluate_final_model(fifteen_days_df, "fifteen_days", rf, RF_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
rf_res_one_month, rf_pred_one_month = evaluate_final_model(one_month_df, "one_month", rf, RF_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
rf_res_three_months, rf_pred_three_months = evaluate_final_model(three_months_df, "three_months", rf, RF_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)

In [None]:
# Gradient boosting tree regressor
gbt_res_one_week, gbt_pred_one_week = evaluate_final_model(one_week_df, "one_week", gbt, GBT_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
gbt_res_fifteen_days, gbt_pred_fifteen_days = evaluate_final_model(fifteen_days_df, "fifteen_days", gbt, GBT_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
gbt_res_one_month, gbt_pred_one_month = evaluate_final_model(one_month_df, "one_month", gbt, GBT_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
gbt_res_three_months, gbt_pred_three_months = evaluate_final_model(three_months_df, "three_months", gbt, GBT_MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)

# ❗Models comparison

In [None]:
def show_results(df, lr_predictions, glr_predictions, rf_predictions, gbt_predictions, title):
  trace1 = go.Scatter(
      x = df['timestamp'],
      y = df['next-market-price'].astype(float),
      mode = 'lines',
      name = 'Next Market price (usd)'
  )

  trace2 = go.Scatter(
      x = lr_predictions['timestamp'],
      y = lr_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Linear Regression predictions'
  )

  trace2 = go.Scatter(
      x = lr_predictions['timestamp'],
      y = lr_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Linear Regression predictions'
  )

  trace3 = go.Scatter(
      x = glr_predictions['timestamp'],
      y = glr_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Generalized Linear Regression predictions'
  )

  trace4 = go.Scatter(
      x = rf_predictions['timestamp'],
      y = rf_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Random Forest Regressor predictions'
  )

  trace5 = go.Scatter(
      x = gbt_predictions['timestamp'],
      y = gbt_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'GBTRegressor predictions'
  )

  layout = dict(
      title=title + " predictions",
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  # Change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace1,trace2,trace3, trace4, trace5]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = title + " predictions")

In [None]:
# Displaying results for one week
one_week_comparison_lst_df = pd.DataFrame(pd.concat([lr_res_one_week, glr_res_one_week, rf_res_one_week, gbt_res_one_week], ignore_index=True))
one_week_comparison_lst_df

In [None]:
show_results(one_week_df.toPandas(), lr_pred_one_week.toPandas(), glr_pred_one_week.toPandas(), rf_pred_one_week.toPandas(), gbt_pred_one_week.toPandas(), "One week")

In [None]:
# Displaying results for fifteen days
fifteen_days_comparison_lst_df = pd.DataFrame(pd.concat([lr_res_fifteen_days, glr_res_fifteen_days, rf_res_fifteen_days, gbt_res_fifteen_days], ignore_index=True))
fifteen_days_comparison_lst_df

In [None]:
show_results(fifteen_days_df.toPandas(), lr_pred_fifteen_days.toPandas(), glr_pred_fifteen_days.toPandas(), rf_pred_fifteen_days.toPandas(), gbt_pred_fifteen_days.toPandas(), "Fifteen days")

In [None]:
# Displaying results for one month
one_month_comparison_lst_df = pd.DataFrame(pd.concat([lr_res_one_month, glr_res_one_month, rf_res_one_month, gbt_res_one_month], ignore_index=True))
one_month_comparison_lst_df

In [None]:
show_results(one_month_df.toPandas(), lr_pred_one_month.toPandas(), glr_pred_one_month.toPandas(), rf_pred_one_month.toPandas(), gbt_pred_one_month.toPandas(), "One month")

In [None]:
# Displaying results for three months
three_months_comparison_lst_df = pd.DataFrame(pd.concat([lr_res_three_months, glr_res_three_months, rf_res_three_months, gbt_res_three_months], ignore_index=True))
three_months_comparison_lst_df

In [None]:
show_results(three_months_df.toPandas(), lr_pred_three_months.toPandas(), glr_pred_three_months.toPandas(), rf_pred_three_months.toPandas(), gbt_pred_three_months.toPandas(), "Three months")

# ❗Summary

In [None]:
# Displaying final results
final_model_results = pd.concat([one_week_comparison_lst_df, fifteen_days_comparison_lst_df, one_month_comparison_lst_df, three_months_comparison_lst_df], ignore_index=True)
final_model_results

In [None]:
scatter_plot(final_model_results, "RMSE", "Model", "RMSE value for each final model")

In [None]:
adv_scatter_plot(final_model_results, "RMSE", "Dataset", "Model", "RMSE value for each model (and dataset)")

In [None]:
# Saving final model results
final_model_results.to_csv(FINAL_RESULTS, index=False)