# **Bitcoin price prediction - Final scores**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author: Corsi Danilo (1742375) - corsi.1742375@studenti.uniroma1.it


---


Description: display of final scores and making predictions on the test set with the models trained on the whole train / validation set.

# Global constants, dependencies, libraries and tools

In [None]:
# Main constants
LOCAL_RUNNING = False
SLOW_OPERATIONS = True # Decide whether or not to use operations that might slow down notebook execution
ROOT_DIR = "D:/Documents/Repository/BDC/project" if LOCAL_RUNNING else "/content/drive"

In [None]:
if not LOCAL_RUNNING:
    # Point Colaboratory to Google Drive
    from google.colab import drive

    # Define GDrive paths
    drive.mount(ROOT_DIR, force_remount=True)

Mounted at /content/drive


In [None]:
# Set main dir
MAIN_DIR = ROOT_DIR + "" if LOCAL_RUNNING else ROOT_DIR + "/MyDrive/BDC/project"

###################
# --- DATASET --- #
###################

# Datasets dirs
DATASET_OUTPUT_DIR = MAIN_DIR + "/datasets/output"

# Datasets names
DATASET_TEST_NAME = "bitcoin_blockchain_data_15min_test"

# Datasets paths
DATASET_TEST = DATASET_OUTPUT_DIR + "/" + DATASET_TEST_NAME + ".parquet"

####################
# --- FEATURES --- #
####################

# Features dir
FEATURES_DIR = MAIN_DIR + "/features"

# Features labels
FEATURES_LABEL = "features"
TARGET_LABEL = "next-market-price"
CHOSEN_FEATURES_LABEL = "most_rel_features"
FEATURES_NORMALIZATION = True

# Features paths
CHOSEN_FEATURES = FEATURES_DIR + "/" + CHOSEN_FEATURES_LABEL + ".json"

##################
# --- MODELS --- #
##################

# Model names
LR_MODEL_NAME = "LinearRegression"
GLR_MODEL_NAME = "GeneralizedLinearRegression"
RF_MODEL_NAME = "RandomForestRegressor"
GBT_MODEL_NAME = "GradientBoostingTreeRegressor"

# Model dir
MODELS_DIR = MAIN_DIR + "/models"

# Model path
LR_MODEL = MODELS_DIR + "/" + LR_MODEL_NAME
GLR_MODEL = MODELS_DIR + "/" + GLR_MODEL_NAME
RF_MODEL = MODELS_DIR + "/" + RF_MODEL_NAME
GBT_MODEL = MODELS_DIR + "/" + GBT_MODEL_NAME

###################
# --- RESULTS --- #
###################

# splits names
BLOCK_SPLITS_NAME = "block_splits"
WALK_FORWARD_SPLITS_NAME = "walk_forward_splits"
SHORT_TERM_SPLITS_NAME = "single_split"

# Results dir
RESULTS_DIR = MAIN_DIR + "/results"
RESULTS_FINAL_DIR = RESULTS_DIR + "/final"

#####################
# --- UTILITIES --- #
#####################

# Utilities dir
UTILITIES_DIR = MAIN_DIR + "/utilities"

In [None]:
# Suppression of warnings for better reading
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
if not LOCAL_RUNNING:
    # Install Spark and related dependencies
    !pip install pyspark
    !pip install -U -q PyDrive -qq
    !apt install openjdk-8-jdk-headless -qq

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=288bc3178e4bc8433087e2c479d2dbe1a5511481b4b4a497e9d2bbdc5d2656d3
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0
The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra fonts-nanum
  fonts-ipafont-gothic fonts-ipafont-mincho fonts-wqy-microhei
  fonts-wqy-zenhei fonts-indi

# Import files

In [None]:
# Import my files
import sys
sys.path.append(UTILITIES_DIR)

from imports import *
import utilities

importlib.reload(utilities)

<module 'utilities' from '/content/drive/MyDrive/BDC/project/utilities/utilities.py'>

# Create the pyspark session

In [None]:
# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '12G').\
                set('spark.driver.memory', '12G').\
                set('spark.driver.maxResultSize', '109G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPricePrediction").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

# Loading dataset

In [None]:
# Load datasets into pyspark dataset objects
df = spark.read.load(DATASET_TEST,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

In [None]:
def dataset_info(dataset):
  # Print dataset
  dataset.show(3)

  # Get the number of rows
  num_rows = dataset.count()

  # Get the number of columns
  num_columns = len(dataset.columns)

  # Print the shape of the dataset
  print("Shape:", (num_rows, num_columns))

  # Print the schema of the dataset
  dataset.printSchema()

In [None]:
if SLOW_OPERATIONS:
  dataset_info(df)

+-------------------+------+------------------+------------------+--------------------+-------------------+-----------------+------------------+--------------------+------------------------+--------------------+------------------+--------------------+--------------------+------------------+--------------+--------------------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|          timestamp|    id|      market-price|    total-bitcoins|          market-cap|       trade-volume|      blocks-size|    avg-block-size|n-transactions-total|n-transactions-per-block|           hash-rate|        difficulty|      miners-revenue|transaction-fees-usd|n-unique-addresses|n-transactions|estimated-transaction-volume-usd|     rate-of-change|        sma-5-days|        sma-7-days|       sma-10-days|       sma-20-days|       sma-50-days|      sma-100-days| next-market-price|
+-------------

# Split test set
The test set is divided into further mini-sets of 1 week, 15 days, 1 month and 3 months to see how the models' performance degrades as time increases.

In [None]:
# Retrieve the last value of the timestamp column
first_timestamp = df.select(col("timestamp")).first()[0]

# Split the test set into mini-sets of 1 week, 15 days, 1 month, and 3 months
one_week_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(weeks=1))
fifteen_days_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(days=15))
one_month_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(months=1))
three_months_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(months=3))

# Load previous results

In [None]:
# Load models results
splits_list = [BLOCK_SPLITS_NAME, WALK_FORWARD_SPLITS_NAME, SHORT_TERM_SPLITS_NAME]
models_list = [LR_MODEL_NAME, GLR_MODEL_NAME, RF_MODEL_NAME, GBT_MODEL_NAME]
train_valid_results = pd.DataFrame(columns=['Model', 'Type', 'Splitting', 'Features', 'Parameters', 'RMSE', 'MSE', 'MAE', 'MAPE', 'R2', 'Adjusted_R2', 'Time'])
train_valid_accuracy = pd.DataFrame(columns=['Model', 'Features', 'Splitting', 'Accuracy'])
for split in splits_list:
    for model in models_list:
        if split == BLOCK_SPLITS_NAME:
            train_valid_results = pd.concat([train_valid_results, pd.read_csv(RESULTS_DIR + "/" + split + "/" + model + ".csv")], ignore_index=True)
            train_valid_accuracy = pd.concat([train_valid_accuracy, pd.read_csv(RESULTS_DIR + "/" + split + "/" + model + "_accuracy.csv")], ignore_index=True)
        elif split == WALK_FORWARD_SPLITS_NAME:
            train_valid_results = pd.concat([train_valid_results, pd.read_csv(RESULTS_DIR + "/" + split + "/" + model + ".csv")], ignore_index=True)
            train_valid_accuracy = pd.concat([train_valid_accuracy, pd.read_csv(RESULTS_DIR + "/" + split + "/" + model + "_accuracy.csv")], ignore_index=True)
        elif split == SHORT_TERM_SPLITS_NAME:
            train_valid_results = pd.concat([train_valid_results, pd.read_csv(RESULTS_DIR + "/" + split + "/" + model + ".csv")], ignore_index=True)
            train_valid_accuracy = pd.concat([train_valid_accuracy, pd.read_csv(RESULTS_DIR + "/" + split + "/" + model + "_accuracy.csv")], ignore_index=True)


In [None]:
def scatter_plot(dataset, x_axis, y_axis, title):
  fig = px.scatter(dataset, x=x_axis, y=y_axis, color=y_axis)
  fig.update_layout(title=title)
  fig.update_yaxes(showticklabels=False)
  fig.update_yaxes(title='')
  fig.show()

In [None]:
def adv_scatter_plot(dataset, x_axis, y_axis, legend, title):
  fig = px.scatter(dataset, x=x_axis, y=y_axis, color=legend)
  fig.update_layout(title=title)
  fig.show()

In [None]:
scatter_plot(train_valid_results, "RMSE", "Model", "RMSE value for each model")

In [None]:
adv_scatter_plot(train_valid_results, "RMSE", "Model", "Splitting", "RMSE value for each model (and splitting method)")

In [None]:
scatter_plot(train_valid_accuracy, "Accuracy", "Model", "Accuracy for each model")

In [None]:
adv_scatter_plot(train_valid_accuracy, "Accuracy", "Model", "Splitting", "Accuracy for each model (and splitting method)")

# Test models
For each model, predictions on the various mini-sets are made and the obtained results are compared.


In [None]:
'''
Description: Evaluate final model by making predictions on the test set
Args:
    dataset: The dataSet which needs to be splited
    dataset_name: Name of selected dataset [one_week | fifteen_days | one_month | three_months]
    model: Trained model
    model_name: Model name selected
    features_normalization: Indicates whether features should be normalized (True) or not (False)
    features: Features to be used to make predictions
    features_name: Name of features used
    features_label: The column name of features
    target_label: The column name of target variable
Return:
    results_df: Results obtained from the evaluation
    predictions: Predictions obtained from the model
'''
def evaluate_final_model(dataset, dataset_name, model, model_name, features_normalization, features, features_name, features_label, target_label):
    # Select the type of features to be used
    dataset = utilities.select_features(dataset, features_normalization, features, features_label, target_label)

    # Make predictions
    predictions = model.transform(dataset).select(target_label, "market-price", "prediction", 'timestamp')

    # Compute validation error by several evaluators
    eval_res = utilities.model_evaluation(target_label, predictions)

    # Use dict to store each result
    results = {
        "Model": model_name,
        "Dataset": dataset_name,
        "Features": features_name,
        "RMSE": eval_res['rmse'],
        "MSE": eval_res['mse'],
        "MAE": eval_res['mae'],
        "MAPE": eval_res['mape'],
        "R2": eval_res['r2'],
        "Adjusted_R2": eval_res['adj_r2'],
    }

    # Transform dict to pandas dataset
    results_pd = pd.DataFrame(results, index=[0])

    return results_pd, predictions

In [None]:
'''
Description: How good the models are at predicting whether the price will go up or down
Args:
    dataset: The dataset which needs to be splited
Return:
    accuracy: Return the percentage of correct predictions
'''
def model_accuracy(dataset):
    # Compute the number of total rows in the DataFrame.
    total_rows = dataset.count()

    # Create a column "correct_prediction" which is worth 1 if the prediction is correct, otherwise 0
    dataset = dataset.withColumn(
        "correct_prediction",
        (
            (col("market-price") < col("next-market-price")) & (col("market-price") < col("prediction"))
        ) | (
            (col("market-price") > col("next-market-price")) & (col("market-price") > col("prediction"))
        )
    )

    # Count the number of correct predictions
    correct_predictions = dataset.filter(col("correct_prediction")).count()

    # Compite percentage of correct predictions
    accuracy = (correct_predictions / total_rows) * 100

    return accuracy

In [None]:
# Load choosen features
with open(CHOSEN_FEATURES, "r") as f:
    CHOSEN_FEATURES = json.load(f)
print(CHOSEN_FEATURES)

['market-price', 'market-cap', 'miners-revenue', 'sma-5-days', 'sma-7-days', 'sma-10-days', 'estimated-transaction-volume-usd', 'sma-20-days', 'trade-volume']


In [None]:
# Load models
lr = PipelineModel.load(LR_MODEL)
glr = PipelineModel.load(GLR_MODEL)
rf = PipelineModel.load(RF_MODEL)
gbt = PipelineModel.load(GBT_MODEL)

In [None]:
# Test models
model_name_list = [LR_MODEL_NAME, GLR_MODEL_NAME, RF_MODEL_NAME, GBT_MODEL_NAME]
model_list = [lr, glr, rf, gbt]
dataset_list = [one_week_df, fifteen_days_df, one_month_df, three_months_df]
dataset_name_list = ["one_week", "fifteen_days", "one_month", "three_months"]
predictions_df = pd.DataFrame(columns=[TARGET_LABEL, "market-price", "prediction", 'timestamp'])
test_results = pd.DataFrame(columns=['Model', 'Dataset', 'Features', 'RMSE', 'MSE', 'MAE', 'MAPE', 'R2', 'Adjusted_R2'])
test_accuracy = pd.DataFrame(columns=['Model', 'Features', 'Dataset', 'Accuracy'])

# For each model makes predictions based on the dataset type
for i, model in enumerate(model_list):
    for j, dataset in enumerate(dataset_list):
        results, predictions = evaluate_final_model(dataset, dataset_name_list[j], model, model_name_list[i], FEATURES_NORMALIZATION, CHOSEN_FEATURES, CHOSEN_FEATURES_LABEL, FEATURES_LABEL, TARGET_LABEL)
        test_results = pd.concat([test_results, results], ignore_index=True)

        predictions = predictions.withColumn("Model", lit(model_name_list[i])).withColumn("Dataset", lit(dataset_name_list[j]))
        predictions_df = pd.concat([predictions_df, predictions.toPandas()], ignore_index=True)

        accuracy = model_accuracy(predictions)
        accuracy_data = {
            'Model': model_name_list[i],
            'Features': CHOSEN_FEATURES_LABEL,
            'Dataset': dataset_name_list[j],
            'Accuracy': accuracy
        }
        accuracy_data_df = pd.DataFrame(accuracy_data, index=['Model'])

        test_accuracy = pd.concat([test_accuracy, accuracy_data_df], ignore_index=True)

# Merge results and accuracy
merged_results = pd.merge(test_results, test_accuracy)
merged_results


Unnamed: 0,Model,Dataset,Features,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Accuracy
0,LinearRegression,one_week,most_rel_features,7224.519196,52193680.0,7191.684962,0.238099,-199.911196,-201.114257,42.793462
1,LinearRegression,fifteen_days,most_rel_features,6662.604694,44390300.0,6544.559957,0.215236,-230.621517,-231.266703,53.296322
2,LinearRegression,one_month,most_rel_features,6522.814281,42547110.0,6435.91995,0.212271,-341.75755,-342.234264,40.020826
3,LinearRegression,three_months,most_rel_features,8003.762358,64060210.0,7839.112199,0.27876,-18.009225,-18.017839,42.391304
4,GeneralizedLinearRegression,one_week,most_rel_features,7090.29853,50272330.0,7057.326343,0.233713,-192.515288,-193.674063,42.793462
5,GeneralizedLinearRegression,fifteen_days,most_rel_features,6466.683734,41818000.0,6346.799498,0.208782,-217.199651,-217.80745,53.296322
6,GeneralizedLinearRegression,one_month,most_rel_features,6333.579997,40114240.0,6246.682369,0.206058,-322.158456,-322.607911,40.020826
7,GeneralizedLinearRegression,three_months,most_rel_features,8216.937423,67518060.0,8005.312536,0.285273,-19.035307,-19.044386,42.391304
8,RandomForestRegressor,one_week,most_rel_features,2146.957116,4609425.0,1495.256221,0.049842,-16.743242,-16.849489,78.306092
9,RandomForestRegressor,fifteen_days,most_rel_features,1845.643907,3406401.0,1344.750192,0.044442,-16.77406,-16.82357,64.26093


# Models comparison

In [None]:
def show_results(predictions, model0_name, model0_predictions, model1_name, model1_predictions, model2_name, model2_predictions, model3_name, model3_predictions, title):
  trace1 = go.Scatter(
      x = predictions['timestamp'],
      y = predictions['next-market-price'].astype(float),
      mode = 'lines',
      name = 'Next Market price (usd)'
  )

  trace2 = go.Scatter(
      x = model0_predictions['timestamp'],
      y = model0_predictions['prediction'].astype(float),
      mode = 'lines',
      name = model0_name + ' predictions'
  )

  trace3 = go.Scatter(
      x = model1_predictions['timestamp'],
      y = model1_predictions['prediction'].astype(float),
      mode = 'lines',
      name = model1_name + ' predictions'
  )

  trace4 = go.Scatter(
      x = model2_predictions['timestamp'],
      y = model2_predictions['prediction'].astype(float),
      mode = 'lines',
      name = model2_name + ' predictions'
  )

  trace5 = go.Scatter(
      x = model3_predictions['timestamp'],
      y = model3_predictions['prediction'].astype(float),
      mode = 'lines',
      name = model3_name + ' predictions'
  )

  layout = dict(
      title=title + " predictions",
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  # Change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace1,trace2,trace3,trace4,trace5]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = title + " predictions")

In [None]:
# For each dataset type, it displays the predicitons of each model
for dataset_name in dataset_name_list:
    predictions_to_show = predictions_df[predictions_df['Dataset'] == dataset_name]

    model_0_predictions = predictions_to_show[predictions_to_show['Model'] == model_name_list[0]]
    model_1_predictions = predictions_to_show[predictions_to_show['Model'] == model_name_list[1]]
    model_2_predictions = predictions_to_show[predictions_to_show['Model'] == model_name_list[2]]
    model_3_predictions = predictions_to_show[predictions_to_show['Model'] == model_name_list[3]]

    show_results(predictions_to_show, model_name_list[0], model_0_predictions, model_name_list[1], model_1_predictions, model_name_list[2], model_2_predictions, model_name_list[3], model_3_predictions, dataset_name)

# Summary

In [None]:
scatter_plot(test_results, "RMSE", "Model", "RMSE value for each model")

In [None]:
adv_scatter_plot(test_results, "RMSE", "Model", "Dataset", "RMSE value for each model (and dataset type)")

In [None]:
scatter_plot(test_accuracy, "Accuracy", "Model", "Accuracy for each model")

In [None]:
adv_scatter_plot(test_accuracy, "Accuracy", "Model", "Dataset", "Accuracy for each model (and dataset type)")

In [None]:
# Saving final test results
test_results.to_csv(RESULTS_FINAL_DIR + "/final.csv", index=False)
test_accuracy.to_csv(RESULTS_FINAL_DIR + "/final_accuracy.csv", index=False)