# **Bitcoin price prediction - Block Split**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author: Corsi Danilo (1742375) - corsi.1742375@studenti.uniroma1.it


---


Description: perform model's train / validation with hyperparameter tuning and cross validation based on different methods of splitting the dataset.

# Global constants, dependencies, libraries and tools

In [1]:
# Main constants
LOCAL_RUNNING = True
SLOW_OPERATIONS = False # Decide whether or not to use operations that might slow down notebook execution
ROOT_DIR = "D:/Documents/Repository/BDC/project" if LOCAL_RUNNING else "/content/drive"

In [2]:
if not LOCAL_RUNNING:
    # Point Colaboratory to Google Drive
    from google.colab import drive

    # Define GDrive paths
    drive.mount(ROOT_DIR, force_remount=True)

    # Install Spark and related dependencies
    !pip install pyspark
    !pip install -U -q PyDrive -qq
    !apt install openjdk-8-jdk-headless -qq

## Import my utilities

In [3]:
# Set main dir
MAIN_DIR = ROOT_DIR + "" if LOCAL_RUNNING else ROOT_DIR + "/MyDrive/BDC/project"

# Utilities dir
UTILITIES_DIR = MAIN_DIR + "/utilities"

# Import my utilities
import sys
sys.path.append(UTILITIES_DIR)

from imports import *
from config import *
import train_validation_utilities

importlib.reload(train_validation_utilities)

<module 'train_validation_utilities' from 'D:\\Documents/Repository/BDC/project/utilities\\train_validation_utilities.py'>

## Core variables

In [4]:
# BS = Block Split
# WFS = Walk Forward Split
# SS = Single Split
SPLITTING_METHOD = BS

# LR = LinearRegression 
# GLR = GeneralizedLinearRegression 
# RF = RandomForestRegressor 
# GBTR = GradientBoostingTreeRegressor
MODEL_NAME = RF

In [5]:
###################
# --- DATASET --- #
###################

# Datasets dirs
DATASET_OUTPUT_DIR = MAIN_DIR + "/datasets/output"

# Datasets paths
DATASET_TRAIN_VALID  = DATASET_OUTPUT_DIR + "/" + DATASET_TRAIN_VALID_NAME + ".parquet"

####################
# --- FEATURES --- #
####################

# Features dir
FEATURES_DIR = MAIN_DIR + "/features"

# Features paths
FEATURES_CORRELATION = FEATURES_DIR + "/" + FEATURES_CORRELATION_LABEL + ".json"
BASE_FEATURES = FEATURES_DIR + "/" + BASE_FEATURES_LABEL + ".json"
BASE_AND_MOST_CORR_FEATURES = FEATURES_DIR + "/" + BASE_AND_MOST_CORR_FEATURES_LABEL + ".json"
BASE_AND_LEAST_CORR_FEATURES = FEATURES_DIR + "/" + BASE_AND_LEAST_CORR_FEATURES_LABEL + ".json"

##################
# --- MODELS --- #
##################

# Model dir
MODELS_DIR = MAIN_DIR + "/models"

# Model path
MODEL = MODELS_DIR + "/" + MODEL_NAME

###################
# --- RESULTS --- #
###################

# Results dir
RESULTS_DIR = MAIN_DIR + "/results/" + SPLITTING_METHOD

# Results path
ALL_MODEL_RESULTS  = RESULTS_DIR + "/" + MODEL_NAME + "_all.csv"
REL_MODEL_RESULTS  = RESULTS_DIR + "/" + MODEL_NAME + "_rel.csv"

MODEL_ACCURACY_RESULTS  = RESULTS_DIR + "/" + MODEL_NAME + "_accuracy.csv"

In [6]:
# Importing useful libraries
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pio.renderers.default = 'vscode+colab' # To correctly render plotly plots

# Create the pyspark session

In [7]:
# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '12G').\
                set('spark.driver.memory', '12G').\
                set('spark.driver.maxResultSize', '109G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPricePrediction").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

# Loading dataset

In [8]:
# Load train / validation set into pyspark dataset objects
df = spark.read.load(DATASET_TRAIN_VALID,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

In [9]:
train_validation_utilities.dataset_info(df)

+-------------------+---+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+-----------------+--------------------+--------------------+------------------+------------------+--------------------+------------------------+-------------------+------------------+--------------------+--------------------+------------------+--------------+--------------------------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|          timestamp| id|     market-price|    opening-price|    highest-price|     lowest-price|    closing-price|  trade-volume-btc|   total-bitcoins|          market-cap|    trade-volume-usd|       blocks-size|    avg-block-size|n-transactions-total|n-transactions-per-block|          hash-rate|        difficulty|      miners-revenue|transaction-fees-usd|n-unique-addresses|n-transactions|estimated-transaction-volume-usd|       sma-5-days|   

# Loading features

In [10]:
# Loading base features
with open(BASE_FEATURES, "r") as f:
    BASE_FEATURES = json.load(f)
print(BASE_FEATURES)

['opening-price', 'highest-price', 'lowest-price', 'closing-price', 'trade-volume-btc', 'market-price', 'market-cap', 'total-bitcoins', 'trade-volume-usd']


In [11]:
# Loading currency and additional most correlated features
with open(BASE_AND_MOST_CORR_FEATURES, "r") as f:
    BASE_AND_MOST_CORR_FEATURES = json.load(f)
print(BASE_AND_MOST_CORR_FEATURES)

['opening-price', 'highest-price', 'lowest-price', 'closing-price', 'trade-volume-btc', 'market-price', 'market-cap', 'total-bitcoins', 'trade-volume-usd', 'miners-revenue', 'sma-5-days', 'sma-7-days', 'sma-10-days', 'estimated-transaction-volume-usd', 'sma-20-days']


In [12]:
# Loading currency and additional least correlated features
with open(BASE_AND_LEAST_CORR_FEATURES, "r") as f:
    BASE_AND_LEAST_CORR_FEATURES = json.load(f)
print(BASE_AND_LEAST_CORR_FEATURES)

['opening-price', 'highest-price', 'lowest-price', 'closing-price', 'trade-volume-btc', 'market-price', 'market-cap', 'total-bitcoins', 'trade-volume-usd', 'sma-100-days', 'transaction-fees-usd', 'n-unique-addresses', 'sma-50-days', 'n-transactions-total', 'blocks-size', 'hash-rate', 'difficulty', 'avg-block-size', 'n-transactions-per-block', 'n-transactions']


# Model train / validation
In order to train and validate the model I'll try several approaches:
- `Default without normalization:` make predictions using the base model
- `Default with normalization:` like the previous one but features are normalized

Then the features that gave on average the most satisfactory results (for each model) are chosen and proceeded with:
- `Hyperparameter tuning:` finding the best parameters to use. 
- `Cross Validation:` validate the performance of the model with the chosen parameters (also here using Block split / Walk forward split)

If the final results are satisfactory, the model will be trained on the whole train / validation set and saved in order to make predictions on the test set.

For each approach the train / validation set will be split according to the chosen splitting method (in order to figure out which one works best for our problem). In this case the `Block time series splits` method will be used: involves dividing the time series into blocks of equal length, and then using each block as a separate fold for cross-validation.

<img src="https://github.com/CorsiDanilo/big-data-computing-project/blob/main/notebooks/images/block-splits.png?raw=1">

In [13]:
# Get splitting parameters based on the choosen splitting method
splitting_info = train_validation_utilities.get_splitting_params(SPLITTING_METHOD)
splitting_info

{'split_type': 'block_splits', 'splits': 5}

## Default
The train / validation set will be splitted based on the splitting method chosen so that the model performance can be seen without any tuning by using different features (normalized and non)

In [14]:
# Get default parameters
params = train_validation_utilities.get_defaults_model_params(MODEL_NAME)
params

{'numTrees': [20], 'maxDepth': [5], 'seed': [42]}

### Without normalization

In [15]:
# Define model and features type
MODEL_TYPE = "default"
FEATURES_NORMALIZATION = False

In [16]:
# Choose base features
CHOSEN_FEATURES = BASE_FEATURES
CHOSEN_FEATURES_LABEL = BASE_FEATURES_LABEL

In [17]:
# Make predictions by using base features
default_train_results_base_features, default_valid_results_base_features, default_train_pred_base_features, default_valid_pred_base_features = train_validation_utilities.multiple_splits(df, params, splitting_info, MODEL_NAME, MODEL_TYPE, FEATURES_NORMALIZATION, CHOSEN_FEATURES, CHOSEN_FEATURES_LABEL, FEATURES_LABEL, TARGET_LABEL, SLOW_OPERATIONS)

100%|██████████| 1/1 [00:05<00:00,  5.32s/it]
100%|██████████| 1/1 [00:03<00:00,  3.10s/it]
100%|██████████| 1/1 [00:02<00:00,  2.67s/it]
100%|██████████| 1/1 [00:02<00:00,  2.61s/it]
100%|██████████| 1/1 [00:02<00:00,  2.44s/it]


In [18]:
default_train_results_base_features

Unnamed: 0,Model,Type,Dataset,Splitting,Features,Splits,Train / Validation,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,RandomForestRegressor,default,train,block_splits,base_features,1,"(20981, 5246)","[20, 5, 42]",56.798694,3226.091696,42.680217,0.00552,0.997909,0.997908,2.81184
1,RandomForestRegressor,default,train,block_splits,base_features,2,"(20981, 5246)","[20, 5, 42]",444.836546,197879.552484,282.983265,0.010942,0.999174,0.999174,1.308453
2,RandomForestRegressor,default,train,block_splits,base_features,3,"(20981, 5246)","[20, 5, 42]",381.586005,145607.878975,278.227982,0.006105,0.998543,0.998543,1.001828
3,RandomForestRegressor,default,train,block_splits,base_features,4,"(20981, 5246)","[20, 5, 42]",247.495065,61253.807029,175.485572,0.005938,0.999274,0.999274,1.046074
4,RandomForestRegressor,default,train,block_splits,base_features,5,"(20981, 5246)","[20, 5, 42]",134.854588,18185.759814,96.117975,0.004111,0.999148,0.999148,0.912767


In [19]:
default_valid_results_base_features

Unnamed: 0,Model,Type,Dataset,Splitting,Features,Splits,Train / Validation,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,RandomForestRegressor,default,valid,block_splits,base_features,1,"(20981, 5246)","[20, 5, 42]",746.675071,557523.7,449.059297,0.039495,0.49572,0.495335,2.81184
1,RandomForestRegressor,default,valid,block_splits,base_features,2,"(20981, 5246)","[20, 5, 42]",1434.702079,2058370.0,989.0665,0.017306,0.810546,0.810401,1.308453
2,RandomForestRegressor,default,valid,block_splits,base_features,3,"(20981, 5246)","[20, 5, 42]",442.675526,195961.6,355.606572,0.0086,0.989268,0.98926,1.001828
3,RandomForestRegressor,default,valid,block_splits,base_features,4,"(20981, 5246)","[20, 5, 42]",733.131336,537481.6,395.083412,0.021731,0.380347,0.379874,1.046074
4,RandomForestRegressor,default,valid,block_splits,base_features,5,"(20981, 5246)","[20, 5, 42]",1017.383255,1035069.0,709.580203,0.023429,-0.986389,-0.987905,0.912767


In [20]:
# Choose base and most additional correlated features
CHOSEN_FEATURES = BASE_AND_MOST_CORR_FEATURES
CHOSEN_FEATURES_LABEL = BASE_AND_MOST_CORR_FEATURES_LABEL

In [21]:
# Make predictions by using base and most additional correlated features
default_train_results_base_and_most_corr_features, default_valid_results_base_and_most_corr_features, default_train_pred_base_and_most_corr_features, default_valid_pred_base_and_most_corr_features = train_validation_utilities.multiple_splits(df, params, splitting_info, MODEL_NAME, MODEL_TYPE, FEATURES_NORMALIZATION, CHOSEN_FEATURES, CHOSEN_FEATURES_LABEL, FEATURES_LABEL, TARGET_LABEL, SLOW_OPERATIONS)

100%|██████████| 1/1 [00:02<00:00,  2.45s/it]
100%|██████████| 1/1 [00:02<00:00,  2.50s/it]
100%|██████████| 1/1 [00:02<00:00,  2.56s/it]
100%|██████████| 1/1 [00:02<00:00,  2.58s/it]
100%|██████████| 1/1 [00:02<00:00,  2.70s/it]


In [22]:
default_train_results_base_and_most_corr_features

Unnamed: 0,Model,Type,Dataset,Splitting,Features,Splits,Train / Validation,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,RandomForestRegressor,default,train,block_splits,base_and_most_corr_features,1,"(20981, 5246)","[20, 5, 42]",53.253924,2835.980453,39.306824,0.005094,0.998162,0.998161,1.012324
1,RandomForestRegressor,default,train,block_splits,base_and_most_corr_features,2,"(20981, 5246)","[20, 5, 42]",438.950071,192677.164786,283.118993,0.010852,0.999196,0.999195,0.982254
2,RandomForestRegressor,default,train,block_splits,base_and_most_corr_features,3,"(20981, 5246)","[20, 5, 42]",365.556343,133631.440011,264.751478,0.00582,0.998663,0.998663,1.00288
3,RandomForestRegressor,default,train,block_splits,base_and_most_corr_features,4,"(20981, 5246)","[20, 5, 42]",222.854147,49663.970829,162.897234,0.005466,0.999411,0.999411,1.094569
4,RandomForestRegressor,default,train,block_splits,base_and_most_corr_features,5,"(20981, 5246)","[20, 5, 42]",122.368593,14974.072514,87.543821,0.003758,0.999299,0.999299,1.102809


In [23]:
default_valid_results_base_and_most_corr_features

Unnamed: 0,Model,Type,Dataset,Splitting,Features,Splits,Train / Validation,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,RandomForestRegressor,default,valid,block_splits,base_and_most_corr_features,1,"(20981, 5246)","[20, 5, 42]",810.340989,656652.5,493.103595,0.043481,0.406058,0.405604,1.012324
1,RandomForestRegressor,default,valid,block_splits,base_and_most_corr_features,2,"(20981, 5246)","[20, 5, 42]",1481.951917,2196181.0,972.018188,0.016809,0.797861,0.797707,0.982254
2,RandomForestRegressor,default,valid,block_splits,base_and_most_corr_features,3,"(20981, 5246)","[20, 5, 42]",584.099599,341172.3,458.967158,0.011371,0.981316,0.981302,1.00288
3,RandomForestRegressor,default,valid,block_splits,base_and_most_corr_features,4,"(20981, 5246)","[20, 5, 42]",735.693024,541244.2,392.743587,0.021639,0.376009,0.375533,1.094569
4,RandomForestRegressor,default,valid,block_splits,base_and_most_corr_features,5,"(20981, 5246)","[20, 5, 42]",1134.726692,1287605.0,893.249176,0.029588,-1.471027,-1.472913,1.102809


In [24]:
# Choose base and least additional correlated features
CHOSEN_FEATURES = BASE_AND_LEAST_CORR_FEATURES
CHOSEN_FEATURES_LABEL = BASE_AND_LEAST_CORR_FEATURES_LABEL

In [25]:
# Make predictions by using base and least additional correlated features
default_train_results_base_and_least_corr_features, default_valid_results_base_and_least_corr_features, default_train_pred_base_and_least_corr_features, default_valid_pred_base_and_least_corr_features = train_validation_utilities.multiple_splits(df, params, splitting_info, MODEL_NAME, MODEL_TYPE, FEATURES_NORMALIZATION, CHOSEN_FEATURES, CHOSEN_FEATURES_LABEL, FEATURES_LABEL, TARGET_LABEL, SLOW_OPERATIONS)

100%|██████████| 1/1 [00:02<00:00,  2.50s/it]
100%|██████████| 1/1 [00:02<00:00,  2.66s/it]
100%|██████████| 1/1 [00:02<00:00,  2.55s/it]
100%|██████████| 1/1 [00:02<00:00,  2.55s/it]
100%|██████████| 1/1 [00:02<00:00,  2.45s/it]


In [26]:
default_train_results_base_and_least_corr_features

Unnamed: 0,Model,Type,Dataset,Splitting,Features,Splits,Train / Validation,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,RandomForestRegressor,default,train,block_splits,base_and_least_corr_features,1,"(20981, 5246)","[20, 5, 42]",53.914153,2906.735852,40.446159,0.005224,0.998116,0.998115,1.005751
1,RandomForestRegressor,default,train,block_splits,base_and_least_corr_features,2,"(20981, 5246)","[20, 5, 42]",426.793271,182152.496074,276.86063,0.010701,0.999239,0.999239,1.14688
2,RandomForestRegressor,default,train,block_splits,base_and_least_corr_features,3,"(20981, 5246)","[20, 5, 42]",355.273527,126219.278978,262.629305,0.005783,0.998737,0.998737,1.079366
3,RandomForestRegressor,default,train,block_splits,base_and_least_corr_features,4,"(20981, 5246)","[20, 5, 42]",239.012529,57126.98913,179.842303,0.006235,0.999323,0.999323,1.094401
4,RandomForestRegressor,default,train,block_splits,base_and_least_corr_features,5,"(20981, 5246)","[20, 5, 42]",118.512364,14045.180537,84.717004,0.003629,0.999342,0.999342,0.975617


In [27]:
default_valid_results_base_and_least_corr_features

Unnamed: 0,Model,Type,Dataset,Splitting,Features,Splits,Train / Validation,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,RandomForestRegressor,default,valid,block_splits,base_and_least_corr_features,1,"(20981, 5246)","[20, 5, 42]",817.668145,668581.2,498.749824,0.043977,0.395268,0.394807,1.005751
1,RandomForestRegressor,default,valid,block_splits,base_and_least_corr_features,2,"(20981, 5246)","[20, 5, 42]",1497.624753,2242880.0,994.206426,0.017217,0.793563,0.793406,1.14688
2,RandomForestRegressor,default,valid,block_splits,base_and_least_corr_features,3,"(20981, 5246)","[20, 5, 42]",1032.172015,1065379.0,805.721721,0.020276,0.941656,0.941611,1.079366
3,RandomForestRegressor,default,valid,block_splits,base_and_least_corr_features,4,"(20981, 5246)","[20, 5, 42]",811.821902,659054.8,480.999726,0.02629,0.240188,0.239608,1.094401
4,RandomForestRegressor,default,valid,block_splits,base_and_least_corr_features,5,"(20981, 5246)","[20, 5, 42]",895.576583,802057.4,643.706793,0.021232,-0.539219,-0.540394,0.975617


### With normalization

In [28]:
# Define model and features type
MODEL_TYPE = "default_norm"
FEATURES_NORMALIZATION = True

In [29]:
# Choose base features
CHOSEN_FEATURES = BASE_FEATURES
CHOSEN_FEATURES_LABEL = BASE_FEATURES_LABEL

In [30]:
# Make predictions by using base features
default_norm_train_results_base_features, default_norm_valid_results_base_features, default_norm_train_pred_base_features, default_norm_valid_pred_base_features = train_validation_utilities.multiple_splits(df, params, splitting_info, MODEL_NAME, MODEL_TYPE, FEATURES_NORMALIZATION, CHOSEN_FEATURES, CHOSEN_FEATURES_LABEL, FEATURES_LABEL, TARGET_LABEL, SLOW_OPERATIONS)

100%|██████████| 1/1 [00:02<00:00,  2.06s/it]
100%|██████████| 1/1 [00:02<00:00,  2.33s/it]
100%|██████████| 1/1 [00:02<00:00,  2.21s/it]
100%|██████████| 1/1 [00:02<00:00,  2.21s/it]
100%|██████████| 1/1 [00:02<00:00,  2.23s/it]


In [31]:
default_norm_train_results_base_features

Unnamed: 0,Model,Type,Dataset,Splitting,Features,Splits,Train / Validation,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,RandomForestRegressor,default_norm,train,block_splits,base_features_norm,1,"(20981, 5246)","[20, 5, 42]",264.214962,69809.55,204.261649,0.025488,0.954749,0.954741,0.807943
1,RandomForestRegressor,default_norm,train,block_splits,base_features_norm,2,"(20981, 5246)","[20, 5, 42]",1923.187495,3698650.0,1472.679126,0.071165,0.984558,0.984555,0.980905
2,RandomForestRegressor,default_norm,train,block_splits,base_features_norm,3,"(20981, 5246)","[20, 5, 42]",1790.430958,3205643.0,1305.481129,0.029951,0.967921,0.967915,0.854278
3,RandomForestRegressor,default_norm,train,block_splits,base_features_norm,4,"(20981, 5246)","[20, 5, 42]",976.815427,954168.4,770.377063,0.026158,0.988687,0.988685,0.818925
4,RandomForestRegressor,default_norm,train,block_splits,base_features_norm,5,"(20981, 5246)","[20, 5, 42]",789.677475,623590.5,569.861189,0.025133,0.970798,0.970793,0.900004


In [32]:
default_norm_valid_results_base_features

Unnamed: 0,Model,Type,Dataset,Splitting,Features,Splits,Train / Validation,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,RandomForestRegressor,default_norm,valid,block_splits,base_features_norm,1,"(20981, 5246)","[20, 5, 42]",1166.995931,1361880.0,751.26008,0.066985,-0.23182,-0.232761,0.807943
1,RandomForestRegressor,default_norm,valid,block_splits,base_features_norm,2,"(20981, 5246)","[20, 5, 42]",6139.624475,37694990.0,5447.276889,0.095824,-2.469484,-2.472132,0.980905
2,RandomForestRegressor,default_norm,valid,block_splits,base_features_norm,3,"(20981, 5246)","[20, 5, 42]",1840.855103,3388748.0,1506.675083,0.035417,0.81442,0.814278,0.854278
3,RandomForestRegressor,default_norm,valid,block_splits,base_features_norm,4,"(20981, 5246)","[20, 5, 42]",1902.49985,3619506.0,1606.802981,0.085161,-3.172862,-3.176047,0.818925
4,RandomForestRegressor,default_norm,valid,block_splits,base_features_norm,5,"(20981, 5246)","[20, 5, 42]",2143.126904,4592993.0,2048.516904,0.068202,-7.81436,-7.821088,0.900004


In [33]:
# Choose base and most additional correlated features
CHOSEN_FEATURES = BASE_AND_MOST_CORR_FEATURES
CHOSEN_FEATURES_LABEL = BASE_AND_MOST_CORR_FEATURES_LABEL

In [34]:
# Make predictions by using base and most additional correlated features
default_norm_train_results_base_and_most_corr_features, default_norm_valid_results_base_and_most_corr_features, default_norm_train_pred_base_and_most_corr_features, default_norm_valid_pred_base_and_most_corr_features = train_validation_utilities.multiple_splits(df, params, splitting_info, MODEL_NAME, MODEL_TYPE, FEATURES_NORMALIZATION, CHOSEN_FEATURES, CHOSEN_FEATURES_LABEL, FEATURES_LABEL, TARGET_LABEL, SLOW_OPERATIONS)

100%|██████████| 1/1 [00:02<00:00,  2.22s/it]
100%|██████████| 1/1 [00:02<00:00,  2.36s/it]
100%|██████████| 1/1 [00:02<00:00,  2.35s/it]
100%|██████████| 1/1 [00:02<00:00,  2.43s/it]
100%|██████████| 1/1 [00:02<00:00,  2.31s/it]


In [35]:
default_norm_train_results_base_and_most_corr_features

Unnamed: 0,Model,Type,Dataset,Splitting,Features,Splits,Train / Validation,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,RandomForestRegressor,default_norm,train,block_splits,base_and_most_corr_features_norm,1,"(20981, 5246)","[20, 5, 42]",130.589656,17053.66,102.517335,0.012905,0.988946,0.988944,0.851076
1,RandomForestRegressor,default_norm,train,block_splits,base_and_most_corr_features_norm,2,"(20981, 5246)","[20, 5, 42]",1062.718473,1129371.0,655.098304,0.021438,0.995285,0.995284,0.971022
2,RandomForestRegressor,default_norm,train,block_splits,base_and_most_corr_features_norm,3,"(20981, 5246)","[20, 5, 42]",841.185235,707592.6,642.565651,0.014234,0.992919,0.992918,0.961791
3,RandomForestRegressor,default_norm,train,block_splits,base_and_most_corr_features_norm,4,"(20981, 5246)","[20, 5, 42]",544.458524,296435.1,416.632347,0.015041,0.996485,0.996485,1.05898
4,RandomForestRegressor,default_norm,train,block_splits,base_and_most_corr_features_norm,5,"(20981, 5246)","[20, 5, 42]",254.673768,64858.73,190.700888,0.007937,0.996963,0.996962,0.95664


In [36]:
default_norm_valid_results_base_and_most_corr_features

Unnamed: 0,Model,Type,Dataset,Splitting,Features,Splits,Train / Validation,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,RandomForestRegressor,default_norm,valid,block_splits,base_and_most_corr_features_norm,1,"(20981, 5246)","[20, 5, 42]",922.472585,850955.7,634.508538,0.057478,0.23031,0.229723,0.851076
1,RandomForestRegressor,default_norm,valid,block_splits,base_and_most_corr_features_norm,2,"(20981, 5246)","[20, 5, 42]",20856.371152,434988200.0,19451.837844,0.348146,-39.036742,-39.067299,0.971022
2,RandomForestRegressor,default_norm,valid,block_splits,base_and_most_corr_features_norm,3,"(20981, 5246)","[20, 5, 42]",3037.309801,9225251.0,2610.123488,0.063217,0.494791,0.494406,0.961791
3,RandomForestRegressor,default_norm,valid,block_splits,base_and_most_corr_features_norm,4,"(20981, 5246)","[20, 5, 42]",3907.142234,15265760.0,3066.472873,0.156325,-16.599616,-16.613048,1.05898
4,RandomForestRegressor,default_norm,valid,block_splits,base_and_most_corr_features_norm,5,"(20981, 5246)","[20, 5, 42]",1650.333142,2723599.0,1605.042714,0.053542,-4.226829,-4.230818,0.95664


In [37]:
# Choose base and least additional correlated features
CHOSEN_FEATURES = BASE_AND_LEAST_CORR_FEATURES
CHOSEN_FEATURES_LABEL = BASE_AND_LEAST_CORR_FEATURES_LABEL

In [38]:
# Make predictions by using base and least additional correlated features
default_norm_train_results_base_and_least_corr_features, default_norm_valid_results_base_and_least_corr_features, default_norm_train_pred_base_and_least_corr_features, default_norm_valid_pred_base_and_least_corr_features = train_validation_utilities.multiple_splits(df, params, splitting_info, MODEL_NAME, MODEL_TYPE, FEATURES_NORMALIZATION, CHOSEN_FEATURES, CHOSEN_FEATURES_LABEL, FEATURES_LABEL, TARGET_LABEL, SLOW_OPERATIONS)

100%|██████████| 1/1 [00:02<00:00,  2.24s/it]
100%|██████████| 1/1 [00:02<00:00,  2.40s/it]
100%|██████████| 1/1 [00:02<00:00,  2.62s/it]
100%|██████████| 1/1 [00:02<00:00,  2.76s/it]
100%|██████████| 1/1 [00:02<00:00,  2.44s/it]


In [39]:
default_norm_train_results_base_and_least_corr_features

Unnamed: 0,Model,Type,Dataset,Splitting,Features,Splits,Train / Validation,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,RandomForestRegressor,default_norm,train,block_splits,base_and_least_corr_features_norm,1,"(20981, 5246)","[20, 5, 42]",145.645879,21212.721984,110.637451,0.013535,0.98625,0.986247,0.937394
1,RandomForestRegressor,default_norm,train,block_splits,base_and_least_corr_features_norm,2,"(20981, 5246)","[20, 5, 42]",552.948393,305751.92562,392.524512,0.017655,0.998723,0.998723,0.947473
2,RandomForestRegressor,default_norm,train,block_splits,base_and_least_corr_features_norm,3,"(20981, 5246)","[20, 5, 42]",978.465893,957395.50467,756.58522,0.016936,0.990419,0.990418,1.160025
3,RandomForestRegressor,default_norm,train,block_splits,base_and_least_corr_features_norm,4,"(20981, 5246)","[20, 5, 42]",424.33531,180060.455632,326.738392,0.011374,0.997865,0.997865,1.155857
4,RandomForestRegressor,default_norm,train,block_splits,base_and_least_corr_features_norm,5,"(20981, 5246)","[20, 5, 42]",327.988266,107576.302417,229.454665,0.009626,0.994962,0.994961,1.041255


In [40]:
default_norm_valid_results_base_and_least_corr_features

Unnamed: 0,Model,Type,Dataset,Splitting,Features,Splits,Train / Validation,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,RandomForestRegressor,default_norm,valid,block_splits,base_and_least_corr_features_norm,1,"(20981, 5246)","[20, 5, 42]",1000.232907,1000466.0,771.796801,0.071833,0.095078,0.094388,0.937394
1,RandomForestRegressor,default_norm,valid,block_splits,base_and_least_corr_features_norm,2,"(20981, 5246)","[20, 5, 42]",3682.33624,13559600.0,3271.340612,0.058407,-0.248039,-0.248991,0.947473
2,RandomForestRegressor,default_norm,valid,block_splits,base_and_least_corr_features_norm,3,"(20981, 5246)","[20, 5, 42]",1568.090304,2458907.0,1305.430944,0.030585,0.865341,0.865238,1.160025
3,RandomForestRegressor,default_norm,valid,block_splits,base_and_least_corr_features_norm,4,"(20981, 5246)","[20, 5, 42]",1001.969636,1003943.0,769.401423,0.040745,-0.157428,-0.158311,1.155857
4,RandomForestRegressor,default_norm,valid,block_splits,base_and_least_corr_features_norm,5,"(20981, 5246)","[20, 5, 42]",2487.843051,6189363.0,2427.123699,0.081097,-10.877936,-10.887002,1.041255


In [41]:
# Define model information and evaluators to show
model_info = ['Model', 'Type', 'Dataset', 'Splitting', 'Features', 'Parameters']
evaluator_lst = ['RMSE', 'MSE', 'MAE', 'MAPE', 'R2', 'Adjusted_R2', 'Time']

In [42]:
# Define the results to show
default_comparison_lst = [default_valid_results_base_features, default_valid_results_base_and_most_corr_features, default_valid_results_base_and_least_corr_features, default_norm_valid_results_base_features, default_norm_valid_results_base_and_most_corr_features, default_norm_valid_results_base_and_least_corr_features]

# Show the comparison table
default_comparison_lst_df = pd.concat([train_validation_utilities.model_comparison(results, model_info, evaluator_lst) for results in default_comparison_lst])
default_comparison_lst_df

Unnamed: 0,Model,Type,Dataset,Splitting,Features,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,RandomForestRegressor,default,valid,block_splits,base_features,"[20, 5, 42]",874.913453,876881.1,579.679197,0.022112,0.337898,0.337393,1.416192
0,RandomForestRegressor,default,valid,block_splits,base_and_most_corr_features,"[20, 5, 42]",949.362444,1004571.0,642.016341,0.024578,0.218043,0.217447,1.038967
0,RandomForestRegressor,default,valid,block_splits,base_and_least_corr_features,"[20, 5, 42]",1010.972679,1087590.0,684.676898,0.025798,0.366291,0.365807,1.060403
0,RandomForestRegressor,default_norm,valid,block_splits,base_features_norm,"[20, 5, 42]",2638.620453,10131620.0,2272.106388,0.070318,-2.574821,-2.57755,0.872411
0,RandomForestRegressor,default_norm,valid,block_splits,base_and_most_corr_features_norm,"[20, 5, 42]",6074.725783,92610760.0,5473.597091,0.135742,-11.827617,-11.837407,0.959902
0,RandomForestRegressor,default_norm,valid,block_splits,base_and_least_corr_features_norm,"[20, 5, 42]",1948.094427,4842456.0,1709.018696,0.056533,-2.064597,-2.066936,1.048401


In [43]:
# Save the best default model results and predicitons
best_default_results = pd.concat([train_validation_utilities.model_comparison(results, model_info, evaluator_lst) for results in [default_valid_results_base_and_least_corr_features]])
best_default_predictions = default_valid_pred_base_and_least_corr_features
best_default_results

Unnamed: 0,Model,Type,Dataset,Splitting,Features,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,RandomForestRegressor,default,valid,block_splits,base_and_least_corr_features,"[20, 5, 42]",1010.972679,1087590.0,684.676898,0.025798,0.366291,0.365807,1.060403


In [44]:
# Define the choosen features
FEATURES_NORMALIZATION = False
CHOSEN_FEATURES = BASE_AND_LEAST_CORR_FEATURES
CHOSEN_FEATURES_LABEL = BASE_AND_LEAST_CORR_FEATURES_LABEL

## Tuned
Once the features and execution method are selected, the model will undergo hyperparameter tuning and cross validation to find the best configuration. 

In [45]:
# Get model grid parameters
params = train_validation_utilities.get_model_grid_params(MODEL_NAME)
params

{'numTrees': [3, 5, 10, 20, 30], 'maxDepth': [3, 5, 10], 'seed': [42]}

### Hyperparameter tuning

In [46]:
# Select the type of feature to be used
MODEL_TYPE = "hyp_tuning"

In [47]:
# Perform hyperparameter tuning
hyp_res = train_validation_utilities.multiple_splits(df, params, splitting_info, MODEL_NAME, MODEL_TYPE, FEATURES_NORMALIZATION, CHOSEN_FEATURES, CHOSEN_FEATURES_LABEL, FEATURES_LABEL, TARGET_LABEL, SLOW_OPERATIONS)
hyp_res

100%|██████████| 15/15 [00:33<00:00,  2.26s/it]


Best parameters chosen for split [1/5]: [3, 5, 42]


100%|██████████| 15/15 [00:33<00:00,  2.27s/it]


Best parameters chosen for split [2/5]: [10, 5, 42]


100%|██████████| 15/15 [00:33<00:00,  2.20s/it]


Best parameters chosen for split [3/5]: [3, 10, 42]


100%|██████████| 15/15 [00:33<00:00,  2.21s/it]


Best parameters chosen for split [4/5]: [10, 10, 42]


100%|██████████| 15/15 [00:32<00:00,  2.15s/it]

Best parameters chosen for split [5/5]: [20, 10, 42]





Unnamed: 0,Model,Type,Dataset,Splitting,Features,Splits,Train / Validation,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,RandomForestRegressor,hyp_tuning,valid,block_splits,base_and_least_corr_features,1,"(20981, 5246)","[3, 5, 42]",747.021974,558041.8,460.088554,0.040708,0.495251,0.494866,0.456042
1,RandomForestRegressor,hyp_tuning,valid,block_splits,base_and_least_corr_features,2,"(20981, 5246)","[10, 5, 42]",1425.850345,2033049.0,935.465315,0.016207,0.812876,0.812733,0.556829
2,RandomForestRegressor,hyp_tuning,valid,block_splits,base_and_least_corr_features,3,"(20981, 5246)","[3, 10, 42]",911.106429,830114.9,630.830416,0.016022,0.95454,0.954505,0.95542
3,RandomForestRegressor,hyp_tuning,valid,block_splits,base_and_least_corr_features,4,"(20981, 5246)","[10, 10, 42]",593.498239,352240.2,264.274214,0.014724,0.593909,0.593599,2.216413
4,RandomForestRegressor,hyp_tuning,valid,block_splits,base_and_least_corr_features,5,"(20981, 5246)","[20, 10, 42]",870.259516,757351.6,629.965671,0.02078,-0.453425,-0.454534,2.304052


---
Since during this stage will be used the Block split / Walk forward split method of the dataset I compute a score for each parameter chosen by each split, assigning weights based on:
   * Their `frequency` for each split (if the same parameters are chosen from several splits, these will have greater weight) 
   * The `split` they belong to (the closer the split is to today's date the more weight they will have)
   * Their `RMSE value` for each split (the lower this is, the more weight they will have)
   
   Then, the overall score will be calculated by putting together these three weights for each parameter and the one with the best score will be the chosen parameter.

In [48]:
# Show parameters score
grouped_scores, best_params = train_validation_utilities.choose_best_params(hyp_res)
grouped_scores

Unnamed: 0_level_0,Split weight,RMSE weight,Frequency weight,Final score
Parameters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(10, 10, 42)",0.8,0.583758,1.0,0.467007
"(20, 10, 42)",1.0,0.389656,1.0,0.389656
"(3, 10, 42)",0.6,0.361008,1.0,0.216605
"(3, 5, 42)",0.2,0.476087,1.0,0.095217
"(10, 5, 42)",0.4,0.0,1.0,0.0


In [49]:
# Print best parameters
print(f"Best parameters: {best_params}")

Best parameters: (10, 10, 42)


### Cross validation

In [50]:
MODEL_TYPE = "cross_val"

In [51]:
# Get tuned parameters
params = train_validation_utilities.get_best_model_params(best_params, MODEL_NAME)
params

{'numTrees': [10], 'maxDepth': [10], 'seed': [42]}

In [52]:
# Perform cross validation
cv_train_result, cv_valid_result, cv_train_pred, cv_valid_pred = train_validation_utilities.multiple_splits(df, params, splitting_info, MODEL_NAME, MODEL_TYPE, FEATURES_NORMALIZATION, CHOSEN_FEATURES, CHOSEN_FEATURES_LABEL, FEATURES_LABEL, TARGET_LABEL, SLOW_OPERATIONS)

100%|██████████| 1/1 [00:03<00:00,  3.47s/it]
100%|██████████| 1/1 [00:03<00:00,  3.31s/it]
100%|██████████| 1/1 [00:03<00:00,  3.19s/it]
100%|██████████| 1/1 [00:03<00:00,  3.74s/it]
100%|██████████| 1/1 [00:03<00:00,  3.14s/it]


In [53]:
cv_train_result

Unnamed: 0,Model,Type,Dataset,Splitting,Features,Splits,Train / Validation,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,RandomForestRegressor,cross_val,train,block_splits,base_and_least_corr_features,1,"(20981, 5246)","[10, 10, 42]",12.990403,168.750578,9.324248,0.001161,0.999891,0.999891,2.020778
1,RandomForestRegressor,cross_val,train,block_splits,base_and_least_corr_features,2,"(20981, 5246)","[10, 10, 42]",76.427711,5841.194935,42.855654,0.00156,0.999976,0.999976,1.846109
2,RandomForestRegressor,cross_val,train,block_splits,base_and_least_corr_features,3,"(20981, 5246)","[10, 10, 42]",86.77629,7530.124439,62.846167,0.001409,0.999925,0.999925,1.910848
3,RandomForestRegressor,cross_val,train,block_splits,base_and_least_corr_features,4,"(20981, 5246)","[10, 10, 42]",57.063393,3256.230789,38.542103,0.0013,0.999961,0.999961,2.432612
4,RandomForestRegressor,cross_val,train,block_splits,base_and_least_corr_features,5,"(20981, 5246)","[10, 10, 42]",27.738624,769.431261,18.99852,0.000791,0.999964,0.999964,1.919582


In [54]:
cv_valid_result

Unnamed: 0,Model,Type,Dataset,Splitting,Features,Splits,Train / Validation,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,RandomForestRegressor,cross_val,valid,block_splits,base_and_least_corr_features,1,"(20981, 5246)","[10, 10, 42]",932.683927,869899.3,585.3503,0.0519,0.213176,0.212575,2.020778
1,RandomForestRegressor,cross_val,valid,block_splits,base_and_least_corr_features,2,"(20981, 5246)","[10, 10, 42]",1518.840579,2306877.0,1025.855451,0.017757,0.787673,0.787511,1.846109
2,RandomForestRegressor,cross_val,valid,block_splits,base_and_least_corr_features,3,"(20981, 5246)","[10, 10, 42]",1311.767545,1720734.0,1037.554556,0.026127,0.905766,0.905694,1.910848
3,RandomForestRegressor,cross_val,valid,block_splits,base_and_least_corr_features,4,"(20981, 5246)","[10, 10, 42]",593.498239,352240.2,264.274214,0.014724,0.593909,0.593599,2.432612
4,RandomForestRegressor,cross_val,valid,block_splits,base_and_least_corr_features,5,"(20981, 5246)","[10, 10, 42]",1063.919028,1131924.0,726.731177,0.023971,-1.172262,-1.17392,1.919582


In [55]:
# Define the results to show
tuned_comparison_lst = [cv_valid_result]

# Show the comparison table
tuned_comparison_lst_df = pd.concat([train_validation_utilities.model_comparison(results, model_info, evaluator_lst) for results in tuned_comparison_lst])
tuned_comparison_lst_df

Unnamed: 0,Model,Type,Dataset,Splitting,Features,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,RandomForestRegressor,cross_val,valid,block_splits,base_and_least_corr_features,"[10, 10, 42]",1084.141864,1276335.0,727.95314,0.026896,0.265652,0.265092,2.025986


# Comparison table
Visualization of model performance at various stages of train / validation

In [56]:
# Concatenate final results into Pandas dataset
final_comparison_lst_df = pd.DataFrame(pd.concat([default_comparison_lst_df, tuned_comparison_lst_df], ignore_index=True))
final_comparison_lst_df

Unnamed: 0,Model,Type,Dataset,Splitting,Features,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,RandomForestRegressor,default,valid,block_splits,base_features,"[20, 5, 42]",874.913453,876881.1,579.679197,0.022112,0.337898,0.337393,1.416192
1,RandomForestRegressor,default,valid,block_splits,base_and_most_corr_features,"[20, 5, 42]",949.362444,1004571.0,642.016341,0.024578,0.218043,0.217447,1.038967
2,RandomForestRegressor,default,valid,block_splits,base_and_least_corr_features,"[20, 5, 42]",1010.972679,1087590.0,684.676898,0.025798,0.366291,0.365807,1.060403
3,RandomForestRegressor,default_norm,valid,block_splits,base_features_norm,"[20, 5, 42]",2638.620453,10131620.0,2272.106388,0.070318,-2.574821,-2.57755,0.872411
4,RandomForestRegressor,default_norm,valid,block_splits,base_and_most_corr_features_norm,"[20, 5, 42]",6074.725783,92610760.0,5473.597091,0.135742,-11.827617,-11.837407,0.959902
5,RandomForestRegressor,default_norm,valid,block_splits,base_and_least_corr_features_norm,"[20, 5, 42]",1948.094427,4842456.0,1709.018696,0.056533,-2.064597,-2.066936,1.048401
6,RandomForestRegressor,cross_val,valid,block_splits,base_and_least_corr_features,"[10, 10, 42]",1084.141864,1276335.0,727.95314,0.026896,0.265652,0.265092,2.025986


# Model accuracy

Since predicting the price accurately is very difficult I also saw how good the models are at predicting whether the price will go up or down in this way:

For each prediction let's consider the actual market-price, next-market-price and our predicted next-market-price (prediction).
I compute whether the current prediction is correct (1) or not (0):

$$ 
prediction\_is\_correct
= 
\begin{cases}
0 \text{ if [(market-price > next-market-price) and (market-price < prediction)] or [(market-price < next-market-price) and (market-price > prediction)]} \\
1 \text{ if [(market-price > next-market-price) and (market-price > prediction)] or [(market-price < next-market-price) and (market-price < prediction)]}
\end{cases}
$$

After that I count the number of correct prediction:
$$ 
correct\_predictions
= 
\sum_{i=0}^{total\_rows} prediction\_is\_correct
$$

Finally I compute the percentage of accuracy of the model:
$$
\\ 
accuracy 
= 
(correct\_predictions / total\_rows) 
* 100
$$

In [57]:
# Convert the pandas dataset to a PySpark dataset
best_default_pred_spark = spark.createDataFrame(best_default_predictions)
validated_pred_spark = spark.createDataFrame(cv_valid_pred)

# Compute model accuracy
default_accuracy = train_validation_utilities.model_accuracy(best_default_pred_spark)
validated_accuracy = train_validation_utilities.model_accuracy(validated_pred_spark)

# Shows whether features are normalised or not
if FEATURES_NORMALIZATION:
    NEW_CHOSEN_FEATURES_LABEL = CHOSEN_FEATURES_LABEL + "_norm"
    CHOSEN_FEATURES_LABEL = NEW_CHOSEN_FEATURES_LABEL
    
# Saving accuracy data into dataframe
accuracy_data = {
    'Model': MODEL_NAME,
    'Features': CHOSEN_FEATURES_LABEL,
    'Splitting': SPLITTING_METHOD,
    'Accuracy (default)': default_accuracy,
    'Accuracy (tuned)': validated_accuracy
}
accuracy_data_df = pd.DataFrame(accuracy_data, index=['Model'])

print(f"Percentage of correct predictions for {MODEL_NAME} with {CHOSEN_FEATURES_LABEL} and {SPLITTING_METHOD} (default): {default_accuracy:.2f}%")
print(f"Percentage of correct predictions for {MODEL_NAME} with {CHOSEN_FEATURES_LABEL} and {SPLITTING_METHOD} (tuned): {validated_accuracy:.2f}%")

Percentage of correct predictions for RandomForestRegressor with base_and_least_corr_features and block_splits (default): 52.09%
Percentage of correct predictions for RandomForestRegressor with base_and_least_corr_features and block_splits (tuned): 52.31%


In [58]:
# Concatenate default and tuned results
default_tuned_results = [best_default_results, cv_valid_result]
default_tuned_results_df = pd.concat([train_validation_utilities.model_comparison(results, model_info, evaluator_lst) for results in default_tuned_results])
default_tuned_results_df

Unnamed: 0,Model,Type,Dataset,Splitting,Features,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,RandomForestRegressor,default,valid,block_splits,base_and_least_corr_features,"[20, 5, 42]",1010.972679,1087590.0,684.676898,0.025798,0.366291,0.365807,1.060403
0,RandomForestRegressor,cross_val,valid,block_splits,base_and_least_corr_features,"[10, 10, 42]",1084.141864,1276335.0,727.95314,0.026896,0.265652,0.265092,2.025986


# Saving final results

In [59]:
# Save all final comparison results
final_comparison_lst_df.to_csv(ALL_MODEL_RESULTS, index=False)

In [60]:
# Save relevant results (default and tuned results)
default_tuned_results_df.to_csv(REL_MODEL_RESULTS, index=False)

In [61]:
# Saving accuracy results
accuracy_data_df.to_csv(MODEL_ACCURACY_RESULTS, index=False)

In [62]:
# Export notebook in html format (remember to save the notebook and change the model name)
if LOCAL_RUNNING:
    !jupyter nbconvert --to html 3-block-split_{MODEL_NAME}.ipynb --output 3-block-split_{MODEL_NAME} --output-dir='./exports'

  warn(
[NbConvertApp] Converting notebook 3-block-split_RandomForestRegressor.ipynb to html
[NbConvertApp] Writing 435467 bytes to exports\3-block-split_RandomForestRegressor.html
