# **Bitcoin price prediction - Gradient Boosting Tree Regressor**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author: Corsi Danilo (1742375) - corsi.1742375@studenti.uniroma1.it


---


Description: executing the chosen model, first with default values, then by choosing the best parameters by performing hyperparameter tuning with cross validation and performance evaluation.

# Global constants, dependencies, libraries and tools

In [1]:
# Main constants
LOCAL_RUNNING = True
SLOW_OPERATIONS = True # Decide whether or not to use operations that might slow down notebook execution
SPLITTING_METHOD = "walk_forward_splits"
MODEL_NAME = "GradientBoostingTreeRegressor"
ROOT_DIR = "D:/Documents/Repository/BDC/project" if LOCAL_RUNNING else "/content/drive"

In [2]:
if not LOCAL_RUNNING: 
    # Point Colaboratory to Google Drive
    from google.colab import drive

    # Define GDrive paths
    drive.mount(ROOT_DIR, force_remount=True)

In [3]:
# Set main dir
MAIN_DIR = ROOT_DIR + "" if LOCAL_RUNNING else ROOT_DIR + "/MyDrive/BDC/project"

###################
# --- DATASET --- #
###################

# Datasets dirs
DATASET_OUTPUT_DIR = MAIN_DIR + "/datasets/output"

# Datasets names
DATASET_TRAIN_VALID_NAME = "bitcoin_blockchain_data_15min_train_valid"

# Datasets paths
DATASET_TRAIN_VALID  = DATASET_OUTPUT_DIR + "/" + DATASET_TRAIN_VALID_NAME + ".parquet"

####################
# --- FEATURES --- #
####################

# Features dir
FEATURES_DIR = MAIN_DIR + "/features"

# Features labels
FEATURES_LABEL = "features"
TARGET_LABEL = "next-market-price"

# Features names
ALL_FEATURES_NAME = "all_features"
MOST_REL_FEATURES_NAME = "most_rel_features"
LEAST_REL_FEATURES_NAME = "least_rel_features"

# Features paths
ALL_FEATURES = FEATURES_DIR + "/" + ALL_FEATURES_NAME + ".json"
MOST_REL_FEATURES = FEATURES_DIR + "/" + MOST_REL_FEATURES_NAME + ".json"
LEAST_REL_FEATURES = FEATURES_DIR + "/" + LEAST_REL_FEATURES_NAME + ".json"

##################
# --- MODELS --- #
##################

# Model dir
MODELS_DIR = MAIN_DIR + "/models"

# Model path
MODEL = MODELS_DIR + "/" + MODEL_NAME

#####################
# --- UTILITIES --- #
#####################

# Utilities dir
UTILITIES_DIR = MAIN_DIR + "/utilities"

###################
# --- RESULTS --- #
###################

# Results dir
RESULTS_DIR = MAIN_DIR + "/results/" + SPLITTING_METHOD

# Results path
MODEL_RESULTS  = RESULTS_DIR + "/" + MODEL_NAME + ".csv"
MODEL_ACCURACY_RESULTS  = RESULTS_DIR + "/" + MODEL_NAME + "_accuracy.csv"

In [4]:
# Suppression of warnings for better reading
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
if not LOCAL_RUNNING:
    # Install Spark and related dependencies
    !pip install pyspark
    !pip install -U -q PyDrive -qq
    !apt install openjdk-8-jdk-headless -qq

# Import files

In [6]:
# Import my files
import sys
sys.path.append(UTILITIES_DIR)

from imports import *
import utilities, parameters

importlib.reload(utilities)
importlib.reload(parameters)

<module 'parameters' from 'D:\\Documents/Repository/BDC/project/utilities\\parameters.py'>

# Create the pyspark session

In [7]:
# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '12G').\
                set('spark.driver.memory', '12G').\
                set('spark.driver.maxResultSize', '109G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPricePrediction").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

# Loading dataset

In [8]:
# Load train / validation set into pyspark dataset objects
df = spark.read.load(DATASET_TRAIN_VALID,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

In [9]:
def dataset_info(dataset):
  # Print dataset
  dataset.show(3)

  # Get the number of rows
  num_rows = dataset.count()

  # Get the number of columns
  num_columns = len(dataset.columns)

  # Print the shape of the dataset
  print("Shape:", (num_rows, num_columns))

  # Print the schema of the dataset
  dataset.printSchema()

In [10]:
if SLOW_OPERATIONS:
  dataset_info(df)

+-------------------+---+------------------+--------------------+--------------------+--------------------+------------------+------------------+--------------------+------------------------+-------------------+------------------+--------------------+--------------------+------------------+-----------------+--------------------------------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|          timestamp| id|      market-price|      total-bitcoins|          market-cap|        trade-volume|       blocks-size|    avg-block-size|n-transactions-total|n-transactions-per-block|          hash-rate|        difficulty|      miners-revenue|transaction-fees-usd|n-unique-addresses|   n-transactions|estimated-transaction-volume-usd|      rate-of-change|        sma-5-days|        sma-7-days|       sma-10-days|       sma-20-days|       sma-50-days|      sma-100-days| next-market-price|
+-----

# Loading features

In [11]:
# Loading all the features
with open(ALL_FEATURES, "r") as f:
    ALL_FEATURES = json.load(f)
print(ALL_FEATURES)

['market-price', 'market-cap', 'miners-revenue', 'sma-5-days', 'sma-7-days', 'sma-10-days', 'estimated-transaction-volume-usd', 'sma-20-days', 'trade-volume', 'sma-100-days', 'n-unique-addresses', 'transaction-fees-usd', 'total-bitcoins', 'n-transactions-total', 'blocks-size', 'sma-50-days', 'hash-rate', 'difficulty', 'avg-block-size', 'n-transactions-per-block', 'n-transactions', 'rate-of-change']


In [12]:
# Loading the most relevant features
with open(MOST_REL_FEATURES, "r") as f:
    MOST_REL_FEATURES = json.load(f)
print(MOST_REL_FEATURES)

['market-price', 'market-cap', 'miners-revenue', 'sma-5-days', 'sma-7-days', 'sma-10-days', 'estimated-transaction-volume-usd', 'sma-20-days', 'trade-volume']


In [13]:
# Loading least relevant features
with open(LEAST_REL_FEATURES, "r") as f:
    LEAST_REL_FEATURES = json.load(f)
print(LEAST_REL_FEATURES)

['sma-100-days', 'n-unique-addresses', 'transaction-fees-usd', 'total-bitcoins', 'n-transactions-total', 'blocks-size', 'sma-50-days', 'hash-rate', 'difficulty', 'avg-block-size', 'n-transactions-per-block', 'n-transactions', 'rate-of-change']


# Model train / validation
In order to train and validate the model I'll try several approaches:
- **Default without normalization:** Make predictions using the chosen base model
- **Default with normalization:** Like the previous one but features are normalized

At this point, the features that gave on average the most satisfactory results (for each model) are chosen and proceeded with:

- **Hyperparameter tuning:** Researching the best parameters to use
- **Cross Validation:** Validate the performance of the model with the chosen parameters

If the final results are satisfactory, the model will be trained on the whole train / validation set and saved to later make predictions on the test set.

---

For each approach the train / validation set will be split according to the chosen splitting method (in order to figure out which one works best for our problem):

- **Block time series splits:** Involves dividing the time series into blocks of equal length, and then using each block as a separate fold for cross-validation.

    ![block-splits.png](https://drive.google.com/uc?id=1SPT133HO1VdWYZZv6GeknFY3xX3T2tvL)

- **Walk forward time series splits:** Involves using a sliding window approach to create the training and validation sets for each fold. The model is trained on a fixed window of historical data, and then validated on the next observation in the time series. This process is repeated for each subsequent observation, with the window sliding forward one step at a time. 

    ![walk-forward-splits.png](https://drive.google.com/uc?id=1SNdq-kjbv4MXtdBj3EOJ2dmQpbbPStJi)

- **Single time series split:** Involves dividing the time series considering as validation set a narrow period of time and as train set everything that happened before this period, in such a way as to best benefit from the trend in the short term.

    ![single-split.png](https://drive.google.com/uc?id=1SODyQLolK4zn9lFGnNaqnMBZrHn3OsVn)

In [14]:
# Get splitting parameters based on the choosen splitting method
splitting_info = parameters.get_splitting_params(SPLITTING_METHOD)
splitting_info

{'split_type': 'walk_forward_splits',
 'min_obser': 20000,
 'sliding_window': 5000}

## Default
The train / validation set will be splitted based on the splitting method chosen so that the model performance can be seen without any tuning by using different features (normalized and non)

### Without normalization

In [15]:
# Define model and features type
MODEL_TYPE = "default"
FEATURES_NORMALIZATION = False

In [16]:
# Get default parameters
params = parameters.get_defaults_model_params(MODEL_NAME)
params

{'maxIter': [20], 'maxDepth': [5], 'stepSize': [0.1], 'seed': [42]}

In [17]:
# Make predictions by using all the features
default_res_all, default_pred_all = utilities.multiple_splits(df, params, splitting_info, MODEL_NAME, MODEL_TYPE, FEATURES_NORMALIZATION, ALL_FEATURES, ALL_FEATURES_NAME, FEATURES_LABEL, TARGET_LABEL)
default_res_all

{'Model': 'GradientBoostingTreeRegressor', 'Type': 'default', 'Splitting': 'walk_forward_splits', 'Features': 'all_features', 'Splits': 1, 'Train&Validation': (20000, 5000), 'Parameters': [20, 5, 0.1, 42], 'RMSE': 58.77328394359638, 'MSE': 3454.2989055146045, 'MAE': 46.26294211108162, 'MAPE': 0.005275295467832301, 'R2': 0.9954573618014019, 'Adjusted_R2': 0.9954537240530947, 'Time': 15.805410861968994}
{'Model': 'GradientBoostingTreeRegressor', 'Type': 'default', 'Splitting': 'walk_forward_splits', 'Features': 'all_features', 'Splits': 2, 'Train&Validation': (20000, 5000), 'Parameters': [20, 5, 0.1, 42], 'RMSE': 259.09763438787365, 'MSE': 67131.58414539225, 'MAE': 117.20513361804501, 'MAPE': 0.011412318038050637, 'R2': 0.7248809973811321, 'Adjusted_R2': 0.7246606818635194, 'Time': 9.454195737838745}
{'Model': 'GradientBoostingTreeRegressor', 'Type': 'default', 'Splitting': 'walk_forward_splits', 'Features': 'all_features', 'Splits': 3, 'Train&Validation': (20000, 5000), 'Parameters': [2

Unnamed: 0,Model,Type,Splitting,Features,Splits,Train&Validation,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,GradientBoostingTreeRegressor,default,walk_forward_splits,all_features,1,"(20000, 5000)","[20, 5, 0.1, 42]",58.773284,3454.299,46.262942,0.005275,0.995457,0.995454,15.805411
1,GradientBoostingTreeRegressor,default,walk_forward_splits,all_features,2,"(20000, 5000)","[20, 5, 0.1, 42]",259.097634,67131.58,117.205134,0.011412,0.724881,0.724661,9.454196
2,GradientBoostingTreeRegressor,default,walk_forward_splits,all_features,3,"(20000, 5000)","[20, 5, 0.1, 42]",524.858587,275476.5,457.111494,0.040408,0.187572,0.186921,9.0021
3,GradientBoostingTreeRegressor,default,walk_forward_splits,all_features,4,"(20000, 5000)","[20, 5, 0.1, 42]",1475.956374,2178447.0,910.571596,0.064066,0.239006,0.238397,8.624105
4,GradientBoostingTreeRegressor,default,walk_forward_splits,all_features,5,"(20000, 5000)","[20, 5, 0.1, 42]",6362.784772,40485030.0,4882.943492,0.20887,-1.52777,-1.529794,8.205129
5,GradientBoostingTreeRegressor,default,walk_forward_splits,all_features,6,"(20000, 5000)","[20, 5, 0.1, 42]",16750.331387,280573600.0,14641.556596,0.343569,-4.081914,-4.085984,8.480216
6,GradientBoostingTreeRegressor,default,walk_forward_splits,all_features,7,"(20000, 5000)","[20, 5, 0.1, 42]",3499.655232,12247590.0,2679.468659,0.048997,0.430082,0.429626,7.785031
7,GradientBoostingTreeRegressor,default,walk_forward_splits,all_features,8,"(20000, 5000)","[20, 5, 0.1, 42]",2820.849571,7957192.0,1700.562523,0.041726,0.895713,0.89563,8.423825
8,GradientBoostingTreeRegressor,default,walk_forward_splits,all_features,9,"(20000, 5000)","[20, 5, 0.1, 42]",871.404734,759346.2,712.54834,0.020927,0.895839,0.895756,9.348041
9,GradientBoostingTreeRegressor,default,walk_forward_splits,all_features,10,"(20000, 5000)","[20, 5, 0.1, 42]",1243.117225,1545340.0,1076.058474,0.023301,0.830455,0.830319,8.562052


In [18]:
# Make predictions by using the most relevant features
default_res_most_rel, default_pred_most_rel = utilities.multiple_splits(df, params, splitting_info, MODEL_NAME, MODEL_TYPE, FEATURES_NORMALIZATION, MOST_REL_FEATURES, MOST_REL_FEATURES_NAME, FEATURES_LABEL, TARGET_LABEL)
default_res_most_rel

{'Model': 'GradientBoostingTreeRegressor', 'Type': 'default', 'Splitting': 'walk_forward_splits', 'Features': 'most_rel_features', 'Splits': 1, 'Train&Validation': (20000, 5000), 'Parameters': [20, 5, 0.1, 42], 'RMSE': 54.30059729088267, 'MSE': 2948.554866146614, 'MAE': 43.3311169768787, 'MAPE': 0.0049432044930157095, 'R2': 0.9961224496397122, 'Adjusted_R2': 0.9961193444942785, 'Time': 7.832159042358398}
{'Model': 'GradientBoostingTreeRegressor', 'Type': 'default', 'Splitting': 'walk_forward_splits', 'Features': 'most_rel_features', 'Splits': 2, 'Train&Validation': (20000, 5000), 'Parameters': [20, 5, 0.1, 42], 'RMSE': 267.6221140990773, 'MSE': 71621.59595485954, 'MAE': 131.50262288155645, 'MAPE': 0.012930710205075201, 'R2': 0.7064800079438471, 'Adjusted_R2': 0.7062449568991576, 'Time': 7.496967315673828}
{'Model': 'GradientBoostingTreeRegressor', 'Type': 'default', 'Splitting': 'walk_forward_splits', 'Features': 'most_rel_features', 'Splits': 3, 'Train&Validation': (20000, 5000), 'Par

Unnamed: 0,Model,Type,Splitting,Features,Splits,Train&Validation,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,GradientBoostingTreeRegressor,default,walk_forward_splits,most_rel_features,1,"(20000, 5000)","[20, 5, 0.1, 42]",54.300597,2948.555,43.331117,0.004943,0.996122,0.996119,7.832159
1,GradientBoostingTreeRegressor,default,walk_forward_splits,most_rel_features,2,"(20000, 5000)","[20, 5, 0.1, 42]",267.622114,71621.6,131.502623,0.012931,0.70648,0.706245,7.496967
2,GradientBoostingTreeRegressor,default,walk_forward_splits,most_rel_features,3,"(20000, 5000)","[20, 5, 0.1, 42]",509.080419,259162.9,440.396311,0.038864,0.235684,0.235072,7.104683
3,GradientBoostingTreeRegressor,default,walk_forward_splits,most_rel_features,4,"(20000, 5000)","[20, 5, 0.1, 42]",1427.543988,2037882.0,877.695194,0.061686,0.28811,0.287539,7.242951
4,GradientBoostingTreeRegressor,default,walk_forward_splits,most_rel_features,5,"(20000, 5000)","[20, 5, 0.1, 42]",6267.404759,39280360.0,4830.215711,0.206985,-1.452554,-1.454518,7.152605
5,GradientBoostingTreeRegressor,default,walk_forward_splits,most_rel_features,6,"(20000, 5000)","[20, 5, 0.1, 42]",11003.622012,121079700.0,8456.704662,0.188307,-1.193067,-1.194823,7.511324
6,GradientBoostingTreeRegressor,default,walk_forward_splits,most_rel_features,7,"(20000, 5000)","[20, 5, 0.1, 42]",2777.306414,7713431.0,2024.361905,0.036218,0.64107,0.640783,7.116197
7,GradientBoostingTreeRegressor,default,walk_forward_splits,most_rel_features,8,"(20000, 5000)","[20, 5, 0.1, 42]",1300.232596,1690605.0,847.175581,0.020037,0.977843,0.977825,7.145599
8,GradientBoostingTreeRegressor,default,walk_forward_splits,most_rel_features,9,"(20000, 5000)","[20, 5, 0.1, 42]",694.304756,482059.1,553.130117,0.016247,0.933875,0.933822,7.240507
9,GradientBoostingTreeRegressor,default,walk_forward_splits,most_rel_features,10,"(20000, 5000)","[20, 5, 0.1, 42]",929.01056,863060.6,719.872877,0.015722,0.90531,0.905234,7.232843


In [19]:
# Make predictions by using the least relevant features
default_res_least_rel, default_pred_least_rel = utilities.multiple_splits(df, params, splitting_info, MODEL_NAME, MODEL_TYPE, FEATURES_NORMALIZATION, LEAST_REL_FEATURES, LEAST_REL_FEATURES_NAME, FEATURES_LABEL, TARGET_LABEL)
default_res_least_rel

{'Model': 'GradientBoostingTreeRegressor', 'Type': 'default', 'Splitting': 'walk_forward_splits', 'Features': 'least_rel_features', 'Splits': 1, 'Train&Validation': (20000, 5000), 'Parameters': [20, 5, 0.1, 42], 'RMSE': 2130.7463426647087, 'MSE': 4540079.9767790325, 'MAE': 1896.428222502027, 'MAPE': 0.20496460110970846, 'R2': -4.970514217597636, 'Adjusted_R2': -4.97529541016428, 'Time': 7.833675861358643}
{'Model': 'GradientBoostingTreeRegressor', 'Type': 'default', 'Splitting': 'walk_forward_splits', 'Features': 'least_rel_features', 'Splits': 2, 'Train&Validation': (20000, 5000), 'Parameters': [20, 5, 0.1, 42], 'RMSE': 2026.0369021522413, 'MSE': 4104825.5288826507, 'MAE': 1541.105989120387, 'MAPE': 0.1605305609027836, 'R2': -15.822417045675735, 'Adjusted_R2': -15.835888450717317, 'Time': 8.361173629760742}
{'Model': 'GradientBoostingTreeRegressor', 'Type': 'default', 'Splitting': 'walk_forward_splits', 'Features': 'least_rel_features', 'Splits': 3, 'Train&Validation': (20000, 5000), 

Unnamed: 0,Model,Type,Splitting,Features,Splits,Train&Validation,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,GradientBoostingTreeRegressor,default,walk_forward_splits,least_rel_features,1,"(20000, 5000)","[20, 5, 0.1, 42]",2130.746343,4540080.0,1896.428223,0.204965,-4.970514,-4.975295,7.833676
1,GradientBoostingTreeRegressor,default,walk_forward_splits,least_rel_features,2,"(20000, 5000)","[20, 5, 0.1, 42]",2026.036902,4104826.0,1541.105989,0.160531,-15.822417,-15.835888,8.361174
2,GradientBoostingTreeRegressor,default,walk_forward_splits,least_rel_features,3,"(20000, 5000)","[20, 5, 0.1, 42]",660.707566,436534.5,549.459067,0.049477,-0.287416,-0.288447,8.71361
3,GradientBoostingTreeRegressor,default,walk_forward_splits,least_rel_features,4,"(20000, 5000)","[20, 5, 0.1, 42]",2214.945322,4905983.0,1583.036982,0.11498,-0.7138,-0.715173,8.816306
4,GradientBoostingTreeRegressor,default,walk_forward_splits,least_rel_features,5,"(20000, 5000)","[20, 5, 0.1, 42]",8292.529576,68766050.0,7265.752313,0.327631,-3.293556,-3.296994,7.550016
5,GradientBoostingTreeRegressor,default,walk_forward_splits,least_rel_features,6,"(20000, 5000)","[20, 5, 0.1, 42]",18398.999812,338523200.0,16285.233009,0.384676,-5.131531,-5.136442,7.462765
6,GradientBoostingTreeRegressor,default,walk_forward_splits,least_rel_features,7,"(20000, 5000)","[20, 5, 0.1, 42]",5579.38253,31129510.0,4469.724451,0.081977,-0.448552,-0.449712,7.488427
7,GradientBoostingTreeRegressor,default,walk_forward_splits,least_rel_features,8,"(20000, 5000)","[20, 5, 0.1, 42]",17374.449691,301871500.0,14801.704144,0.365583,-2.956314,-2.959482,7.542968
8,GradientBoostingTreeRegressor,default,walk_forward_splits,least_rel_features,9,"(20000, 5000)","[20, 5, 0.1, 42]",4110.618536,16897180.0,3559.464307,0.10488,-1.317813,-1.319669,7.533884
9,GradientBoostingTreeRegressor,default,walk_forward_splits,least_rel_features,10,"(20000, 5000)","[20, 5, 0.1, 42]",8073.080854,65174630.0,7422.062507,0.156709,-6.15056,-6.156286,7.43719


### With normalization

In [20]:
# Define model and features type
MODEL_TYPE = "default_norm"
FEATURES_NORMALIZATION = True

In [21]:
# Make predictions by using all the features
default_norm_res_all, default_norm_pred_all = utilities.multiple_splits(df, params, splitting_info, MODEL_NAME, MODEL_TYPE, FEATURES_NORMALIZATION, ALL_FEATURES, ALL_FEATURES_NAME, FEATURES_LABEL, TARGET_LABEL)
default_norm_res_all

{'Model': 'GradientBoostingTreeRegressor', 'Type': 'default_norm', 'Splitting': 'walk_forward_splits', 'Features': 'all_features', 'Splits': 1, 'Train&Validation': (20000, 5000), 'Parameters': [20, 5, 0.1, 42], 'RMSE': 737.5475748799098, 'MSE': 543976.4252112362, 'MAE': 598.7321797225753, 'MAPE': 0.0669417979550973, 'R2': 0.2846339717861549, 'Adjusted_R2': 0.2840611060978955, 'Time': 7.7946696281433105}
{'Model': 'GradientBoostingTreeRegressor', 'Type': 'default_norm', 'Splitting': 'walk_forward_splits', 'Features': 'all_features', 'Splits': 2, 'Train&Validation': (20000, 5000), 'Parameters': [20, 5, 0.1, 42], 'RMSE': 516.6998304829401, 'MSE': 266978.71482109907, 'MAE': 349.0012616968465, 'MAPE': 0.03537344845428763, 'R2': -0.0941335390353577, 'Adjusted_R2': -0.09500972204960023, 'Time': 7.593148469924927}
{'Model': 'GradientBoostingTreeRegressor', 'Type': 'default_norm', 'Splitting': 'walk_forward_splits', 'Features': 'all_features', 'Splits': 3, 'Train&Validation': (20000, 5000), 'Pa

Unnamed: 0,Model,Type,Splitting,Features,Splits,Train&Validation,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,all_features,1,"(20000, 5000)","[20, 5, 0.1, 42]",737.547575,543976.4,598.73218,0.066942,0.284634,0.284061,7.79467
1,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,all_features,2,"(20000, 5000)","[20, 5, 0.1, 42]",516.69983,266978.7,349.001262,0.035373,-0.094134,-0.09501,7.593148
2,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,all_features,3,"(20000, 5000)","[20, 5, 0.1, 42]",968.085082,937188.7,761.029938,0.069753,-1.763932,-1.766145,7.524353
3,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,all_features,4,"(20000, 5000)","[20, 5, 0.1, 42]",2419.84529,5855651.0,1549.159924,0.110795,-1.045547,-1.047185,7.361097
4,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,all_features,5,"(20000, 5000)","[20, 5, 0.1, 42]",7598.480272,57736900.0,6733.699401,0.307036,-2.604928,-2.607815,7.775281
5,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,all_features,6,"(20000, 5000)","[20, 5, 0.1, 42]",17637.744325,311090000.0,15554.490963,0.366378,-4.634646,-4.639158,7.790218
6,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,all_features,7,"(20000, 5000)","[20, 5, 0.1, 42]",3196.960833,10220560.0,2494.475773,0.045358,0.524406,0.524025,7.768218
7,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,all_features,8,"(20000, 5000)","[20, 5, 0.1, 42]",4131.909198,17072670.0,3339.857468,0.07043,0.776246,0.776067,8.254898
8,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,all_features,9,"(20000, 5000)","[20, 5, 0.1, 42]",13685.456161,187291700.0,10827.603549,0.317623,-24.691093,-24.711666,7.675854
9,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,all_features,10,"(20000, 5000)","[20, 5, 0.1, 42]",8628.160403,74445150.0,7696.401862,0.163278,-7.167664,-7.174204,7.520558


In [22]:
# Make predictions by using the most relevant features
default_norm_res_most_rel, default_norm_pred_most_rel = utilities.multiple_splits(df, params, splitting_info, MODEL_NAME, MODEL_TYPE, FEATURES_NORMALIZATION, MOST_REL_FEATURES, MOST_REL_FEATURES_NAME, FEATURES_LABEL, TARGET_LABEL)
default_norm_res_most_rel

{'Model': 'GradientBoostingTreeRegressor', 'Type': 'default_norm', 'Splitting': 'walk_forward_splits', 'Features': 'most_rel_features', 'Splits': 1, 'Train&Validation': (20000, 5000), 'Parameters': [20, 5, 0.1, 42], 'RMSE': 320.8069937934846, 'MSE': 102917.12726681285, 'MAE': 268.4925043620952, 'MAPE': 0.03008553053431714, 'R2': 0.8646569719644571, 'Adjusted_R2': 0.8645485891592235, 'Time': 7.444110155105591}
{'Model': 'GradientBoostingTreeRegressor', 'Type': 'default_norm', 'Splitting': 'walk_forward_splits', 'Features': 'most_rel_features', 'Splits': 2, 'Train&Validation': (20000, 5000), 'Parameters': [20, 5, 0.1, 42], 'RMSE': 460.6235582177385, 'MSE': 212174.06238517034, 'MAE': 367.02824795679544, 'MAPE': 0.0378008879291805, 'R2': 0.13046716879824893, 'Adjusted_R2': 0.12977084621070012, 'Time': 7.781440258026123}
{'Model': 'GradientBoostingTreeRegressor', 'Type': 'default_norm', 'Splitting': 'walk_forward_splits', 'Features': 'most_rel_features', 'Splits': 3, 'Train&Validation': (20

Unnamed: 0,Model,Type,Splitting,Features,Splits,Train&Validation,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,most_rel_features,1,"(20000, 5000)","[20, 5, 0.1, 42]",320.806994,102917.1,268.492504,0.030086,0.864657,0.864549,7.44411
1,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,most_rel_features,2,"(20000, 5000)","[20, 5, 0.1, 42]",460.623558,212174.1,367.028248,0.037801,0.130467,0.129771,7.78144
2,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,most_rel_features,3,"(20000, 5000)","[20, 5, 0.1, 42]",937.833782,879532.2,814.32982,0.07323,-1.593893,-1.59597,7.202684
3,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,most_rel_features,4,"(20000, 5000)","[20, 5, 0.1, 42]",1700.281176,2890956.0,1186.137556,0.086148,-0.009894,-0.010702,7.061909
4,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,most_rel_features,5,"(20000, 5000)","[20, 5, 0.1, 42]",6945.001155,48233040.0,5675.70802,0.249149,-2.011534,-2.013945,7.633361
5,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,most_rel_features,6,"(20000, 5000)","[20, 5, 0.1, 42]",13973.784877,195266700.0,11742.617089,0.273224,-2.536785,-2.539617,7.038519
6,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,most_rel_features,7,"(20000, 5000)","[20, 5, 0.1, 42]",19313.17563,372998800.0,18223.166445,0.322595,-16.356782,-16.370681,7.64309
7,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,most_rel_features,8,"(20000, 5000)","[20, 5, 0.1, 42]",24920.285756,621020600.0,24194.89498,0.544137,-7.139067,-7.145585,7.181315
8,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,most_rel_features,9,"(20000, 5000)","[20, 5, 0.1, 42]",4895.211432,23963090.0,3440.670663,0.098701,-2.287055,-2.289687,6.921617
9,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,most_rel_features,10,"(20000, 5000)","[20, 5, 0.1, 42]",5343.925824,28557540.0,4780.359287,0.100484,-2.133158,-2.135667,7.331


In [23]:
# Make predictions by using the least relevant features
default_norm_res_least_rel, default_norm_pred_least_rel = utilities.multiple_splits(df, params, splitting_info, MODEL_NAME, MODEL_TYPE, FEATURES_NORMALIZATION, LEAST_REL_FEATURES, LEAST_REL_FEATURES_NAME, FEATURES_LABEL, TARGET_LABEL)
default_norm_res_least_rel

{'Model': 'GradientBoostingTreeRegressor', 'Type': 'default_norm', 'Splitting': 'walk_forward_splits', 'Features': 'least_rel_features', 'Splits': 1, 'Train&Validation': (20000, 5000), 'Parameters': [20, 5, 0.1, 42], 'RMSE': 2130.30575841533, 'MSE': 4538202.624337513, 'MAE': 1849.5051129429519, 'MAPE': 0.20452115610312954, 'R2': -4.968045371343634, 'Adjusted_R2': -4.9728245868562215, 'Time': 7.815220832824707}
{'Model': 'GradientBoostingTreeRegressor', 'Type': 'default_norm', 'Splitting': 'walk_forward_splits', 'Features': 'least_rel_features', 'Splits': 2, 'Train&Validation': (20000, 5000), 'Parameters': [20, 5, 0.1, 42], 'RMSE': 760.780978043656, 'MSE': 578787.6965530617, 'MAE': 545.9316971579532, 'MAPE': 0.056150145995214826, 'R2': -1.3719907079636497, 'Adjusted_R2': -1.3738902000220792, 'Time': 7.445554733276367}
{'Model': 'GradientBoostingTreeRegressor', 'Type': 'default_norm', 'Splitting': 'walk_forward_splits', 'Features': 'least_rel_features', 'Splits': 3, 'Train&Validation': (

Unnamed: 0,Model,Type,Splitting,Features,Splits,Train&Validation,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,least_rel_features,1,"(20000, 5000)","[20, 5, 0.1, 42]",2130.305758,4538203.0,1849.505113,0.204521,-4.968045,-4.972825,7.815221
1,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,least_rel_features,2,"(20000, 5000)","[20, 5, 0.1, 42]",760.780978,578787.7,545.931697,0.05615,-1.371991,-1.37389,7.445555
2,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,least_rel_features,3,"(20000, 5000)","[20, 5, 0.1, 42]",2094.832218,4388322.0,1940.158778,0.170722,-11.941922,-11.952286,7.268003
3,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,least_rel_features,4,"(20000, 5000)","[20, 5, 0.1, 42]",3005.0637,9030408.0,2086.969846,0.153508,-2.15458,-2.157106,6.709549
4,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,least_rel_features,5,"(20000, 5000)","[20, 5, 0.1, 42]",8585.233377,73706230.0,7583.451361,0.34421,-3.602007,-3.605692,6.599265
5,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,least_rel_features,6,"(20000, 5000)","[20, 5, 0.1, 42]",16661.731635,277613300.0,14506.217743,0.340175,-4.028296,-4.032322,6.67216
6,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,least_rel_features,7,"(20000, 5000)","[20, 5, 0.1, 42]",10926.663716,119392000.0,8036.16877,0.145151,-4.555677,-4.560126,6.903439
7,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,least_rel_features,8,"(20000, 5000)","[20, 5, 0.1, 42]",13837.81767,191485200.0,11819.607328,0.2582,-1.509596,-1.511606,6.696115
8,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,least_rel_features,9,"(20000, 5000)","[20, 5, 0.1, 42]",2897.619646,8396200.0,2195.961609,0.061133,-0.15172,-0.152642,6.782755
9,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,least_rel_features,10,"(20000, 5000)","[20, 5, 0.1, 42]",12787.343385,163516200.0,12169.243172,0.259935,-16.939985,-16.954352,6.845652


In [24]:
# Define model information and evaluators to show
model_info = ['Model', 'Type', 'Splitting', 'Features', 'Parameters']
evaluator_lst = ['RMSE', 'MSE', 'MAE', 'MAPE', 'R2', 'Adjusted_R2', 'Time']

In [25]:
# Define the results to show
default_comparison_lst = [default_res_all, default_res_most_rel, default_res_least_rel, default_norm_res_all, default_norm_res_most_rel, default_norm_res_least_rel]

# Show the comparison table
default_comparison_lst_df = pd.concat([utilities.model_comparison(results, model_info, evaluator_lst) for results in default_comparison_lst])
default_comparison_lst_df

Unnamed: 0,Model,Type,Splitting,Features,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,GradientBoostingTreeRegressor,default,walk_forward_splits,all_features,"[20, 5, 0.1, 42]",2638.954201,20466440.0,2108.261655,0.070707,-0.41019,-0.411319,8.633248
0,GradientBoostingTreeRegressor,default,walk_forward_splits,most_rel_features,"[20, 5, 0.1, 42]",2164.790804,12461190.0,1671.421663,0.058803,-0.031602,-0.032428,7.327423
0,GradientBoostingTreeRegressor,default,walk_forward_splits,least_rel_features,"[20, 5, 0.1, 42]",6448.888877,71216770.0,5654.88026,0.185159,-6.666626,-6.672765,7.687908
0,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,all_features,"[20, 5, 0.1, 42]",5300.901669,49467480.0,4517.726197,0.144694,-3.509128,-3.512739,7.810047
0,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,most_rel_features,"[20, 5, 0.1, 42]",6317.167819,75600850.0,5519.420013,0.161337,-12.291512,-12.302156,7.359104
0,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,least_rel_features,"[20, 5, 0.1, 42]",7888.247848,90830620.0,6821.110115,0.224324,-16.478387,-16.492384,6.808287


In [26]:
# Save the best default model results and predicitons
best_default_results = default_norm_res_all
best_default_predictions = default_norm_pred_all

## Tuned
Once the features and execution method are selected, the model will undergo hyperparameter tuning and cross validation to find the best configuration. 

### Hyperparameter tuning

In [27]:
# Select the type of feature to be used
MODEL_TYPE = "hyp_tuning"
CHOSEN_FEATURES = ALL_FEATURES
CHOSEN_FEATURES_LABEL = ALL_FEATURES_NAME
FEATURES_NORMALIZATION = True

In [28]:
# Get model grid parameters
params = parameters.get_model_grid_params(MODEL_NAME)
params

{'maxIter': [3, 5, 10, 20, 30],
 'maxDepth': [3, 5, 10],
 'stepSize': [0.1, 0.3, 0.5, 0.7],
 'seed': [42]}

In [29]:
# Perform hyperparameter tuning
hyp_res = utilities.multiple_splits(df, params, splitting_info, MODEL_NAME, MODEL_TYPE, FEATURES_NORMALIZATION, CHOSEN_FEATURES, CHOSEN_FEATURES_LABEL, FEATURES_LABEL, TARGET_LABEL)
hyp_res

{'Model': 'GradientBoostingTreeRegressor', 'Type': 'hyp_tuning', 'Splitting': 'walk_forward_splits', 'Features': 'all_features', 'Splits': 1, 'Train&Validation': (20000, 5000), 'Parameters': [30, 3, 0.7, 42], 'RMSE': 521.772115453058, 'MSE': 272246.14046435925, 'MAE': 466.6165517497868, 'MAPE': 0.05184669169961726, 'R2': 0.6419777932014058, 'Adjusted_R2': 0.641691088731497, 'Time': 4.854459285736084}
{'Model': 'GradientBoostingTreeRegressor', 'Type': 'hyp_tuning', 'Splitting': 'walk_forward_splits', 'Features': 'all_features', 'Splits': 2, 'Train&Validation': (20000, 5000), 'Parameters': [3, 5, 0.1, 42], 'RMSE': 479.6356690546991, 'MSE': 230050.3750295489, 'MAE': 330.5802393590523, 'MAPE': 0.033588925205633134, 'R2': 0.05720637258982153, 'Adjusted_R2': 0.05645138269800165, 'Time': 1.1469156742095947}
{'Model': 'GradientBoostingTreeRegressor', 'Type': 'hyp_tuning', 'Splitting': 'walk_forward_splits', 'Features': 'all_features', 'Splits': 3, 'Train&Validation': (20000, 5000), 'Parameters

Unnamed: 0,Model,Type,Splitting,Features,Splits,Train&Validation,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,GradientBoostingTreeRegressor,hyp_tuning,walk_forward_splits,all_features,1,"(20000, 5000)","[30, 3, 0.7, 42]",521.772115,272246.1,466.616552,0.051847,0.641978,0.641691,4.854459
1,GradientBoostingTreeRegressor,hyp_tuning,walk_forward_splits,all_features,2,"(20000, 5000)","[3, 5, 0.1, 42]",479.635669,230050.4,330.580239,0.033589,0.057206,0.056451,1.146916
2,GradientBoostingTreeRegressor,hyp_tuning,walk_forward_splits,all_features,3,"(20000, 5000)","[10, 3, 0.7, 42]",913.360253,834227.0,748.164556,0.068291,-1.46028,-1.46225,2.108407
3,GradientBoostingTreeRegressor,hyp_tuning,walk_forward_splits,all_features,4,"(20000, 5000)","[5, 3, 0.7, 42]",2219.007587,4923995.0,1430.553176,0.104113,-0.720092,-0.72147,0.865277
4,GradientBoostingTreeRegressor,hyp_tuning,walk_forward_splits,all_features,5,"(20000, 5000)","[30, 3, 0.5, 42]",6485.103018,42056560.0,5439.537703,0.241577,-1.625892,-1.627995,4.086955
5,GradientBoostingTreeRegressor,hyp_tuning,walk_forward_splits,all_features,6,"(20000, 5000)","[30, 3, 0.1, 42]",14688.524399,215752700.0,12613.011891,0.293268,-2.907841,-2.91097,4.020801
6,GradientBoostingTreeRegressor,hyp_tuning,walk_forward_splits,all_features,7,"(20000, 5000)","[20, 3, 0.5, 42]",2652.0919,7033591.0,2060.48227,0.036904,0.672705,0.672443,2.810099
7,GradientBoostingTreeRegressor,hyp_tuning,walk_forward_splits,all_features,8,"(20000, 5000)","[10, 3, 0.1, 42]",3310.522426,10959560.0,2682.740535,0.059122,0.856365,0.85625,1.564261
8,GradientBoostingTreeRegressor,hyp_tuning,walk_forward_splits,all_features,9,"(20000, 5000)","[30, 3, 0.3, 42]",11257.192871,126724400.0,9562.494247,0.278795,-16.38298,-16.396901,4.99751
9,GradientBoostingTreeRegressor,hyp_tuning,walk_forward_splits,all_features,10,"(20000, 5000)","[30, 3, 0.5, 42]",6212.110595,38590320.0,5379.746665,0.113137,-3.233892,-3.237283,4.774027


---
To select the best parameters to be used in the final model I assign a score to each value in the "Parameters" column based on the following criteria:
* Calculate the frequency of each unique value in the "Parameters" column.
* Normalize the frequencies to a scale of 0 to 1, where 1 represents the highest frequency.
* Calculate the split weight for each value in the "Parameters" column, where a higher split number corresponds to a higher weight.
* Normalize the split weights to a scale of 0 to 1, where 1 represents the highest split weight.
* Calculate the RMSE weight for each value in the "Parameters" column, where a lower RMSE value corresponds to a higher weight.
* Normalize the RMSE weights to a scale of 0 to 1, where 1 represents the highest RMSE weight.

Then calculate the overall score for each value in the "Parameters" column by combining the normalized frequency, split weight, and RMSE weight and take into consideration the parameters that have the highest score.

In [30]:
# Show parameters score
grouped_scores, best_params = parameters.choose_best_params(hyp_res)
grouped_scores

Unnamed: 0_level_0,Split weight,RMSE weight,Frequency weight,Final score
Parameters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(5, 5, 0.7, 42)",0.818182,0.954554,0.666667,0.520666
"(5, 3, 0.7, 42)",0.545455,0.742385,1.0,0.404937
"(20, 3, 0.5, 42)",0.659091,0.870752,0.666667,0.382603
"(30, 3, 0.3, 42)",0.515152,0.558671,1.0,0.2878
"(30, 5, 0.1, 42)",0.818182,0.939849,0.333333,0.256322
"(30, 3, 0.5, 42)",0.454545,0.562812,1.0,0.255824
"(30, 5, 0.3, 42)",0.954545,0.722852,0.333333,0.229998
"(20, 3, 0.7, 42)",0.590909,0.749644,0.333333,0.147657
"(10, 3, 0.1, 42)",0.363636,0.774618,0.333333,0.093893
"(10, 3, 0.5, 42)",0.727273,0.233866,0.333333,0.056695


In [31]:
# Print best parameters
print(f"Best parameters: {best_params}")

Best parameters: (5, 5, 0.7, 42)


### Cross validation

In [32]:
MODEL_TYPE = "cross_val"

In [33]:
# Get tuned parameters
params = parameters.get_best_model_params(best_params, MODEL_NAME)
params

{'maxIter': [5], 'maxDepth': [5], 'stepSize': [0.7], 'seed': [42]}

In [34]:
# Perform cross validation
cv_res, cv_pred = utilities.multiple_splits(df, params, splitting_info, MODEL_NAME, MODEL_TYPE, FEATURES_NORMALIZATION, CHOSEN_FEATURES, CHOSEN_FEATURES_LABEL, FEATURES_LABEL, TARGET_LABEL)
cv_res

{'Model': 'GradientBoostingTreeRegressor', 'Type': 'cross_val', 'Splitting': 'walk_forward_splits', 'Features': 'all_features', 'Splits': 1, 'Train&Validation': (20000, 5000), 'Parameters': [5, 5, 0.7, 42], 'RMSE': 852.9938046515718, 'MSE': 727598.4307739639, 'MAE': 696.5479537492247, 'MAPE': 0.0773836517369525, 'R2': 0.04315853512350787, 'Adjusted_R2': 0.04239229571219538, 'Time': 1.7221431732177734}
{'Model': 'GradientBoostingTreeRegressor', 'Type': 'cross_val', 'Splitting': 'walk_forward_splits', 'Features': 'all_features', 'Splits': 2, 'Train&Validation': (20000, 5000), 'Parameters': [5, 5, 0.7, 42], 'RMSE': 499.43956432494934, 'MSE': 249439.8784130952, 'MAE': 343.93100260289066, 'MAPE': 0.03491087631123144, 'R2': -0.022255789670542203, 'Adjusted_R2': -0.023074412925533805, 'Time': 2.004049301147461}
{'Model': 'GradientBoostingTreeRegressor', 'Type': 'cross_val', 'Splitting': 'walk_forward_splits', 'Features': 'all_features', 'Splits': 3, 'Train&Validation': (20000, 5000), 'Paramet

Unnamed: 0,Model,Type,Splitting,Features,Splits,Train&Validation,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,GradientBoostingTreeRegressor,cross_val,walk_forward_splits,all_features,1,"(20000, 5000)","[5, 5, 0.7, 42]",852.993805,727598.4,696.547954,0.077384,0.043159,0.042392,1.722143
1,GradientBoostingTreeRegressor,cross_val,walk_forward_splits,all_features,2,"(20000, 5000)","[5, 5, 0.7, 42]",499.439564,249439.9,343.931003,0.034911,-0.022256,-0.023074,2.004049
2,GradientBoostingTreeRegressor,cross_val,walk_forward_splits,all_features,3,"(20000, 5000)","[5, 5, 0.7, 42]",989.711241,979528.3,744.07588,0.068582,-1.888799,-1.891112,2.088585
3,GradientBoostingTreeRegressor,cross_val,walk_forward_splits,all_features,4,"(20000, 5000)","[5, 5, 0.7, 42]",2534.829643,6425361.0,1626.078543,0.116018,-1.244563,-1.24636,1.913442
4,GradientBoostingTreeRegressor,cross_val,walk_forward_splits,all_features,5,"(20000, 5000)","[5, 5, 0.7, 42]",7636.996986,58323720.0,6760.488895,0.308304,-2.641567,-2.644483,1.706354
5,GradientBoostingTreeRegressor,cross_val,walk_forward_splits,all_features,6,"(20000, 5000)","[5, 5, 0.7, 42]",17512.934485,306702900.0,15371.342292,0.361489,-4.555183,-4.559632,1.562643
6,GradientBoostingTreeRegressor,cross_val,walk_forward_splits,all_features,7,"(20000, 5000)","[5, 5, 0.7, 42]",3365.084505,11323790.0,2765.355611,0.049758,0.473069,0.472647,1.704081
7,GradientBoostingTreeRegressor,cross_val,walk_forward_splits,all_features,8,"(20000, 5000)","[5, 5, 0.7, 42]",3451.549782,11913200.0,2597.19164,0.057748,0.843866,0.843741,1.609475
8,GradientBoostingTreeRegressor,cross_val,walk_forward_splits,all_features,9,"(20000, 5000)","[5, 5, 0.7, 42]",12593.934741,158607200.0,10245.014283,0.300134,-20.756393,-20.773816,1.590965
9,GradientBoostingTreeRegressor,cross_val,walk_forward_splits,all_features,10,"(20000, 5000)","[5, 5, 0.7, 42]",8262.724756,68272620.0,7141.115758,0.151642,-6.490452,-6.49645,1.642624


In [35]:
# Define the results to show
tuned_comparison_lst = [cv_res]

# Show the comparison table
tuned_comparison_lst_df = pd.concat([utilities.model_comparison(results, model_info, evaluator_lst) for results in tuned_comparison_lst])
tuned_comparison_lst_df

Unnamed: 0,Model,Type,Splitting,Features,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,GradientBoostingTreeRegressor,cross_val,walk_forward_splits,all_features,"[5, 5, 0.7, 42]",5126.273302,46201980.0,4342.102519,0.141886,-3.203794,-3.207161,1.661259


# Comparison table
Visualization of model performance at various stages of train / validation

In [36]:
# Concatenate final results into Pandas dataset
final_comparison_lst_df = pd.DataFrame(pd.concat([default_comparison_lst_df, tuned_comparison_lst_df], ignore_index=True))
final_comparison_lst_df

Unnamed: 0,Model,Type,Splitting,Features,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,GradientBoostingTreeRegressor,default,walk_forward_splits,all_features,"[20, 5, 0.1, 42]",2638.954201,20466440.0,2108.261655,0.070707,-0.41019,-0.411319,8.633248
1,GradientBoostingTreeRegressor,default,walk_forward_splits,most_rel_features,"[20, 5, 0.1, 42]",2164.790804,12461190.0,1671.421663,0.058803,-0.031602,-0.032428,7.327423
2,GradientBoostingTreeRegressor,default,walk_forward_splits,least_rel_features,"[20, 5, 0.1, 42]",6448.888877,71216770.0,5654.88026,0.185159,-6.666626,-6.672765,7.687908
3,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,all_features,"[20, 5, 0.1, 42]",5300.901669,49467480.0,4517.726197,0.144694,-3.509128,-3.512739,7.810047
4,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,most_rel_features,"[20, 5, 0.1, 42]",6317.167819,75600850.0,5519.420013,0.161337,-12.291512,-12.302156,7.359104
5,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,least_rel_features,"[20, 5, 0.1, 42]",7888.247848,90830620.0,6821.110115,0.224324,-16.478387,-16.492384,6.808287
6,GradientBoostingTreeRegressor,cross_val,walk_forward_splits,all_features,"[5, 5, 0.7, 42]",5126.273302,46201980.0,4342.102519,0.141886,-3.203794,-3.207161,1.661259


# Model accuracy

Since predicting the price accurately is very difficult let's se how good the model is at predicting whether the price will go up or down. 

For each row in our predictions let's consider the actual market-price, next-market-price and our predicted next-market-price (prediction).
We compute whether the current prediction is correct (1) or not (0):

$$ 
prediction\_is\_correct
= 
\begin{cases}
0 \text{ if [(market-price > next-market-price) and (market-price < prediction)] or [(market-price < next-market-price) and (market-price > prediction)]} \\
1 \text{ if [(market-price > next-market-price) and (market-price > prediction)] or [(market-price < next-market-price) and (market-price < prediction)]}
\end{cases}
$$

After that we count the number of correct prediction:
$$ 
correct\_predictions
= 
\sum_{i=0}^{total\_rows} prediction\_is\_correct
$$

Finally we compute the percentage of accuracy of the model:
$$
\\ 
accuracy 
= 
(correct\_predictions / total\_rows) 
* 100
$$

In [37]:
# Convert the pandas dataset to a PySpark dataset
best_default_pred_spark = spark.createDataFrame(best_default_predictions)
validated_pred_spark = spark.createDataFrame(cv_pred)

# Compute model accuracy
default_accuracy = utilities.model_accuracy(best_default_pred_spark)
validated_accuracy = utilities.model_accuracy(validated_pred_spark)

# Saving accuracy data into dataframe
accuracy_data = {
    'Model': MODEL_NAME,
    'Features': CHOSEN_FEATURES_LABEL,
    'Splitting': SPLITTING_METHOD,
    'Accuracy (default)': default_accuracy,
    'Accuracy (validated)': validated_accuracy
}
accuracy_data_df = pd.DataFrame(accuracy_data, index=['Model'])

print(f"Percentage of correct predictions for {MODEL_NAME} with {CHOSEN_FEATURES_LABEL} and {SPLITTING_METHOD} (default): {default_accuracy:.2f}%")
print(f"Percentage of correct predictions for {MODEL_NAME} with {CHOSEN_FEATURES_LABEL} and {SPLITTING_METHOD} (validated): {validated_accuracy:.2f}%")

Percentage of correct predictions for GradientBoostingTreeRegressor with all_features and walk_forward_splits (default): 46.47%
Percentage of correct predictions for GradientBoostingTreeRegressor with all_features and walk_forward_splits (validated): 46.35%


# Saving final results


In [38]:
# Concatenate default and tuned results
default_tuned_results = [best_default_results, cv_res]
default_tuned_results_df = pd.concat([utilities.model_comparison(results, model_info, evaluator_lst) for results in default_tuned_results])
default_tuned_results_df

Unnamed: 0,Model,Type,Splitting,Features,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,GradientBoostingTreeRegressor,default_norm,walk_forward_splits,all_features,"[20, 5, 0.1, 42]",5300.901669,49467480.0,4517.726197,0.144694,-3.509128,-3.512739,7.810047
0,GradientBoostingTreeRegressor,cross_val,walk_forward_splits,all_features,"[5, 5, 0.7, 42]",5126.273302,46201980.0,4342.102519,0.141886,-3.203794,-3.207161,1.661259


In [39]:
# Saving default and tuned results
default_tuned_results_df.to_csv(MODEL_RESULTS, index=False)

In [40]:
# Saving accuracy results
accuracy_data_df.to_csv(MODEL_ACCURACY_RESULTS, index=False)