# **Bitcoin price prediction - Final scores**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author: Corsi Danilo (1742375) - corsi.1742375@studenti.uniroma1.it


---


Description: display of final scores and making predictions on the test set with the models trained on the whole train / validation set.

# Global constants, dependencies, libraries and tools

In [1]:
# Main constants
LOCAL_RUNNING = False
ROOT_DIR = "D:/Documents/Repository/BDC/project" if LOCAL_RUNNING else "/content/drive"

In [2]:
if not LOCAL_RUNNING:
    # Point Colaboratory to Google Drive
    from google.colab import drive

    # Define GDrive paths
    drive.mount(ROOT_DIR, force_remount=True)

    # Install Spark and related dependencies
    !pip install pyspark
    !pip install -U -q PyDrive -qq
    !apt install openjdk-8-jdk-headless -qq

Mounted at /content/drive
openjdk-8-jdk-headless is already the newest version (8u382-ga-1~22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 9 not upgraded.


## Import my utilities

In [3]:
# Set main dir
MAIN_DIR = ROOT_DIR + "" if LOCAL_RUNNING else ROOT_DIR + "/MyDrive/BDC/project"

# Utilities dir
UTILITIES_DIR = MAIN_DIR + "/utilities"

# Import my utilities
import sys
sys.path.append(UTILITIES_DIR)

from imports import *
import final_scores_utilities

importlib.reload(final_scores_utilities)

<module 'final_scores_utilities' from '/content/drive/MyDrive/BDC/project/utilities/final_scores_utilities.py'>

In [4]:
# Set main dir
MAIN_DIR = ROOT_DIR + "" if LOCAL_RUNNING else ROOT_DIR + "/MyDrive/BDC/project"

###################
# --- DATASET --- #
###################

# Datasets dirs
DATASET_OUTPUT_DIR = MAIN_DIR + "/datasets/output"

# Datasets names
DATASET_TEST_NAME = "bitcoin_blockchain_data_15min_test"

# Datasets paths
DATASET_TEST = DATASET_OUTPUT_DIR + "/" + DATASET_TEST_NAME + ".parquet"

####################
# --- FEATURES --- #
####################

# Features dir
FEATURES_DIR = MAIN_DIR + "/features"

# Features labels
FEATURES_LABEL = "features"
TARGET_LABEL = "next-market-price"

# Features names
FEATURES_CORRELATION_LABEL = "features_correlation"
BASE_FEATURES_LABEL = "base_features"
BASE_AND_MOST_CORR_FEATURES_LABEL = "base_and_most_corr_features"
BASE_AND_LEAST_CORR_FEATURES_LABEL = "base_and_least_corr_features"

# Features paths
FEATURES_CORRELATION = FEATURES_DIR + "/" + FEATURES_CORRELATION_LABEL + ".json"
BASE_FEATURES = FEATURES_DIR + "/" + BASE_FEATURES_LABEL + ".json"
BASE_AND_MOST_CORR_FEATURES = FEATURES_DIR + "/" + BASE_AND_MOST_CORR_FEATURES_LABEL + ".json"
BASE_AND_LEAST_CORR_FEATURES = FEATURES_DIR + "/" + BASE_AND_LEAST_CORR_FEATURES_LABEL + ".json"

##################
# --- MODELS --- #
##################

# Model names
LR_MODEL_NAME = "LinearRegression"
GLR_MODEL_NAME = "GeneralizedLinearRegression"
RF_MODEL_NAME = "RandomForestRegressor"
GBTR_MODEL_NAME = "GradientBoostingTreeRegressor"

# Model dir
MODELS_DIR = MAIN_DIR + "/models"

# Model path
LR_MODEL = MODELS_DIR + "/" + LR_MODEL_NAME
GLR_MODEL = MODELS_DIR + "/" + GLR_MODEL_NAME
RF_MODEL = MODELS_DIR + "/" + RF_MODEL_NAME
GBTR_MODEL = MODELS_DIR + "/" + GBTR_MODEL_NAME

###################
# --- RESULTS --- #
###################

# splits names
BLOCK_SPLITS_NAME = "block_splits"
WALK_FORWARD_SPLITS_NAME = "walk_forward_splits"
SHORT_TERM_SPLITS_NAME = "single_split"

# Results dir
RESULTS_DIR = MAIN_DIR + "/results"
RESULTS_FINAL_DIR = RESULTS_DIR + "/final"

In [5]:
# Suppression of warnings for better reading
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

if LOCAL_RUNNING: pio.renderers.default='notebook' # To correctly export the notebook in html format

# Create the pyspark session

In [6]:
# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '12G').\
                set('spark.driver.memory', '12G').\
                set('spark.driver.maxResultSize', '109G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPricePrediction").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

# Loading dataset

In [7]:
# Load datasets into pyspark dataset objects
df = spark.read.load(DATASET_TEST,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

In [8]:
final_scores_utilities.dataset_info(df)

+-------------------+------+------------+------------------+------------------+------------------+------------------+------------------+--------------------+--------------------+--------------------+------------------+------------------+--------------------+------------------------+--------------------+------------------+--------------------+--------------------+------------------+-----------------+--------------------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+
|          timestamp|    id|market-price|     opening-price|     highest-price|      lowest-price|     closing-price|  trade-volume-btc|      total-bitcoins|          market-cap|    trade-volume-usd|       blocks-size|    avg-block-size|n-transactions-total|n-transactions-per-block|           hash-rate|        difficulty|      miners-revenue|transaction-fees-usd|n-unique-addresses|   n-transactions|estimated-transaction-volume-u

# Load train / validation data

In [9]:
# Load models results
splits_list = [BLOCK_SPLITS_NAME, WALK_FORWARD_SPLITS_NAME, SHORT_TERM_SPLITS_NAME]
models_list = [LR_MODEL_NAME, GLR_MODEL_NAME, RF_MODEL_NAME, GBTR_MODEL_NAME]
train_valid_results = pd.DataFrame(columns=['Model', 'Type', 'Dataset', 'Splitting', 'Features', 'Parameters', 'RMSE', 'MSE', 'MAE', 'MAPE', 'R2', 'Adjusted_R2', 'Time'])
train_valid_accuracy = pd.DataFrame(columns=['Model', 'Features', 'Splitting', 'Accuracy (default)', 'Accuracy (tuned)'])
for split in splits_list:
    for model in models_list:
        if split == BLOCK_SPLITS_NAME:
            train_valid_results = pd.concat([train_valid_results, pd.read_csv(RESULTS_DIR + "/" + split + "/" + model + "_rel.csv")], ignore_index=True)
            train_valid_accuracy = pd.concat([train_valid_accuracy, pd.read_csv(RESULTS_DIR + "/" + split + "/" + model + "_accuracy.csv")], ignore_index=True)
        elif split == WALK_FORWARD_SPLITS_NAME:
            train_valid_results = pd.concat([train_valid_results, pd.read_csv(RESULTS_DIR + "/" + split + "/" + model + "_rel.csv")], ignore_index=True)
            train_valid_accuracy = pd.concat([train_valid_accuracy, pd.read_csv(RESULTS_DIR + "/" + split + "/" + model + "_accuracy.csv")], ignore_index=True)
        elif split == SHORT_TERM_SPLITS_NAME:
            train_valid_results = pd.concat([train_valid_results, pd.read_csv(RESULTS_DIR + "/" + split + "/" + model + "_rel.csv")], ignore_index=True)
            train_valid_accuracy = pd.concat([train_valid_accuracy, pd.read_csv(RESULTS_DIR + "/" + split + "/" + model + "_accuracy.csv")], ignore_index=True)

In [10]:
#Plotting parameters
colors = ['red', 'green']
legend = ["Default", "Tuned"]

# Define the order for 'Splitting' and 'Model' columns
splitting_order = ['Block splits', 'Walk-forward splits', 'Single split']
model_order = ['LR', 'GLR', 'RF', 'GBTR']

# Mapping for models names
model_mapping = {
    "LinearRegression": "LR",
    "GeneralizedLinearRegression": "GLR",
    "RandomForestRegressor": "RF",
    "GradientBoostingTreeRegressor": "GBTR",
}

# Mapping for type names
type_mapping = {
    "default_norm": "Default",
    "default": "Default",
    "cross_val": "Tuned",
    "tuned": "Tuned",
}

# Mapping for splits names
splitting_mapping = {
    "block_splits": "Block splits",
    "walk_forward_splits": "Walk-forward splits",
    "single_split": "Single split"
}

features_mapping = {
    "base_features": "Base features",
    "base_and_most_corr_features": "Base + most corr. features",
    "base_and_least_corr_features": "Base + least corr. features",
    "base__features_norm": "Base features(norm.)",
    "base_and_most_corr_features_norm": "Base + most corr. features(norm.)",
    "base_and_least_corr_features_norm": "Base + least corr. features (norm.)"
}

In [47]:
# Make a copy of the original data
train_valid_results_copy = train_valid_results.copy()
train_valid_accuracy_copy = train_valid_accuracy.copy()

# Replace results labels
train_valid_results_copy['Model'] = train_valid_results_copy['Model'].replace(model_mapping)
train_valid_results_copy['Type'] = train_valid_results_copy['Type'].replace(type_mapping)
train_valid_results_copy['Splitting'] = train_valid_results_copy['Splitting'].replace(splitting_mapping)
train_valid_results_copy['Features'] = train_valid_results_copy['Features'].replace(features_mapping)

# Replace accuracy labels
train_valid_accuracy_copy['Model'] = train_valid_accuracy_copy['Model'].replace(model_mapping)
train_valid_accuracy_copy['Splitting'] = train_valid_accuracy_copy['Splitting'].replace(splitting_mapping)
train_valid_accuracy_copy['Features'] = train_valid_accuracy_copy['Features'].replace(features_mapping)

# Convert the 'Splitting' and 'Model' columns to category type with defined order
train_valid_results_copy['Splitting'] = pd.Categorical(train_valid_results_copy['Splitting'], categories=splitting_order, ordered=True)
train_valid_results_copy['Model'] = pd.Categorical(train_valid_results_copy['Model'], categories=model_order, ordered=True)
train_valid_accuracy_copy['Splitting'] = pd.Categorical(train_valid_accuracy_copy['Splitting'], categories=splitting_order, ordered=True)
train_valid_accuracy_copy['Model'] = pd.Categorical(train_valid_accuracy_copy['Model'], categories=model_order, ordered=True)

# Group by 'Splitting' and 'Model' columns
train_valid_results_grouped = train_valid_results_copy.groupby(['Splitting', 'Model'])
train_valid_accuracy_grouped = train_valid_accuracy_copy.groupby('Splitting')

In [42]:
train_valid_results_copy

Unnamed: 0,Model,Type,Dataset,Splitting,Features,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,LR,Default,valid,Block splits,Base + least corr. features (norm.),"[100, 0.0, 0.0]",2227.412278,8171259.0,1870.345923,0.05248,-1.631054,-1.633063,0.461474
1,LR,Tuned,valid,Block splits,Base + least corr. features (norm.),"[5, 0.8, 0.0]",1032.704613,1732495.0,776.557626,0.022809,0.583006,0.582688,0.301061
2,GLR,Default,valid,Block splits,Base + least corr. features (norm.),"[25, 0]",2227.412278,8171259.0,1870.345923,0.05248,-1.631054,-1.633063,0.363326
3,GLR,Tuned,valid,Block splits,Base + least corr. features (norm.),"[5, 0.1, 'gaussian', 'log']",1613.05515,4358777.0,1318.116337,0.036782,0.253175,0.252605,0.241773
4,RF,Default,valid,Block splits,Base features,"[20, 5, 42]",874.913453,876881.1,579.679197,0.022112,0.337898,0.337393,1.108412
5,RF,Tuned,valid,Block splits,Base features,"[30, 10, 42]",772.597736,713592.2,497.460229,0.018699,0.530111,0.529753,2.466774
6,GBTR,Default,valid,Block splits,Base features,"[20, 5, 0.1, 42]",694.84138,641161.3,446.415138,0.016936,0.763292,0.763111,5.69686
7,GBTR,Tuned,valid,Block splits,Base features,"[3, 5, 0.1, 42]",732.282155,728794.3,478.257071,0.018009,0.7354,0.735198,1.069041
8,LR,Default,valid,Walk-forward splits,Base + least corr. features (norm.),"[100, 0.0, 0.0]",1743.625346,5340021.0,1496.804362,0.043785,0.360428,0.359915,0.394546
9,LR,Tuned,valid,Walk-forward splits,base_features_norm,"[5, 0.0, 0.0]",1664.415862,4874436.0,1430.47418,0.041731,0.435987,0.435535,0.317339


In [43]:
train_valid_accuracy_copy

Unnamed: 0,Model,Features,Splitting,Accuracy (default),Accuracy (tuned)
0,LR,Base + least corr. features (norm.),Block splits,48.211971,46.164697
1,GLR,Base + least corr. features (norm.),Block splits,48.211971,48.215783
2,RF,Base features,Block splits,53.808616,54.292795
3,GBTR,Base features,Block splits,51.044605,50.171559
4,LR,base_features_norm,Walk-forward splits,48.015455,50.306364
5,GLR,base_features_norm,Walk-forward splits,48.015455,47.985455
6,RF,Base features,Walk-forward splits,50.989091,51.544545
7,GBTR,Base features,Walk-forward splits,49.032727,50.008182
8,LR,Base + most corr. features(norm.),Single split,46.706989,50.268817
9,GLR,Base + most corr. features(norm.),Single split,46.706989,46.673387


In [44]:
title = 'RMSE value for each model and splitting method'
final_scores_utilities.train_val_bar_plot_results(train_valid_results_grouped, colors, 'Type', 'RMSE', title)

❗TODO

In [45]:
title = 'R2 value for each model and splitting method'
final_scores_utilities.train_val_bar_plot_results(train_valid_results_grouped, colors, 'Type', 'R2', title)

❗TODO

In [48]:
title = 'Percentage of accuracy between default and tuned model'
final_scores_utilities.train_val_bar_plot_accuracy(train_valid_accuracy_grouped, colors, legend, 'Model', 'Accuracy (default)', 'Accuracy (tuned)', title)

❗TODO

# Test models
For each model, predictions on the various mini-sets are made and the obtained results are compared.

The test set is divided into further mini-sets of **1 week**, **15 days**, **1 month** and **3 months** to see how the models' performance degrades as the time taken into account increases.

❗TOFIX

In [17]:
# Retrieve the last value of the timestamp column
first_timestamp = df.select(col("timestamp")).first()[0]

# Split the test set into mini-sets of 1 week, 15 days, 1 month, and 3 months
one_week_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(weeks=1))
fifteen_days_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(days=15))
one_month_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(months=1))
three_months_df = df.filter(col("timestamp") <= first_timestamp + relativedelta(months=3))

# Save
datasets_list = [one_week_df, fifteen_days_df, one_month_df, three_months_df]

In [18]:
final_scores_utilities.show_datasets(one_week_df.toPandas(), fifteen_days_df.toPandas(), one_month_df.toPandas(), three_months_df.toPandas(), "Test set split")

In [19]:
# Loading base features
with open(BASE_FEATURES, "r") as f:
    BASE_FEATURES = json.load(f)
print(BASE_FEATURES)

['opening-price', 'highest-price', 'lowest-price', 'closing-price', 'trade-volume-btc', 'market-price', 'market-cap', 'total-bitcoins', 'trade-volume-usd']


In [20]:
# Loading currency and additional most correlated features
with open(BASE_AND_MOST_CORR_FEATURES, "r") as f:
    BASE_AND_MOST_CORR_FEATURES = json.load(f)
print(BASE_AND_MOST_CORR_FEATURES)

['opening-price', 'highest-price', 'lowest-price', 'closing-price', 'trade-volume-btc', 'market-price', 'market-cap', 'total-bitcoins', 'trade-volume-usd', 'miners-revenue', 'sma-5-days', 'sma-7-days', 'sma-10-days', 'estimated-transaction-volume-usd', 'sma-20-days']


In [21]:
# Loading currency and additional least correlated features
with open(BASE_AND_LEAST_CORR_FEATURES, "r") as f:
    BASE_AND_LEAST_CORR_FEATURES = json.load(f)
print(BASE_AND_LEAST_CORR_FEATURES)

['opening-price', 'highest-price', 'lowest-price', 'closing-price', 'trade-volume-btc', 'market-price', 'market-cap', 'total-bitcoins', 'trade-volume-usd', 'sma-100-days', 'transaction-fees-usd', 'n-unique-addresses', 'sma-50-days', 'n-transactions-total', 'blocks-size', 'hash-rate', 'difficulty', 'avg-block-size', 'n-transactions-per-block', 'n-transactions']


In [22]:
# Load models
lr = PipelineModel.load(LR_MODEL)
glr = PipelineModel.load(GLR_MODEL)
rf = PipelineModel.load(RF_MODEL)
gbtr = PipelineModel.load(GBTR_MODEL)

In [23]:
# Filter train_valid_results based on Type column
filtered_results = train_valid_results[
    (train_valid_results['Type'].isin(['cross_val', 'tuned'])) &
    (train_valid_results['Splitting'] == 'single_split')
]

model_params_list = []
for index, row in filtered_results.iterrows():
  # Select model
  if row['Model'] == LR_MODEL_NAME:
    model = lr
  elif row['Model'] == GLR_MODEL_NAME:
    model = glr
  elif row['Model'] == RF_MODEL_NAME:
    model = rf
  elif row['Model'] == GBTR_MODEL_NAME:
    model = gbtr

  model_name = row['Model']
  features_label = row['Features']

  if features_label.endswith('_norm'):
    features_normalization = True
    features_label = features_label.replace("_norm", "")
  else:
    features_normalization = False

  # Select feature
  if features_label == BASE_FEATURES:
    features = BASE_FEATURES
  elif features_label == BASE_AND_MOST_CORR_FEATURES_LABEL:
    features = BASE_AND_MOST_CORR_FEATURES
  elif features_label == BASE_AND_LEAST_CORR_FEATURES_LABEL:
    features = BASE_AND_LEAST_CORR_FEATURES

  model_params = {
      "Model_name": model_name,
      "Model": model,
      "Features_label": features_label,
      "Features": features,
      "Normalization": features_normalization
  }

  model_params_list.append(model_params)

print(model_params_list)

[{'Model_name': 'LinearRegression', 'Model': PipelineModel_402ec2b178b9, 'Features_label': 'base_and_most_corr_features', 'Features': ['opening-price', 'highest-price', 'lowest-price', 'closing-price', 'trade-volume-btc', 'market-price', 'market-cap', 'total-bitcoins', 'trade-volume-usd', 'miners-revenue', 'sma-5-days', 'sma-7-days', 'sma-10-days', 'estimated-transaction-volume-usd', 'sma-20-days'], 'Normalization': True}, {'Model_name': 'GeneralizedLinearRegression', 'Model': PipelineModel_c128882cd8e7, 'Features_label': 'base_and_most_corr_features', 'Features': ['opening-price', 'highest-price', 'lowest-price', 'closing-price', 'trade-volume-btc', 'market-price', 'market-cap', 'total-bitcoins', 'trade-volume-usd', 'miners-revenue', 'sma-5-days', 'sma-7-days', 'sma-10-days', 'estimated-transaction-volume-usd', 'sma-20-days'], 'Normalization': True}, {'Model_name': 'RandomForestRegressor', 'Model': PipelineModel_2a89489a9eb0, 'Features_label': 'base_features', 'Features': ['opening-pr

In [24]:
# Test models
dataset_list = [one_week_df, fifteen_days_df, one_month_df, three_months_df]
dataset_name_list = ["one_week", "fifteen_days", "one_month", "three_months"]
predictions_df = pd.DataFrame(columns=[TARGET_LABEL, "market-price", "prediction", 'timestamp'])
test_results = pd.DataFrame(columns=['Model', 'Dataset', 'Features', 'RMSE', 'MSE', 'MAE', 'MAPE', 'R2', 'Adjusted_R2'])
test_accuracy = pd.DataFrame(columns=['Model', 'Features', 'Dataset', 'Accuracy'])

# For each model makes predictions based on the dataset type
for model_params in model_params_list:
    for j, dataset in enumerate(dataset_list):
      MODEL_NAME = model_params['Model_name']
      MODEL = model_params['Model']
      CHOSEN_FEATURES_LABEL = model_params['Features_label']
      CHOSEN_FEATURES = model_params['Features']
      FEATURES_NORMALIZATION = model_params['Normalization']

      results, predictions = final_scores_utilities.evaluate_final_model(dataset, dataset_name_list[j], MODEL, MODEL_NAME, FEATURES_NORMALIZATION, CHOSEN_FEATURES, CHOSEN_FEATURES_LABEL, FEATURES_LABEL, TARGET_LABEL)
      test_results = pd.concat([test_results, results], ignore_index=True)

      predictions = predictions.withColumn("Model", lit(MODEL_NAME)).withColumn("Dataset", lit(dataset_name_list[j]))
      predictions_df = pd.concat([predictions_df, predictions.toPandas()], ignore_index=True)

      accuracy = final_scores_utilities.model_accuracy(predictions)
      accuracy_data = {
          'Model': MODEL_NAME,
          'Features': CHOSEN_FEATURES_LABEL,
          'Dataset': dataset_name_list[j],
          'Accuracy': accuracy
      }

      accuracy_data_df = pd.DataFrame(accuracy_data, index=['Model'])
      test_accuracy = pd.concat([test_accuracy, accuracy_data_df], ignore_index=True)

# Merge results and accuracy
final_test_results = pd.merge(test_results, test_accuracy)
final_test_results

Unnamed: 0,Model,Dataset,Features,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Accuracy
0,LinearRegression,one_week,base_and_most_corr_features,2609.393648,6808935.0,2440.4972,0.08833,-2.663772,-2.685711,71.322437
1,LinearRegression,fifteen_days,base_and_most_corr_features,2946.197227,8680078.0,2846.512298,0.106555,-3.630695,-3.643594,66.620402
2,LinearRegression,one_month,base_and_most_corr_features,3088.128452,9536537.0,2995.746203,0.113862,-6.276874,-6.286668,64.494458
3,LinearRegression,three_months,base_and_most_corr_features,2282.902299,5211643.0,2027.938757,0.072343,0.580316,0.580126,57.835145
4,GeneralizedLinearRegression,one_week,base_and_most_corr_features,2987.628423,8925924.0,2804.72844,0.101492,-3.802888,-3.831648,71.322437
5,GeneralizedLinearRegression,fifteen_days,base_and_most_corr_features,3423.588091,11720960.0,3312.875054,0.124036,-5.252958,-5.270376,66.620402
6,GeneralizedLinearRegression,one_month,base_and_most_corr_features,3564.472652,12705470.0,3476.982746,0.132117,-8.694931,-8.707979,64.494458
7,GeneralizedLinearRegression,three_months,base_and_most_corr_features,2579.296794,6652772.0,2262.833504,0.080586,0.464265,0.464022,56.98596
8,RandomForestRegressor,one_week,base_features,513.33711,263515.0,365.313231,0.013612,0.858207,0.857358,53.640416
9,RandomForestRegressor,fifteen_days,base_features,800.702651,641124.7,690.581401,0.026253,0.657969,0.657017,45.038168


# Models comparison

In [25]:
# For each dataset type, it displays the predicitons of each model
for i, data in enumerate(datasets_list):
    predictions_to_show = predictions_df[predictions_df['Dataset'] == dataset_name_list[i]]

    lr_predictions = predictions_to_show[predictions_to_show['Model'] == LR_MODEL_NAME]
    glr_predictions = predictions_to_show[predictions_to_show['Model'] == GLR_MODEL_NAME]
    rf_predictions = predictions_to_show[predictions_to_show['Model'] == RF_MODEL_NAME]
    gbtr_predictions = predictions_to_show[predictions_to_show['Model'] == GBTR_MODEL_NAME]

    final_scores_utilities.show_results(data.toPandas(), LR_MODEL_NAME, lr_predictions, GLR_MODEL_NAME, glr_predictions, RF_MODEL_NAME, rf_predictions, GBTR_MODEL_NAME, gbtr_predictions, dataset_name_list[i] + " predictions")

# Summary

In [55]:
# Manually set colors
colors = ['red', 'green', 'blue', 'orange']

# Define the order for 'Dataset' and 'Model' columns
dataset_order = ['One week', 'Fifteen days', 'One month', 'Three months']
model_order = ['LR', 'GLR', 'RF', 'GBTR']

# Mapping for models names
model_mapping = {
    "LinearRegression": "LR",
    "GeneralizedLinearRegression": "GLR",
    "RandomForestRegressor": "RF",
    "GradientBoostingTreeRegressor": "GBTR",
}

# Mapping for datasets names
dataset_mapping = {
    "one_week": "One week",
    "fifteen_days": "Fifteen days",
    "one_month": "One month",
    "three_months": "Three months"
}

features_mapping = {
    "base_features": "Base features",
    "base_and_most_corr_features": "Base + most corr. features",
    "base_and_least_corr_features": "Base + least corr. features",
    "base__features_norm": "Base features(norm.)",
    "base_and_most_corr_features_norm": "Base + most corr. features(norm.)",
    "base_and_least_corr_features_norm": "Base + least corr. features (norm.)"
}

In [56]:
# Make a copy of the original data
final_test_results_copy = final_test_results.copy()

# Replace results labels
final_test_results_copy['Model'] = final_test_results_copy['Model'].replace(model_mapping)
final_test_results_copy['Dataset'] = final_test_results_copy['Dataset'].replace(dataset_mapping)
final_test_results_copy['Features'] = final_test_results_copy['Features'].replace(features_mapping)

# Convert the 'Dataset' and 'Model' columns to category type with defined order
final_test_results_copy['Dataset'] = pd.Categorical(final_test_results_copy['Dataset'], categories=dataset_order, ordered=True)
final_test_results_copy['Model'] = pd.Categorical(final_test_results_copy['Model'], categories=model_order, ordered=True)

# Group by 'Dataset' and 'Model' columns
final_test_results_grouped = final_test_results_copy.groupby('Dataset')

In [57]:
final_test_results_copy

Unnamed: 0,Model,Dataset,Features,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Accuracy
0,LR,One week,Base + most corr. features,2609.393648,6808935.0,2440.4972,0.08833,-2.663772,-2.685711,71.322437
1,LR,Fifteen days,Base + most corr. features,2946.197227,8680078.0,2846.512298,0.106555,-3.630695,-3.643594,66.620402
2,LR,One month,Base + most corr. features,3088.128452,9536537.0,2995.746203,0.113862,-6.276874,-6.286668,64.494458
3,LR,Three months,Base + most corr. features,2282.902299,5211643.0,2027.938757,0.072343,0.580316,0.580126,57.835145
4,GLR,One week,Base + most corr. features,2987.628423,8925924.0,2804.72844,0.101492,-3.802888,-3.831648,71.322437
5,GLR,Fifteen days,Base + most corr. features,3423.588091,11720960.0,3312.875054,0.124036,-5.252958,-5.270376,66.620402
6,GLR,One month,Base + most corr. features,3564.472652,12705470.0,3476.982746,0.132117,-8.694931,-8.707979,64.494458
7,GLR,Three months,Base + most corr. features,2579.296794,6652772.0,2262.833504,0.080586,0.464265,0.464022,56.98596
8,RF,One week,Base features,513.33711,263515.0,365.313231,0.013612,0.858207,0.857358,53.640416
9,RF,Fifteen days,Base features,800.702651,641124.7,690.581401,0.026253,0.657969,0.657017,45.038168


In [58]:
title = 'RMSE value for each model and dataset split'
final_scores_utilities.test_bar_plot(final_test_results_grouped, colors, 'Model', 'RMSE', title)

❗TOFIX

In [59]:
title = 'R2 value for each model and dataset split'
final_scores_utilities.test_bar_plot(final_test_results_grouped, colors, 'Model', 'R2', title)

❗TOFIX

In [60]:
title = 'Percentage of accuracy between default and tuned model'
final_scores_utilities.test_bar_plot(final_test_results_grouped, colors, 'Model', "Accuracy", title)

❗TOFIX

# Saving final results


In [32]:
# Saving final test results
final_test_results.to_csv(RESULTS_FINAL_DIR + "/final.csv", index=False)

In [33]:
# Export notebook in html format (remember to save the notebook and change the model name)
if LOCAL_RUNNING:
    !jupyter nbconvert --to html 6-final-scores.ipynb --output 6-final-scores --output-dir='./exports'