# **Bitcoin price forecasting - Conclusions**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author
Corsi Danilo - corsi.1742375@studenti.uniroma1.it



## Global constants, dependencies, libraries and tools

In [1]:
# Main constants
GDRIVE_DIR = "/content/drive"
SLOW_OPERATIONS = True

In [2]:
# Datasets dirs
GDRIVE_DATASET_OUTPUT_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/output"

# Datasets names
DATASET_TEST_NAME = "bitcoin_blockchain_data_30min_test"

# Datasets paths
GDRIVE_DATASET_TEST = GDRIVE_DATASET_OUTPUT_DIR + "/" + DATASET_TEST_NAME + ".parquet"

# --------------------------------------------- #

# Features dir
GDRIVE_FEATURES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/features"

# Features labels
FEATURES_LABEL = "features"
TARGET_LABEL = "next-market-price"
CHOSEN_FEATURES_LABEL = "more_rel_features"

# Features paths
GDRIVE_CHOSEN_FEATURES = GDRIVE_FEATURES_DIR + "/" + CHOSEN_FEATURES_LABEL + ".json"

# --------------------------------------------- #

# Model names
LR_MODEL_NAME = "LinearRegression"
GLR_MODEL_NAME = "GeneralizedLinearRegression"
RF_MODEL_NAME = "RandomForestRegressor"
GBT_MODEL_NAME = "GBTRegressor"

# Model dir
GDRIVE_MODELS_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/models"

# Model path
GDRIVE_LR_MODEL = GDRIVE_MODELS_DIR + "/" + LR_MODEL_NAME
GDRIVE_GLR_MODEL = GDRIVE_MODELS_DIR + "/" + GLR_MODEL_NAME
GDRIVE_RF_MODEL = GDRIVE_MODELS_DIR + "/" + RF_MODEL_NAME
GDRIVE_GBT_MODEL = GDRIVE_MODELS_DIR + "/" + GBT_MODEL_NAME

# --------------------------------------------- #

# Utilities dir
GDRIVE_UTILITIES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/utilities"

# --------------------------------------------- #

# Results dir
GDRIVE_RESULTS_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/results"

# Model results path
GDRIVE_LR_MODEL_RESULTS = GDRIVE_RESULTS_DIR + "/" + LR_MODEL_NAME + ".csv"
GDRIVE_GLR_MODEL_RESULTS = GDRIVE_RESULTS_DIR + "/" + GLR_MODEL_NAME + ".csv"
GDRIVE_RF_MODEL_RESULTS = GDRIVE_RESULTS_DIR + "/" + RF_MODEL_NAME + ".csv"
GDRIVE_GBT_MODEL_RESULTS = GDRIVE_RESULTS_DIR + "/" + GBT_MODEL_NAME + ".csv"

# Final results path
GDRIVE_FINAL_RESULTS  = GDRIVE_RESULTS_DIR + "/final.csv"

In [3]:
# Point Colaboratory to our Google Drive
from google.colab import drive

# Define GDrive paths
drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


In [4]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
# Install Spark and related dependencies
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285387 sha256=f1a6f534e9de18d8bafda1ceeeaefabf9d5c18825e0d9603f750f6ee36575912
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


## Import files

In [6]:
GDRIVE_UTILITIES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/utilities"

import sys
sys.path.append(GDRIVE_UTILITIES_DIR)

from imports import *

from dateutil.relativedelta import relativedelta

## Create the pyspark session

In [7]:
# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '4G').\
                set('spark.driver.memory', '45G').\
                set('spark.driver.maxResultSize', '10G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPriceForecasting").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

## Loading dataset

In [8]:
# Load datasets into pyspark dataset objects
df = spark.read.load(GDRIVE_DATASET_TEST,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

In [9]:
def dataset_info(dataset):
  dataset.show(3)

  # Get the number of rows
  num_rows = dataset.count()

  # Get the number of columns
  num_columns = len(dataset.columns)

  # Print the shape of the dataset
  print("Shape:", (num_rows, num_columns))

  # Print the schema of the dataset
  dataset.printSchema()

In [10]:
if SLOW_OPERATIONS:
  dataset_info(df)

+-------------------+------+------------+--------------------+--------------------+--------------------+------------------+------------------+--------------------+------------------------+------------------+--------------------+--------------------+--------------------+------------------+-----------------+--------------------------------+--------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-----------------+
|          timestamp|    id|market-price|      total-bitcoins|          market-cap|        trade-volume|       blocks-size|    avg-block-size|n-transactions-total|n-transactions-per-block|         hash-rate|          difficulty|      miners-revenue|transaction-fees-usd|n-unique-addresses|   n-transactions|estimated-transaction-volume-usd|      rate-of-change|        sma-5-days|        sma-7-days|       sma-10-days|      sma-20-days|       sma-50-days|      sma-100-days|next-market-price|
+-------------

## Cut test dataset

In [11]:
# Define the date column in your dataset
date_column = "timestamp"

# Retrieve the last value of the column
first_timestamp = df.select(col(date_column)).first()[0]

# Split the test set into mini-sets of 1 week, 15 days, 1 month, and 3 months
one_week_df = df.filter(col(date_column) <= first_timestamp + relativedelta(weeks=1))
fifteen_days_df = df.filter(col(date_column) <= first_timestamp + relativedelta(days=15))
one_month_df = df.filter(col(date_column) <= first_timestamp + relativedelta(months=1))
three_months_df = df.filter(col(date_column) <= first_timestamp + relativedelta(months=3))

## Load models

In [12]:
# Upload the 3 templates saved in Google Drive
lr = PipelineModel.load(GDRIVE_LR_MODEL)
glr = PipelineModel.load(GDRIVE_GLR_MODEL)
rf = PipelineModel.load(GDRIVE_RF_MODEL)
gbt = PipelineModel.load(GDRIVE_GBT_MODEL)

In [13]:
def violin_plot(df, value_colname, facet_colname, title, figscale=1, mpl_palette_name='Dark2', **kwargs):
  figsize = (12 * figscale, 1.2 * figscale * len(df[facet_colname].unique()))
  plt.figure(figsize=figsize)
  sns.violinplot(df, x=value_colname, y=facet_colname, palette=mpl_palette_name, **kwargs)
  sns.despine(top=True, right=True, bottom=True, left=True)
  plt.text(0, 1, title, fontsize=16, transform=plt.gca().transAxes, ha='center', va='top')

  return autoviz.MplChart.from_current_mpl_state()

In [14]:
lr_results = pd.read_csv(GDRIVE_LR_MODEL_RESULTS)
glr_results = pd.read_csv(GDRIVE_GLR_MODEL_RESULTS)
rf_results = pd.read_csv(GDRIVE_RF_MODEL_RESULTS)
gbt_results = pd.read_csv(GDRIVE_GBT_MODEL_RESULTS)

model_results = pd.concat([lr_results, glr_results, rf_results, gbt_results], ignore_index=True)
model_results

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,LinearRegression,simple,all_features,"[100, 0.0, 0.0]",30.324492,919.5748,26.687451,0.001174,0.99995,0.99995,9.48691
1,LinearRegression,simple,more_rel_features,"[100, 0.0, 0.0]",16.736956,280.1257,12.552692,0.000539,0.999985,0.999985,3.867935
2,LinearRegression,simple,less_rel_features,"[100, 0.0, 0.0]",60293.582613,3635316000.0,57204.519888,2.621113,-198.51822,-198.56442,1.972919
3,LinearRegression,simple_norm,all_features,"[100, 0.0, 0.0]",9344.755084,87324450.0,7512.500099,0.313865,-3.792656,-3.793765,2.897192
4,LinearRegression,simple_norm,more_rel_features,"[100, 0.0, 0.0]",5336.759798,28481010.0,4464.383178,0.22433,-0.563132,-0.563494,3.638081
5,LinearRegression,simple_norm,less_rel_features,"[100, 0.0, 0.0]",10232.878751,104711800.0,9195.00555,0.409027,-4.746932,-4.748262,2.113339
6,LinearRegression,autotuning,more_rel_features,"[5, 0.2, 0.2]",2496.479009,6232407.0,2124.046305,0.103889,0.657945,0.657866,0.343026
7,LinearRegression,multi_splits,more_rel_features,"[5, 0.2, 0.2]",10992.948425,283002500.0,9654.008222,0.394375,-2.254079,-2.254531,2.012786
8,LinearRegression,block_splits,more_rel_features,"[5, 0.2, 0.2]",3900.04104,36124340.0,3594.753415,0.204565,-4.892798,-4.899629,0.946438
9,LinearRegression,final_validated,more_rel_features,"[5, 0.2, 0.2]",2496.479009,6232407.0,2124.046305,0.103889,0.657945,0.657866,3.811129


In [15]:
def violin_plot(dataset, x_axis, y_axis, title):
  fig = px.violin(dataset, x=x_axis, y=y_axis, color=y_axis)
  fig.update_layout(title=title)
  fig.update_yaxes(showticklabels=False)
  fig.update_yaxes(title='')
  fig.show()

In [16]:
violin_plot(model_results, "RMSE", "Features", "RMSE value for each features")

In [17]:
violin_plot(model_results, "RMSE", "Model", "RMSE value for each model")

## Load features

In [18]:
# Loading correlation matrix features
with open(GDRIVE_CHOSEN_FEATURES, "r") as f:
    CHOSEN_FEATURES = json.load(f)
print(CHOSEN_FEATURES)

['market-price', 'market-cap', 'miners-revenue', 'sma-5-days', 'sma-7-days', 'sma-10-days', 'estimated-transaction-volume-usd', 'n-transactions-total', 'blocks-size', 'sma-100-days', 'total-bitcoins', 'sma-20-days']


## Test models

In [19]:
# Normalized / standardized features selection
def select_normalized_features(dataset, features, features_label, target_label):
    # Assemble the columns into a vector column
    assembler = VectorAssembler(inputCols = features, outputCol = "raw_features")
    df_vector  = assembler.transform(dataset).select("timestamp", "id", "raw_features", target_label)

    # Create a Normalizer instance
    normalizer = Normalizer(inputCol="raw_features", outputCol=features_label)

    # Fit and transform the data
    normalized_data = normalizer.transform(df_vector).select("timestamp", "id", features_label, target_label)

    return normalized_data

In [20]:
def model_evaluation(target_label, predictions):
    mse_evaluator = RegressionEvaluator(labelCol=target_label, predictionCol="prediction", metricName='mse')
    rmse_evaluator = RegressionEvaluator(labelCol=target_label, predictionCol="prediction", metricName='rmse')
    mae_evaluator = RegressionEvaluator(labelCol=target_label, predictionCol="prediction", metricName='mae')
    r2_evaluator = RegressionEvaluator(labelCol=target_label, predictionCol="prediction", metricName='r2')

    mape = mean_absolute_percentage_error(predictions.toPandas()[target_label], predictions.toPandas()["prediction"])

    mse = mse_evaluator.evaluate(predictions)
    rmse = rmse_evaluator.evaluate(predictions)
    mae = mae_evaluator.evaluate(predictions)
    r2 = r2_evaluator.evaluate(predictions)

    # Adjusted R-squared
    n = predictions.count()
    p = len(predictions.columns)
    adj_r2 = 1-(1-r2)*(n-1)/(n-p-1)

    results = {'rmse':rmse, 'mse':mse, 'mae':mae, 'mape':mape, 'r2':r2, 'adj_r2':adj_r2}

    return results

In [21]:
def evaluate_final_model(dataset, dataset_name, model, model_name, choosen_features, features_label, target_label):
    dataset = select_normalized_features(dataset, choosen_features, features_label, target_label)

    # Chain assembler and model in a Pipeline
    pipeline = Pipeline(stages=[model])
    # Train a model and calculate running time
    start = time.time()
    pipeline_model = pipeline.fit(dataset)
    end = time.time()

    # Make predictions
    predictions = pipeline_model.transform(dataset).select(target_label, "prediction", 'timestamp')

    # Compute validation error by several evaluators
    eval_res = model_evaluation(target_label, predictions)

    # Use dict to store each result
    results = {
        "Model": model_name,
        "Dataset": dataset_name,
        "RMSE": eval_res['rmse'],
        "MSE": eval_res['mse'],
        "MAE": eval_res['mae'],
        "MAPE": eval_res['mape'],
        "R2": eval_res['r2'],
        "Adjusted_R2": eval_res['adj_r2'],
        "Time": end - start,
    }

    # Transform dict to pandas dataset
    results_pd = pd.DataFrame(results, index=[0])

    return results_pd, predictions

In [22]:
lr_res_one_week, lr_pred_one_week = evaluate_final_model(one_week_df, "one_week", lr, LR_MODEL_NAME, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
lr_res_fifteen_days, lr_pred_fifteen_days = evaluate_final_model(fifteen_days_df, "fifteen_days", lr, LR_MODEL_NAME, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
lr_res_one_month, lr_pred_one_month = evaluate_final_model(one_month_df, "one_month", lr, LR_MODEL_NAME, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
lr_res_three_months, lr_pred_three_months = evaluate_final_model(three_months_df, "three_months", lr, LR_MODEL_NAME, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)

In [23]:
glr_res_one_week, glr_pred_one_week = evaluate_final_model(one_week_df, "one_week", glr, GLR_MODEL_NAME, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
glr_res_fifteen_days, glr_pred_fifteen_days = evaluate_final_model(fifteen_days_df, "fifteen_days", glr, GLR_MODEL_NAME, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
glr_res_one_month, glr_pred_one_month = evaluate_final_model(one_month_df, "one_month", glr, GLR_MODEL_NAME, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
glr_res_three_months, glr_pred_three_months = evaluate_final_model(three_months_df, "three_months", glr, GLR_MODEL_NAME, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)

In [24]:
rf_res_one_week, rf_pred_one_week = evaluate_final_model(one_week_df, "one_week", rf, RF_MODEL_NAME, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
rf_res_fifteen_days, rf_pred_fifteen_days = evaluate_final_model(fifteen_days_df, "fifteen_days", rf, RF_MODEL_NAME, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
rf_res_one_month, rf_pred_one_month = evaluate_final_model(one_month_df, "one_month", rf, RF_MODEL_NAME, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
rf_res_three_months, rf_pred_three_months = evaluate_final_model(three_months_df, "three_months", rf, RF_MODEL_NAME, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)

In [25]:
gbt_res_one_week, gbt_pred_one_week = evaluate_final_model(one_week_df, "one_week", gbt, GBT_MODEL_NAME, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
gbt_res_fifteen_days, gbt_pred_fifteen_days = evaluate_final_model(fifteen_days_df, "fifteen_days", gbt, GBT_MODEL_NAME, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
gbt_res_one_month, gbt_pred_one_month = evaluate_final_model(one_month_df, "one_month", gbt, GBT_MODEL_NAME, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)
gbt_res_three_months, gbt_pred_three_months = evaluate_final_model(three_months_df, "three_months", gbt, GBT_MODEL_NAME, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)

## Models comparison

In [26]:
def show_results(df, lr_predictions, glr_predictions, rf_predictions, gbt_predictions, title):
  trace1 = go.Scatter(
      x = df['timestamp'],
      y = df['next-market-price'].astype(float),
      mode = 'lines',
      name = 'Next Market price (usd)'
  )

  trace2 = go.Scatter(
      x = lr_predictions['timestamp'],
      y = lr_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Linear Regression predictions'
  )

  trace2 = go.Scatter(
      x = lr_predictions['timestamp'],
      y = lr_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Linear Regression predictions'
  )

  trace3 = go.Scatter(
      x = glr_predictions['timestamp'],
      y = glr_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Generalized Linear Regression predictions'
  )

  trace4 = go.Scatter(
      x = rf_predictions['timestamp'],
      y = rf_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Random Forest Regressor predictions'
  )

  trace5 = go.Scatter(
      x = gbt_predictions['timestamp'],
      y = gbt_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'GBTRegressor predictions'
  )

  layout = dict(
      title=title + " predictions",
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  #change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace1,trace2,trace3, trace4, trace5]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = title + " predictions")

In [37]:
# Define what model_info and evaluators in the Model Comparison Table
model_info = ['Model', "Dataset"]
evaluator_lst = ['RMSE', 'MSE', 'MAE', 'MAPE', 'R2', 'Adjusted_R2', 'Time']

# The the Cross Validation results would like to compare
one_week_comparison_lst = [lr_res_one_week, glr_res_one_week, rf_res_one_week, gbt_res_one_week]
fifteen_days_comparison_lst = [lr_res_fifteen_days, glr_res_fifteen_days, rf_res_fifteen_days, gbt_res_fifteen_days]
one_month_comparison_lst = [lr_res_one_month, glr_res_one_month, rf_res_one_month, gbt_res_one_month]
three_months_comparison_lst = [lr_res_three_months, glr_res_three_months, rf_res_three_months, gbt_res_three_months]

In [38]:
def model_comparison(cv_result, model_info, evaluator_lst):
    # Calculate mean of all splits on chosen evaluator
    col_mean_df = cv_result[evaluator_lst].mean().to_frame().T

    # Extract model info
    model_info_df = cv_result[model_info][:1]

    # Concatenate by row
    comparison_df = pd.concat([model_info_df,col_mean_df], axis=1)

    return comparison_df

In [39]:
# Show the Comparison Table
pd.concat([model_comparison(cv_result, model_info, evaluator_lst) for cv_result in one_week_comparison_lst])

Unnamed: 0,Model,Dataset,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,LinearRegression,one_week,3103.336886,9630700.0,2736.802453,0.093504,-10.46364,-10.566916,0.001028
0,GeneralizedLinearRegression,one_week,2735.590791,7483457.0,2213.622695,0.075558,-7.907729,-7.987978,0.000182
0,RandomForestRegressor,one_week,291.992031,85259.35,201.006618,0.006955,0.898514,0.8976,0.000161
0,GBTRegressor,one_week,603.438887,364138.5,401.285036,0.014171,0.566558,0.562653,0.000118


In [40]:
show_results(one_week_df.toPandas(), lr_pred_one_week.toPandas(), glr_pred_one_week.toPandas(), rf_pred_one_week.toPandas(), gbt_pred_one_week.toPandas(), "One week")

In [41]:
# Show the Comparison Table
pd.concat([model_comparison(cv_result, model_info, evaluator_lst) for cv_result in fifteen_days_comparison_lst])

Unnamed: 0,Model,Dataset,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,LinearRegression,fifteen_days,3743.377033,14012870.0,3490.62171,0.117864,-14.231418,-14.295148,0.000178
0,GeneralizedLinearRegression,fifteen_days,2845.532378,8097055.0,2431.556702,0.082404,-7.801167,-7.837992,0.00017
0,RandomForestRegressor,fifteen_days,350.335303,122734.8,282.945356,0.009561,0.866592,0.866034,0.000191
0,GBTRegressor,fifteen_days,566.443658,320858.4,398.735374,0.01384,0.65124,0.649781,0.000178


In [42]:
show_results(fifteen_days_df.toPandas(), lr_pred_fifteen_days.toPandas(), glr_pred_fifteen_days.toPandas(), rf_pred_fifteen_days.toPandas(), gbt_pred_fifteen_days.toPandas(), "Fifteen days")

In [43]:
# Show the Comparison Table
pd.concat([model_comparison(cv_result, model_info, evaluator_lst) for cv_result in one_month_comparison_lst])

Unnamed: 0,Model,Dataset,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,LinearRegression,one_month,3330.651595,11093240.0,3080.417365,0.105379,-11.945275,-11.972301,0.000166
0,GeneralizedLinearRegression,one_month,2427.883716,5894619.0,1961.502835,0.067113,-5.878736,-5.893097,0.000167
0,RandomForestRegressor,one_month,318.430907,101398.2,255.228821,0.008762,0.881673,0.881426,0.00021
0,GBTRegressor,one_month,489.226721,239342.8,353.640644,0.012387,0.720699,0.720115,0.00019


In [44]:
show_results(one_month_df.toPandas(), lr_pred_one_month.toPandas(), glr_pred_one_month.toPandas(), rf_pred_one_month.toPandas(), gbt_pred_one_month.toPandas(), "One month")

In [45]:
# Show the Comparison Table
pd.concat([model_comparison(cv_result, model_info, evaluator_lst) for cv_result in three_months_comparison_lst])

Unnamed: 0,Model,Dataset,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,LinearRegression,three_months,3186.058742,10150970.0,2865.438293,0.099885,-3.158085,-3.160943,0.000191
0,GeneralizedLinearRegression,three_months,3137.674629,9845002.0,2677.216031,0.094457,-3.032753,-3.035524,0.000175
0,RandomForestRegressor,three_months,1008.122161,1016310.0,670.615743,0.023306,0.583695,0.583408,0.000186
0,GBTRegressor,three_months,824.815304,680320.3,627.407807,0.022141,0.721324,0.721133,0.000194


In [46]:
show_results(three_months_df.toPandas(), lr_pred_three_months.toPandas(), glr_pred_three_months.toPandas(), rf_pred_three_months.toPandas(), gbt_pred_three_months.toPandas(), "Three months")

## Summary

In [47]:
comparison_lst = [lr_res_one_week, glr_res_one_week, rf_res_one_week, gbt_res_one_week,
                  lr_res_fifteen_days, glr_res_fifteen_days, rf_res_fifteen_days, gbt_res_fifteen_days,
                  lr_res_one_month, glr_res_one_month, rf_res_one_month, gbt_res_one_month,
                  lr_res_three_months, glr_res_three_months, rf_res_three_months, gbt_res_three_months
                  ]

# Show the Comparison Table
comparison_lst_df = pd.concat([model_comparison(cv_result, model_info, evaluator_lst) for cv_result in comparison_lst])
comparison_lst_df

Unnamed: 0,Model,Dataset,RMSE,MSE,MAE,MAPE,R2,Adjusted_R2,Time
0,LinearRegression,one_week,3103.336886,9630700.0,2736.802453,0.093504,-10.46364,-10.566916,0.001028
0,GeneralizedLinearRegression,one_week,2735.590791,7483457.0,2213.622695,0.075558,-7.907729,-7.987978,0.000182
0,RandomForestRegressor,one_week,291.992031,85259.35,201.006618,0.006955,0.898514,0.8976,0.000161
0,GBTRegressor,one_week,603.438887,364138.5,401.285036,0.014171,0.566558,0.562653,0.000118
0,LinearRegression,fifteen_days,3743.377033,14012870.0,3490.62171,0.117864,-14.231418,-14.295148,0.000178
0,GeneralizedLinearRegression,fifteen_days,2845.532378,8097055.0,2431.556702,0.082404,-7.801167,-7.837992,0.00017
0,RandomForestRegressor,fifteen_days,350.335303,122734.8,282.945356,0.009561,0.866592,0.866034,0.000191
0,GBTRegressor,fifteen_days,566.443658,320858.4,398.735374,0.01384,0.65124,0.649781,0.000178
0,LinearRegression,one_month,3330.651595,11093240.0,3080.417365,0.105379,-11.945275,-11.972301,0.000166
0,GeneralizedLinearRegression,one_month,2427.883716,5894619.0,1961.502835,0.067113,-5.878736,-5.893097,0.000167


In [48]:
violin_plot(model_results, "RMSE", "Model", "RMSE value for each model")

In [49]:
comparison_lst_df.to_csv(GDRIVE_FINAL_RESULTS, index=False)