# **Bitcoin price forecasting - Random Forest**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author
Corsi Danilo - corsi.1742375@studenti.uniroma1.it



## Global constants, dependencies, libraries and tools

In [1]:
# GDrive root
GDRIVE_DIR = "/content/drive"

# Dataset
GDRIVE_DATASET_OUTPUT_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/output"
GDRIVE_DATASET_NAME = "bitcoin_blockchain_data_30min"
GDRIVE_DATASET_NAME_TEST = GDRIVE_DATASET_NAME + "_test"
GDRIVE_DATASET_NAME_EXT_TEST = "/" + GDRIVE_DATASET_NAME_TEST + ".parquet"
GDRIVE_DATASET_TEST = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_DATASET_NAME_EXT_TEST

# Features
GDRIVE_FEATURES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/features"
GDRIVE_ALL_FEATURES_NAME = "all_features"
GDRIVE_MORE_REL_FEATURES_NAME = "more_rel_features"
GDRIVE_LESS_REL_FEATURES_NAME = "less_rel_features"
GDRIVE_ALL_FEATURES_NAME_EXT = "/" + GDRIVE_ALL_FEATURES_NAME + ".json"
GDRIVE_MORE_REL_FEATURES_NAME_EXT = "/" + GDRIVE_MORE_REL_FEATURES_NAME + ".json"
GDRIVE_LESS_REL_FEATURES_NAME_EXT = "/" + GDRIVE_LESS_REL_FEATURES_NAME + ".json"
GDRIVE_ALL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_ALL_FEATURES_NAME_EXT
GDRIVE_MORE_REL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_MORE_REL_FEATURES_NAME_EXT
GDRIVE_LESS_REL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_LESS_REL_FEATURES_NAME_EXT

# Models
PATH_MODELS = "/content/drive/MyDrive/BDC/project/models/"
LR_MODEL_NAME = "LinearRegression"
GLR_MODEL_NAME = "GeneralizedLinearRegression"
RF_MODEL_NAME = "RandomForestRegressor"
GBT_MODEL_NAME = "GBTRegressor"

# Others
JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
MODEL_NAME = "LinearRegression"
SLOW_OPERATION = True

In [2]:
# Point Colaboratory to our Google Drive
from google.colab import drive

# Define GDrive paths
drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


In [3]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
# Install Spark and related dependencies
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285388 sha256=a940996fb0553cc9f54a9ab75ca38f6358b7052655d6f40f386d9046f018b8ec
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


## Import files

In [5]:
GDRIVE_UTILITIES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/utilities"

import sys
sys.path.append(GDRIVE_UTILITIES_DIR)

from imports import *

## Create the pyspark session

In [6]:
# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '4G').\
                set('spark.driver.memory', '45G').\
                set('spark.driver.maxResultSize', '10G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPriceForecasting").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

## Loading dataset

In [7]:
# Load datasets into pyspark dataframe objects
df = spark.read.load(GDRIVE_DATASET_TEST,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

In [8]:
def dataset_info(dataset):
  dataset.show(3)

  # Get the number of rows
  num_rows = dataset.count()

  # Get the number of columns
  num_columns = len(dataset.columns)

  # Print the shape of the DataFrame
  print("Shape:", (num_rows, num_columns))

  # Print the schema of the DataFrame
  dataset.printSchema()

In [9]:
if SLOW_OPERATION:
  dataset_info(df)

+-------------------+------+------------------+--------------------+--------------------+--------------------+---------------+------------------+--------------------+------------------------+-------------------+------------------+--------------------+--------------------+------------------+-----------------+--------------------------------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|          timestamp|    id|      market-price|      total-bitcoins|          market-cap|        trade-volume|    blocks-size|    avg-block-size|n-transactions-total|n-transactions-per-block|          hash-rate|        difficulty|      miners-revenue|transaction-fees-usd|n-unique-addresses|   n-transactions|estimated-transaction-volume-usd|      rate-of-change|        sma-5-days|        sma-7-days|       sma-10-days|       sma-20-days|       sma-50-days|      sma-100-days|
+-------------------+------+----------------

## Load features

In [10]:
# Set the depended variable
TARGET_VAL = 'market-price'

# Set the features label
FEATURES_LABEL = "features"

In [11]:
# # Loading correlation matrix features
# with open(GDRIVE_ALL_FEATURES, "r") as f:
#     all_features = json.load(f)
# print(all_features)

In [12]:
# Loading correlation matrix features
with open(GDRIVE_MORE_REL_FEATURES, "r") as f:
    more_rel_features = json.load(f)
print(more_rel_features)

['market-cap', 'miners-revenue', 'sma-5-days', 'sma-7-days', 'estimated-transaction-volume-usd', 'sma-10-days', 'n-transactions-total', 'blocks-size', 'sma-100-days', 'total-bitcoins']


In [13]:
# # Loading correlation matrix features
# with open(GDRIVE_LESS_REL_FEATURES, "r") as f:
#     less_rel_features = json.load(f)
# print(less_rel_features)

## Load models

In [14]:
# Upload the 3 templates saved in Google Drive
lr = PipelineModel.load(PATH_MODELS + LR_MODEL_NAME)
glr = PipelineModel.load(PATH_MODELS + GLR_MODEL_NAME)
rf = PipelineModel.load(PATH_MODELS + RF_MODEL_NAME)
gbt = PipelineModel.load(PATH_MODELS + GBT_MODEL_NAME)

## Test models

In [15]:
# Raw features selection❗
# # Return the dataset with the selected features
# def select_features(dataset, features, featureCol, labelCol):
#   vectorAssembler = VectorAssembler(
#     inputCols = features,
#     outputCol = featureCol)

#   dataset = vectorAssembler.transform(dataset)
#   dataset = dataset.select(['timestamp','id', featureCol, labelCol])
#   return dataset

# Normalized / standardized features selection❗
def select_features(dataset, features, featureCol, labelCol):
    # Assemble the columns into a vector column
    assembler = VectorAssembler(inputCols=features, outputCol="raw_features")
    df_vector  = assembler.transform(dataset).select("timestamp", "id", "raw_features", labelCol)

    # Normalized
    # Create a Normalizer instance
    normalizer = Normalizer(inputCol="raw_features", outputCol=featureCol)

    # Fit and transform the data
    normalized_data = normalizer.transform(df_vector)

    # Show the normalized data
    # normalized_data.show()

    return normalized_data


In [16]:
def test_final_model(dataframe, trained_model, features, features_label, target_val, ml_model):
  dataframe = select_features(dataframe, features, features_label, target_val)

  # Make predictions
  predictions = trained_model.transform(dataframe)

  results = []

  # Compute validation error by several evaluator
  rmse_evaluator = RegressionEvaluator(labelCol=target_val, predictionCol="prediction", metricName='rmse')
  mae_evaluator = RegressionEvaluator(labelCol=target_val, predictionCol="prediction", metricName='mae')
  r2_evaluator = RegressionEvaluator(labelCol=target_val, predictionCol="prediction", metricName='r2')
  var_evaluator = RegressionEvaluator(labelCol=target_val, predictionCol="prediction", metricName='var')

  predictions_pd = predictions.select(target_val, "prediction").toPandas()
  mape = mean_absolute_percentage_error(predictions_pd[target_val], predictions_pd["prediction"])

  rmse = rmse_evaluator.evaluate(predictions)
  mae = mae_evaluator.evaluate(predictions)
  var = var_evaluator.evaluate(predictions)
  r2 = r2_evaluator.evaluate(predictions)
  # Adjusted R-squared
  n = predictions.count()
  p = len(predictions.columns)
  adj_r2 = 1-(1-r2)*(n-1)/(n-p-1)

  # Use dict to store each result
  result = {
      "Model": ml_model,
      "Type": "trained",
      "RMSE": rmse,
      "MAPE":mape,
      "MAE": mae,
      "Variance": var,
      "R2": r2,
      "Adjusted_R2": adj_r2,
  }

  # Transform dict to pandas dataframe
  result_df = pd.DataFrame(result, index=[0])

  return result_df, predictions.toPandas()

In [17]:
lr_results, lr_predictions = test_final_model(df, lr, more_rel_features, FEATURES_LABEL, TARGET_VAL, LR_MODEL_NAME)
glr_results, glr_predictions = test_final_model(df, glr, more_rel_features, FEATURES_LABEL, TARGET_VAL, GLR_MODEL_NAME)
rf_results, rf_predictions = test_final_model(df, rf, more_rel_features, FEATURES_LABEL, TARGET_VAL, RF_MODEL_NAME)
gbt_results, gbt_predictions = test_final_model(df, gbt, more_rel_features, FEATURES_LABEL, TARGET_VAL, GBT_MODEL_NAME)

## Summary

In [18]:
# Define what model_info and evaluators in the Model Comparison Table
model_info = ['Model','Type']
evaluator_lst = ['RMSE','MAPE','MAE','Variance','R2','Adjusted_R2']

# The the Cross Validation results would like to compare
comparison_lst = [lr_results, glr_results, rf_results, gbt_results]

In [19]:
def modelComparison(cv_result, model_info, evaluator_lst):
    # Calculate mean of all splits on chosen evaluator
    col_mean_df = cv_result[evaluator_lst].mean().to_frame().T
    # Extract model info
    model_info_df = cv_result[model_info][:1]
    # Concatenate by row
    comparison_df = pd.concat([model_info_df,col_mean_df],axis=1)
    return comparison_df

In [20]:
def show_results(test_df, lr_predictions, glr_predictions, rf_predictions, gbt_predictions):
  trace1 = go.Scatter(
      x = test_df['timestamp'],
      y = test_df['market-price'].astype(float),
      mode = 'lines',
      name = 'Test set'
  )

  trace2 = go.Scatter(
      x = lr_predictions['timestamp'],
      y = lr_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Linear Regression predictions'
  )

  trace2 = go.Scatter(
      x = lr_predictions['timestamp'],
      y = lr_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Linear Regression predictions'
  )

  trace3 = go.Scatter(
      x = glr_predictions['timestamp'],
      y = glr_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Generalized Linear Regression predictions'
  )

  trace4 = go.Scatter(
      x = rf_predictions['timestamp'],
      y = rf_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Random Forest Regressor predictions'
  )

  trace5 = go.Scatter(
      x = gbt_predictions['timestamp'],
      y = gbt_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'GBTRegressor predictions'
  )

  layout = dict(
      title='Test and predictions with Rangeslider',
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  #change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace1,trace2,trace3, trace4, trace5]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = "Test and predictions with Rangeslider")

In [21]:
# Show the Comparison Table
pd.concat([modelComparison(cv_result, model_info, evaluator_lst) for cv_result in comparison_lst])

Unnamed: 0,Model,Type,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2
0,LinearRegression,trained,4428.598548,0.147706,4261.262069,19046190.0,-8.016113,-8.024209
0,GeneralizedLinearRegression,trained,13252.843967,0.45905,13170.521722,173462600.0,-79.743,-79.815502
0,RandomForestRegressor,trained,1759.661056,0.055297,1574.897879,5284704.0,-0.423458,-0.424736
0,GBTRegressor,trained,1172.965446,0.033211,952.018523,2240165.0,0.367505,0.366937


In [22]:
show_results(df.toPandas(), lr_predictions, glr_predictions, rf_predictions, gbt_predictions)

## Making future forecasting ❗

In [23]:
# Ri-allena il modello migliore con tutto il dataset
# Crea un dataset che va da oggi ad un anno ad intervalli di mezz'ora❓
# Crea una colonna isFuture(True: dataframe nuovo | False: dataframe originale)
# Concatena il dataset esistente con quello futuro

In [24]:
df_pd = df.toPandas()

In [25]:
# Add one year to the current date
last_timestamp = df_pd['timestamp'].iloc[-1].date()
last_id = df_pd['id'].iloc[-1]

# Print the new date
print(last_timestamp)
print(last_id)

2023-08-18
133775


In [26]:
# Add one year to the current date
new_date = (last_timestamp + pd.DateOffset(years=1)).date()

# Print the new date
print(new_date)

2024-08-18


In [27]:
# Create future dataframe
future = pd.date_range(last_timestamp,new_date, freq='30min')
future_df = pd.DataFrame(index=future)

# Generate the IDs based on the length of the DataFrame
future_df['id'] = range(last_id, last_id + len(future_df))

future_df

Unnamed: 0,id
2023-08-18 00:00:00,133775
2023-08-18 00:30:00,133776
2023-08-18 01:00:00,133777
2023-08-18 01:30:00,133778
2023-08-18 02:00:00,133779
...,...
2024-08-17 22:00:00,151339
2024-08-17 22:30:00,151340
2024-08-17 23:00:00,151341
2024-08-17 23:30:00,151342


In [28]:
df_pd = df_pd.set_index('timestamp')
df_pd

Unnamed: 0_level_0,id,market-price,total-bitcoins,market-cap,trade-volume,blocks-size,avg-block-size,n-transactions-total,n-transactions-per-block,hash-rate,...,n-unique-addresses,n-transactions,estimated-transaction-volume-usd,rate-of-change,sma-5-days,sma-7-days,sma-10-days,sma-20-days,sma-50-days,sma-100-days
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-04-01 15:30:00,127087,28470.249792,1.933499e+07,5.431791e+11,1.369586e+08,469794.565513,1.853800,8.196096e+08,2289.372401,3.395387e+08,...,674757.520833,333838.833333,1.093688e+09,-0.001076,20616.695105,20362.472876,20953.711646,33273.821989,24160.079006,15422.119868
2023-04-01 16:00:00,127088,28469.943333,1.933502e+07,5.430432e+11,1.344060e+08,469799.849629,1.855111,8.196163e+08,2289.571972,3.399753e+08,...,673866.666667,334296.666667,1.085623e+09,-0.001076,20617.834515,20363.325507,20953.613527,33273.230874,24160.419053,15422.222533
2023-04-01 16:30:00,127089,28469.636875,1.933504e+07,5.429073e+11,1.318535e+08,469805.133745,1.856422,8.196229e+08,2289.771542,3.404119e+08,...,672975.812500,334754.500000,1.077558e+09,-0.001076,20618.974843,20364.178352,20953.515306,33272.637927,24160.759096,15422.325193
2023-04-01 17:00:00,127090,28469.330417,1.933506e+07,5.427714e+11,1.293009e+08,469810.417861,1.857733,8.196296e+08,2289.971113,3.408485e+08,...,672084.958333,335212.333333,1.069493e+09,-0.001076,20620.116091,20365.031409,20953.416983,33272.043148,24161.099135,15422.427849
2023-04-01 17:30:00,127091,28469.023958,1.933509e+07,5.426355e+11,1.267483e+08,469815.701978,1.859045,8.196362e+08,2290.170684,3.412852e+08,...,671194.104167,335670.166667,1.061427e+09,-0.001076,20621.258257,20365.884681,20953.318557,33271.446538,24161.439170,15422.530502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-18 21:30:00,133771,26111.112500,1.946211e+07,5.072502e+11,1.830140e+08,504581.499297,1.691828,8.811480e+08,3643.124743,3.643419e+08,...,644542.166667,509720.312500,1.476158e+09,-0.049069,28544.973430,27069.836244,24309.023198,27671.972352,26200.579500,16080.595994
2023-08-18 22:00:00,133772,26098.300000,1.946213e+07,5.073891e+11,1.799027e+08,504586.224709,1.693243,8.811569e+08,3657.042652,3.643961e+08,...,641302.333333,511732.250000,1.418873e+09,-0.049093,28544.685601,27070.185996,24309.501142,27671.122143,26200.774834,16080.670784
2023-08-18 22:30:00,133773,26085.487500,1.946215e+07,5.075279e+11,1.767913e+08,504590.950120,1.694658,8.811657e+08,3670.960560,3.644504e+08,...,638062.500000,513744.187500,1.361587e+09,-0.049117,28544.394834,27070.531158,24309.978148,27670.271409,26200.970169,16080.745477
2023-08-18 23:00:00,133774,26072.675000,1.946217e+07,5.076668e+11,1.736800e+08,504595.675532,1.696074,8.811745e+08,3684.878469,3.645046e+08,...,634822.666667,515756.125000,1.304301e+09,-0.049141,28544.101128,27070.871731,24310.454214,27669.420149,26201.165504,16080.820074


In [29]:
# future_df['isFuture'] = True
# test_df_pd['isFuture'] = False
test_and_future = pd.concat([df_pd, future_df])
test_and_future

Unnamed: 0,id,market-price,total-bitcoins,market-cap,trade-volume,blocks-size,avg-block-size,n-transactions-total,n-transactions-per-block,hash-rate,...,n-unique-addresses,n-transactions,estimated-transaction-volume-usd,rate-of-change,sma-5-days,sma-7-days,sma-10-days,sma-20-days,sma-50-days,sma-100-days
2023-04-01 15:30:00,127087,28470.249792,1.933499e+07,5.431791e+11,1.369586e+08,469794.565513,1.853800,8.196096e+08,2289.372401,3.395387e+08,...,674757.520833,333838.833333,1.093688e+09,-0.001076,20616.695105,20362.472876,20953.711646,33273.821989,24160.079006,15422.119868
2023-04-01 16:00:00,127088,28469.943333,1.933502e+07,5.430432e+11,1.344060e+08,469799.849629,1.855111,8.196163e+08,2289.571972,3.399753e+08,...,673866.666667,334296.666667,1.085623e+09,-0.001076,20617.834515,20363.325507,20953.613527,33273.230874,24160.419053,15422.222533
2023-04-01 16:30:00,127089,28469.636875,1.933504e+07,5.429073e+11,1.318535e+08,469805.133745,1.856422,8.196229e+08,2289.771542,3.404119e+08,...,672975.812500,334754.500000,1.077558e+09,-0.001076,20618.974843,20364.178352,20953.515306,33272.637927,24160.759096,15422.325193
2023-04-01 17:00:00,127090,28469.330417,1.933506e+07,5.427714e+11,1.293009e+08,469810.417861,1.857733,8.196296e+08,2289.971113,3.408485e+08,...,672084.958333,335212.333333,1.069493e+09,-0.001076,20620.116091,20365.031409,20953.416983,33272.043148,24161.099135,15422.427849
2023-04-01 17:30:00,127091,28469.023958,1.933509e+07,5.426355e+11,1.267483e+08,469815.701978,1.859045,8.196362e+08,2290.170684,3.412852e+08,...,671194.104167,335670.166667,1.061427e+09,-0.001076,20621.258257,20365.884681,20953.318557,33271.446538,24161.439170,15422.530502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-17 22:00:00,151339,,,,,,,,,,...,,,,,,,,,,
2024-08-17 22:30:00,151340,,,,,,,,,,...,,,,,,,,,,
2024-08-17 23:00:00,151341,,,,,,,,,,...,,,,,,,,,,
2024-08-17 23:30:00,151342,,,,,,,,,,...,,,,,,,,,,


In [30]:
# Convert pandas DataFrame to PySpark DataFrame
test_and_future_df = spark.createDataFrame(test_and_future.reset_index()).withColumn("id", F.row_number().over(Window.orderBy(F.monotonically_increasing_id()))-1).withColumnRenamed("index","timestamp")

In [31]:
# Fill all columns with zeros
test_and_future_df = test_and_future_df.fillna(0)

In [32]:
def dataset_info(dataset):
  dataset.show(3)

  # Get the number of rows
  num_rows = dataset.count()

  # Get the number of columns
  num_columns = len(dataset.columns)

  # Print the shape of the DataFrame
  print("Shape:", (num_rows, num_columns))

  # Print the schema of the DataFrame
  dataset.printSchema()

In [33]:
dataset_info(test_and_future_df)

+-------------------+---+------------------+--------------------+--------------------+--------------------+---------------+------------------+--------------------+------------------------+-------------------+------------------+--------------------+--------------------+------------------+-----------------+--------------------------------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|          timestamp| id|      market-price|      total-bitcoins|          market-cap|        trade-volume|    blocks-size|    avg-block-size|n-transactions-total|n-transactions-per-block|          hash-rate|        difficulty|      miners-revenue|transaction-fees-usd|n-unique-addresses|   n-transactions|estimated-transaction-volume-usd|      rate-of-change|        sma-5-days|        sma-7-days|       sma-10-days|       sma-20-days|       sma-50-days|      sma-100-days|
+-------------------+---+------------------+------

### Test models ❗

In [34]:
# Raw features selection❗
# # Return the dataset with the selected features
# def select_features(dataset, features, featureCol, labelCol):
#   vectorAssembler = VectorAssembler(
#     inputCols = features,
#     outputCol = featureCol)

#   dataset = vectorAssembler.transform(dataset)
#   dataset = dataset.select(['timestamp','id', featureCol, labelCol])
#   return dataset

# Normalized / standardized features selection❗
def select_features(dataset, features, featureCol, labelCol):
    # Assemble the columns into a vector column
    assembler = VectorAssembler(inputCols=features, outputCol="raw_features")
    df_vector  = assembler.transform(dataset).select("timestamp", "id", "raw_features", labelCol)

    # Normalized
    # Create a Normalizer instance
    normalizer = Normalizer(inputCol="raw_features", outputCol=featureCol)

    # Fit and transform the data
    normalized_data = normalizer.transform(df_vector)

    # Show the normalized data
    # normalized_data.show()

    return normalized_data

In [35]:
def test_final_model(dataframe, trained_model, features, features_label, target_val, ml_model):
  dataframe = select_features(dataframe, features, features_label, target_val)

  # Make predictions
  predictions = trained_model.transform(dataframe)

  results = []

  # Compute validation error by several evaluator
  rmse_evaluator = RegressionEvaluator(labelCol=target_val, predictionCol="prediction", metricName='rmse')
  mae_evaluator = RegressionEvaluator(labelCol=target_val, predictionCol="prediction", metricName='mae')
  r2_evaluator = RegressionEvaluator(labelCol=target_val, predictionCol="prediction", metricName='r2')
  var_evaluator = RegressionEvaluator(labelCol=target_val, predictionCol="prediction", metricName='var')

  predictions_pd = predictions.select(target_val, "prediction").toPandas()
  mape = mean_absolute_percentage_error(predictions_pd[target_val], predictions_pd["prediction"])

  rmse = rmse_evaluator.evaluate(predictions)
  mae = mae_evaluator.evaluate(predictions)
  var = var_evaluator.evaluate(predictions)
  r2 = r2_evaluator.evaluate(predictions)
  # Adjusted R-squared
  n = predictions.count()
  p = len(predictions.columns)
  adj_r2 = 1-(1-r2)*(n-1)/(n-p-1)

  # Use dict to store each result
  result = {
      "Model": ml_model,
      "Type": "trained",
      "RMSE": rmse,
      "MAPE":mape,
      "MAE": mae,
      "Variance": var,
      "R2": r2,
      "Adjusted_R2": adj_r2,
  }

  # Transform dict to pandas dataframe
  result_df = pd.DataFrame(result, index=[0])

  return result_df, predictions.toPandas()

In [36]:
lr_results, lr_predictions = test_final_model(test_and_future_df, lr, more_rel_features, FEATURES_LABEL, TARGET_VAL, LR_MODEL_NAME)
glr_results, glr_predictions = test_final_model(test_and_future_df, glr, more_rel_features, FEATURES_LABEL, TARGET_VAL, GLR_MODEL_NAME)
rf_results, rf_predictions = test_final_model(test_and_future_df, rf, more_rel_features, FEATURES_LABEL, TARGET_VAL, RF_MODEL_NAME)
gbt_results, gbt_predictions = test_final_model(test_and_future_df, gbt, more_rel_features, FEATURES_LABEL, TARGET_VAL, GBT_MODEL_NAME)

### Summary ❗

In [37]:
# Define what model_info and evaluators in the Model Comparison Table
model_info = ['Model','Type']
evaluator_lst = ['RMSE','MAPE','MAE','Variance','R2','Adjusted_R2']

# The the Cross Validation results would like to compare
comparison_lst = [lr_results, glr_results, rf_results, gbt_results]

In [38]:
def modelComparison(cv_result, model_info, evaluator_lst):
    # Calculate mean of all splits on chosen evaluator
    col_mean_df = cv_result[evaluator_lst].mean().to_frame().T
    # Extract model info
    model_info_df = cv_result[model_info][:1]
    # Concatenate by row
    comparison_df = pd.concat([model_info_df,col_mean_df],axis=1)
    return comparison_df

In [39]:
def show_results(test_df, lr_predictions, glr_predictions, rf_predictions, gbt_predictions):
  trace1 = go.Scatter(
      x = test_df['timestamp'],
      y = test_df['market-price'].astype(float),
      mode = 'lines',
      name = 'Test set'
  )

  trace2 = go.Scatter(
      x = lr_predictions['timestamp'],
      y = lr_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Linear Regression predictions'
  )

  trace2 = go.Scatter(
      x = lr_predictions['timestamp'],
      y = lr_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Linear Regression predictions'
  )

  trace3 = go.Scatter(
      x = glr_predictions['timestamp'],
      y = glr_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Generalized Linear Regression predictions'
  )

  trace4 = go.Scatter(
      x = rf_predictions['timestamp'],
      y = rf_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Random Forest Regressor predictions'
  )

  trace5 = go.Scatter(
      x = gbt_predictions['timestamp'],
      y = gbt_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'GBTRegressor predictions'
  )

  layout = dict(
      title='Test and predictions with Rangeslider',
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  #change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace1,trace2,trace3, trace4, trace5]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = "Test and predictions with Rangeslider")

In [40]:
# Show the Comparison Table
pd.concat([modelComparison(cv_result, model_info, evaluator_lst) for cv_result in comparison_lst])

Unnamed: 0,Model,Type,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2
0,LinearRegression,trained,3520707.0,1.349384e+22,2997410.0,12442750000000.0,-75604.227613,-75622.93329
0,GeneralizedLinearRegression,trained,14856.11,5.030538e+19,14801.73,56826640.0,-0.346178,-0.346511
0,RandomForestRegressor,trained,50493.24,1.934935e+20,43398.45,2018221000.0,-14.551006,-14.554854
0,GBTRegressor,trained,52887.9,2.026901e+20,45268.77,2247891000.0,-16.061007,-16.065228


In [41]:
show_results(test_and_future_df.toPandas(), lr_predictions, glr_predictions, rf_predictions, gbt_predictions)

Output hidden; open in https://colab.research.google.com to view.