# **Bitcoin price forecasting - Random Forest**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author
Corsi Danilo - corsi.1742375@studenti.uniroma1.it



## Global constants, dependencies, libraries and tools

In [1]:
# GDrive root
GDRIVE_DIR = "/content/drive"

# Dataset
GDRIVE_DATASET_OUTPUT_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/output"
GDRIVE_DATASET_NAME = "bitcoin_blockchain_data_30min"
GDRIVE_DATASET_NAME_TEST = GDRIVE_DATASET_NAME + "_test"
GDRIVE_DATASET_NAME_EXT_TEST = "/" + GDRIVE_DATASET_NAME_TEST + ".parquet"
GDRIVE_DATASET_TEST = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_DATASET_NAME_EXT_TEST

# Features
GDRIVE_FEATURES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/features"
GDRIVE_ALL_FEATURES_NAME = "all_features"
GDRIVE_MORE_REL_FEATURES_NAME = "more_rel_features"
GDRIVE_LESS_REL_FEATURES_NAME = "less_rel_features"
GDRIVE_ALL_FEATURES_NAME_EXT = "/" + GDRIVE_ALL_FEATURES_NAME + ".json"
GDRIVE_MORE_REL_FEATURES_NAME_EXT = "/" + GDRIVE_MORE_REL_FEATURES_NAME + ".json"
GDRIVE_LESS_REL_FEATURES_NAME_EXT = "/" + GDRIVE_LESS_REL_FEATURES_NAME + ".json"
GDRIVE_ALL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_ALL_FEATURES_NAME_EXT
GDRIVE_MORE_REL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_MORE_REL_FEATURES_NAME_EXT
GDRIVE_LESS_REL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_LESS_REL_FEATURES_NAME_EXT

# Models
PATH_MODELS = "/content/drive/MyDrive/BDC/project/models/"
LR_MODEL_NAME = "LinearRegression"
GLR_MODEL_NAME = "GeneralizedLinearRegression"
RF_MODEL_NAME = "RandomForestRegressor"
GBT_MODEL_NAME = "GBTRegressor"

# Others
JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
MODEL_NAME = "LinearRegression"
SLOW_OPERATION = True

In [46]:
# Point Colaboratory to our Google Drive
from google.colab import drive

# Define GDrive paths
drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


In [3]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
# Install Spark and related dependencies
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285388 sha256=20fa52b9c03391529374baa39a1a15dddf694818facfe3bfdb8b58f14e2e98c2
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


## Import files

In [5]:
GDRIVE_UTILITIES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/utilities"

import sys
sys.path.append(GDRIVE_UTILITIES_DIR)

from imports import *

## Create the pyspark session

In [6]:
# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '4G').\
                set('spark.driver.memory', '45G').\
                set('spark.driver.maxResultSize', '10G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPriceForecasting").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

## Loading dataset

In [142]:
# Load datasets into pyspark dataframe objects
test_df = spark.read.load(GDRIVE_DATASET_TEST,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

## Load features

In [86]:
# Set the depended variable
TARGET_VAL = 'market-price'

# Set the features label
FEATURES_LABEL = "features"

In [87]:
# # Loading correlation matrix features
# with open(GDRIVE_ALL_FEATURES, "r") as f:
#     all_features = json.load(f)
# print(all_features)

In [88]:
# Loading correlation matrix features
with open(GDRIVE_MORE_REL_FEATURES, "r") as f:
    more_rel_features = json.load(f)
print(more_rel_features)

['market-cap', 'miners-revenue', 'sma-5-days', 'sma-7-days', 'estimated-transaction-volume-usd', 'sma-10-days', 'n-transactions-total', 'blocks-size', 'sma-100-days', 'total-bitcoins']


In [89]:
# # Loading correlation matrix features
# with open(GDRIVE_LESS_REL_FEATURES, "r") as f:
#     less_rel_features = json.load(f)
# print(less_rel_features)

## Load models

In [90]:
# Upload the 3 templates saved in Google Drive
lr = PipelineModel.load(PATH_MODELS + LR_MODEL_NAME)
glr = PipelineModel.load(PATH_MODELS + GLR_MODEL_NAME)
rf = PipelineModel.load(PATH_MODELS + RF_MODEL_NAME)
gbt = PipelineModel.load(PATH_MODELS + GBT_MODEL_NAME)

## Test models

In [91]:
# Raw features selection❗
# # Return the dataset with the selected features
# def select_features(dataset, features, featureCol, labelCol):
#   vectorAssembler = VectorAssembler(
#     inputCols = features,
#     outputCol = featureCol)

#   dataset = vectorAssembler.transform(dataset)
#   dataset = dataset.select(['timestamp','id', featureCol, labelCol])
#   return dataset

# Normalized / standardized features selection❗
def select_features(dataset, features, featureCol, labelCol):
    # Assemble the columns into a vector column
    assembler = VectorAssembler(inputCols=features, outputCol="raw_features")
    df_vector  = assembler.transform(dataset).select("timestamp", "id", "raw_features", labelCol)

    # Normalized
    # Create a Normalizer instance
    normalizer = Normalizer(inputCol="raw_features", outputCol=featureCol)

    # Fit and transform the data
    normalized_data = normalizer.transform(df_vector)

    # Show the normalized data
    # normalized_data.show()

    return normalized_data


In [92]:
def test_final_model(dataframe, trained_model, features, features_label, target_val, ml_model):
  dataframe = select_features(dataframe, features, features_label, target_val)

  # Make predictions
  predictions = trained_model.transform(dataframe)

  results = []

  # Compute validation error by several evaluator
  rmse_evaluator = RegressionEvaluator(labelCol=target_val, predictionCol="prediction", metricName='rmse')
  mae_evaluator = RegressionEvaluator(labelCol=target_val, predictionCol="prediction", metricName='mae')
  r2_evaluator = RegressionEvaluator(labelCol=target_val, predictionCol="prediction", metricName='r2')
  var_evaluator = RegressionEvaluator(labelCol=target_val, predictionCol="prediction", metricName='var')

  predictions_pd = predictions.select(target_val, "prediction").toPandas()
  mape = mean_absolute_percentage_error(predictions_pd[target_val], predictions_pd["prediction"])

  rmse = rmse_evaluator.evaluate(predictions)
  mae = mae_evaluator.evaluate(predictions)
  var = var_evaluator.evaluate(predictions)
  r2 = r2_evaluator.evaluate(predictions)
  # Adjusted R-squared
  n = predictions.count()
  p = len(predictions.columns)
  adj_r2 = 1-(1-r2)*(n-1)/(n-p-1)

  # Use dict to store each result
  result = {
      "Model": ml_model,
      "Type": "trained",
      "RMSE": rmse,
      "MAPE":mape,
      "MAE": mae,
      "Variance": var,
      "R2": r2,
      "Adjusted_R2": adj_r2,
  }

  # Transform dict to pandas dataframe
  result_df = pd.DataFrame(result, index=[0])

  return result_df, predictions.toPandas()

In [93]:
lr_results, lr_predictions = test_final_model(test_df, lr, more_rel_features, FEATURES_LABEL, TARGET_VAL, LR_MODEL_NAME)
glr_results, glr_predictions = test_final_model(test_df, glr, more_rel_features, FEATURES_LABEL, TARGET_VAL, GLR_MODEL_NAME)
rf_results, rf_predictions = test_final_model(test_df, rf, more_rel_features, FEATURES_LABEL, TARGET_VAL, RF_MODEL_NAME)
gbt_results, gbt_predictions = test_final_model(test_df, gbt, more_rel_features, FEATURES_LABEL, TARGET_VAL, GBT_MODEL_NAME)

## Summary

In [94]:
# Define what model_info and evaluators in the Model Comparison Table
model_info = ['Model','Type']
evaluator_lst = ['RMSE','MAPE','MAE','Variance','R2','Adjusted_R2']

# The the Cross Validation results would like to compare
comparison_lst = [lr_results, glr_results, rf_results, gbt_results]

In [95]:
def modelComparison(cv_result, model_info, evaluator_lst):
    # Calculate mean of all splits on chosen evaluator
    col_mean_df = cv_result[evaluator_lst].mean().to_frame().T
    # Extract model info
    model_info_df = cv_result[model_info][:1]
    # Concatenate by row
    comparison_df = pd.concat([model_info_df,col_mean_df],axis=1)
    return comparison_df

In [96]:
# Show the Comparison Table
pd.concat([modelComparison(cv_result, model_info, evaluator_lst) for cv_result in comparison_lst])

Unnamed: 0,Model,Type,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2
0,LinearRegression,trained,861.008394,0.023773,708.972561,717223.3,-1.390455,-1.396832
0,GeneralizedLinearRegression,trained,2898.282152,0.092366,2743.440032,9126036.0,-26.086135,-26.158397
0,RandomForestRegressor,trained,1169.71903,0.032692,966.380315,2111243.0,-3.411932,-3.423703
0,GBTRegressor,trained,601.314753,0.017162,508.541918,842802.7,-0.165922,-0.169032


In [97]:
def show_results(test_df, lr_predictions, glr_predictions, rf_predictions, gbt_predictions):
  trace1 = go.Scatter(
      x = test_df['timestamp'],
      y = test_df['market-price'].astype(float),
      mode = 'lines',
      name = 'Test set'
  )

  trace2 = go.Scatter(
      x = lr_predictions['timestamp'],
      y = lr_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Linear Regression predictions'
  )

  trace2 = go.Scatter(
      x = lr_predictions['timestamp'],
      y = lr_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Linear Regression predictions'
  )

  trace3 = go.Scatter(
      x = glr_predictions['timestamp'],
      y = glr_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Generalized Linear Regression predictions'
  )

  trace4 = go.Scatter(
      x = rf_predictions['timestamp'],
      y = rf_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Random Forest Regressor predictions'
  )

  trace5 = go.Scatter(
      x = gbt_predictions['timestamp'],
      y = gbt_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'GBTRegressor predictions'
  )

  layout = dict(
      title='Test and predictions with Rangeslider',
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  #change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace1,trace2,trace3, trace4, trace5]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = "Test and predictions with Rangeslider")

In [98]:
show_results(test_df.toPandas(), lr_predictions, glr_predictions, rf_predictions, gbt_predictions)

## Making future forecasting ❗

In [217]:
# Ri-allena il modello migliore con tutto il dataset
# Crea un dataset che va da oggi ad un anno ad intervalli di mezz'ora❓
# Crea una colonna isFuture(True: dataframe nuovo | False: dataframe originale)
# Concatena il dataset esistente con quello futuro

In [218]:
test_df_pd = test_df.toPandas()

In [219]:
# Add one year to the current date
last_timestamp = test_df_pd['timestamp'].iloc[-1].date()
last_id = test_df_pd['id'].iloc[-1]

# Print the new date
print(last_timestamp)
print(last_id)

2023-08-16
133679


In [220]:
# Add one year to the current date
new_date = (last_timestamp + pd.DateOffset(years=1)).date()

# Print the new date
print(new_date)

2024-08-16


In [221]:
# Create future dataframe
future = pd.date_range(last_timestamp,new_date, freq='30min')
future_df = pd.DataFrame(index=future)

# Generate the IDs based on the length of the DataFrame
future_df['id'] = range(last_id, last_id + len(future_df))

future_df

Unnamed: 0,id
2023-08-16 00:00:00,133679
2023-08-16 00:30:00,133680
2023-08-16 01:00:00,133681
2023-08-16 01:30:00,133682
2023-08-16 02:00:00,133683
...,...
2024-08-15 22:00:00,151243
2024-08-15 22:30:00,151244
2024-08-15 23:00:00,151245
2024-08-15 23:30:00,151246


In [222]:
test_df_pd = test_df_pd.set_index('timestamp')
test_df_pd

Unnamed: 0_level_0,id,market-price,total-bitcoins,market-cap,trade-volume,blocks-size,avg-block-size,n-transactions-total,n-transactions-per-block,hash-rate,...,n-transactions,estimated-transaction-volume-usd,rate-of-change,sma-5-days,sma-7-days,sma-10-days,sma-20-days,sma-50-days,sma-100-days,next-market-price
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-07-01 00:00:00,131424,30471.500000,1.941713e+07,5.948244e+11,3.743253e+08,492534.687752,1.656755,8.585794e+08,2438.150376,3.348457e+08,...,324274.000000,3.141739e+09,0.008206,26522.418219,24258.046628,22668.340685,29721.555588,25545.010028,15837.752202,30474.000625
2023-07-01 00:30:00,131425,30474.000625,1.941715e+07,5.948174e+11,3.674813e+08,492539.282416,1.660088,8.585862e+08,2425.113861,3.368388e+08,...,323974.958333,3.116832e+09,0.008206,26523.436180,24259.374807,22669.079400,29720.413071,25545.327338,15837.863586,30476.501250
2023-07-01 01:00:00,131426,30476.501250,1.941718e+07,5.948104e+11,3.606373e+08,492543.877079,1.663420,8.585930e+08,2412.077346,3.388319e+08,...,323675.916667,3.091926e+09,0.008205,26524.452787,24260.703642,22669.818057,29719.267552,25545.644628,15837.974988,30479.001875
2023-07-01 01:30:00,131427,30479.001875,1.941720e+07,5.948034e+11,3.537933e+08,492548.471743,1.666752,8.585997e+08,2399.040831,3.408251e+08,...,323376.875000,3.067019e+09,0.008204,26525.468040,24262.033131,22670.556657,29718.119032,25545.961899,15838.086406,30481.502500
2023-07-01 02:00:00,131428,30481.502500,1.941722e+07,5.947964e+11,3.469492e+08,492553.066407,1.670085,8.586065e+08,2386.004316,3.428182e+08,...,323077.833333,3.042112e+09,0.008204,26526.481939,24263.363276,22671.295200,29716.967510,25546.279149,15838.197842,30484.003125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-16 21:30:00,133675,28750.189375,1.946029e+07,5.452773e+11,9.993861e+07,504093.807246,1.648569,8.802099e+08,3246.587838,4.081822e+08,...,509003.583333,4.299148e+09,-0.033885,28556.062063,27014.417805,24255.872276,27750.231758,26180.819034,16072.671774,28740.447500
2023-08-16 22:00:00,133676,28740.447500,1.946032e+07,5.447720e+11,1.005588e+08,504099.540082,1.647714,8.802218e+08,3241.211296,4.078024e+08,...,507657.666667,4.336937e+09,-0.033896,28556.169558,27015.213218,24256.544264,27749.468135,26181.051601,16072.766466,28730.705625
2023-08-16 22:30:00,133677,28730.705625,1.946034e+07,5.442667e+11,1.011790e+08,504105.272917,1.646859,8.802336e+08,3235.834754,4.074226e+08,...,506311.750000,4.374726e+09,-0.033908,28556.272603,27016.008641,24257.215691,27748.703383,26181.283872,16072.861083,28720.963750
2023-08-16 23:00:00,133678,28720.963750,1.946036e+07,5.437613e+11,1.017991e+08,504111.005753,1.646004,8.802455e+08,3230.458212,4.070428e+08,...,504965.833333,4.412515e+09,-0.033919,28556.371198,27016.804074,24257.886554,27747.937502,26181.515848,16072.955625,28711.221875


In [223]:
# future_df['isFuture'] = True
# test_df_pd['isFuture'] = False
test_and_future = pd.concat([test_df_pd, future_df])
test_and_future

Unnamed: 0,id,market-price,total-bitcoins,market-cap,trade-volume,blocks-size,avg-block-size,n-transactions-total,n-transactions-per-block,hash-rate,...,n-transactions,estimated-transaction-volume-usd,rate-of-change,sma-5-days,sma-7-days,sma-10-days,sma-20-days,sma-50-days,sma-100-days,next-market-price
2023-07-01 00:00:00,131424,30471.500000,1.941713e+07,5.948244e+11,3.743253e+08,492534.687752,1.656755,8.585794e+08,2438.150376,3.348457e+08,...,324274.000000,3.141739e+09,0.008206,26522.418219,24258.046628,22668.340685,29721.555588,25545.010028,15837.752202,30474.000625
2023-07-01 00:30:00,131425,30474.000625,1.941715e+07,5.948174e+11,3.674813e+08,492539.282416,1.660088,8.585862e+08,2425.113861,3.368388e+08,...,323974.958333,3.116832e+09,0.008206,26523.436180,24259.374807,22669.079400,29720.413071,25545.327338,15837.863586,30476.501250
2023-07-01 01:00:00,131426,30476.501250,1.941718e+07,5.948104e+11,3.606373e+08,492543.877079,1.663420,8.585930e+08,2412.077346,3.388319e+08,...,323675.916667,3.091926e+09,0.008205,26524.452787,24260.703642,22669.818057,29719.267552,25545.644628,15837.974988,30479.001875
2023-07-01 01:30:00,131427,30479.001875,1.941720e+07,5.948034e+11,3.537933e+08,492548.471743,1.666752,8.585997e+08,2399.040831,3.408251e+08,...,323376.875000,3.067019e+09,0.008204,26525.468040,24262.033131,22670.556657,29718.119032,25545.961899,15838.086406,30481.502500
2023-07-01 02:00:00,131428,30481.502500,1.941722e+07,5.947964e+11,3.469492e+08,492553.066407,1.670085,8.586065e+08,2386.004316,3.428182e+08,...,323077.833333,3.042112e+09,0.008204,26526.481939,24263.363276,22671.295200,29716.967510,25546.279149,15838.197842,30484.003125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-15 22:00:00,151243,,,,,,,,,,...,,,,,,,,,,
2024-08-15 22:30:00,151244,,,,,,,,,,...,,,,,,,,,,
2024-08-15 23:00:00,151245,,,,,,,,,,...,,,,,,,,,,
2024-08-15 23:30:00,151246,,,,,,,,,,...,,,,,,,,,,


In [243]:
# Convert pandas DataFrame to PySpark DataFrame
test_and_future_df = spark.createDataFrame(test_and_future.reset_index()).withColumn("id", F.row_number().over(Window.orderBy(F.monotonically_increasing_id()))-1).withColumnRenamed("index","timestamp")

In [244]:
# Fill all columns with zeros
test_and_future_df = test_and_future_df.fillna(0)

In [257]:
def dataset_info(dataset):
  dataset.show(3)

  # Get the number of rows
  num_rows = dataset.count()

  # Get the number of columns
  num_columns = len(dataset.columns)

  # Print the shape of the DataFrame
  print("Shape:", (num_rows, num_columns))

  # Print the schema of the DataFrame
  dataset.printSchema()

In [259]:
dataset_info(test_and_future_df)
test_and_future_df.tail(3)


+-------------------+---+------------+--------------------+--------------------+--------------------+----------------+------------------+--------------------+------------------------+--------------------+------------------+--------------------+--------------------+------------------+-----------------+--------------------------------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+
|          timestamp| id|market-price|      total-bitcoins|          market-cap|        trade-volume|     blocks-size|    avg-block-size|n-transactions-total|n-transactions-per-block|           hash-rate|        difficulty|      miners-revenue|transaction-fees-usd|n-unique-addresses|   n-transactions|estimated-transaction-volume-usd|      rate-of-change|        sma-5-days|        sma-7-days|       sma-10-days|       sma-20-days|       sma-50-days|      sma-100-days|next-market-price|
+-------------------+-

[Row(timestamp=datetime.datetime(2024, 8, 15, 23, 0), id=19822, market-price=0.0, total-bitcoins=0.0, market-cap=0.0, trade-volume=0.0, blocks-size=0.0, avg-block-size=0.0, n-transactions-total=0.0, n-transactions-per-block=0.0, hash-rate=0.0, difficulty=0.0, miners-revenue=0.0, transaction-fees-usd=0.0, n-unique-addresses=0.0, n-transactions=0.0, estimated-transaction-volume-usd=0.0, rate-of-change=0.0, sma-5-days=0.0, sma-7-days=0.0, sma-10-days=0.0, sma-20-days=0.0, sma-50-days=0.0, sma-100-days=0.0, next-market-price=0.0),
 Row(timestamp=datetime.datetime(2024, 8, 15, 23, 30), id=19823, market-price=0.0, total-bitcoins=0.0, market-cap=0.0, trade-volume=0.0, blocks-size=0.0, avg-block-size=0.0, n-transactions-total=0.0, n-transactions-per-block=0.0, hash-rate=0.0, difficulty=0.0, miners-revenue=0.0, transaction-fees-usd=0.0, n-unique-addresses=0.0, n-transactions=0.0, estimated-transaction-volume-usd=0.0, rate-of-change=0.0, sma-5-days=0.0, sma-7-days=0.0, sma-10-days=0.0, sma-20-da

## Test models

In [247]:
# Raw features selection❗
# # Return the dataset with the selected features
# def select_features(dataset, features, featureCol, labelCol):
#   vectorAssembler = VectorAssembler(
#     inputCols = features,
#     outputCol = featureCol)

#   dataset = vectorAssembler.transform(dataset)
#   dataset = dataset.select(['timestamp','id', featureCol, labelCol])
#   return dataset

# Normalized / standardized features selection❗
def select_features(dataset, features, featureCol, labelCol):
    # Assemble the columns into a vector column
    assembler = VectorAssembler(inputCols=features, outputCol="raw_features")
    df_vector  = assembler.transform(dataset).select("timestamp", "id", "raw_features", labelCol)

    # Normalized
    # Create a Normalizer instance
    normalizer = Normalizer(inputCol="raw_features", outputCol=featureCol)

    # Fit and transform the data
    normalized_data = normalizer.transform(df_vector)

    # Show the normalized data
    # normalized_data.show()

    return normalized_data


In [248]:
def test_final_model(dataframe, trained_model, features, features_label, target_val, ml_model):
  dataframe = select_features(dataframe, features, features_label, target_val)

  # Make predictions
  predictions = trained_model.transform(dataframe)

  results = []

  # Compute validation error by several evaluator
  rmse_evaluator = RegressionEvaluator(labelCol=target_val, predictionCol="prediction", metricName='rmse')
  mae_evaluator = RegressionEvaluator(labelCol=target_val, predictionCol="prediction", metricName='mae')
  r2_evaluator = RegressionEvaluator(labelCol=target_val, predictionCol="prediction", metricName='r2')
  var_evaluator = RegressionEvaluator(labelCol=target_val, predictionCol="prediction", metricName='var')

  predictions_pd = predictions.select(target_val, "prediction").toPandas()
  mape = mean_absolute_percentage_error(predictions_pd[target_val], predictions_pd["prediction"])

  rmse = rmse_evaluator.evaluate(predictions)
  mae = mae_evaluator.evaluate(predictions)
  var = var_evaluator.evaluate(predictions)
  r2 = r2_evaluator.evaluate(predictions)
  # Adjusted R-squared
  n = predictions.count()
  p = len(predictions.columns)
  adj_r2 = 1-(1-r2)*(n-1)/(n-p-1)

  # Use dict to store each result
  result = {
      "Model": ml_model,
      "Type": "trained",
      "RMSE": rmse,
      "MAPE":mape,
      "MAE": mae,
      "Variance": var,
      "R2": r2,
      "Adjusted_R2": adj_r2,
  }

  # Transform dict to pandas dataframe
  result_df = pd.DataFrame(result, index=[0])

  return result_df, predictions.toPandas()

In [249]:
lr_results, lr_predictions = test_final_model(test_and_future_df, lr, more_rel_features, FEATURES_LABEL, TARGET_VAL, LR_MODEL_NAME)
glr_results, glr_predictions = test_final_model(test_and_future_df, glr, more_rel_features, FEATURES_LABEL, TARGET_VAL, GLR_MODEL_NAME)
rf_results, rf_predictions = test_final_model(test_and_future_df, rf, more_rel_features, FEATURES_LABEL, TARGET_VAL, RF_MODEL_NAME)
gbt_results, gbt_predictions = test_final_model(test_and_future_df, gbt, more_rel_features, FEATURES_LABEL, TARGET_VAL, GBT_MODEL_NAME)

## Summary

In [250]:
# Define what model_info and evaluators in the Model Comparison Table
model_info = ['Model','Type']
evaluator_lst = ['RMSE','MAPE','MAE','Variance','R2','Adjusted_R2']

# The the Cross Validation results would like to compare
comparison_lst = [lr_results, glr_results, rf_results, gbt_results]

In [251]:
def modelComparison(cv_result, model_info, evaluator_lst):
    # Calculate mean of all splits on chosen evaluator
    col_mean_df = cv_result[evaluator_lst].mean().to_frame().T
    # Extract model info
    model_info_df = cv_result[model_info][:1]
    # Concatenate by row
    comparison_df = pd.concat([model_info_df,col_mean_df],axis=1)
    return comparison_df

In [252]:
# Show the Comparison Table
pd.concat([modelComparison(cv_result, model_info, evaluator_lst) for cv_result in comparison_lst])

Unnamed: 0,Model,Type,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2
0,LinearRegression,trained,52482.9,2.22504e+20,49486.49,2507671000.0,-29.770274,-29.77959
0,GeneralizedLinearRegression,trained,inf,4.64296e+292,1.030944e+277,inf,-inf,-inf
0,RandomForestRegressor,trained,55880.22,2.369049e+20,52713.42,2849833000.0,-33.882854,-33.893415
0,GBTRegressor,trained,57673.65,2.445127e+20,54350.6,3045851000.0,-36.157858,-36.169108


In [253]:
def show_results(test_df, lr_predictions, glr_predictions, rf_predictions, gbt_predictions):
  trace1 = go.Scatter(
      x = test_df['timestamp'],
      y = test_df['market-price'].astype(float),
      mode = 'lines',
      name = 'Test set'
  )

  trace2 = go.Scatter(
      x = lr_predictions['timestamp'],
      y = lr_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Linear Regression predictions'
  )

  trace2 = go.Scatter(
      x = lr_predictions['timestamp'],
      y = lr_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Linear Regression predictions'
  )

  trace3 = go.Scatter(
      x = glr_predictions['timestamp'],
      y = glr_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Generalized Linear Regression predictions'
  )

  trace4 = go.Scatter(
      x = rf_predictions['timestamp'],
      y = rf_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'Random Forest Regressor predictions'
  )

  trace5 = go.Scatter(
      x = gbt_predictions['timestamp'],
      y = gbt_predictions['prediction'].astype(float),
      mode = 'lines',
      name = 'GBTRegressor predictions'
  )

  layout = dict(
      title='Test and predictions with Rangeslider',
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  #change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace1,trace2,trace3, trace4, trace5]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = "Test and predictions with Rangeslider")

In [254]:
show_results(test_df.toPandas(), lr_predictions, glr_predictions, rf_predictions, gbt_predictions)

Output hidden; open in https://colab.research.google.com to view.