# **Bitcoin price forecasting - Random Forest**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author
Corsi Danilo - corsi.1742375@studenti.uniroma1.it



# Dependencies, Libraries and Tools

In [1]:
# Global Constants
JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
SLOW_OPERATION = False

In [2]:
#Install some useful dependencies
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from itertools import cycle

import plotly.express as px

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import gc

import pandas as pd
import numpy as np
import json

import matplotlib.pyplot as plt
import seaborn as sns

# !pip install -U -q PyDrive # To use files that are stored in Google Drive directly (e.g., without downloading them from an external URL)
# !apt install openjdk-8-jdk-headless -qq
# import os
# os.environ["JAVA_HOME"] = JAVA_HOME

In [3]:
# Install Spark and related dependencies
!pip install pyspark

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.ml import Pipeline

# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '4G').\
                set('spark.driver.memory', '45G').\
                set('spark.driver.maxResultSize', '10G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPriceForecasting").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285397 sha256=911c5d36319b89fc9e4218123781cac4e69dd00c633117471a3ab8577fa9da7e
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


# Link to Google Drive

In [4]:
# Define GDrive paths
GDRIVE_DIR = "/content/drive"

GDRIVE_MODEL_NAME = "random_forest"
GDRIVE_MODELS_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/models"
GDRIVE_MODEL_NAME_EXT = GDRIVE_MODELS_DIR + "/" + GDRIVE_MODEL_NAME

GDRIVE_DATASET_RAW_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/raw"
GDRIVE_DATASET_TEMP_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/temp"
GDRIVE_DATASET_OUTPUT_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/output"

GDRIVE_DATASET_NAME = "bitcoin_blockchain_data_1h"
GDRIVE_DATASET_NAME_TRAIN = GDRIVE_DATASET_NAME + "_train"
GDRIVE_DATASET_NAME_TEST = GDRIVE_DATASET_NAME + "_test"

GDRIVE_DATASET_NAME_EXT = "/" + GDRIVE_DATASET_NAME + ".parquet"
GDRIVE_DATASET_NAME_EXT_TRAIN  = "/" + GDRIVE_DATASET_NAME_TRAIN + ".parquet"
GDRIVE_DATASET_NAME_EXT_TEST = "/" + GDRIVE_DATASET_NAME_TEST + ".parquet"

GDRIVE_DATASET = GDRIVE_DATASET_RAW_DIR + GDRIVE_DATASET_NAME_EXT
GDRIVE_DATASET_TRAIN = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_DATASET_NAME_EXT_TRAIN
GDRIVE_DATASET_TEST = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_DATASET_NAME_EXT_TEST

In [5]:
# Point Colaboratory to our Google Drive
from google.colab import drive

drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


# Training the model ❗

In [6]:
# Load datasets into pyspark dataframe objects
train_df = spark.read.load(GDRIVE_DATASET_TRAIN,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

test_df = spark.read.load(GDRIVE_DATASET_TEST,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

In [7]:
# Function that create a simple rf model (with no hyperparameter tuning)
def simple_random_forest_model(train, featureCol, labelCol):
  rf = RandomForestRegressor(featuresCol=featureCol, labelCol=labelCol)
  rf_model = rf.fit(train)
  return rf_model

In [8]:
# Define the evaluation metrics
# Notice that r2_adj metric is included when calculating r2
metrics = ['mse', 'rmse', 'mae', 'r2']

In [9]:
# Function to compute the r2 adjusted metric
# r2 is the r2 metric, n is the number of observations, k is the number of features
def compute_r2adj(r2, n, k):
  return 1 - (1 - r2) * ((n - 1) / (n - k - 1))

In [10]:
# Function to evaluate a model
def evaluate_models(predictions, modelName, typeName, label, prediction, metrics):
  r2 = None
  for metric in metrics:
    evaluator = RegressionEvaluator(labelCol=label, predictionCol=prediction, metricName=metric)
    evaluation = evaluator.evaluate(predictions)
    print(metric.upper()+' for '+modelName+' on '+typeName+' set: '+str(evaluation))
    if metric == 'r2':
      print('R2_adj'+' for '+modelName+' on '+typeName+' set: '+str(compute_r2adj(evaluation, predictions.count(), len(predictions.columns))))

In [11]:
# Function that create simple models (without hyperparameter tuning) and evaluate them
def test_best_features(train_data, test_data, features, featureCol, labelCol, metrics = ['rmse', 'r2']):
  # Train the models
  rf = simple_random_forest_model(train_data, featureCol, labelCol)

  # Training set evaluation
  rf_training = rf.transform(test_data)
  evaluate_models(rf_training, 'random forest regression', 'training', labelCol, 'prediction', metrics)

  # Testing set evaluation
  rf_predictions = rf.transform(test_data)
  evaluate_models(rf_predictions, 'random forest regression', 'testing', labelCol, 'prediction', metrics)

  return rf_predictions

In [12]:
# TODO: import feature selected form JSON ❗
# Consider all / relevant / selected features
all_features = ['market-cap', 'total-bitcoins', 'trade-volume', 'blocks-size', 'avg-block-size', 'n-transactions-total', 'n-transactions-per-block', 'hash-rate', 'difficulty', 'miners-revenue', 'transaction-fees-usd', 'n-unique-addresses', 'n-transactions', 'estimated-transaction-volume-usd']
rel_features = ['market-cap', 'estimated-transaction-volume-usd', 'blocks-size', 'n-unique-addresses']
sel_features = ['total-bitcoins', 'blocks-size', 'avg-block-size', 'n-transactions-per-block', 'miners-revenue', 'n-unique-addresses', 'n-transactions']

dep_var = 'market-price'

In [13]:
# Return the dataset with the selected features
def select_features(dataset, features):
  vectorAssembler = VectorAssembler(
    inputCols = features,
    outputCol = 'features')

  dataset = vectorAssembler.transform(dataset)
  dataset = dataset.select(['timestamp','index', 'features', dep_var])
  return dataset

In [14]:
def show_results(train, test, pred):
  trace1 = go.Scatter(
      x = train['timestamp'],
      y = train['market-price'].astype(float),
      mode = 'lines',
      name = 'Train'
  )

  trace2 = go.Scatter(
      x = test['timestamp'],
      y = test['market-price'].astype(float),
      mode = 'lines',
      name = 'Test'
  )

  trace3 = go.Scatter(
      x = pred['timestamp'],
      y = pred['prediction'].astype(float),
      mode = 'lines',
      name = 'Prediction'
  )

  layout = dict(
      title='Train, test and prediction set with Rangeslider',
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  #change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace1, trace2, trace3]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = "Train, test and prediction set with Rangeslider")

In [15]:
# Test performances with the selected features
rf_predictions = test_best_features(select_features(train_df, sel_features), select_features(test_df, sel_features), sel_features, 'features', dep_var)

RMSE for random forest regression on training set: 8024.046831921485
R2 for random forest regression on training set: 0.655342918346346
R2_adj for random forest regression on training set: 0.6552573487843881
RMSE for random forest regression on testing set: 8024.046831921485
R2 for random forest regression on testing set: 0.655342918346346
R2_adj for random forest regression on testing set: 0.6552573487843881


In [16]:
show_results(train_df.toPandas(), test_df.toPandas(), rf_predictions.toPandas())

Output hidden; open in https://colab.research.google.com to view.

In [17]:
# Test features with the relevant features
rf_predictions = test_best_features(select_features(train_df, rel_features), select_features(test_df, rel_features), rel_features, 'features', dep_var)

RMSE for random forest regression on training set: 8831.395752321698
R2 for random forest regression on training set: 0.5824975864672218
R2_adj for random forest regression on training set: 0.5823939312674768
RMSE for random forest regression on testing set: 8831.395752321698
R2 for random forest regression on testing set: 0.5824975864672218
R2_adj for random forest regression on testing set: 0.5823939312674768


In [18]:
show_results(train_df.toPandas(), test_df.toPandas(), rf_predictions.toPandas())

Output hidden; open in https://colab.research.google.com to view.

In [19]:
# Test performances with all features
rf_predictions = test_best_features(select_features(train_df, all_features), select_features(test_df, all_features), all_features, 'features', dep_var)

RMSE for random forest regression on training set: 6542.465488926179
R2 for random forest regression on training set: 0.7708693549559315
R2_adj for random forest regression on training set: 0.7708124676613677
RMSE for random forest regression on testing set: 6542.465488926179
R2 for random forest regression on testing set: 0.7708693549559315
R2_adj for random forest regression on testing set: 0.7708124676613677


In [20]:
show_results(train_df.toPandas(), test_df.toPandas(), rf_predictions.toPandas())

Output hidden; open in https://colab.research.google.com to view.

❓❓❓

# Hyperparameter tuning ❗

In [21]:
df = spark.read.load(GDRIVE_DATASET,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor

In [23]:
def select_features(dataset, features):
  vectorAssembler = VectorAssembler(
    inputCols = features,
    outputCol = 'features')

  dataset = vectorAssembler.transform(dataset)
  dataset = dataset.select(['timestamp','index', 'features', dep_var])
  return dataset

In [24]:
def split_train_test_validation(dataset):
  from pyspark.sql.window import Window
  window = Window.orderBy("timestamp")

  dataset = dataset.withColumn("index", row_number().over(window) - 1)

  # Calculates the total number of rows in the DataFrame
  total_rows = dataset.count()

  # Calculates the index corresponding to 80% of the rows
  train_percent = 0.8
  train_index = int(total_rows * train_percent)

  # Calculates the index corresponding to 15% of the rows
  test_percent = 0.15
  test_index = train_index + int(total_rows * test_percent)

  train_df = dataset.filter(col("index") <= train_index)
  test_df = dataset.filter((col("index") > train_index) & (col("index") <= test_index))
  valid_df = dataset.filter(col("index") > test_index)

  return train_df, test_df, valid_df

In [25]:
# Let's again split the data (here we also use validation data to find the best hyperparameters)
train_data, test_data, validation_data = split_train_test_validation(df)
assembler = VectorAssembler(inputCols=all_features, outputCol='features')

train_data = assembler.transform(train_data).select(['timestamp', 'index', 'features', dep_var])
test_data = assembler.transform(test_data).select(['timestamp', 'index', 'features', dep_var])
validation_data = assembler.transform(validation_data).select(['timestamp', 'index', 'features', dep_var])

In [26]:
def show_results(train, test, pred):
  trace1 = go.Scatter(
      x = train['timestamp'],
      y = train['market-price'].astype(float),
      mode = 'lines',
      name = 'Train'
  )

  trace2 = go.Scatter(
      x = test['timestamp'],
      y = test['market-price'].astype(float),
      mode = 'lines',
      name = 'Test'
  )

  trace3 = go.Scatter(
      x = pred['timestamp'],
      y = pred['market-price'].astype(float),
      mode = 'lines',
      name = 'Prediction'
  )

  layout = dict(
      title='Train, test and prediction set with Rangeslider',
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  #change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace1, trace2, trace3]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = "Train, test and prediction set with Rangeslider")

In [27]:
show_results(train_data.toPandas(), test_data.toPandas(), validation_data.toPandas())

Output hidden; open in https://colab.research.google.com to view.

In [28]:
# Hyperparameter tuning for the model
def random_forest_cross_val(train, k_fold=5):
    rf = RandomForestRegressor(featuresCol='features', labelCol=dep_var)
    pipeline = Pipeline(stages=[rf])

    param_grid = ParamGridBuilder()\
    .addGrid(rf.maxDepth, [5, 6, 7]) \
    .addGrid(rf.numTrees, [30, 50, 70]) \
    .build()

    cross_val = CrossValidator(estimator=pipeline,
                               estimatorParamMaps=param_grid,
                               evaluator=RegressionEvaluator(labelCol=dep_var),
                               numFolds=k_fold,
                               collectSubModels=True
                               )

    # Run cross-validation, and choose the best set of parameters.
    cv_model = cross_val.fit(train)

    return cv_model

In [29]:
# Execute cross validation with the model (high RAM usage!)
cv_rf_models = random_forest_cross_val(validation_data)

In [30]:
# Summarizes all the models trained during cross validation
def summarize_rf_models(cv_models):
    for k, models in enumerate(cv_models):
        print("*************** Fold #{:d} ***************\n".format(k+1))
        for i, m in enumerate(models):
            print("--- Model #{:d} out of {:d} ---".format(i+1, len(models)))
            print("\tParameters: maxDepth=[{:.3f}]; numTrees=[{:.3f}] ".format(m.stages[-1]._java_obj.getMaxDepth(), m.stages[-1]._java_obj.getNumTrees()))
            print("\tModel summary: {}\n".format(m.stages[-1]))
        print("***************************************\n")

In [31]:
# Call the function above|
summarize_rf_models(cv_rf_models.subModels)

*************** Fold #1 ***************

--- Model #1 out of 9 ---
	Parameters: maxDepth=[5.000]; numTrees=[30.000] 
	Model summary: RandomForestRegressionModel: uid=RandomForestRegressor_dd7409a9bb6b, numTrees=30, numFeatures=14

--- Model #2 out of 9 ---
	Parameters: maxDepth=[5.000]; numTrees=[50.000] 
	Model summary: RandomForestRegressionModel: uid=RandomForestRegressor_dd7409a9bb6b, numTrees=50, numFeatures=14

--- Model #3 out of 9 ---
	Parameters: maxDepth=[5.000]; numTrees=[70.000] 
	Model summary: RandomForestRegressionModel: uid=RandomForestRegressor_dd7409a9bb6b, numTrees=70, numFeatures=14

--- Model #4 out of 9 ---
	Parameters: maxDepth=[6.000]; numTrees=[30.000] 
	Model summary: RandomForestRegressionModel: uid=RandomForestRegressor_dd7409a9bb6b, numTrees=30, numFeatures=14

--- Model #5 out of 9 ---
	Parameters: maxDepth=[6.000]; numTrees=[50.000] 
	Model summary: RandomForestRegressionModel: uid=RandomForestRegressor_dd7409a9bb6b, numTrees=50, numFeatures=14

--- Model

In [32]:
# Summarize average error
for i, avg_rmse in enumerate(cv_rf_models.avgMetrics):
    print("Avg. RMSE computed across k-fold cross validation for model setting #{:d}: {:3f}".format(i+1, avg_rmse))

Avg. RMSE computed across k-fold cross validation for model setting #1: 286.088121
Avg. RMSE computed across k-fold cross validation for model setting #2: 274.388786
Avg. RMSE computed across k-fold cross validation for model setting #3: 276.648275
Avg. RMSE computed across k-fold cross validation for model setting #4: 202.517021
Avg. RMSE computed across k-fold cross validation for model setting #5: 196.188401
Avg. RMSE computed across k-fold cross validation for model setting #6: 199.018909
Avg. RMSE computed across k-fold cross validation for model setting #7: 149.418197
Avg. RMSE computed across k-fold cross validation for model setting #8: 144.333162
Avg. RMSE computed across k-fold cross validation for model setting #9: 145.725451


In [33]:
# Get the best model to extract best hyperparameters
best_rf_model_params = cv_rf_models.bestModel.stages[-1].extractParamMap()
print('Best parameters for random forest regressor:')
for param, value in best_rf_model_params.items():
    print(param.name, "=", value)

Best parameters for random forest regressor:
bootstrap = True
cacheNodeIds = False
checkpointInterval = 10
featureSubsetStrategy = auto
featuresCol = features
impurity = variance
labelCol = market-price
leafCol = 
maxBins = 32
maxDepth = 7
maxMemoryInMB = 256
minInfoGain = 0.0
minInstancesPerNode = 1
minWeightFractionPerNode = 0.0
numTrees = 50
predictionCol = prediction
seed = 5377348841675092911
subsamplingRate = 1.0


In [34]:
# Fit a model with best parameters
rf = RandomForestRegressor(featuresCol='features', labelCol=dep_var, maxDepth=7, numTrees=50)
rf_model = rf.fit(validation_data)

In [35]:
# Training set evaluation
rf_training = rf_model.transform(train_data)
evaluate_models(rf_training, 'random forest regressor', 'training', dep_var, 'prediction', metrics)

# Testing set evaluation
rf_predictions = rf_model.transform(test_data)
evaluate_models(rf_predictions, 'random forest regressor', 'testing', dep_var, 'prediction', metrics)

MSE for random forest regressor on training set: 209490272.355465
RMSE for random forest regressor on training set: 14473.778786324772
MAE for random forest regressor on training set: 13837.839559197944
R2 for random forest regressor on training set: -3.0790708352819145
R2_adj for random forest regressor on training set: -3.079323948466362
MSE for random forest regressor on testing set: 448185462.5288945
RMSE for random forest regressor on testing set: 21170.39117562296
MAE for random forest regressor on testing set: 17111.342070425137
R2 for random forest regressor on testing set: -1.3381623687914828
R2_adj for random forest regressor on testing set: -1.3389364409522426


In [36]:
def show_results(train, test, pred):
  trace1 = go.Scatter(
      x = train['timestamp'],
      y = train['market-price'].astype(float),
      mode = 'lines',
      name = 'Train'
  )

  trace2 = go.Scatter(
      x = test['timestamp'],
      y = test['market-price'].astype(float),
      mode = 'lines',
      name = 'Test'
  )

  trace3 = go.Scatter(
      x = pred['timestamp'],
      y = pred['prediction'].astype(float),
      mode = 'lines',
      name = 'Prediction'
  )

  layout = dict(
      title='Train, test and prediction set with Rangeslider',
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  #change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace1, trace2, trace3]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = "Train, test and prediction set with Rangeslider")

In [37]:
show_results(train_data.toPandas(), test_data.toPandas(), rf_predictions.toPandas())

Output hidden; open in https://colab.research.google.com to view.

In [29]:
# Save the best model
# rf_model.write().overwrite().save(GDRIVE_MODEL_NAME_EXT)

## Evaluation ❗

In [None]:
# Function to evaluate a model
def evaluation(pred, n_features):
  from pyspark.ml.evaluation import RegressionEvaluator

  evaluator = RegressionEvaluator(
      predictionCol="prediction",  # Colonna delle previsioni
      labelCol="market-price",  # Colonna delle etichette di output
  )

  mse = evaluator.evaluate(pred, {evaluator.metricName: "mse"})
  rmse = evaluator.evaluate(pred, {evaluator.metricName: "rmse"})
  r2 = evaluator.evaluate(pred, {evaluator.metricName: "r2"})
  mae = evaluator.evaluate(pred, {evaluator.metricName: "mae"})

  from pyspark.sql.functions import abs, col
  from pyspark.sql import functions as F
  from pyspark.ml.evaluation import RegressionEvaluator

  # Calcola il MAPE
  mape = pred.withColumn("error", abs(col("market-price") - col("prediction")) / col("market-price")) \
          .selectExpr("avg(error) * 100 as mape") \
          .collect()[0]["mape"]

  # adj_r2
  n = pred.count()  # Numero di osservazioni
  p = n_features # Numero di predittori nel modello
  adj_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

  print("MSE = %s" % (mse)) # deve essere un valore non negativo, dove un valore di 0 indica una perfetta corrispondenza tra i valori predetti e quelli di riferimento
  print("RMSE = %s" % (rmse)) # dovresti considerare il valore di RMSE in relazione al range dei valori target nel tuo problema specifico
  print("R2 = %s" % (r2)) # piú é vicino ad 1 meglio é
  print("MAE = %s" % (mae)) # può essere utile confrontare il valore di MAE con quello di altri modelli o con il range dei valori target per valutare la sua precisione
  print("MAPE = %s" % (mape)) # di solito viene utilizzato come misura relativa per confrontare la precisione di modelli diversi
  print("ADJ R2 = %s" % (adj_r2)) # piú é vicino ad 1 meglio é

In [None]:
evaluation(all_predictions_df, len(all_columns))
# evaluation(rel_predictions_df)
# evaluation(sel_predictions_df)

MSE = 77058739.66501749
RMSE = 8778.310752361042
R2 = 0.5875016663735204
MAE = 7484.627081452254
MAPE = 27.13897711289007
ADJ R2 = 0.5872147822865472
