# **Bitcoin price forecasting - Linear Regression**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author
Corsi Danilo - corsi.1742375@studenti.uniroma1.it



# Dependencies, Libraries and Tools

In [1]:
JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
MODEL_NAME = "LinearRegression"
SLOW_OPERATION = False

In [2]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
#Install some useful dependencies
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from itertools import cycle

import plotly.express as px

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import gc

import pandas as pd
import numpy as np
import json

import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Install Spark and related dependencies
!pip install pyspark

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.ml import Pipeline

# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '4G').\
                set('spark.driver.memory', '45G').\
                set('spark.driver.maxResultSize', '10G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPriceForecasting").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285388 sha256=5a2fa6397edd1d365ed9a4f1a2f0c4640012b1ab8fabebac7fa8a7d2294553cb
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


# Link to Google Drive

In [5]:
# Define GDrive paths
GDRIVE_DIR = "/content/drive"

GDRIVE_DATASET_OUTPUT_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/output"

GDRIVE_DATASET_NAME = "bitcoin_blockchain_data_30min"
GDRIVE_DATASET_NAME_ENG = GDRIVE_DATASET_NAME + "_eng"

GDRIVE_DATASET_NAME_EXT_ENG  = "/" + GDRIVE_DATASET_NAME_ENG + ".parquet"

GDRIVE_DATASET_NAME_ENG = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_DATASET_NAME_EXT_ENG


In [7]:
# Point Colaboratory to our Google Drive
from google.colab import drive

drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


In [8]:
# Load datasets into pyspark dataset objects
dataset = spark.read.load(GDRIVE_DATASET_NAME_ENG,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

# Import my utilities

In [None]:
import sys
GDRIVE_UTILITIES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/utilities"
sys.path.append(GDRIVE_UTILITIES_DIR)

import shutil
shutil.rmtree(GDRIVE_UTILITIES_DIR + '/__pycache__')

import utilities

import importlib
importlib.reload(utilities)

# Loading features

In [9]:
GDRIVE_FEATURES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/features"

GDRIVE_ALL_FEATURES_NAME = "all_features"
GDRIVE_MORE_REL_FEATURES_NAME = "more_rel_features"
GDRIVE_LESS_REL_FEATURES_NAME = "less_rel_features"

GDRIVE_ALL_FEATURES_NAME_EXT = "/" + GDRIVE_ALL_FEATURES_NAME + ".json"
GDRIVE_MORE_REL_FEATURES_NAME_EXT = "/" + GDRIVE_MORE_REL_FEATURES_NAME + ".json"
GDRIVE_LESS_REL_FEATURES_NAME_EXT = "/" + GDRIVE_LESS_REL_FEATURES_NAME + ".json"

GDRIVE_ALL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_ALL_FEATURES_NAME_EXT
GDRIVE_MORE_REL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_MORE_REL_FEATURES_NAME_EXT
GDRIVE_LESS_REL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_LESS_REL_FEATURES_NAME_EXT

In [10]:
# Set the target variable
TARGET_VAL = 'market-price'

# Set the features label
FEATURES_LABEL = "features"

In [11]:
# Loading correlation matrix features
with open(GDRIVE_ALL_FEATURES, "r") as f:
    all_features = json.load(f)
print(all_features)

['total-bitcoins', 'market-cap', 'trade-volume', 'blocks-size', 'avg-block-size', 'n-transactions-total', 'n-transactions-per-block', 'hash-rate', 'difficulty', 'miners-revenue', 'transaction-fees-usd', 'n-unique-addresses', 'n-transactions', 'estimated-transaction-volume-usd', 'rate-of-change', 'sma-5-days', 'sma-7-days', 'sma-10-days', 'sma-20-days', 'sma-50-days', 'sma-100-days']


In [12]:
# Loading correlation matrix features
with open(GDRIVE_MORE_REL_FEATURES, "r") as f:
    more_rel_features = json.load(f)
print(more_rel_features)

['market-cap', 'miners-revenue', 'sma-5-days', 'sma-7-days', 'estimated-transaction-volume-usd', 'sma-10-days', 'n-transactions-total', 'blocks-size', 'sma-100-days', 'total-bitcoins']


In [13]:
# Loading correlation matrix features
with open(GDRIVE_LESS_REL_FEATURES, "r") as f:
    less_rel_features = json.load(f)
print(less_rel_features)

['sma-20-days', 'sma-50-days', 'n-unique-addresses', 'difficulty', 'hash-rate', 'avg-block-size', 'transaction-fees-usd', 'trade-volume', 'n-transactions-per-block', 'n-transactions', 'rate-of-change']


# Tain/Test Data

In [16]:
# Apache Spark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

# Python
import numpy as np
import pandas as pd
from itertools import product
import time
import io
from contextlib import redirect_stdout

# Graph packages
# https://plotly.com/python/getting-started/#jupyterlab-support
# https://plotly.com/python/time-series/
import plotly.express as px

!pip install pmdarima

# ARIMA
# https://alkaline-ml.com/pmdarima/index.html
import pmdarima as pm
from pmdarima.arima import ndiffs

#Vector Autoregressions
# https://www.statsmodels.org/dev/vector_ar.html
from statsmodels.tsa.api import VAR

# Scikit-learn
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, mean_squared_log_error, mean_absolute_percentage_error, r2_score

# Load the customized Time Series Cross Validation
# from tsCrossValidation import mulTsCrossValidation, blockedTsCrossValidation, wfTsCrossValidation, modelComparison



In [20]:
'''
Description: Split and keep the original time-series order
Args:
    dataSet: The dataSet which needs to be splited
    proportion: A number represents the split proportion

Return:
    train_data: The train dataSet
    test_data: The test dataSet
'''
def trainSplit(dataSet, proportion):
    records_num = dataSet.count()
    split_point = int(records_num * proportion)

    train_data = dataSet.filter(F.col("id") < split_point)
    test_data = dataSet.filter(F.col("id") >= split_point)

    return (train_data,test_data)

In [18]:
# Have a look on the data
dataset.select("id","timestamp","market-price").show(5)

+---+-------------------+------------------+
| id|          timestamp|      market-price|
+---+-------------------+------------------+
|  0|2016-01-01 00:00:00|            430.89|
|  1|2016-01-01 00:30:00|430.97041666666667|
|  2|2016-01-01 01:00:00|431.05083333333334|
|  3|2016-01-01 01:30:00|431.13124999999997|
|  4|2016-01-01 02:00:00|431.21166666666664|
+---+-------------------+------------------+
only showing top 5 rows



In [21]:
# Split the dataSet
proportion = 0.9990
#proportion = 0.7
train_data,test_data = trainSplit(dataset, proportion)

# Cache it
train_data.cache()
test_data.cache()

# Number of train and test dataSets
print(f"Training data: {train_data.count()}\nTest data: {test_data.count()}")

Training data: 131292
Test data: 132


In [22]:
# Save column name
column_names = dataset.columns
# labels and features
feature_cols = dataset.columns
# Gain the column list of features
non_feature_cols  = ['id',"market-price",'timestamp']
[feature_cols.remove(non_feature) for non_feature in non_feature_cols]

[None, None, None]

# Local Mode building

In [23]:
# Define a function to plot line-like graph
# https://plotly.com/python/time-series/#time-series-with-range-selector-buttons
'''
Description: Plot the line graph by plotly(custom design)
Args:
    data: The data(pandas dataset) which you want to ploy by line
    graph_title: The title of the graph

Return: None
'''
def line_plot(data,graph_title):
    plot = px.line(data,title=graph_title)
    plot.update_xaxes(
        rangeslider_visible=True,
        rangeselector=dict(
            buttons=list([
                dict(count=7, label="1w", step="day", stepmode="backward"),
                dict(count=1, label="1m", step="month", stepmode="backward"),
                dict(count=6, label="6m", step="month", stepmode="backward"),
                dict(count=1, label="1y", step="year", stepmode="backward"),
                dict(step="all")
            ])
        )
    )
    plot.show()

In [24]:
'''
Description: Transform each partition of Spark to pandas dataset
Args:
    partition_rdd: RDD of each partition

Return:
    pandas_df: Data in pandas dataset
'''
def partitionToPandas(partition_rdd):
    pandas_df = pd.DataFrame(columns = column_names)

    # each_row is Row() type in Spark
    for each_row in partition_rdd:
        pandas_df = pandas_df.append(each_row.asDict(),ignore_index=True)
    return [pandas_df]

In [25]:
'''
Description: Build ARIMA model on each partition
Args:
    partition_rdd: RDD of each partition

Return:
    arima_model: ARIMA model
'''
def buildARIMA(pandas_df):

    # Only choose Close as prediction
    pandas_df = pandas_df[['timestamp','market-price']].set_index("timestamp")

    # Choose the best degree of differencing
    kpss_diffs = ndiffs(pandas_df, alpha=0.05, test='kpss', max_d=6)
    adf_diffs = ndiffs(pandas_df, alpha=0.05, test='adf', max_d=6)
    n_diffs = max(adf_diffs, kpss_diffs)

    # Auto training
    # p: AR (i.e., the number of lag observations)
    # d: The degree of differencing.
    # q: MA (the size of the “window”)
    output = io.StringIO()
    # Capture the trace time output to get the model training time
    with redirect_stdout(output):
        arima_model = pm.auto_arima(pandas_df, start_p=1, seasonal=False,
                             d=n_diffs, trace=True,
                             suppress_warnings=True,
                             error_action="ignore",
                             max_order=None,
                             stepwise=True)
    # Get the model training time from trace time output
    model_results = output.getvalue().split('\n')
    model_results = [ line for line in model_results if "AIC" in line ]
    model_results = [line.split(':')[1].split(',') for line in model_results]
    AIC_lst = [ line[0].split('=')[1] for line in model_results ]
    time_lst = [ line[1].split('=')[1].split(' ')[0] for line in model_results ]
    model_results_dict = {"AIC": AIC_lst, "time": time_lst}
    model_results_df = pd.DataFrame(model_results_dict)
    train_time = model_results_df.sort_values("AIC").iloc[0,1]

    # Save the (p,d,q)
    order_info = arima_model.order
    return (arima_model,float(train_time))

In [26]:
'''
Description: Make prediction on each partition
Args:
    pandas_df: Data in pandas dataset
    broadcast_models: Trained Models
    model_name: specify which model to make prediction

Return:
    partition_pred: Predictions on the partition in a list
'''
def makePrediction(pandas_df, broadcast_models, model_name):
    prediction_lst = []
    num_pred = pandas_df.shape[0]
    num_models = len(broadcast_models.value)

    if model_name == "VectorARIMA":
        pandas_df.drop(['id'], axis=1, inplace=True)
        pandas_df.set_index("timestamp", inplace=True)

        # Get the prediction from each model, then save to a list
        for model in broadcast_models.value:
            results = model.fit(maxlags=6, ic='aic')
            lag_order = results.k_ar
            prediction = results.forecast(pandas_df.values[-lag_order:],num_pred)
            close_prediction = [lst[3] for lst in prediction]
            prediction_lst.append(close_prediction)

    elif model_name == "ARIMA":
        # Get the prediction from each model, then save to a list
        for model in broadcast_models.value:
            prediction_lst.append(model.predict(num_pred).tolist())

    else:
        return "Wrong model name"

    # Define weight value
    weight = list(range(1,num_models+1))
    # Weighted the results from each Model
    weighted_pred_lst = [[i*b for i in a] for a,b in zip(prediction_lst,weight)]

    # Aggregate the weighted predictions, then get Weighted value
    partition_pred = [value / sum(weight) for value in map(sum,zip(*weighted_pred_lst))]
    # Simple average
    #partition_pred = [value / num_models for value in map(sum,zip(*prediction_lst))]

    return partition_pred

In [27]:
'''
Description: Build Vector ARIMA model on each partition
Args:
    partition_rdd: RDD of each partition

Return:
    vector_arima: Vector ARIMA model
'''
def buildVectorARIMA(pandas_df):

    # Drop the column that don't need to predict
    pandas_df.drop(['id'], axis=1, inplace=True)
    pandas_df.set_index("timestamp", inplace=True)
    start = time.time()
    vector_arima = VAR(pandas_df)
    end = time.time()
    return (vector_arima, end-start)

In [28]:
'''
Description: Calculate evaluation metrics
Args:
    y_test: Label of test data
    y_pred: Prediction on test data
    partition_num_train: Number of partition of Train data
    partition_num_test: Number of partition of Test data
    train_time: Time of training model
    model_name: specify which model to make prediction

Return:
    results: All the evaluation metrics in a dict
'''
def evaluationAssemble(y_test, y_pred, partition_num_train, partition_num_test, train_time, model_name):
    # Explained variance score
    exp_var = explained_variance_score(y_test,y_pred)

    # Mean absolute error
    mae = mean_absolute_error(y_test,y_pred)

    # Root Mean squared error
    rmse = mean_squared_error(y_test,y_pred,squared=False)

    # Mean squared logarithmic error
    msle = mean_squared_log_error(y_test,y_pred)

    # Mean absolute percentage error
    mape = mean_absolute_percentage_error(y_test,y_pred)

    # R2 score, the coefficient of determination
    r2 = r2_score(y_test,y_pred)

    # Adjusted R2 score
    n = len(y_pred)
    if model_name == "ARIMA":
        p = 1
    elif model_name == "VectorARIMA":
        p = len(feature_cols)

    adj_r2 = 1-(1-r2)*(n-1)/(n-p-1)

    # Use dict to store each result
    results = {
        "Model": model_name,
        "P_train": partition_num_train,
        "P_test": partition_num_test,
        "Proportion": proportion,
        "RMSE": rmse,
        "MAPE": mape,
        "MAE": mae,
        "MSLE": msle,
        "Variance": exp_var,
        "R2": r2,
        "Adjusted_R2": adj_r2,
        "Time": train_time,
    }
    return results

In [29]:
'''
Description: Transform a Spark Row type list to pandas dataset
Args:
    row_list: Data in pandas dataset
    column_names: Column names will display in pandas dataset. The format need to be a list

Return:
    pandas_df: Data in pandas dataset
'''
def row2Pandasdf(row_list, column_names):
    pandas_df = pd.DataFrame(columns = column_names)

    # each_row is Row() type in Spark
    for each_row in row_list:
        pandas_df = pandas_df.append(each_row.asDict(), ignore_index=True)
    return pandas_df

In [32]:
# @title
'''
Description: Local mode on Spark using Scikit-learn
Args:
    train_data: Train data in Spark datafram
    test_data: Test data in Spark datafram
    partition_num_train: Number of partition of Train data
    partition_num_test: Number of partition of Test data
    model_name: specify which model to make prediction

Return:
    results: All the evaluation metrics in a dict
'''
def localMode(train_data, test_data, partition_num_train, partition_num_test, model_name):

    # Transform Train/Test to RDD type, manually set partition number
    train_rdd = train_data.orderBy("id").rdd.coalesce(partition_num_train)
    test_rdd  = test_data.orderBy("id").rdd.coalesce(partition_num_test)

    # Collect all the models which generated from each partition, to driver
    if model_name == "ARIMA":
        models = train_rdd.mapPartitions(partitionToPandas).map(buildARIMA).collect()
    elif model_name == "VectorARIMA":
        models = train_rdd.mapPartitions(partitionToPandas).map(buildVectorARIMA).collect()
    else:
        return "Wrong model name"

    train_time = max([model[1] for model in models])
    models = [model[0] for model in models]
    # broadcast models
    broadcast_models = sc.broadcast(models)

    # Transform each partition of test_rdd to pandas dataset, then make prediction on each partition, then merge the results in a single list
    y_pred = test_rdd.mapPartitions(partitionToPandas).map(lambda x: makePrediction(x, broadcast_models, model_name)).reduce(lambda x,y: x+y)

    # Get the label of test data. (Row() type also works for calculating evaluation metrics)
    y_test = test_data.select("market-price").collect()

    # Generate a pandas dataset on predictions. Can help to plot graph easier later.
    y_test_rows = test_data.select("timestamp","market-price").collect()
    y_df = row2Pandasdf(y_test_rows, ["timestamp","market-price"])

    # Add prediction to y_test_df
    y_df["prediction"] = y_pred

    # Plot the prediction
    #line_plot(y_df.set_index("Timestamp"), model_name)

    # Calculate evaluation metrics
    results = evaluationAssemble(y_test, y_pred, partition_num_train, partition_num_test, train_time, model_name)
    return results

In [33]:
# Only use VectorARIMA
# model_name == "ARIMA" or "VectorARIMA"
localMode(train_data, test_data, 3, 1, "VectorARIMA")

TypeError: ignored

In [None]:
# Only use ARIMA
# model_name == "ARIMA" or "VectorARIMA"
localMode(train_data, test_data, 3, 1, "ARIMA")

# OLD

## Evaluation of a simple model

In [None]:
# Get default params
params = utilities.get_defaults_model_params(MODEL_NAME)
params

{'maxIter': [100], 'regParam': [0.0], 'elasticNetParam': [0.0]}

In [None]:
# Valid performances with all the features
simple_res_all, simple_pred_all = utilities.evaluate_simple_model(df, all_features, params, GDRIVE_ALL_FEATURES_NAME, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
simple_res_all

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,simple,all_features,"[100, 0.0, 0.0]",843.020822,0.023818,629.717382,76020470.0,0.990671,0.990669,1.728785


In [None]:
utilities.show_results(simple_pred_all, MODEL_NAME, TARGET_VAL)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Valid performances with the corr matrix features
simple_res_more_rel, simple_pred_more_rel = utilities.evaluate_simple_model(df, more_rel_features, params, GDRIVE_MORE_REL_FEATURES_NAME, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
simple_res_more_rel

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,simple,more_rel_features,"[100, 0.0, 0.0]",652.761659,0.01778,475.443556,79876610.0,0.994407,0.994406,2.049921


In [None]:
utilities.show_results(simple_pred_more_rel, MODEL_NAME, TARGET_VAL)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Valid performances with the corr matrix features
simple_res_less_rel, simple_pred_less_rel = utilities.evaluate_simple_model(df, less_rel_features, params, GDRIVE_LESS_REL_FEATURES_NAME, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
simple_res_less_rel

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,simple,less_rel_features,"[100, 0.0, 0.0]",40956.685757,1.560198,37050.205975,1652957000.0,-21.01904,-21.023229,1.635961


In [None]:
utilities.show_results(simple_pred_less_rel, MODEL_NAME, TARGET_VAL)

Output hidden; open in https://colab.research.google.com to view.

## Hyperparameter tuning

In [None]:
choosen_features = more_rel_features
CHOSEN_FEATURES_LABEL = GDRIVE_MORE_REL_FEATURES_NAME

In [None]:
# Split proportion list
PORTION_LIST = [0.6, 0.7, 0.8, 0.9]

In [None]:
# Get simple params
params = utilities.get_simple_model_params(MODEL_NAME)
params

{'maxIter': [5, 10, 50, 80, 100],
 'regParam': array([0. , 0.2, 0.4, 0.6, 0.8]),
 'elasticNetParam': array([0. , 0.2, 0.4, 0.6, 0.8])}

In [None]:
hyp_res = utilities.autoTuning(df, choosen_features, params, CHOSEN_FEATURES_LABEL, PORTION_LIST, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
hyp_res

Unnamed: 0,Model,Type,Features,Proportion,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,autotuning,more_rel_features,0.9,"[100, 0.2, 0.8]",488.391059,0.01626,363.681154,22735860.0,0.988728,0.988724,0.273395


## Cross validation

In [None]:
# Get tuned params
params = utilities.get_tuned_model_params(MODEL_NAME)
params

{'maxIter': [100], 'regParam': [0.4], 'elasticNetParam': [0.2]}

In [None]:
## Cross Validation Parameter
# Multiple Splits Time Series Cross Validation
mul_cv = {'cv_type':'mulTs',
          'kSplits': 5}

# Blocked Time Series Cross Validation
blk_cv = {'cv_type':'blkTs',
          'kSplits': 10}

In [None]:
mul_cv_res, trained_models_mul_cv = utilities.tsCrossValidation(df, choosen_features, params, mul_cv, CHOSEN_FEATURES_LABEL, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
mul_cv_res

Unnamed: 0,Model,Type,Features,Splits,Train&Validation,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,mulTs,more_rel_features,1,"(21904, 21904)","[100, 0.4, 0.2]",325.58222,0.028155,203.309029,15745170.0,0.993497,0.993495,0.69414
1,LinearRegression,mulTs,more_rel_features,2,"(43808, 21904)","[100, 0.4, 0.2]",244.476367,0.026052,183.561119,6644103.0,0.990491,0.990488,1.711551
2,LinearRegression,mulTs,more_rel_features,3,"(65712, 21904)","[100, 0.4, 0.2]",318.139565,0.027301,248.829647,14629020.0,0.993202,0.993201,0.889911
3,LinearRegression,mulTs,more_rel_features,4,"(87616, 21904)","[100, 0.4, 0.2]",1894.200594,0.037411,1648.423494,86539410.0,0.957941,0.957932,0.979221
4,LinearRegression,mulTs,more_rel_features,5,"(109520, 21904)","[100, 0.4, 0.2]",1242.857634,0.048973,1106.780904,46287330.0,0.963564,0.963555,0.987871


In [None]:
blk_cv_res, trained_models_mul_cv = utilities.tsCrossValidation(df, choosen_features, params, blk_cv, CHOSEN_FEATURES_LABEL, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)
blk_cv_res

Unnamed: 0,Model,Type,Features,Splits,Train&Validation,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,blkTs,more_rel_features,1,"(10513, 2629)","[100, 0.4, 0.2]",12.302569,0.019521,11.579816,415.5722,0.355129,0.3539,0.532461
1,LinearRegression,blkTs,more_rel_features,2,"(10513, 2629)","[100, 0.4, 0.2]",111.132706,0.038788,94.53785,113856.5,0.90879,0.908616,1.014405
2,LinearRegression,blkTs,more_rel_features,3,"(10513, 2629)","[100, 0.4, 0.2]",352.403106,0.0328,289.903742,1148209.0,0.915844,0.915684,2.366994
3,LinearRegression,blkTs,more_rel_features,4,"(10513, 2629)","[100, 0.4, 0.2]",83.295361,0.015134,63.817925,1060637.0,0.993372,0.993359,0.823161
4,LinearRegression,blkTs,more_rel_features,5,"(10513, 2629)","[100, 0.4, 0.2]",237.802746,0.018028,188.769293,667254.0,0.926832,0.926692,0.513712
5,LinearRegression,blkTs,more_rel_features,6,"(10513, 2629)","[100, 0.4, 0.2]",539.082863,0.055065,519.488838,342743.0,-2.961505,-2.969056,0.68799
6,LinearRegression,blkTs,more_rel_features,7,"(10513, 2629)","[100, 0.4, 0.2]",1229.001013,0.0205,1018.982647,28488890.0,0.950351,0.950256,0.53917
7,LinearRegression,blkTs,more_rel_features,8,"(10513, 2629)","[100, 0.4, 0.2]",828.005376,0.010943,620.480366,38374900.0,0.982911,0.982879,0.537766
8,LinearRegression,blkTs,more_rel_features,9,"(10513, 2629)","[100, 0.4, 0.2]",905.182089,0.038108,785.102819,4313129.0,0.736626,0.736124,0.511625
9,LinearRegression,blkTs,more_rel_features,10,"(10513, 2629)","[100, 0.4, 0.2]",543.478665,0.016088,438.063379,2260959.0,0.859556,0.859289,0.537404


## Comparison table

In [None]:
# Define what model_info and evaluators in the Model Comparison Table
model_info = ['Model','Type', 'Features', 'Parameters']
evaluator_lst = ['RMSE','MAPE','MAE','Variance','R2','Adjusted_R2','Time']

# The the Cross Validation results would like to compare
comparison_lst = [simple_res_all, simple_res_more_rel, simple_res_less_rel, hyp_res, mul_cv_res, blk_cv_res]

In [None]:
# Show the Comparison Table
pd.concat([utilities.modelComparison(cv_result, model_info, evaluator_lst) for cv_result in comparison_lst])

Unnamed: 0,Model,Type,Features,Parameters,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,LinearRegression,simple,all_features,"[100, 0.0, 0.0]",843.020822,0.023818,629.717382,76020470.0,0.990671,0.990669,1.728785
0,LinearRegression,simple,more_rel_features,"[100, 0.0, 0.0]",652.761659,0.01778,475.443556,79876610.0,0.994407,0.994406,2.049921
0,LinearRegression,simple,less_rel_features,"[100, 0.0, 0.0]",40956.685757,1.560198,37050.205975,1652957000.0,-21.01904,-21.023229,1.635961
0,LinearRegression,autotuning,more_rel_features,"[100, 0.2, 0.8]",488.391059,0.01626,363.681154,22735860.0,0.988728,0.988724,0.273395
0,LinearRegression,mulTs,more_rel_features,"[100, 0.4, 0.2]",805.051276,0.033579,678.180838,33969010.0,0.979739,0.979734,1.052539
0,LinearRegression,blkTs,more_rel_features,"[100, 0.4, 0.2]",484.168649,0.026498,403.072668,7677099.0,0.466791,0.465774,0.806469


## Training the final model

In [None]:
model = utilities.train_final_model(df, more_rel_features, params, MODEL_NAME, FEATURES_LABEL, TARGET_VAL)

In [None]:
GDRIVE_MODELS_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/models"
GDRIVE_MODEL_NAME_EXT = GDRIVE_MODELS_DIR + "/" + MODEL_NAME

In [None]:
# Save the trained model
model.write().overwrite().save(GDRIVE_MODEL_NAME_EXT)