# **Bitcoin price forecasting - GBTRegressor**
### Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author
Corsi Danilo - corsi.1742375@studenti.uniroma1.it



## Global constants, dependencies, libraries and tools

In [1]:
# Main constants
GDRIVE_DIR = "/content/drive"
MODEL_NAME = "LinearRegression"
SLOW_OPERATIONS = True

In [2]:
# Datasets dirs
GDRIVE_DATASET_OUTPUT_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/output"

# Datasets names
DATASET_TRAIN_VALID_NAME = "bitcoin_blockchain_data_30min_train_valid"

# Datasets paths
GDRIVE_DATASET_TRAIN_VALID  = GDRIVE_DATASET_OUTPUT_DIR + "/" + DATASET_TRAIN_VALID_NAME + ".parquet"

# --------------------------------------------- #

# Features dir
GDRIVE_FEATURES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/features"

# Features names
ALL_FEATURES_NAME = "all_features"
MORE_REL_FEATURES_NAME = "more_rel_features"
LESS_REL_FEATURES_NAME = "less_rel_features"

# Features labels
FEATURES_LABEL = "features"
TARGET_LABEL = "next-market-price"

# Features paths
GDRIVE_ALL_FEATURES = GDRIVE_FEATURES_DIR + "/" + ALL_FEATURES_NAME + ".json"
GDRIVE_MORE_REL_FEATURES = GDRIVE_FEATURES_DIR + "/" + MORE_REL_FEATURES_NAME + ".json"
GDRIVE_LESS_REL_FEATURES = GDRIVE_FEATURES_DIR + "/" + LESS_REL_FEATURES_NAME + ".json"

# --------------------------------------------- #

# Model dir
GDRIVE_MODELS_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/models"

# Model path
GDRIVE_MODEL = GDRIVE_MODELS_DIR + "/" + MODEL_NAME

# --------------------------------------------- #

# Utilities dir
GDRIVE_UTILITIES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/utilities"

In [3]:
# Point Colaboratory to our Google Drive
from google.colab import drive

# Define GDrive paths
drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


In [4]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
# Install Spark and related dependencies
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285387 sha256=fc641814102f5700c832fc1c8c31a41ea818b0b287b74e687b1b0de0d457576e
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


## Import files

In [51]:
import sys
sys.path.append(GDRIVE_UTILITIES_DIR)

from imports import *
import utilities, parameters

importlib.reload(utilities)
importlib.reload(parameters)

<module 'parameters' from '/content/drive/MyDrive/BDC/project/utilities/parameters.py'>

## Create the pyspark session

In [7]:
# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '4G').\
                set('spark.driver.memory', '45G').\
                set('spark.driver.maxResultSize', '10G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPriceForecasting").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

## Loading dataset

In [8]:
# Load datasets into pyspark dataset objects
df = spark.read.load(GDRIVE_DATASET_TRAIN_VALID,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

In [9]:
def dataset_info(dataset):
  dataset.show(3)

  # Get the number of rows
  num_rows = dataset.count()

  # Get the number of columns
  num_columns = len(dataset.columns)

  # Print the shape of the dataset
  print("Shape:", (num_rows, num_columns))

  # Print the schema of the dataset
  dataset.printSchema()

In [10]:
if SLOW_OPERATIONS:
  dataset_info(df)

+-------------------+---+------------------+--------------+-------------------+--------------+------------------+------------------+--------------------+------------------------+-----------------+-------------------+------------------+--------------------+------------------+------------------+--------------------------------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|          timestamp| id|      market-price|total-bitcoins|         market-cap|  trade-volume|       blocks-size|    avg-block-size|n-transactions-total|n-transactions-per-block|        hash-rate|         difficulty|    miners-revenue|transaction-fees-usd|n-unique-addresses|    n-transactions|estimated-transaction-volume-usd|      rate-of-change|        sma-5-days|        sma-7-days|       sma-10-days|       sma-20-days|       sma-50-days|      sma-100-days| next-market-price|
+-------------------+---+-----------

## Loading features

In [11]:
# Loading correlation matrix features
with open(GDRIVE_ALL_FEATURES, "r") as f:
    all_features = json.load(f)
print(all_features)

['market-price', 'total-bitcoins', 'market-cap', 'trade-volume', 'blocks-size', 'avg-block-size', 'n-transactions-total', 'n-transactions-per-block', 'hash-rate', 'difficulty', 'miners-revenue', 'transaction-fees-usd', 'n-unique-addresses', 'n-transactions', 'estimated-transaction-volume-usd', 'rate-of-change', 'sma-5-days', 'sma-7-days', 'sma-10-days', 'sma-20-days', 'sma-50-days', 'sma-100-days']


In [12]:
# Loading correlation matrix features
with open(GDRIVE_MORE_REL_FEATURES, "r") as f:
    more_rel_features = json.load(f)
print(more_rel_features)

['market-price', 'market-cap', 'miners-revenue', 'sma-5-days', 'sma-7-days', 'sma-10-days', 'estimated-transaction-volume-usd', 'n-transactions-total', 'blocks-size', 'sma-100-days', 'total-bitcoins']


In [13]:
# Loading correlation matrix features
with open(GDRIVE_LESS_REL_FEATURES, "r") as f:
    less_rel_features = json.load(f)
print(less_rel_features)

['sma-20-days', 'sma-50-days', 'n-unique-addresses', 'difficulty', 'hash-rate', 'avg-block-size', 'transaction-fees-usd', 'trade-volume', 'n-transactions-per-block', 'n-transactions', 'rate-of-change']


## [TOFIX] Shared functions ❗

In [108]:
from itertools import product
from dateutil.relativedelta import relativedelta

##########
# SHARED #
##########

# Return the dataset with the selected features
def select_features(dataset, features, features_label, target_label):
  vectorAssembler = VectorAssembler(inputCols = features, outputCol = features_label)

  dataset = vectorAssembler.transform(dataset).select("timestamp", "id", features_label, target_label)

  return dataset

# Normalized / standardized features selection
def select_normalized_features(dataset, features, features_label, target_label):
    # Assemble the columns into a vector column
    assembler = VectorAssembler(inputCols = features, outputCol = "raw_features")
    df_vector  = assembler.transform(dataset).select("timestamp", "id", "raw_features", target_label)

    # Create a Normalizer instance
    normalizer = Normalizer(inputCol="raw_features", outputCol=features_label)

    # Fit and transform the data
    normalized_data = normalizer.transform(df_vector).select("timestamp", "id", features_label, target_label)

    return normalized_data

def dataset_split(dataset):
    # Retrieve the last value of the column
    last_value = dataset.agg(last("timestamp")).collect()[0][0]

    # Sottrai un mese dalla data di partenza
    split_date = last_value - relativedelta(months=1)

    # Split the dataset based on the desired date
    train_data = dataset[dataset['timestamp'] <= split_date]
    valid_data = dataset[dataset['timestamp'] > split_date]

    return train_data, valid_data

def show_results(results, ml_model):
  trace1 = go.Scatter(
      x = results['timestamp'],
      y = results['next-market-price'].astype(float),
      mode = 'lines',
      name = 'Next Market price (usd)'
  )

  trace2 = go.Scatter(
      x = results['timestamp'],
      y = results['prediction'].astype(float),
      mode = 'lines',
      name = 'Predicted next makert price (usd)'
  )

  layout = dict(
      title= ml_model +' predicitons',
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  #change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace1,trace2]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = ml_model +' predicitons')

# def modelComparison(cv_result, model_info, evaluator_lst):
#     # Calculate mean of all splits on chosen evaluator
#     col_mean_df = cv_result[evaluator_lst].mean().to_frame().T

#     # Extract model info
#     model_info_df = cv_result[model_info][:1]

#     # Concatenate by row
#     comparison_df = pd.concat([model_info_df,col_mean_df], axis=1)

#     return comparison_df

def model_selection(ml_model, param, features_label, target_label):
    if ml_model == "LinearRegression":
        model = LinearRegression(featuresCol=features_label, \
                                    labelCol=target_label, \
                                    maxIter=param['maxIter'], \
                                    regParam=param['regParam'], \
                                    elasticNetParam=param['elasticNetParam'])

    elif ml_model == "GeneralizedLinearRegression":
        model = GeneralizedLinearRegression(featuresCol=features_label, \
                                            labelCol=target_label, \
                                            maxIter=param['maxIter'], \
                                            regParam=param['regParam'])

    elif ml_model == "RandomForestRegressor":
        model = RandomForestRegressor(featuresCol=features_label, \
                                        labelCol=target_label, \
                                        numTrees = param["numTrees"], \
                                        maxDepth = param["maxDepth"])

    elif ml_model == "GBTRegressor":
        model = GBTRegressor(featuresCol=features_label, \
                                labelCol=target_label, \
                                maxIter = param['maxIter'], \
                                maxDepth = param['maxDepth'], \
                                stepSize = param['stepSize'])
    return model

def model_evaluation(target_label, predictions):
    mse_evaluator = RegressionEvaluator(labelCol=target_label, predictionCol="prediction", metricName='mse')
    rmse_evaluator = RegressionEvaluator(labelCol=target_label, predictionCol="prediction", metricName='rmse')
    mae_evaluator = RegressionEvaluator(labelCol=target_label, predictionCol="prediction", metricName='mae')
    r2_evaluator = RegressionEvaluator(labelCol=target_label, predictionCol="prediction", metricName='r2')

    mse = mse_evaluator.evaluate(predictions)
    rmse = rmse_evaluator.evaluate(predictions)
    mae = mae_evaluator.evaluate(predictions)
    r2 = r2_evaluator.evaluate(predictions)

    # Adjusted R-squared
    n = predictions.count()
    p = len(predictions.columns)
    adj_r2 = 1-(1-r2)*(n-1)/(n-p-1)

    results = {'mse':mse, 'rmse':rmse, 'mae':mae, 'r2':r2, 'adj_r2':adj_r2}

    return results

## [TOFIX] Simple model functions ❗

In [68]:
############################
# MODEL TRAIN / VALIDATION #
############################

# Function that create simple models (without hyperparameter tuning) and evaluate them
def model_train_valid_(dataset, params, features, model_type, model_name, features_name, features_label, target_label):
    # Select train and valid data features
    if model_type == "simple":
        dataset = select_features(dataset, features, features_label, target_label)
    elif model_type == "simple_norm" or model_type == "final_validated":
        dataset = select_normalized_features(dataset, features, features_label, target_label)

    # ALL combination of params
    param_lst = [dict(zip(params, param)) for param in product(*params.values())]

    for param in param_lst:
        # Chosen Model
        model = model_selection(model_name, param, features_label, target_label)

        # Split dataset
        train_data, valid_data = dataset_split(dataset)

        # Chain assembler and model in a Pipeline
        pipeline = Pipeline(stages=[model])
        # Train a model and calculate running time
        start = time.time()
        pipeline_model = pipeline.fit(train_data)
        end = time.time()

        # Make predictions
        predictions = pipeline_model.transform(valid_data).select(target_label, "prediction", 'timestamp')

        # Compute validation error by several evaluators
        eval_res = model_evaluation(target_label, predictions)

        # Use dict to store each result
        results = {
            "Model": model_name,
            "Type": model_type,
            "Features": features_name,
            "Parameters": [list(param.values())],
            "MSE": eval_res['mse'],
            "RMSE": eval_res['rmse'],
            "MAE": eval_res['mae'],
            "R2": eval_res['r2'],
            "Adjusted_R2": eval_res['adj_r2'],
            "Time": end - start,
        }

    # Transform dict to pandas dataset
    results_pd = pd.DataFrame(results, index=[0])

    return results_pd, predictions.toPandas()

## Simple model

In [64]:
MODEL_TYPE = "simple"

In [65]:
# Get default params
params = parameters.get_defaults_model_params(MODEL_NAME)
params

{'maxIter': [100], 'regParam': [0.0], 'elasticNetParam': [0.0]}

In [69]:
# Valid performances with all the features
simple_res_all, simple_pred_all = model_train_valid_(df, params, all_features, MODEL_TYPE, MODEL_NAME, ALL_FEATURES_NAME, FEATURES_LABEL, TARGET_LABEL)
simple_res_all

Unnamed: 0,Model,Type,Features,Parameters,MSE,RMSE,MAE,R2,Adjusted_R2,Time
0,LinearRegression,simple,all_features,"[100, 0.0, 0.0]",29.236022,5.407034,4.381591,0.999966,0.999966,2.040987


In [70]:
show_results(simple_pred_all, MODEL_NAME)

In [71]:
# Valid performances with the corr matrix features
simple_res_more_rel, simple_pred_more_rel = model_train_valid_(df, params, more_rel_features, MODEL_TYPE, MODEL_NAME, MORE_REL_FEATURES_NAME, FEATURES_LABEL, TARGET_LABEL)
simple_res_more_rel

Unnamed: 0,Model,Type,Features,Parameters,MSE,RMSE,MAE,R2,Adjusted_R2,Time
0,LinearRegression,simple,more_rel_features,"[100, 0.0, 0.0]",128.354483,11.329364,9.424813,0.999851,0.999851,1.328911


In [72]:
show_results(simple_pred_more_rel, MODEL_NAME)

In [73]:
# Valid performances with the corr matrix features
simple_res_less_rel, simple_pred_less_rel = model_train_valid_(df, params, less_rel_features, MODEL_TYPE, MODEL_NAME, LESS_REL_FEATURES_NAME, FEATURES_LABEL, TARGET_LABEL)
simple_res_less_rel

Unnamed: 0,Model,Type,Features,Parameters,MSE,RMSE,MAE,R2,Adjusted_R2,Time
0,LinearRegression,simple,less_rel_features,"[100, 0.0, 0.0]",676607100.0,26011.673055,25379.746393,-785.269831,-786.912456,2.340446


In [74]:
show_results(simple_pred_less_rel, MODEL_NAME)

## Simple model w/ data normalization

In [91]:
MODEL_TYPE = "simple_norm"

In [92]:
# Valid performances with all the features
simple_norm_res_all, simple_norm_pred_all = model_train_valid_(df, params, all_features, MODEL_TYPE, MODEL_NAME, ALL_FEATURES_NAME, FEATURES_LABEL, TARGET_LABEL)
simple_norm_res_all

Unnamed: 0,Model,Type,Features,Parameters,MSE,RMSE,MAE,R2,Adjusted_R2,Time
0,LinearRegression,simple_norm,all_features,"[100, 0.0, 0.0]",1389959000.0,37282.156811,35851.425641,-7.372336,-7.373053,2.155105


In [93]:
show_results(simple_norm_pred_all, MODEL_NAME)

Output hidden; open in https://colab.research.google.com to view.

In [94]:
# Valid performances with the corr matrix features
simple_norm_res_more_rel, simple_norm_pred_more_rel = model_train_valid_(df, params, more_rel_features, MODEL_TYPE, MODEL_NAME, MORE_REL_FEATURES_NAME, FEATURES_LABEL, TARGET_LABEL)
simple_norm_res_more_rel

Unnamed: 0,Model,Type,Features,Parameters,MSE,RMSE,MAE,R2,Adjusted_R2,Time
0,LinearRegression,simple_norm,more_rel_features,"[100, 0.0, 0.0]",113757900.0,10665.736198,7933.800274,0.314786,0.314727,1.842914


In [95]:
show_results(simple_norm_pred_more_rel, MODEL_NAME)

Output hidden; open in https://colab.research.google.com to view.

In [96]:
# Valid performances with the corr matrix features
simple_norm_res_less_rel, simple_norm_pred_less_rel = model_train_valid_(df, params, less_rel_features, MODEL_TYPE, MODEL_NAME, LESS_REL_FEATURES_NAME, FEATURES_LABEL, TARGET_LABEL)
simple_norm_res_less_rel

Unnamed: 0,Model,Type,Features,Parameters,MSE,RMSE,MAE,R2,Adjusted_R2,Time
0,LinearRegression,simple_norm,less_rel_features,"[100, 0.0, 0.0]",655064700.0,25594.230853,21953.469488,-2.945743,-2.94608,4.213806


In [97]:
show_results(simple_norm_pred_less_rel, MODEL_NAME)

Output hidden; open in https://colab.research.google.com to view.

## [TOFIX] Cross validation functions ❗

In [105]:
####################
# CROSS VALIDATION #
####################

def cross_validation(dataset, features_label, target_label, k_fold=5):
    # Linear regression
    model = LinearRegression(featuresCol=features_label, labelCol=target_label)
    pipeline = Pipeline(stages=[model])

    param_grid = ParamGridBuilder()\
    .addGrid(model.maxIter, [5, 10, 50, 80, 100]) \
    .addGrid(model.regParam, np.arange(0,1,0.2).round(decimals=2)) \
    .addGrid(model.elasticNetParam, np.arange(0,1,0.2).round(decimals=2)) \
    .build()

    cross_val = CrossValidator(estimator=pipeline,
                               estimatorParamMaps=param_grid,
                               evaluator=RegressionEvaluator(labelCol=target_label),
                               numFolds=k_fold,
                               collectSubModels=True
                               )

    # Run cross-validation, and choose the best set of parameters.
    cv_model = cross_val.fit(dataset)

    return cv_model

## Cross validation ❗

In [100]:
CHOSEN_FEATURES = more_rel_features
CHOSEN_FEATURES_LABEL = MORE_REL_FEATURES_NAME

In [109]:
norm_df = select_normalized_features(df, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)

train_data, test_data = dataset_split(norm_df)

In [110]:
# Execute cross validation with linear regression
cv_models = cross_validation(train_data, FEATURES_LABEL, TARGET_LABEL)

# Summarize average error
for i, avg_rmse in enumerate(cv_models.avgMetrics):
    print("Avg. RMSE computed across k-fold cross validation for model setting #{:d}: {:3f}".format(i+1, avg_rmse))

Avg. RMSE computed across k-fold cross validation for model setting #1: 7628.021110
Avg. RMSE computed across k-fold cross validation for model setting #2: 7628.021110
Avg. RMSE computed across k-fold cross validation for model setting #3: 7628.021110
Avg. RMSE computed across k-fold cross validation for model setting #4: 7628.021110
Avg. RMSE computed across k-fold cross validation for model setting #5: 7628.021110
Avg. RMSE computed across k-fold cross validation for model setting #6: 7628.411135
Avg. RMSE computed across k-fold cross validation for model setting #7: 11616.089962
Avg. RMSE computed across k-fold cross validation for model setting #8: 11515.869037
Avg. RMSE computed across k-fold cross validation for model setting #9: 11520.088786
Avg. RMSE computed across k-fold cross validation for model setting #10: 11520.083447
Avg. RMSE computed across k-fold cross validation for model setting #11: 7628.809222
Avg. RMSE computed across k-fold cross validation for model setting #1

In [None]:
# Get the best model to extract best hyperparameters
best_model_params = cv_models.bestModel.stages[-1].extractParamMap()
print("Best parameters:")
for param, value in best_model_params.items():
    print(param.name, "=", value)

## Validate model ❗

In [111]:
# Fit a model with best parameters
model = LinearRegression(featuresCol=FEATURES_LABEL, labelCol=TARGET_LABEL, elasticNetParam=0.0, regParam=0.0, maxIter=5)
pipeline = Pipeline(stages=[model])

trained_model = pipeline.fit(train_data)

In [112]:
# Training set evaluation
training = trained_model.transform(train_data)

# Compute validation error by several evaluators
eval_res = model_evaluation(TARGET_LABEL, training)

# Use dict to store each result
results = {
    "Model": MODEL_NAME,
    "Type": "trained",
    "Features": CHOSEN_FEATURES_LABEL,
    "MSE": eval_res['mse'],
    "RMSE": eval_res['rmse'],
    "MAE": eval_res['mae'],
    "R2": eval_res['r2'],
    "Adjusted_R2": eval_res['adj_r2'],
}

# Transform dict to pandas dataset
results_pd = pd.DataFrame(results, index=[0])
results_pd

Unnamed: 0,MSE,RMSE,MAE,R2,Adjusted_R2
0,58181780.0,7627.698084,5854.065013,0.782775,0.782766


In [114]:
show_results(training.toPandas(), MODEL_NAME)

Output hidden; open in https://colab.research.google.com to view.

In [115]:
# Testing set evaluation
predictions = trained_model.transform(test_data)

# Compute validation error by several evaluators
eval_res = model_evaluation(TARGET_LABEL, predictions)

# Use dict to store each result
results = {
    "Model": MODEL_NAME,
    "Type": "predictions",
    "Features": CHOSEN_FEATURES_LABEL,
    "MSE": eval_res['mse'],
    "RMSE": eval_res['rmse'],
    "MAE": eval_res['mae'],
    "R2": eval_res['r2'],
    "Adjusted_R2": eval_res['adj_r2'],
}

# Transform dict to pandas dataset
results_pd = pd.DataFrame(results, index=[0])
results_pd

Unnamed: 0,MSE,RMSE,MAE,R2,Adjusted_R2
0,11066200.0,3326.590039,2595.856104,-11.859782,-11.904621


In [116]:
show_results(predictions.toPandas(), MODEL_NAME)

## Train final model ❗

In [117]:
train_valid_data = select_normalized_features(df, CHOSEN_FEATURES, FEATURES_LABEL, TARGET_LABEL)

In [118]:
# Fit a model with best parameters
model = LinearRegression(featuresCol=FEATURES_LABEL, labelCol=TARGET_LABEL, elasticNetParam=0.0, regParam=0.0, maxIter=5)
pipeline = Pipeline(stages=[model])

trained_model = pipeline.fit(train_valid_data)

In [119]:
# Training set evaluation
training = trained_model.transform(train_valid_data)

# Compute validation error by several evaluators
eval_res = model_evaluation(TARGET_LABEL, training)

# Use dict to store each result
results = {
    "Model": MODEL_NAME,
    "Type": "trained_final",
    "Features": CHOSEN_FEATURES_LABEL,
    "MSE": eval_res['mse'],
    "RMSE": eval_res['rmse'],
    "MAE": eval_res['mae'],
    "R2": eval_res['r2'],
    "Adjusted_R2": eval_res['adj_r2'],
}

# Transform dict to pandas dataset
results_pd = pd.DataFrame(results, index=[0])
results_pd

Unnamed: 0,MSE,RMSE,MAE,R2,Adjusted_R2
0,57649630.0,7592.735531,5824.634389,0.783723,0.783715


In [120]:
show_results(training.toPandas(), MODEL_NAME)

Output hidden; open in https://colab.research.google.com to view.

## Output

In [60]:
# Save the trained model
lr_model_final.write().overwrite().save(GDRIVE_MODEL)