# Bitcoin Price Prediction - Local Mode Linear Regression

* **Description**: COMP4103(Big Data)--Group Project
* **Author**: Aaron
* **Version**: 0.3

**Updates:**
1. Update the way to calcute Training time accurately.
2. Add Cross Validation part

**Issues:**  
1. N/A

**To be done:**  
1. Visualize the influence of the different number of partitions

## 1. Load related packages

In [1]:
# Apache Spark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

# Python
import numpy as np
import pandas as pd
from itertools import product
import time

# Graph packages
# https://plotly.com/python/getting-started/#jupyterlab-support
# https://plotly.com/python/time-series/
import plotly.express as px

# Scikit-learn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, mean_squared_log_error, mean_absolute_percentage_error, r2_score

# Load the customized Time Series Cross Validation
from tsCrossValidation import mulTsCrossValidation, blockedTsCrossValidation, wfTsCrossValidation, modelComparison

## 2. Create a Spark Session

In [2]:
# Start a SparkSession
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("Bitcoin Prediction - local") \
    .getOrCreate()

sc = spark.sparkContext

In [3]:
spark

## 3. load Data

In [4]:
# Read csv file
filename = "bitcoin_1m_1min.csv"
dataset = spark.read.format("csv") \
          .option("inferSchema",'True') \
          .option("header",True) \
          .load(filename)

## 4. Train/Test Data

In [5]:
'''
Description: Split and keep the original time-series order
Args:
    dataSet: The dataSet which needs to be splited
    proportion: A number represents the split proportion

Return: 
    train_data: The train dataSet
    test_data: The test dataSet
'''
def trainSplit(dataSet, proportion):
    records_num = dataset.count()
    split_point = round(records_num * proportion)
    
    train_data = dataset.filter(F.col("id") < split_point)
    test_data = dataset.filter(F.col("id") >= split_point)
    
    return (train_data,test_data)

In [6]:
# Have a look on the data
dataset.select("id","Timestamp","Close","NEXT_BTC_CLOSE").tail(5)

[Row(id=41755, Timestamp='2021-03-30 23:55:00', Close=58714.31, NEXT_BTC_CLOSE=58686.0),
 Row(id=41756, Timestamp='2021-03-30 23:56:00', Close=58686.0, NEXT_BTC_CLOSE=58685.81),
 Row(id=41757, Timestamp='2021-03-30 23:57:00', Close=58685.81, NEXT_BTC_CLOSE=58723.84),
 Row(id=41758, Timestamp='2021-03-30 23:58:00', Close=58723.84, NEXT_BTC_CLOSE=58760.59),
 Row(id=41759, Timestamp='2021-03-30 23:59:00', Close=58760.59, NEXT_BTC_CLOSE=58778.18)]

In [7]:
# Split the dataSet: Train(70%), test(30%)
proportion = 0.7
train_data,test_data = trainSplit(dataset, proportion)

# Cache it
train_data.cache()
test_data.cache()

# Number of train and test dataSets
print(f"Training data: {train_data.count()}\nTest data: {test_data.count()}")

Training data: 29232
Test data: 12528


In [8]:
# Save column name 
column_names = dataset.columns
# labels and features
feature_cols = dataset.columns
# Gain the column list of features
non_feature_cols  = ['id',"NEXT_BTC_CLOSE",'Timestamp']
[feature_cols.remove(non_feature) for non_feature in non_feature_cols]

[None, None, None]

## 5. Local Mode building

In [9]:
# Define a function to plot line-like graph
# https://plotly.com/python/time-series/#time-series-with-range-selector-buttons
'''
Description: Plot the line graph by plotly(custom design)
Args:
    data: The data(pandas dataframe) which you want to ploy by line
    graph_title: The title of the graph
    
Return: None
'''
def line_plot(data,graph_title):
    plot = px.line(data,title=graph_title)
    plot.update_xaxes(
        rangeslider_visible=True,
        rangeselector=dict(
            buttons=list([
                dict(count=7, label="1w", step="day", stepmode="backward"),
                dict(count=1, label="1m", step="month", stepmode="backward"),
                dict(count=6, label="6m", step="month", stepmode="backward"),
                dict(count=1, label="1y", step="year", stepmode="backward"),
                dict(step="all")
            ])
        )
    )
    plot.show()

In [10]:
'''
Description: Transform each partition of Spark to pandas dataframe
Args:
    partition_rdd: RDD of each partition
    
Return: 
    pandas_df: Data in pandas dataframe
'''
def partitionToPandas(partition_rdd):
    pandas_df = pd.DataFrame(columns = column_names)
    
    # each_row is Row() type in Spark
    for each_row in partition_rdd:
        pandas_df = pandas_df.append(each_row.asDict(),ignore_index=True)
    return [pandas_df]

In [11]:
'''
Description: Build model on each partition
Args:
    partition_rdd: RDD of each partition
    
Return: 
    lr_reg: linear regression model and training time
'''
def buildModel(pandas_df):
    X_train = pandas_df.loc[:,feature_cols]
    y_train = pandas_df['NEXT_BTC_CLOSE']
    start = time.time()
    lr_reg = LinearRegression().fit(X_train, y_train)
    end = time.time()
    return (lr_reg,end-start)

In [12]:
'''
Description: Make prediction on each partition
Args:
    pandas_df: Data in pandas dataframe
    broadcast_models: Trained Models
    
Return: 
    partition_pred: Predictions on the partition in a list
'''
def makePrediction(pandas_df,broadcast_models):
    prediction_lst = []
    X_test = pandas_df.loc[:,feature_cols]
    num_models = len(broadcast_models.value)
    
    # Get the prediction from each model, then save to a list
    for model in broadcast_models.value:
        prediction_lst.append(model.predict(X_test).tolist())
    
    # Define weight value
    weight = list(range(1,num_models+1))
    # Weighted the results from each Model
    weighted_pred_lst = [[i*b for i in a] for a,b in zip(prediction_lst,weight)]
    
    # Aggregate the weighted predictions, then get Weighted value
    partition_pred = [value / sum(weight) for value in map(sum,zip(*weighted_pred_lst))]
    # Simple average 
    #partition_pred = [value / num_models for value in map(sum,zip(*prediction_lst))]
    
    return partition_pred

In [13]:
'''
Description: Calculate evaluation metrics
Args:
    y_test: Label of test data
    y_pred: Prediction on test data
    partition_num_train: Number of partition of Train data
    partition_num_test: Number of partition of Test data
    train_time: Time of training model
Return: 
    results: All the evaluation metrics in a dict
'''
def evaluationAssemble(y_test, y_pred, partition_num_train, partition_num_test, train_time):
    # Explained variance score
    exp_var = explained_variance_score(y_test,y_pred)

    # Mean absolute error
    mae = mean_absolute_error(y_test,y_pred)

    # Root Mean squared error
    rmse = mean_squared_error(y_test,y_pred,squared=False)

    # Mean squared logarithmic error
    msle = mean_squared_log_error(y_test,y_pred)

    # Mean absolute percentage error
    mape = mean_absolute_percentage_error(y_test,y_pred)

    # R2 score, the coefficient of determination
    r2 = r2_score(y_test,y_pred)

    # Adjusted R2 score
    n = len(y_pred)
    p = len(feature_cols)
    adj_r2 = 1-(1-r2)*(n-1)/(n-p-1)

    # Use dict to store each result
    results = {
        "Model": "Linear Regression",
        "P_train": partition_num_train,
        "P_test": partition_num_test,
        "Proportion": proportion,
        "RMSE": rmse,
        "MAPE":mape,
        "MAE": mae,
        "MSLE": msle,
        "Variance": exp_var,
        "R2": r2,
        "Adjusted_R2": adj_r2,
        "Time": train_time,
    }
    return results

In [14]:
'''
Description: Transform a Spark Row type list to pandas dataframe 
Args:
    row_list: Data in pandas dataframe
    column_names: Column names will display in pandas dataframe. The format need to be a list
    
Return: 
    pandas_df: Data in pandas dataframe
'''
def row2Pandasdf(row_list, column_names):
    pandas_df = pd.DataFrame(columns = column_names)
    
    # each_row is Row() type in Spark
    for each_row in row_list:
        pandas_df = pandas_df.append(each_row.asDict(), ignore_index=True)
    return pandas_df

In [15]:
'''
Description: Local mode on Spark using Scikit-learn
Args:
    train_data: Train data in Spark dataframe
    test_data: Test data in Spark dataframe
    partition_num_train: Number of partition of Train data
    partition_num_test: Number of partition of Test data
    
Return: 
    results: All the evaluation metrics in a dict
'''
def localMode(train_data, test_data, partition_num_train, partition_num_test):
    # Transform Train/Test to RDD type, manually set partition number
    train_rdd = train_data.orderBy("id").rdd.coalesce(partition_num_train)
    test_rdd  = test_data.orderBy("id").rdd.coalesce(partition_num_test)
    
    # Collect all the models which generated from each partition, to driver
    models = train_rdd.mapPartitions(partitionToPandas).map(buildModel).collect()
    train_time = max([model[1] for model in models])
    models = [model[0] for model in models]
    
    # broadcast models
    broadcast_models = sc.broadcast(models)

    # Transform each partition of test_rdd to pandas dataframe, then make prediction on each partition, then merge the results in a single list
    y_pred = test_rdd.mapPartitions(partitionToPandas).map(lambda x: makePrediction(x,broadcast_models)).reduce(lambda x,y: x+y)

    # Get the label of test data. (Row() type also works for calculating evaluation metrics)
    y_test = test_data.select("NEXT_BTC_CLOSE").collect()
    
    # Generate a pandas dataframe on predictions. Can help to plot graph easier later.
    y_test_rows = test_data.select("Timestamp","NEXT_BTC_CLOSE").collect()
    y_df = row2Pandasdf(y_test_rows, ["Timestamp","NEXT_BTC_CLOSE"])
    
    # Add prediction to y_test_df
    y_df["prediction"] = y_pred
    
    # Plot the prediction
    #line_plot(y_df.set_index("Timestamp"), "Predict by Linear Regression")
    
    # Calculate evaluation metrics
    results = evaluationAssemble(y_test, y_pred, partition_num_train, partition_num_test, train_time)
    return results

In [16]:
## Cross Validation Parameter
# Multiple Splits Time Series Cross Validation
mul_cv = {'cv_type':'mulTs',
          'kSplits': 5}

# Blocked Time Series Cross Validation
blk_cv = {'cv_type':'blkTs',
          'kSplits': 10}

# Walk Forward Validation
wf_cv = {'cv_type':'wfTs',
         'min_obser': 41710,
         'expand_window': 1}

## 7. Time Series Cross Validation

In [17]:
'''
Description: Cross Validation on Time Series data
Args:
    dataSet: The dataSet which needs to be splited
    cv_info: The type of Cross Validation
    partition_num_train: Number of partition of Train data
    partition_num_test: Number of partition of Test data
Return: 
    tsCv_df: All the splits performance of each model in a pandas dataframe
'''
def tsCrossValidation(dataSet, partition_num_train, partition_num_test, cv_info):
    
    # Get the number of samples
    num = dataSet.count()
    
    # Save results in a list
    result_lst = []
        
    # Identify the type of Cross Validation 
    if cv_info['cv_type'] == 'mulTs':
        split_position_df = mulTsCrossValidation(num, cv_info['kSplits'])
    elif cv_info['cv_type'] == 'blkTs':
        split_position_df = blockedTsCrossValidation(num, cv_info['kSplits'])
    elif cv_info['cv_type'] == 'wfTs':
        split_position_df = wfTsCrossValidation(num, cv_info['min_obser'], cv_info['expand_window'])


    for position in split_position_df.itertuples():
        # Get the start/split/end position from a kind of Time Series Cross Validation
        start = getattr(position, 'start')
        splits = getattr(position, 'split')
        end = getattr(position, 'end')
        idx  = getattr(position, 'Index')

        # Train/Test size
        train_size = splits - start
        test_size = end - splits

        # Get training data and test data
        train_data = dataSet.filter(F.col("id").between(start, splits-1))
        test_data = dataSet.filter(F.col("id").between(splits, end-1))

        # Cache it
        train_data.cache()
        test_data.cache()
        
        # train the local mode
        results = localMode(train_data, test_data, partition_num_train, partition_num_test)
        
        # Store each splits result
        result_lst.append(results)
            
        # Release Cache
        train_data.unpersist()
        test_data.unpersist()

    # Transform dict to pandas dataframe
    tsCv_df = pd.DataFrame(result_lst)
    return tsCv_df

### 7.1. Local Mode

In [18]:
# LinearRegression
lr_mul_cv = tsCrossValidation(dataset, 3, 2, mul_cv)
lr_mul_cv

Unnamed: 0,Model,P_train,P_test,Proportion,RMSE,MAPE,MAE,MSLE,Variance,R2,Adjusted_R2,Time
0,Linear Regression,3,2,0.7,63.212106,0.000885,47.112788,1.394735e-06,0.999386,0.999356,0.999355,0.004367
1,Linear Regression,3,2,0.7,73.156342,0.000932,53.711752,1.631152e-06,0.998705,0.998699,0.998697,0.004028
2,Linear Regression,3,2,0.7,60.938185,0.000788,45.365867,1.123086e-06,0.997983,0.997979,0.997977,0.002896
3,Linear Regression,3,2,0.7,69.053209,0.000935,50.99463,1.620982e-06,0.998834,0.998833,0.998832,0.00203
4,Linear Regression,3,2,0.7,47.881707,0.000651,36.401047,7.341378e-07,0.999258,0.999257,0.999256,0.003279


In [19]:
lr_blk_cv = tsCrossValidation(dataset, 3, 2, blk_cv)
lr_blk_cv

Unnamed: 0,Model,P_train,P_test,Proportion,RMSE,MAPE,MAE,MSLE,Variance,R2,Adjusted_R2,Time
0,Linear Regression,3,2,0.7,89.371224,0.001352,66.368067,3.313506e-06,0.984954,0.984867,0.984739,0.002374
1,Linear Regression,3,2,0.7,51.812652,0.00075,37.858859,1.051126e-06,0.991162,0.990888,0.990811,0.002444
2,Linear Regression,3,2,0.7,59.137672,0.000843,46.273783,1.159234e-06,0.996418,0.996348,0.996317,0.00208
3,Linear Regression,3,2,0.7,78.076059,0.000894,51.855277,1.783066e-06,0.996969,0.996839,0.996812,0.002463
4,Linear Regression,3,2,0.7,93.509675,0.001292,71.016209,2.904678e-06,0.98985,0.988373,0.988275,0.002463
5,Linear Regression,3,2,0.7,64.556366,0.000825,47.62647,1.254672e-06,0.978128,0.978121,0.977936,0.002607
6,Linear Regression,3,2,0.7,51.826217,0.000692,39.685982,8.172531e-07,0.983868,0.983434,0.983294,0.00246
7,Linear Regression,3,2,0.7,86.71728,0.001202,64.349018,2.648839e-06,0.997216,0.997193,0.997169,0.002034
8,Linear Regression,3,2,0.7,49.803957,0.00067,37.227355,8.027395e-07,0.994568,0.9943,0.994252,0.001904
9,Linear Regression,3,2,0.7,41.916637,0.000548,32.237616,5.070384e-07,0.94664,0.945855,0.945397,0.002457


In [20]:
lr_wf_cv = tsCrossValidation(dataset, 3, 2, wf_cv)
lr_wf_cv



Unnamed: 0,Model,P_train,P_test,Proportion,RMSE,MAPE,MAE,MSLE,Variance,R2,Adjusted_R2,Time
0,Linear Regression,3,2,0.7,13.73248,0.000234,13.73248,5.470324e-08,1.0,,,0.003386
1,Linear Regression,3,2,0.7,7.36467,0.000125,7.36467,1.573569e-08,1.0,,,0.002359
2,Linear Regression,3,2,0.7,3.204114,5.5e-05,3.204114,2.978708e-09,1.0,,,0.003513
3,Linear Regression,3,2,0.7,29.588175,0.000504,29.588175,2.539024e-07,1.0,,,0.00235
4,Linear Regression,3,2,0.7,34.500734,0.000588,34.500734,3.45282e-07,1.0,,,0.002267
5,Linear Regression,3,2,0.7,56.62474,0.000966,56.62474,9.316285e-07,1.0,,,0.003322
6,Linear Regression,3,2,0.7,13.304497,0.000227,13.304497,5.149444e-08,1.0,,,0.002326
7,Linear Regression,3,2,0.7,1.129125,1.9e-05,1.129125,3.710051e-10,1.0,,,0.003175
8,Linear Regression,3,2,0.7,8.466584,0.000144,8.466584,2.085902e-08,1.0,,,0.002346
9,Linear Regression,3,2,0.7,8.435404,0.000144,8.435404,2.070118e-08,1.0,,,0.003314


In [21]:
# Define what model_info and evaluators in the Model Comparison Table
model_info = ['Model']
evaluator_lst = ['RMSE','MAPE','MAE','Variance','R2','Adjusted_R2','Time']

# The the Cross Validation results would like to compare
comparison_lst = [lr_mul_cv,lr_blk_cv,lr_wf_cv]

In [22]:
# Show the Comparison Table
pd.concat([modelComparison(cv_result,model_info,evaluator_lst) for cv_result in comparison_lst])

Unnamed: 0,Model,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,Linear Regression,62.84831,0.000838,46.717217,0.998833,0.998825,0.998824,0.00332
0,Linear Regression,66.672774,0.000907,49.449864,0.985977,0.985622,0.9855,0.002329
0,Linear Regression,22.360924,0.000381,22.360924,1.0,,,0.013504


### 7.2. Sklearn Only

In [23]:
# Set partition_num_train == 1, means we only use Sklearn.
sklr_mul_cv = tsCrossValidation(dataset, 1, 1, mul_cv)
sklr_blk_cv = tsCrossValidation(dataset, 1, 1, blk_cv)
sklr_wf_cv = tsCrossValidation(dataset, 1, 1, wf_cv)
# The the Cross Validation results would like to compare
comparison_lst = [sklr_mul_cv,sklr_blk_cv,sklr_wf_cv]
pd.concat([modelComparison(cv_result,model_info,evaluator_lst) for cv_result in comparison_lst])



Unnamed: 0,Model,RMSE,MAPE,MAE,Variance,R2,Adjusted_R2,Time
0,Linear Regression,62.165891,0.000829,46.224582,0.998841,0.998839,0.998838,0.002716
0,Linear Regression,65.344015,0.000892,48.621349,0.985944,0.985914,0.985795,0.001903
0,Linear Regression,22.306136,0.00038,22.306136,1.0,,,0.004803
