<a href="https://colab.research.google.com/github/CorsiDanilo/big-data-computing-project/blob/main/2_Model_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bitcoin price forecasting with PySpark
## Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author
Corsi Danilo - corsi.1742375@studenti.uniroma1.it



# Global Constants


In [1]:
JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
GDRIVE_DIR = "/content/drive"

GDRIVE_DATASET_RAW_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/raw"
GDRIVE_DATASET_TEMP_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/temp"
GDRIVE_DATASET_OUTPUT_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/output"

GDRIVE_DATASET_NAME = "bitcoin_blockchain_data_1m"
GDRIVE_DATASET_NAME_EXT = "/" + GDRIVE_DATASET_NAME + ".parquet"

GDRIVE_DATASET = GDRIVE_DATASET_RAW_DIR + GDRIVE_DATASET_NAME_EXT

SLOW_OPERATION = False

#  Import useful Python packages

In [2]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from itertools import cycle

import plotly.express as px

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import gc

# **Spark + Google Colab Setup** ❗

In [3]:
!pip install pyspark
# Alternatively, if you want to install a specific version of pyspark:
#!pip install pyspark==3.2.1
!pip install -U -q PyDrive # To use files that are stored in Google Drive directly (e.g., without downloading them from an external URL)
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = JAVA_HOME

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

from pyspark.sql import functions as F

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285398 sha256=435d0cf3202e8c165c97c992c58ddd14ad4e99cb09c610ae7a6347990507c996
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1
The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra
  fonts-ipafont-gothic fonts-ipafont-mincho fonts-wqy-microhei
  fonts-wqy-zenhei fonts-indic
The follow

In [4]:
#TODO: da sistemare ❗
#General System Utilities
import sys
from datetime import datetime
import pickle

#Data Processing Libraries
import numpy as np
import pandas as pd
from pandas import concat
import matplotlib.pyplot as plt
from fastai.tabular import *
import six

#Pyspark/SQL libs
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DateType, IntegerType, FloatType
import seaborn as sns
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

#DS/DL Libs
import sklearn
from sklearn.linear_model import LinearRegression as sklearnLR
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, GRU
from keras import optimizers
from sklearn.preprocessing import MinMaxScaler

In [5]:
# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '4G').\
                set('spark.driver.memory', '45G').\
                set('spark.driver.maxResultSize', '10G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPriceForecasting").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [6]:
# Point Colaboratory to our Google Drive

from google.colab import drive

drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


# **Model preparation** ❗


In [7]:
# load dataset into pyspark dataframe objects
df = spark.read.load(GDRIVE_DATASET,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

Linear Regression models typically take in a single vector input, so we’ll need to vectorize all of our features into a single column. Thankfully, pyspark offers the VectorAssembler class to do just that.

To build and compare performance of our three feature set sizes — 34 (all features, our baseline), 7 (relevant features), and 7 (RFE-selected features) — we’ll start by assembling 3 independent VectorAssemblers, 1 for each feature list:

In [8]:
all_columns = ['market-cap', 'total-bitcoins', 'trade-volume', 'blocks-size', 'avg-block-size', 'n-transactions-total', 'n-transactions-per-block', 'hash-rate', 'difficulty', 'miners-revenue', 'transaction-fees-usd', 'n-unique-addresses', 'n-transactions', 'estimated-transaction-volume-usd']
rel_columns = ['market-cap', 'estimated-transaction-volume-usd', 'blocks-size', 'n-unique-addresses']
selected_features_rfe = ['total-bitcoins', 'blocks-size', 'avg-block-size', 'n-transactions-per-block', 'miners-revenue', 'n-unique-addresses', 'n-transactions']
dep_var = 'market-price'

vectorAssembler = VectorAssembler(
    inputCols = all_columns,
    outputCol = 'features')

vectorAssembler2 = VectorAssembler(
    inputCols = rel_columns,
    outputCol = 'features')

vectorAssembler3 = VectorAssembler(
    inputCols = selected_features_rfe,
    outputCol = 'features')


In [9]:
from pyspark.sql.window import Window
window = Window.orderBy("timestamp")

#All columns featurized
v_all_df = vectorAssembler.transform(df)
v_all_df = v_all_df.withColumn("index", row_number().over(window) - 1)
v_all_df = v_all_df.select(['timestamp','index', 'features', dep_var])

#Relevant columns featurized
v_rel_df = vectorAssembler2.transform(df)
v_rel_df = v_rel_df.withColumn("index", row_number().over(window) - 1)
v_rel_df = v_rel_df.select(['timestamp', 'index', 'features', dep_var])

#RFE-selected columns featurized
v_sel_df = vectorAssembler3.transform(df)
v_sel_df = v_sel_df.withColumn("index", row_number().over(window) - 1)
v_sel_df = v_sel_df.select(['timestamp', 'index', 'features', dep_var])

if SLOW_OPERATION:
  v_all_df.show(3)
  v_rel_df.show(3)
  v_sel_df.show(3)

and calling the show method on each of the resulting RDDs yields the following vectorized inputs (X) and targets (y):

Great! Now we can move on to partitioning each of these RDDs into training and test sets.



I created a utility method for creating training and testing inputs and labels:



In [10]:
def regression_data_builder(spark_df, part_index):
    train_df = spark_df.filter(col("index") <= part_index)
    test_df = spark_df.filter(col("index") > part_index)

    return train_df, test_df

From here, we can easily create 3 data bunches. First, we ensure that all 3 RDDs we created in the previous step are indexed correctly, store the index of an 80/20 split in a variable called “valid_index” and partition the data accordingly:



In [11]:
# calculates the total number of rows in the DataFrame
total_rows = df.count()

# calculates the index corresponding to 80% of the rows
valid_index = int(total_rows * 0.8)

all_train_df, all_test_df = regression_data_builder(v_all_df, valid_index)
rel_train_df, rel_test_df = regression_data_builder(v_rel_df, valid_index)
sel_train_df, sel_test_df = regression_data_builder(v_sel_df, valid_index)

if SLOW_OPERATION:
  print("The shape of the train dataset is {:d} rows by {:d} columns".format(all_train_df.count(), len(all_train_df.columns)))
  all_train_df.show(3)
  print("The shape of the train dataset is {:d} rows by {:d} columns".format(all_test_df.count(), len(all_test_df.columns)))
  all_test_df.show(3)

  print("The shape of the train dataset is {:d} rows by {:d} columns".format(rel_train_df.count(), len(rel_train_df.columns)))
  rel_train_df.show(3)
  print("The shape of the train dataset is {:d} rows by {:d} columns".format(rel_test_df.count(), len(rel_test_df.columns)))
  rel_test_df.show(3)

  print("The shape of the train dataset is {:d} rows by {:d} columns".format(sel_train_df.count(), len(sel_train_df.columns)))
  sel_train_df.show(3)
  print("The shape of the train dataset is {:d} rows by {:d} columns".format(sel_test_df.count(), len(sel_test_df.columns)))
  sel_test_df.show(3)

Checking the shapes of our RDDs ensures we got this step right. Everything looks good! Let’s move on to fitting the models.



In [12]:
def compute_avg_df(dataset):
  dataset = dataset.withColumn("date", date_format(col("timestamp"), "yyyy-MM-dd"))

  dataset = dataset.groupBy("date").agg(
      avg("market-price").alias("market-price")
  )

  return dataset

In [13]:
avg_train_df = compute_avg_df(all_train_df)
avg_test_df = compute_avg_df(all_test_df)

In [14]:
def show_train_test(train, test):
  trace1 = go.Scatter(
      x = train['date'],
      y = train['market-price'].astype(float),
      mode = 'lines',
      name = 'Train'
  )

  trace2 = go.Scatter(
      x = test['date'],
      y = test['market-price'].astype(float),
      mode = 'lines',
      name = 'Test'
  )

  layout = dict(
      title='Train and Test set with the Slider ',
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  #change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace1,trace2]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = "Train and Test set  with Rangeslider")

In [15]:
show_train_test(avg_train_df.toPandas(), avg_test_df.toPandas())

# Output

Saving the final train and test datasets

In [16]:
def output(dataset, type):
  from pyspark.sql.functions import date_format, to_timestamp, col

  dataset.write.parquet(GDRIVE_DATASET_TEMP_DIR, mode='overwrite')

  import os
  import glob
  import time

  while True:
      parquet_files = glob.glob(os.path.join(GDRIVE_DATASET_TEMP_DIR, "part*.parquet"))
      if len(parquet_files) > 0:
          # .parquet file found!
          file_path = parquet_files[0]
          break
      else:
          print(".parquet file not found. I'll try again after 1 second...")
          time.sleep(1)

  print(".parquet file found:", file_path)

  new_file_path = GDRIVE_DATASET_OUTPUT_DIR + "/" + GDRIVE_DATASET_NAME + "_" + type + ".parquet"

  import shutil

  # rename and move the file
  shutil.move(file_path, new_file_path)

  print("File renamed and moved successfully!")

In [17]:
output(all_train_df, "all_train")
output(all_test_df, "all_test")

.parquet file found: /content/drive/MyDrive/BDC/project/datasets/temp/part-00000-9f252738-4561-4544-936b-aaa18583ae22-c000.snappy.parquet
File renamed and moved successfully!
.parquet file found: /content/drive/MyDrive/BDC/project/datasets/temp/part-00000-07327039-5ca0-4fa1-a4f6-ee5faf9e1314-c000.snappy.parquet
File renamed and moved successfully!


In [18]:
output(rel_train_df, "rel_train")
output(rel_test_df, "rel_test")

.parquet file found: /content/drive/MyDrive/BDC/project/datasets/temp/part-00000-3575b91a-24a9-4e65-8955-ee70dd985bd0-c000.snappy.parquet
File renamed and moved successfully!
.parquet file found: /content/drive/MyDrive/BDC/project/datasets/temp/part-00000-11a9fc5e-ae1b-47b5-af6a-e3337c5d66ea-c000.snappy.parquet
File renamed and moved successfully!


In [19]:
output(sel_train_df, "sel_train")
output(sel_test_df, "sel_test")

.parquet file found: /content/drive/MyDrive/BDC/project/datasets/temp/part-00000-97fc0c54-e816-4dc5-81a9-6c6b992d2ee4-c000.snappy.parquet
File renamed and moved successfully!
.parquet file found: /content/drive/MyDrive/BDC/project/datasets/temp/part-00000-6e7d3499-22f7-4eb6-85f9-3acbd18addc1-c000.snappy.parquet
File renamed and moved successfully!
