<a href="https://colab.research.google.com/github/CorsiDanilo/big-data-computing-project/blob/main/2_Train_test_split.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bitcoin price forecasting with PySpark
## Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author
Corsi Danilo - corsi.1742375@studenti.uniroma1.it



Description: In this notebook I am going to split the dataset into train and test sets by saving them separately on the Google Drive.

# Global Constants


In [1]:
JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
GDRIVE_DIR = "/content/drive"

GDRIVE_DATASET_RAW_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/raw"
GDRIVE_DATASET_TEMP_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/temp"
GDRIVE_DATASET_OUTPUT_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/output"

GDRIVE_DATASET_NAME = "bitcoin_blockchain_data_1h"
GDRIVE_DATASET_NAME_EXT = "/" + GDRIVE_DATASET_NAME + ".parquet"

GDRIVE_DATASET = GDRIVE_DATASET_RAW_DIR + GDRIVE_DATASET_NAME_EXT

SLOW_OPERATION = True

#  Import Python packages ❗

In [2]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from itertools import cycle

import plotly.express as px

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import gc

#General System Utilities
import sys
from datetime import datetime
import pickle

#Data Processing Libraries
import numpy as np
import pandas as pd
from pandas import concat
import matplotlib.pyplot as plt
from fastai.tabular import *
import six

#DS/DL Libs
import sklearn
from sklearn.linear_model import LinearRegression as sklearnLR
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, GRU
from keras import optimizers
from sklearn.preprocessing import MinMaxScaler

# Spark + Google Colab Setup

In [3]:
!pip install pyspark
# Alternatively, if you want to install a specific version of pyspark:
#!pip install pyspark==3.2.1
!pip install -U -q PyDrive # To use files that are stored in Google Drive directly (e.g., without downloading them from an external URL)
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = JAVA_HOME

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

from pyspark.sql import functions as F

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285397 sha256=46af6e30b5c325463d4ca5458b462b99b48d69ca0d1771d30a8054bd2b33a03a
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1
The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra fonts-nanum
  fonts-ipafont-gothic fonts-ipafont-mincho fonts-wqy-microhei
  fonts-wqy-zenhei fonts-indi

In [4]:
# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '4G').\
                set('spark.driver.memory', '45G').\
                set('spark.driver.maxResultSize', '10G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPriceForecasting").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [5]:
# Point Colaboratory to our Google Drive

from google.colab import drive

drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


# **Model preparation** ❗


In [6]:
# load dataset into pyspark dataframe objects
df = spark.read.load(GDRIVE_DATASET,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

After loading the dataset we can store the index of an 80/20 split in a variable called “valid_index” and partition the data accordingly:

In [7]:
from pyspark.sql.window import Window
window = Window.orderBy("timestamp")

df = df.withColumn("index", row_number().over(window) - 1)

# Calculates the total number of rows in the DataFrame
total_rows = df.count()

# Calculates the index corresponding to 80% of the rows
valid_index = int(total_rows * 0.8)

train_df = df.filter(col("index") <= valid_index)
test_df = df.filter(col("index") > valid_index)

if SLOW_OPERATION:
  print("The shape of the train dataset is {:d} rows by {:d} columns".format(train_df.count(), len(train_df.columns)))
  train_df.show(3)
  print("The shape of the train dataset is {:d} rows by {:d} columns".format(test_df.count(), len(test_df.columns)))
  test_df.show(3)

The shape of the train dataset is 80584 rows by 17 columns
+------------------+--------------------+-----------------+------------+-----------------+--------------------+--------------------+------------------------+-----------------+----------------+-----------------+--------------------+------------------+-----------------+--------------------------------+-------------------+-----+
|      market-price|          market-cap|   total-bitcoins|trade-volume|      blocks-size|      avg-block-size|n-transactions-total|n-transactions-per-block|        hash-rate|      difficulty|   miners-revenue|transaction-fees-usd|n-unique-addresses|   n-transactions|estimated-transaction-volume-usd|          timestamp|index|
+------------------+--------------------+-----------------+------------+-----------------+--------------------+--------------------+------------------------+-----------------+----------------+-----------------+--------------------+------------------+-----------------+-----------------

# Visualizing train / test set ❗

In this section we are going to display the division of the dataset we just made based on the market price (value on which we will then go on to make our forecast)

In [8]:
# def compute_avg_df(dataset):
#   dataset = dataset.withColumn("date", date_format(col("timestamp"), "yyyy-MM-dd"))

#   dataset = dataset.groupBy("date").agg(
#       avg("market-price").alias("market-price")
#   )

#   return dataset

In [9]:
# avg_train_df = compute_avg_df(train_df)
# avg_test_df = compute_avg_df(test_df)

In [16]:
def show_train_test(train, test):
  trace1 = go.Scatter(
      x = train['timestamp'],
      y = train['market-price'].astype(float),
      mode = 'lines',
      name = 'Train'
  )

  trace2 = go.Scatter(
      x = test['timestamp'],
      y = test['market-price'].astype(float),
      mode = 'lines',
      name = 'Test'
  )

  layout = dict(
      title='Train and Test set with the Slider ',
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  #change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace1,trace2]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = "Train and Test set  with Rangeslider")

In [17]:
show_train_test(train_df.toPandas(), test_df.toPandas())


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead



# Output

In this last section we are going to save the final training and test datasets.

In [12]:
def output(dataset, type):
  from pyspark.sql.functions import date_format, to_timestamp, col

  dataset.write.parquet(GDRIVE_DATASET_TEMP_DIR, mode='overwrite')

  import os
  import glob
  import time

  while True:
      parquet_files = glob.glob(os.path.join(GDRIVE_DATASET_TEMP_DIR, "part*.parquet"))
      if len(parquet_files) > 0:
          # .parquet file found!
          file_path = parquet_files[0]
          break
      else:
          print(".parquet file not found. I'll try again after 1 second...")
          time.sleep(1)

  print(".parquet file found:", file_path)

  new_file_path = GDRIVE_DATASET_OUTPUT_DIR + "/" + GDRIVE_DATASET_NAME + "_" + type + ".parquet"

  import shutil

  # rename and move the file
  shutil.move(file_path, new_file_path)

  print("File renamed and moved successfully!")

In [13]:
output(train_df, "train")
output(test_df, "test")

.parquet file found: /content/drive/MyDrive/BDC/project/datasets/temp/part-00000-75b4da6f-c0ec-4a92-b6ff-366a27e152a7-c000.snappy.parquet
File renamed and moved successfully!
.parquet file found: /content/drive/MyDrive/BDC/project/datasets/temp/part-00000-88f468fc-82ab-49f8-b0e2-9abf109c9138-c000.snappy.parquet
File renamed and moved successfully!
