<a href="https://colab.research.google.com/github/CorsiDanilo/big-data-computing-project/blob/main/2_BDC_Project_Bitcoin_price_forecasting_(Model_preparation).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bitcoin price forecasting with PySpark
## Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author
Corsi Danilo - corsi.1742375@studenti.uniroma1.it



# Global Constants


In [10]:
# TODO: da sistemare ❗

JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
GDRIVE_DIR = "/content/drive"

GDRIVE_DATASET_RAW_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/raw"
GDRIVE_DATASET_TEMP_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/temp"
GDRIVE_DATASET_OUTPUT_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/output"

GDRIVE_DATASET_NAME = "bitcoin_2015_2023_cleaned"
GDRIVE_DATASET_NAME_EXT = "/" + GDRIVE_DATASET_NAME + ".csv"

GDRIVE_DATASET = GDRIVE_DATASET_OUTPUT_DIR + GDRIVE_DATASET_NAME_EXT

SLOW_OPERATION = True

#  Import useful Python packages

In [2]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from itertools import cycle

import plotly.express as px

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import gc

# **Spark + Google Colab Setup**

## Install PySpark and related dependencies





In [3]:
!pip install pyspark
# Alternatively, if you want to install a specific version of pyspark:
#!pip install pyspark==3.2.1
!pip install -U -q PyDrive # To use files that are stored in Google Drive directly (e.g., without downloading them from an external URL)
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = JAVA_HOME

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

from pyspark.sql import functions as F

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285398 sha256=991bcbc733cba8fe230ffcc030855bf59b94ad69d0b0280b6d492016512a77a5
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1
The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra
  fonts-ipafont-gothic fonts-ipafont-mincho fonts-wqy-microhei
  fonts-wqy-zenhei fonts-indic
The follow

##  Create Spark context

In [4]:
# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '4G').\
                set('spark.driver.memory', '45G').\
                set('spark.driver.maxResultSize', '10G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPriceForecasting").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

##  Link Colab to our Google Drive

In [5]:
# Point Colaboratory to our Google Drive

from google.colab import drive

drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


# **Model preparation**

Prepara i dati: Assicurati che il tuo dataset sia in un formato adatto per l'addestramento del modello. Dovresti avere una colonna di etichette di output (variabile di risposta) e le features (variabili indipendenti) in colonne separate.

Crea un VectorAssembler: Un VectorAssembler è utilizzato per combinare le features in una singola colonna vettoriale. Questo passaggio è necessario poiché PySpark richiede che le features siano in un unico vettore per l'addestramento del modello Random Forest.

In [6]:
# load dataset into pyspark dataset objects
df = spark.read.load(GDRIVE_DATASET,
                         format="csv",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

Check the shape of the loaded dataset, print out the schema of the loaded dataset.

In [11]:
if SLOW_OPERATION:
  print("The shape of the dataset is {:d} rows by {:d} columns".format(df.count(), len(df.columns)))
  df.printSchema()
  df.show(5)

The shape of the dataset is 3766763 rows by 3 columns
root
 |-- date: timestamp (nullable = true)
 |-- close: double (nullable = true)
 |-- volume_usd: double (nullable = true)

+-------------------+--------+----------+
|               date|   close|volume_usd|
+-------------------+--------+----------+
|2023-02-21 00:33:00|24859.34|       0.0|
|2023-02-21 00:32:00|24859.34|   2562.98|
|2023-02-21 00:31:00|24821.96|   2249.87|
|2023-02-21 00:30:00|24818.09|     54.68|
|2023-02-21 00:29:00|24812.25|   2249.86|
+-------------------+--------+----------+
only showing top 5 rows



In [8]:
# def model_preparation(dataset):
#   from pyspark.ml.feature import VectorAssembler

#   assembler = VectorAssembler(
#       inputCols=["close"],
#       outputCol="features"
#   )

#   dataset = assembler.transform(dataset)

#   from pyspark.sql.functions import date_format, to_timestamp

#   # transform date column into string
#   dataset = dataset.withColumn("date_str", date_format(to_timestamp("date", "yyyy-MM-dd HH:mm:ss"), "yyyy-MM-dd HH:mm:ss"))

#   # encode the date to a column of label indicies
#   from pyspark.ml.feature import StringIndexer

#   label_stringIdx = StringIndexer(inputCol = 'date_str', outputCol = 'labelIndex')
#   dataset = label_stringIdx.fit(dataset).transform(dataset)

#   # divide the dataset into train set and test set
#   from pyspark.sql.functions import percent_rank
#   from pyspark.sql import Window

#   dataset = dataset.withColumn("rank", percent_rank().over(Window.partitionBy().orderBy("date_str")))
#   train = dataset.where("rank <= .8")
#   test = dataset.where("rank > .8")

#   return train.drop("rank", "date_str"), test.drop("rank", "date_str")

In [9]:
# def model_preparation(dataset):
#   # Preprocessing: StringIndexer for categorical labels
#   stringIndexer  = StringIndexer(inputCol="date", outputCol="label")

#   # Define the feature and label columns & Assemble the feature vector
#   assembler = VectorAssembler(inputCols="close", outputCol="features")

#   return train.drop("rank", "date_str"), test.drop("rank", "date_str")

In [18]:
def model_preparation(dataset):
  from pyspark.ml.feature import VectorAssembler

  assembler = VectorAssembler(inputCols=['close', 'volume_usd'], outputCol='features')
  dataset = assembler.transform(dataset)
  dataset = dataset.select('features', 'close')

  print("The shape of the dataset is {:d} rows by {:d} columns".format(dataset.count(), len(dataset.columns)))
  dataset.printSchema()
  dataset.show(5)

  # # transform date column into string
  # dataset = dataset.withColumn("date_str", date_format(to_timestamp("date", "yyyy-MM-dd HH:mm:ss"), "yyyy-MM-dd HH:mm:ss"))

  # # encode the date to a column of label indicies
  # from pyspark.ml.feature import StringIndexer

  # label_stringIdx = StringIndexer(inputCol = 'date_str', outputCol = 'labelIndex')
  # dataset = label_stringIdx.fit(dataset).transform(dataset)

  # # divide the dataset into train set and test set
  # from pyspark.sql.functions import percent_rank
  # from pyspark.sql import Window

  # dataset = dataset.withColumn("rank", percent_rank().over(Window.partitionBy().orderBy("date_str")))
  # train = dataset.where("rank <= .8")
  # test = dataset.where("rank > .8")

  # return train.drop("rank", "date_str"), test.drop("rank", "date_str")

In [19]:
# train_df, test_df = model_preparation(df)
model_preparation(df)

IllegalArgumentException: ignored

In [None]:
if SLOW_OPERATION:
  print("The shape of the train dataset is {:d} rows by {:d} columns".format(train_df.count(), len(train_df.columns)))
  train_df.show(5)
  print("The shape of the test dataset is {:d} rows by {:d} columns".format(test_df.count(), len(test_df.columns)))
  test_df.show(5)

In [None]:
def compute_daily_df(dataset):
  dataset = dataset.drop("features", "labelIndex")

  dataset = dataset.withColumn("date", date_format(dataset.date, "yyyy-MM-dd")).groupBy("date").agg(
      avg("close").alias("close")
  ).sort("date")

  dataset = dataset.withColumn("close", round(dataset["close"], 2))

  return dataset

In [None]:
def show_daily_train_test(train, test):
  daily_train_pandas = compute_daily_df(train).toPandas()
  daily_test_pandas = compute_daily_df(test).toPandas()

  trace1 = go.Scatter(
      x = daily_train_pandas['date'],
      y = daily_train_pandas['close'].astype(float),
      mode = 'lines',
      name = 'Train set'
  )

  trace2 = go.Scatter(
      x = daily_test_pandas['date'],
      y = daily_test_pandas['close'].astype(float),
      mode = 'lines',
      name = 'Test set'
  )

  layout = dict(
      title='Train and Test set with the Slider ',
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  #change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace1,trace2]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = "Train and Test set  with Rangeslider")

In [None]:
show_daily_train_test(train_df, test_df)

# Output

Saving the final train and test datasets

In [None]:
def output(dataset, typology):
  from pyspark.sql.functions import date_format, to_timestamp, col

  # transform date column into string
  dataset = dataset.withColumn("date", to_timestamp(col("date"), "yyyy-MM-dd HH:mm:ss").cast("string"))

  # definition of Vector to String conversion function
  vector_to_string = udf(lambda vector: str(vector), StringType())

  # applying the function to the features column
  dataset = dataset.withColumn("features", vector_to_string(dataset["features"]))

  # save the dataset in CSV format
  dataset.repartition(1).write.csv(GDRIVE_DATASET_TEMP_DIR, header=True, mode='overwrite')

  import os
  import glob
  import time

  while True:
      csv_files = glob.glob(os.path.join(GDRIVE_DATASET_TEMP_DIR, "part*.csv"))
      if len(csv_files) > 0:
          # .csv file found!
          file_path = csv_files[0]
          break
      else:
          print(".csv file not found. I'll try again after 1 second...")
          time.sleep(1)

  print(".csv file found:", file_path)

  new_file_path = GDRIVE_DATASET_OUTPUT_DIR + "/" + GDRIVE_DATASET_NAME + "_" + typology + ".csv"

  import shutil

  # rename and move the file
  shutil.move(file_path, new_file_path)

  print("File renamed and moved successfully!")

In [None]:
output(train_df, "train")
output(test_df, "test")

.csv file found: /content/drive/MyDrive/Computer_Science/BDC/project/datasets/temp/part-00000-c85b88ff-d0a5-44b3-a848-1f640d5df9ec-c000.csv
File renamed and moved successfully!
.csv file found: /content/drive/MyDrive/Computer_Science/BDC/project/datasets/temp/part-00000-2f02edff-d40b-49fd-a2f8-883e4d0b3fd7-c000.csv
File renamed and moved successfully!
