# Bitcoin price forecasting with PySpark - Feature Engineering
## Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author
Corsi Danilo - corsi.1742375@studenti.uniroma1.it



Description: In this notebook I am going to explore the data and visualize the correlations check their stationarity and choose features to use to train the models.

# Dependencies, Libraries and Tools

In [1]:
# Define some global variables
JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64"
SLOW_OPERATION = True

In [2]:
# Ignore warnings
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
#Install some useful dependencies
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from itertools import cycle

import plotly.express as px

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import gc

import pandas as pd
import numpy as np
import json

import matplotlib.pyplot as plt
import seaborn as sns

# !pip install -U -q PyDrive # To use files that are stored in Google Drive directly (e.g., without downloading them from an external URL)
# !apt install openjdk-8-jdk-headless -qq
# import os
# os.environ["JAVA_HOME"] = JAVA_HOME

In [4]:
# Install Spark and related dependencies
!pip install pyspark

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.ml import Pipeline

# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '4G').\
                set('spark.driver.memory', '45G').\
                set('spark.driver.maxResultSize', '10G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPriceForecasting").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285388 sha256=cb9ac15120d3596ca6a49e56d960eec816d5f0aad31f1632a02a32c78158a512
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


# Link to Google Drive

In [5]:
# Define some constants related to Google Drive
GDRIVE_DIR = "/content/drive"

GDRIVE_DATASET_RAW_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/raw"
GDRIVE_DATASET_TEMP_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/temp"
GDRIVE_DATASET_OUTPUT_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/datasets/output"

GDRIVE_DATASET_NAME = "bitcoin_blockchain_data_30min"

GDRIVE_DATASET_NAME_EXT = "/" + GDRIVE_DATASET_NAME + ".parquet"

GDRIVE_DATASET = GDRIVE_DATASET_RAW_DIR + GDRIVE_DATASET_NAME_EXT

In [6]:
# Point Colaboratory to our Google Drive
from google.colab import drive

drive.mount(GDRIVE_DIR, force_remount=True)

Mounted at /content/drive


# Loading the dataset

In [None]:
import pyspark.sql.functions as F

# Load datasets into pyspark dataframe objects
df = spark.read.load(GDRIVE_DATASET,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    ) \
                     .withColumn("id", F.row_number().over(Window.orderBy(F.monotonically_increasing_id()))-1)

In [None]:
def dataset_info(dataset):
  dataset.show(3)

  # Get the number of rows
  num_rows = dataset.count()

  # Get the number of columns
  num_columns = len(dataset.columns)

  # Print the shape of the DataFrame
  print("Shape:", (num_rows, num_columns))

  # Print the schema of the DataFrame
  dataset.printSchema()

In [None]:
if SLOW_OPERATION:
  dataset_info(df)

+------------------+--------------+-------------------+--------------+------------------+------------------+--------------------+------------------------+-----------------+-------------------+------------------+--------------------+------------------+------------------+--------------------------------+-------------------+---+
|      market-price|total-bitcoins|         market-cap|  trade-volume|       blocks-size|    avg-block-size|n-transactions-total|n-transactions-per-block|        hash-rate|         difficulty|    miners-revenue|transaction-fees-usd|n-unique-addresses|    n-transactions|estimated-transaction-volume-usd|          timestamp| id|
+------------------+--------------+-------------------+--------------+------------------+------------------+--------------------+------------------------+-----------------+-------------------+------------------+--------------------+------------------+------------------+--------------------------------+-------------------+---+
|            430

# Adding useful features

In [None]:
# Creation of a new dataset for the new features
new_features_df = df.select("timestamp", "id", "market-price")

In [None]:
# Add the column 'tomorrow-market-price' which will be the target feature on which to make predictions
from pyspark.sql.window import Window
import pyspark.sql.functions as F

new_features_df = new_features_df.withColumn("next-market-price", F.lag("market-price", offset=-1) \
        .over(Window.orderBy("id"))) \
        .dropna()

In [None]:
# Generate additional valuable features
# Rate of Change allows investors to spot security momentum and other trends
# Typically a 12-day Rate-of-Change is used but for simplicity, I used it for every 30-min interval
new_features_df = new_features_df.withColumn("rate-of-change", (F.col("next-market-price") / F.col("market-price") - 1) * 100)

In [None]:
# Computing Simple Moving Averages
# Adapted from: https://stackoverflow.com/questions/45806194/pyspark-rolling-average-using-timeseries-data
def simple_moving_average(dataframe, period, days, col="next-market-price", orderby="id"):
    dataframe = dataframe.withColumn(f"sma-{days}-days", F.avg(col) \
          .over(Window.orderBy(orderby) \
          .rowsBetween(-period,0)))
    return dataframe

In [None]:
# MA number 5/7/10/20/50/100/200 days;
MA5 = 60 * 24 * 5
MA7 = 60 * 24 * 7
MA10 = 60 * 24 * 10
MA20 = 60 * 24 * 20
MA50 = 60 * 24 * 50
MA100 = 60 * 24 * 100

# Periods selected based on this article:
# https://www.investopedia.com/ask/answers/122414/what-are-most-common-periods-used-creating-moving-average-
# ma-lines.asp#:~:text=Traders%20and%20market%20analysts%20commonly,averages%20are%20the%20most%20common.

# To analyze short-term trends
new_features_df = simple_moving_average(new_features_df, MA5, 5) # these might have to be 240 - 1 actually
new_features_df = simple_moving_average(new_features_df, MA7, 7)
new_features_df = simple_moving_average(new_features_df, MA10, 10)
new_features_df = simple_moving_average(new_features_df, MA20, 20)
new_features_df = simple_moving_average(new_features_df, MA50, 50)
# To analyze long-term trends
new_features_df = simple_moving_average(new_features_df, MA100, 100)

In [None]:
# Drop "market-price column"
new_features_df = new_features_df.drop("market-price")

In [None]:
if SLOW_OPERATION:
  dataset_info(new_features_df)

+-------------------+---+------------------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|          timestamp| id| next-market-price|      rate-of-change|        sma-5-days|        sma-7-days|       sma-10-days|       sma-20-days|       sma-50-days|      sma-100-days|
+-------------------+---+------------------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|2016-01-01 00:00:00|  0|430.97041666666667| 0.01866292247827417|430.97041666666667|430.97041666666667|430.97041666666667|430.97041666666667|430.97041666666667|430.97041666666667|
|2016-01-01 00:30:00|  1|431.05083333333334|0.018659440081436607|        431.010625|        431.010625|        431.010625|        431.010625|        431.010625|        431.010625|
|2016-01-01 01:00:00|  2|431.13124999999997| 0.01865595898393746| 431.0508333333333| 431.05083333333

In [None]:
# Merge original dataset with the one with the new features
merged_df = df.join(new_features_df, on=['timestamp','id'], how='inner')

DataFrame[timestamp: timestamp_ntz, id: int, market-price: double, total-bitcoins: double, market-cap: double, trade-volume: double, blocks-size: double, avg-block-size: double, n-transactions-total: double, n-transactions-per-block: double, hash-rate: double, difficulty: double, miners-revenue: double, transaction-fees-usd: double, n-unique-addresses: double, n-transactions: double, estimated-transaction-volume-usd: double, next-market-price: double, rate-of-change: double, sma-5-days: double, sma-7-days: double, sma-10-days: double, sma-20-days: double, sma-50-days: double, sma-100-days: double]

In [None]:
# Reorder the columns
new_columns = ["timestamp", "id"] + [col for col in merged_df.columns if col not in ["timestamp", "id", "next-market-price"]] + ["next-market-price"]
merged_df = merged_df.select(*new_columns)

In [None]:
# Set the "timestamp" column as the index of the Pandas DataFrame
merged_df.toPandas().set_index("timestamp", inplace=True)
merged_df.cache()

In [None]:
if SLOW_OPERATION:
  dataset_info(merged_df)

+-------------------+---+------------------+--------------+-------------------+--------------+------------------+------------------+--------------------+------------------------+-----------------+-------------------+------------------+--------------------+------------------+------------------+--------------------------------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|          timestamp| id|      market-price|total-bitcoins|         market-cap|  trade-volume|       blocks-size|    avg-block-size|n-transactions-total|n-transactions-per-block|        hash-rate|         difficulty|    miners-revenue|transaction-fees-usd|n-unique-addresses|    n-transactions|estimated-transaction-volume-usd|      rate-of-change|        sma-5-days|        sma-7-days|       sma-10-days|       sma-20-days|       sma-50-days|      sma-100-days| next-market-price|
+-------------------+---+-----------

# Showing train / validation and test sets

In [None]:
# # Calculates the total number of rows in the DataFrame
# total_rows = merged_df.count()

# # Calculates the index corresponding to 95% of the rows (train / valid set) and 5% (test set)
# index = int(total_rows * 0.95)

# # Dividi il dataset in base all'indice
# train_valid_df = merged_df.filter(merged_df['id'] < index)
# test_df = merged_df.filter(df['id'] >= index)

In [None]:
# Split the DataFrame based on a specific date
split_date = "2023-07-01"
train_valid_df = merged_df.filter(col("timestamp") < split_date)
test_df = merged_df.filter(col("timestamp") >= split_date)

In [None]:
# Data visualization with rangeslider
def data_visualization(train_valid, test):
  trace1 = go.Scatter(
      x = train_valid['timestamp'],
      y = train_valid["market-price"].astype(float),
      mode = 'lines',
      name = "Train / Validation set"
  )

  trace2 = go.Scatter(
      x = test['timestamp'],
      y = test['market-price'].astype(float),
      mode = 'lines',
      name = "Test set"
  )

  layout = dict(
      title="Train, valid and test set with rangeslider",
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  #change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace1, trace2]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = "Train, valid and test set with rangeslider")

In [None]:
data_visualization(train_valid_df.toPandas(), test_df.toPandas())

Output hidden; open in https://colab.research.google.com to view.

# Saving enginered dataset

In [None]:
def output(dataset, type):
  from pyspark.sql.functions import date_format, to_timestamp, col

  dataset.write.parquet(GDRIVE_DATASET_TEMP_DIR, mode='overwrite')

  import os
  import glob
  import time

  while True:
      parquet_files = glob.glob(os.path.join(GDRIVE_DATASET_TEMP_DIR, "part*.parquet"))
      if len(parquet_files) > 0:
          # .parquet file found!
          file_path = parquet_files[0]
          break
      else:
          print(".parquet file not found. I'll try again after 1 second...")
          time.sleep(1)

  print(".parquet file found:", file_path)

  new_file_path = GDRIVE_DATASET_OUTPUT_DIR + "/" + GDRIVE_DATASET_NAME + "_" + type +".parquet"

  import shutil

  # rename and move the file
  shutil.move(file_path, new_file_path)

  print("File renamed and moved successfully!")

In [None]:
output(train_valid_df, "eng")

.parquet file found: /content/drive/MyDrive/BDC/project/datasets/temp/part-00000-d7e51646-6c9c-4f16-94ed-9d352046d288-c000.snappy.parquet
File renamed and moved successfully!


In [None]:
output(test_df, "test")

.parquet file found: /content/drive/MyDrive/BDC/project/datasets/temp/part-00000-fcb73d40-7a6d-4d01-bbcb-ba0ebf69fb93-c000.snappy.parquet
File renamed and moved successfully!


# Data visualization

In [None]:
merged_df_pd = merged_df.toPandas()

In [None]:
# Data visualization with rangeslider
def data_visualization(dataset, key, value):
  trace = go.Scatter(
      x = dataset['timestamp'],
      y = dataset[value].astype(float),
      mode = 'lines',
      name = key
  )

  layout = dict(
      title=key,
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  #change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = "Data visualization with rangeslider")

In [None]:
# List of features according to categories
currency_statistics = {'Market price (USD)':'market-price', 'Market cap (USD)':'market-cap', 'N. total bitcoins':'total-bitcoins', 'Trade volume (USD)':'trade-volume'}
block_details = {'Blocks size (MB)':'blocks-size', 'Avg. block size (MB)':'avg-block-size', 'N. total transactions':'n-transactions-total', 'N. transactions per block':'n-transactions-per-block'}
mining_information = {'Hash rate (TH/s)':'hash-rate', 'Difficulty (T)':'difficulty', 'Miners revenue (USD)':'miners-revenue', 'Transaction fees (USD)':'transaction-fees-usd'}
network_activity = {"N. unique addresses":'n-unique-addresses', 'N. transactions':'n-transactions', 'Estimated transaction volume (USD)':'estimated-transaction-volume-usd'}
additional_features = {"Rate of change (%)":"rate-of-change", "Simple moving avg. (5d)":"sma-5-days", "Simple moving avg. (7d)":"sma-7-days", "Simple moving avg. (10d)":"sma-10-days", "Simple moving avg. (20d)":"sma-20-days", "Simple moving avg. (50d)":"sma-50-days", "Simple moving avg. (100d)":"sma-100-days"}

In [None]:
if SLOW_OPERATION:
  for key, value in currency_statistics.items():
    data_visualization(merged_df_pd, key, value)

In [None]:
if SLOW_OPERATION:
  for key, value in block_details.items():
    data_visualization(merged_df_pd, key, value)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
if SLOW_OPERATION:
  for key, value in mining_information.items():
    data_visualization(merged_df_pd, key, value)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
if SLOW_OPERATION:
  for key, value in network_activity.items():
    data_visualization(merged_df_pd, key, value)

Output hidden; open in https://colab.research.google.com to view.

## TO FIX: sma_visualization ❗

In [None]:
# # Data visualization with rangeslider
# def sma_visualization(dataset):
#   trace1 = go.Scatter(
#       x = dataset['timestamp'],
#       y = dataset["market-price"].astype(float),
#       mode = 'lines',
#       name = "Market price (usd)"
#   )

#   trace2 = go.Scatter(
#       x = dataset['timestamp'],
#       y = dataset["sma-5-days"].astype(float),
#       mode = 'lines',
#       name = "Sma 5 days (usd)"
#   )

#   trace3 = go.Scatter(
#       x = dataset['timestamp'],
#       y = dataset["sma-7-days"].astype(float),
#       mode = 'lines',
#       name = "Sma 7 days (usd)"
#   )

#   trace4 = go.Scatter(
#       x = dataset['timestamp'],
#       y = dataset["sma-10-days"].astype(float),
#       mode = 'lines',
#       name = "Sma 10 days (usd)"
#   )

#   trace5 = go.Scatter(
#       x = dataset['timestamp'],
#       y = dataset["sma-20-days"].astype(float),
#       mode = 'lines',
#       name = "Sma 20 days (usd)"
#   )

#   trace6 = go.Scatter(
#       x = dataset['timestamp'],
#       y = dataset["sma-50-days"].astype(float),
#       mode = 'lines',
#       name = "Sma 50 days (usd)"
#   )

#   trace7 = go.Scatter(
#       x = dataset['timestamp'],
#       y = dataset["sma-100-days"].astype(float),
#       mode = 'lines',
#       name = "Sma 100 days (usd)"
#   )

#   layout = dict(
#       title="Data visualization with rangeslider",
#       xaxis=dict(
#           rangeselector=dict(
#               buttons=list([
#                   #change the count to desired amount of months.
#                   dict(count=1,
#                       label='1m',
#                       step='month',
#                       stepmode='backward'),
#                   dict(count=6,
#                       label='6m',
#                       step='month',
#                       stepmode='backward'),
#                   dict(count=12,
#                       label='1y',
#                       step='month',
#                       stepmode='backward'),
#                   dict(count=36,
#                       label='3y',
#                       step='month',
#                       stepmode='backward'),
#                   dict(step='all')
#               ])
#           ),
#           rangeslider=dict(
#               visible = True
#           ),
#           type='date'
#       )
#   )

#   data = [trace1, trace2, trace3, trace4, trace5, trace6, trace7]
#   fig = dict(data=data, layout=layout)
#   iplot(fig, filename = "Data visualization with rangeslider")

In [None]:
# if SLOW_OPERATION:
#     sma_visualization(merged_df_pd)

# Checking stationarity ❓
Source: https://www.kaggle.com/code/debashis74017/time-series-forecasting-itcoin-price?scriptVersionId=113747601&cellId=25

Stationarity means that the statistical properties of a time series i.e. mean, variance and covariance do not change over time. Many statistical models require the series to be stationary to make effective and precise predictions.

Two statistical tests would be used to check the stationarity of a time series:
* Augmented Dickey Fuller (“ADF”) test
* Kwiatkowski-Phillips-Schmidt-Shin (“KPSS”) test.

## ADF Test ❓
ADF test is used to determine the presence of unit root in the series, and hence helps in understand if the series is stationary or not. The null and alternate hypothesis of this test are:

* Null Hypothesis: The series has a unit root.

* Alternate Hypothesis: The series has no unit root.

If the null hypothesis in failed to be rejected, this test may provide evidence that the series is non-stationary.

In [None]:
from statsmodels.tsa.stattools import adfuller

if SLOW_OPERATION:
  result = adfuller(df.toPandas()['market-price'], autolag='AIC')
  print(f'ADF Statistic: {result[0]}')
  print(f'p-value: {result[1]}')
  for key, value in result[4].items():
      print('Critial Values:')
      print(f'   {key}, {value}')

ADF Statistic: -1.4094573813848776
p-value: 0.5777338267506308
Critial Values:
   1%, -3.4303992641851133
Critial Values:
   5%, -2.861561774057184
Critial Values:
   10%, -2.566781589558402


ADF Stats value is greater than all critical values, and p-value is also greater than 0.05. So we can strongly reject the null hypothesis, and conclude that, Price value is Non-Stationary.

Let's apply log transformation to the data and test again.

In [None]:
from numpy import log

if SLOW_OPERATION:
  result = adfuller((log(df.toPandas()['market-price'])), autolag='AIC')
  print(f'ADF Statistic: {result[0]}')
  print(f'p-value: {result[1]}')
  for key, value in result[4].items():
      print('Critial Values:')
      print(f'   {key}, {value}')

ADF Statistic: -1.8813880105943408
p-value: 0.34086679123733743
Critial Values:
   1%, -3.4303992641851133
Critial Values:
   5%, -2.861561774057184
Critial Values:
   10%, -2.566781589558402


After applying Log transformation also, ADF Stats value is greater than all critical values, and p-value is also greater than 0.05. It seems, Price value is purely Non-Stationary.

## KPSS test - Kwiatkowski Phillips Schmidt Shin ❓
KPSS is another test for checking the stationarity of a time series. The null and alternate hypothesis for the KPSS test are opposite that of the ADF test:

* Null Hypothesis: The process is trend stationary.

* Alternate Hypothesis: The series has a unit root (series is not stationary).

In [None]:
from statsmodels.tsa.stattools import kpss

if SLOW_OPERATION:
  result = kpss(df.toPandas()['market-price'], regression='c')
  print('\nKPSS Statistic: %f' % result[0])
  print('p-value: %f' % result[1])
  for key, value in result[3].items():
      print('Critial Values:')
      print(f'   {key}, {value}');


KPSS Statistic: 38.377644
p-value: 0.010000
Critial Values:
   10%, 0.347
Critial Values:
   5%, 0.463
Critial Values:
   2.5%, 0.574
Critial Values:
   1%, 0.739



The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is smaller than the p-value returned.




The output of the KPSS test contains 4 values:

* **The KPSS statistic**: is the actual test statistic that is computed while performing the test.
* **p-value**: is the probability score based on which you can decide whether to reject the null hypothesis or not. If the p-value is less than a predefined alpha level (typically 0.05), we reject the null hypothesis.
* **Number of lags used by the test**: is the number of lags of the series that was actually used by the model equation of the kpss test. By default, the statsmodels kpss() uses the ‘legacy’ method. In legacy method, int(12 * (n / 100)**(1 / 4)) number of lags is included, where n is the length of the series.
* **Critical values**: in order to reject the null hypothesis, the test statistic should be greater than the provided critical values. If it is in fact higher than the target critical value, then that should automatically reflect in a low p-value. If the p-value is less than 0.05, the kpss statistic will be greater than the 5% critical value.

Here we find that, KPSS stats value is too high than critical values, so, we concluded that this time series is Non-Stationary

#  Feature selection

In [96]:
new_columns = [col for col in merged_df.columns if col not in ["timestamp", "id"]]
merged_df_no_index = merged_df.select(*new_columns)

['total-bitcoins', 'market-cap', 'trade-volume', 'blocks-size', 'avg-block-size', 'n-transactions-total', 'n-transactions-per-block', 'hash-rate', 'difficulty', 'miners-revenue', 'transaction-fees-usd', 'n-unique-addresses', 'n-transactions', 'estimated-transaction-volume-usd', 'rate-of-change', 'sma-5-days', 'sma-7-days', 'sma-10-days', 'sma-20-days', 'sma-50-days', 'sma-100-days']


In [None]:
all_features = merged_df_no_index.columns[1:]
print(all_features)

In [109]:
# Assemble the features into a vector column
assembler = VectorAssembler(inputCols=merged_df_no_index.columns, outputCol="features")
df_vector = assembler.transform(merged_df_no_index).select("market-price", "features")

# Calculate the correlation matrix using Pearson method
correlation_matrix = Correlation.corr(df_vector, "features", method="pearson").head()

# Get the correlation values with the "market-price" column
correlation_with_market_price = correlation_matrix[0].toArray()[0]

# Create a dictionary with feature names and their correlation values
feature_correlations = dict(zip(merged_df_no_index.columns, correlation_with_market_price))

# Sort the features based on their correlation with "market-price"
sorted_features = dict(sorted(feature_correlations.items(), key=lambda x: x[1], reverse=True))

# Print the sorted features and their correlation values
more_rel_features = []
less_rel_features = []
# Set the threshold value
threshold = 0.7
for feature, correlation in sorted_features.items():
    print(f"{feature}: {correlation}")
    if (correlation > threshold) & (feature != 'market-price'):
      more_rel_features.append(feature)
    elif (feature != 'market-price'):
      less_rel_features.append(feature)

market-price: 1.0
market-cap: 0.99921219667059
miners-revenue: 0.9421098142530814
sma-5-days: 0.9106313788564082
sma-7-days: 0.8841631082917157
estimated-transaction-volume-usd: 0.8416797296365173
sma-10-days: 0.8386997050604912
n-transactions-total: 0.7373435302022633
blocks-size: 0.7303619464401548
sma-100-days: 0.7128394512519217
total-bitcoins: 0.7070959368592018
sma-20-days: 0.6933036892970385
sma-50-days: 0.6808792511340275
n-unique-addresses: 0.6425038917319674
difficulty: 0.634774597544706
hash-rate: 0.6331911408499087
avg-block-size: 0.5036061542777199
transaction-fees-usd: 0.3787916787607916
trade-volume: 0.274232595613375
n-transactions-per-block: 0.13843755483141754
n-transactions: 0.10753876639980868
rate-of-change: -0.02638801928483736


In [112]:
# Print the selected keys
print(more_rel_features)

['market-cap', 'miners-revenue', 'sma-5-days', 'sma-7-days', 'estimated-transaction-volume-usd', 'sma-10-days', 'n-transactions-total', 'blocks-size', 'sma-100-days', 'total-bitcoins']


In [113]:
# Print the selected keys
print(less_rel_features)

['sma-20-days', 'sma-50-days', 'n-unique-addresses', 'difficulty', 'hash-rate', 'avg-block-size', 'transaction-fees-usd', 'trade-volume', 'n-transactions-per-block', 'n-transactions', 'rate-of-change']


# Output

In [114]:
GDRIVE_FEATURES_DIR = GDRIVE_DIR + "/MyDrive/BDC/project/features"

GDRIVE_ALL_FEATURES_NAME = "all_features"
GDRIVE_MORE_REL_FEATURES_NAME = "more_rel_features"
GDRIVE_LESS_REL_FEATURES_NAME = "less_rel_features"

GDRIVE_ALL_FEATURES_NAME_EXT = "/" + GDRIVE_ALL_FEATURES_NAME + ".json"
GDRIVE_MORE_REL_FEATURES_NAME_EXT = "/" + GDRIVE_MORE_REL_FEATURES_NAME + ".json"
GDRIVE_LESS_REL_FEATURES_NAME_EXT = "/" + GDRIVE_LESS_REL_FEATURES_NAME + ".json"

GDRIVE_ALL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_ALL_FEATURES_NAME_EXT
GDRIVE_MORE_REL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_MORE_REL_FEATURES_NAME_EXT
GDRIVE_LESS_REL_FEATURES = GDRIVE_FEATURES_DIR + GDRIVE_LESS_REL_FEATURES_NAME_EXT

In [115]:
with open(GDRIVE_ALL_FEATURES, 'w') as file:
    json.dump(all_features, file)

In [116]:
with open(GDRIVE_MORE_REL_FEATURES, 'w') as file:
    json.dump(more_rel_features, file)

In [117]:
with open(GDRIVE_LESS_REL_FEATURES, 'w') as file:
    json.dump(less_rel_features, file)