# **Bitcoin price prediction - Feature Engineering**
## Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author: Corsi Danilo (1742375) - corsi.1742375@studenti.uniroma1.it


---


Description: adding useful features regardings the price of Bitcoin, visualizing data and performing feature selection.


# Global constants, dependencies, libraries and tools

In [None]:
# Main constants
LOCAL_RUNNING = True
SLOW_OPERATIONS = True # Decide whether or not to use operations that might slow down notebook execution
ROOT_DIR = "D:/Documents/Repository/BDC/project" if LOCAL_RUNNING else "/content/drive"

In [None]:
if not LOCAL_RUNNING:
    # Point Colaboratory to Google Drive
    from google.colab import drive

    # Define GDrive paths
    drive.mount(ROOT_DIR, force_remount=True)

In [None]:
# Set main dir
MAIN_DIR = ROOT_DIR + "" if LOCAL_RUNNING else ROOT_DIR + "/MyDrive/BDC/project"

###################
# --- DATASET --- #
###################

# Datasets dirs
DATASET_RAW_DIR = MAIN_DIR + "/datasets/raw"
DATASET_OUTPUT_DIR = MAIN_DIR + "/datasets/output"
DATASET_TEMP_DIR = MAIN_DIR + "/datasets/temp"

# Datasets names
DATASET_NAME = "bitcoin_blockchain_data_15min"

# Datasets paths
DATASET_RAW = DATASET_RAW_DIR + "/" + DATASET_NAME + ".parquet"

####################
# --- FEATURES --- #
####################

# Features dir
FEATURES_DIR = MAIN_DIR + "/features"

# Features names
FEATURES_RELEVANCE_NAME = "features_relevance"
ALL_FEATURES_NAME = "all_features"
MOST_REL_FEATURES_NAME = "most_rel_features"
LEAST_REL_FEATURES_NAME = "least_rel_features"

# Features paths
FEATURES_RELEVANCE = FEATURES_DIR + "/" + FEATURES_RELEVANCE_NAME + ".json"
ALL_FEATURES = FEATURES_DIR + "/" + ALL_FEATURES_NAME + ".json"
MOST_REL_FEATURES = FEATURES_DIR + "/" + MOST_REL_FEATURES_NAME + ".json"
LEAST_REL_FEATURES = FEATURES_DIR + "/" + LEAST_REL_FEATURES_NAME + ".json"

#####################
# --- UTILITIES --- #
#####################

# Utilities dir
UTILITIES_DIR = MAIN_DIR + "/utilities"

In [None]:
# Suppression of warnings for better reading
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
if not LOCAL_RUNNING:
    # Install Spark and related dependencies
    !pip install pyspark
    !pip install -U -q PyDrive -qq
    !apt install openjdk-8-jdk-headless -qq

# Import files

In [None]:
# Import my files
import sys
sys.path.append(UTILITIES_DIR)

from imports import *

# Create the pyspark session

In [None]:
# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '12G').\
                set('spark.driver.memory', '12G').\
                set('spark.driver.maxResultSize', '109G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPricePrediction").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

# Loading dataset

In [None]:
# Load datasets into pyspark dataset objects
df = spark.read.load(DATASET_RAW,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    ) \
                     .withColumn("id", F.row_number().over(Window.orderBy(F.monotonically_increasing_id()))-1) # Adding "id" column

In [None]:
def dataset_info(dataset):
  # Print dataset
  dataset.show(3)

  # Get the number of rows
  num_rows = dataset.count()

  # Get the number of columns
  num_columns = len(dataset.columns)

  # Print the shape of the dataset
  print("Shape:", (num_rows, num_columns))

  # Print the schema of the dataset
  dataset.printSchema()

In [None]:
if SLOW_OPERATIONS:
  dataset_info(df)

# Adding useful features
Here I am going to add some features that could help us predict the Bitcoin price:

*   **next-market-price:** represents the price of Bitcoin for the next day (this will be the target variable on which to make predictions)
*   **rate-of-change:** indicator that measures the percentage of price changes over a period of time, allows investors to spot security momentum and other trends
*   **sma-x-days:** indicators that calculate the average price over a specified number of days. They are commonly used by traders to identify trends and potential buy or sell signals


In [None]:
# Creation of a new dataset for the new features
new_features_df = df.select("timestamp", "id", "market-price")

In [None]:
# Adding 'tomorrow-market-price' column
new_features_df = new_features_df.withColumn("next-market-price", F.lag("market-price", offset=-1) \
        .over(Window.orderBy("id"))) \
        .dropna()

In [None]:
# Adding "rate-of-change" column
new_features_df = new_features_df.withColumn("rate-of-change", (F.col("next-market-price") / F.col("market-price") - 1) * 100)

In [None]:
def simple_moving_average(dataset, period, days, col="next-market-price", orderby="id"):
    dataset = dataset.withColumn(f"sma-{days}-days", F.avg(col) \
          .over(Window.orderBy(orderby) \
          .rowsBetween(-period,0)))
    return dataset

In [None]:
# Moving averages days (5/7/10/20/50/100)
MA5 = 60 * 24 * 5
MA7 = 60 * 24 * 7
MA10 = 60 * 24 * 10
MA20 = 60 * 24 * 20
MA50 = 60 * 24 * 50
MA100 = 60 * 24 * 100

# Computing the SMA
new_features_df = simple_moving_average(new_features_df, MA5, 5)
new_features_df = simple_moving_average(new_features_df, MA7, 7)
new_features_df = simple_moving_average(new_features_df, MA10, 10)
new_features_df = simple_moving_average(new_features_df, MA20, 20)
new_features_df = simple_moving_average(new_features_df, MA50, 50)
new_features_df = simple_moving_average(new_features_df, MA100, 100)

In [None]:
# Drop "market-price" column
new_features_df = new_features_df.drop("market-price")

In [None]:
if SLOW_OPERATIONS:
  dataset_info(new_features_df)

In [None]:
# Merge original dataset with the one with the new features
merged_df = df.join(new_features_df, on=['timestamp','id'], how='inner')

In [None]:
# Rearranges columns
new_columns = ["timestamp", "id"] + [col for col in merged_df.columns if col not in ["timestamp", "id", "next-market-price"]] + ["next-market-price"]
merged_df = merged_df.select(*new_columns)

In [None]:
# Set the "timestamp" column as the index of the Pandas dataset
merged_df.toPandas().set_index("timestamp", inplace=True)

In [None]:
if SLOW_OPERATIONS:
  dataset_info(merged_df)

# Splitting dataset
Here we are going to split the dataset into two sets:
* **Train / salidation set:** will be used to train the models and validate the performances
* **Test set:** will be used to perform price prediction on never-before-seen data (the last 3 months of the original dataset will be used).

In [None]:
# Retrieve the last timestamp value
last_value = merged_df.agg(last("timestamp")).collect()[0][0]

# Subtract three month from the last timestamp value
split_date = last_value - relativedelta(months=3)

# Split the dataset based on the desired date
train_valid_df = merged_df[merged_df['timestamp'] <= split_date]
test_df = merged_df[merged_df['timestamp'] > split_date]

In [None]:
def data_visualization(train_valid, test):
  trace1 = go.Scatter(
      x = train_valid['timestamp'],
      y = train_valid["market-price"].astype(float),
      mode = 'lines',
      name = "Train / Validation set"
  )

  trace2 = go.Scatter(
      x = test['timestamp'],
      y = test['market-price'].astype(float),
      mode = 'lines',
      name = "Test set"
  )

  layout = dict(
      title="Train / Validation and Test sets",
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  # Change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace1, trace2]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = "Train / Validation and Test sets")

In [None]:
if SLOW_OPERATIONS:
    data_visualization(train_valid_df.toPandas(), test_df.toPandas())

# Saving datasets

In [None]:
def output(dataset, dataset_type):
  dataset.write.parquet(DATASET_TEMP_DIR, mode='overwrite')

  while True:
      parquet_files = glob.glob(os.path.join(DATASET_TEMP_DIR, "part*.parquet"))
      if len(parquet_files) > 0:
          # .parquet file found!
          file_path = parquet_files[0]
          break
      else:
          print(".parquet file not found. I'll try again after 1 second...")
          time.sleep(1)

  print(".parquet file found:", file_path)

  new_file_path = DATASET_OUTPUT_DIR + "/" + DATASET_NAME + "_" + dataset_type + ".parquet"

  # Rename and move the file
  shutil.move(file_path, new_file_path)

  print("File renamed and moved successfully!")

In [None]:
# Save the train / validation set
output(train_valid_df, "train_valid")

In [None]:
# Save the test set
output(test_df, "test")

# Data visualization

Here we are going to display the features taken under consideration according to their categories.

❗TO DELETE

In [None]:
DATASET_OUTPUT_DIR = MAIN_DIR + "/datasets/output"

# Datasets names
DATASET_NAME = "bitcoin_blockchain_data_15min"

# Datasets paths
DATASET_TRAIN_VALID = DATASET_OUTPUT_DIR + "/" + DATASET_NAME + "_train_valid.parquet"
DATASET_TEST = DATASET_OUTPUT_DIR + "/" + DATASET_NAME + "_test.parquet"

# Load datasets into pyspark dataset objects
train = spark.read.load(DATASET_TRAIN_VALID,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

# Load datasets into pyspark dataset objects
test = spark.read.load(DATASET_TEST,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )
            
merged_df = train.union(test)

In [None]:
# Convert the PySpark dataset into Pandas
merged_df_pd = merged_df.toPandas()

In [None]:
# List of features according to categories
currency_statistics = {'Market price (USD)':'market-price', 'Market cap (USD)':'market-cap', 'N. total bitcoins':'total-bitcoins', 'Trade volume (USD)':'trade-volume'}
block_details = {'Blocks size (MB)':'blocks-size', 'Avg. block size (MB)':'avg-block-size', 'N. total transactions':'n-transactions-total', 'N. transactions per block':'n-transactions-per-block'}
mining_information = {'Hash rate (TH/s)':'hash-rate', 'Difficulty (T)':'difficulty', 'Miners revenue (USD)':'miners-revenue', 'Transaction fees (USD)':'transaction-fees-usd'}
network_activity = {"N. unique addresses":'n-unique-addresses', 'N. transactions':'n-transactions', 'Estimated transaction volume (USD)':'estimated-transaction-volume-usd'}
additional_features = {"Rate of change (%)":"rate-of-change", "Simple moving avg. (5d)":"sma-5-days", "Simple moving avg. (7d)":"sma-7-days", "Simple moving avg. (10d)":"sma-10-days", "Simple moving avg. (20d)":"sma-20-days", "Simple moving avg. (50d)":"sma-50-days", "Simple moving avg. (100d)":"sma-100-days"}

In [None]:
def data_visualization(dataset, key, value):
  trace = go.Scatter(
      x = dataset['timestamp'],
      y = dataset[value].astype(float),
      mode = 'lines',
      name = key
  )

  layout = dict(
      title=key,
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  # Change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace]
  fig = dict(data=data, layout=layout)
  iplot(fig, filename = "Data visualization with rangeslider")

In [None]:
# Currency Statistics
if SLOW_OPERATIONS:
  for key, value in currency_statistics.items():
    data_visualization(merged_df_pd, key, value)

Concerning currency statistics, we can see that in the period from late 2020 to mid-2022 there has been a rise in the price of Bitcoin, while the amount of Bitcoins issued is slowly peaking (i.e. 21 million), it is thought that the last BTC will be mined in 2140.

In [None]:
# Block Details
if SLOW_OPERATIONS:
  for key, value in block_details.items():
    data_visualization(merged_df_pd, key, value)

Concerning block details, we can see that over time the number of transactions has increased exponentially, along with the size of the blocks. The peak around the end of January 2023 is due to the creation of the Ordinals protocol that allows the creation of 'digital artefacts' on the Bitcoin network (These can include JPEG images, PDFs and audio and video files).

In [None]:
# Mining Information
if SLOW_OPERATIONS:
  for key, value in mining_information.items():
    data_visualization(merged_df_pd, key, value)

Regarding mining information, we can see how the difficulty of the network along with the hash rate has also increased exponentially, the greater the hashing (computing) power in the network, the greater its security and resistance to attacks. While the miners revenue more or less follows the price trend of Bitcoin itself (this is also thanks to the transaction fees that are distributed to the miners). The two biggest spikes in transaction fees are due to a combination of ASIC shortages, huge price increases of BTC outpacing difficulty and the sudden hashrate drop, resulting in slower block times, backlog of transactions and extra fees per block (20 - 21 April 2021) and the increase in demand for block space attributed to the increase in Ordinals (8 May 2023).

In [None]:
# Network Activity
if SLOW_OPERATIONS:
  for key, value in network_activity.items():
    data_visualization(merged_df_pd, key, value)

Regarding Network Activity, we can see how this also increases as time goes by, a symbol that the Bitcoin protocol is becoming more and more popular and people are willing to pay to use it.

In [None]:
# Additional Features: Rate of change
if SLOW_OPERATIONS:
  first_pair = next(iter(additional_features.items()))
  data_visualization(merged_df_pd, first_pair[0], first_pair[1])

In [None]:
def sma_visualization(dataset, features, title):
  trace1 = go.Scatter(
      x = dataset['timestamp'],
      y = dataset["market-price"].astype(float),
      mode = 'lines',
      name = "Market price (usd)"
  )

  trace2 = go.Scatter(
      x = dataset['timestamp'],
      y = dataset[features[0][1]].astype(float),
      mode = 'lines',
      name = features[0][0]
  )

  trace3 = go.Scatter(
      x = dataset['timestamp'],
      y = dataset[features[1][1]].astype(float),
      mode = 'lines',
      name = features[1][0]
  )

  trace4 = go.Scatter(
      x = dataset['timestamp'],
      y = dataset[features[2][1]].astype(float),
      mode = 'lines',
      name = features[2][0]
  )

  layout = dict(
      title=title,
      xaxis=dict(
          rangeselector=dict(
              buttons=list([
                  # Change the count to desired amount of months.
                  dict(count=1,
                      label='1m',
                      step='month',
                      stepmode='backward'),
                  dict(count=6,
                      label='6m',
                      step='month',
                      stepmode='backward'),
                  dict(count=12,
                      label='1y',
                      step='month',
                      stepmode='backward'),
                  dict(count=36,
                      label='3y',
                      step='month',
                      stepmode='backward'),
                  dict(step='all')
              ])
          ),
          rangeslider=dict(
              visible = True
          ),
          type='date'
      )
  )

  data = [trace1, trace2, trace3, trace4]

  fig = dict(data=data, layout=layout)
  iplot(fig, filename = title)

In [None]:
# Extract the short term SMA
short_term_sma = list(additional_features.items())[1:4]
print(short_term_sma)

# Extract the long term SMA
long_term_sma = list(additional_features.items())[-3:]
print(long_term_sma)

In [None]:
# Additional Features: Short term SMA
if SLOW_OPERATIONS:
  sma_visualization(merged_df_pd, short_term_sma, "Short term SMA (usd)")

In [None]:
# Additional Features: Long term SMA
if SLOW_OPERATIONS:
  sma_visualization(merged_df_pd,long_term_sma, "Long term SMA (usd)")

With regard to added features, we can see that the main price variations are in the medium to long term (months - years) rather than in the short term (days).

#  Feature selection
Here we are going to select features based on their correlation and importance with respect to the market price using the Pearson method and Random Forest Regressor.

❗TO DELETE

In [None]:
# TO DELETE ❗
DATASET_OUTPUT_DIR = MAIN_DIR + "/datasets/output"

# Datasets names
DATASET_NAME = "bitcoin_blockchain_data_15min"

# Datasets paths
DATASET_TRAIN_VALID = DATASET_OUTPUT_DIR + "/" + DATASET_NAME + "_train_valid.parquet"
DATASET_TEST = DATASET_OUTPUT_DIR + "/" + DATASET_NAME + "_test.parquet"

# Load datasets into pyspark dataset objects
train = spark.read.load(DATASET_TRAIN_VALID,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )

# Load datasets into pyspark dataset objects
test = spark.read.load(DATASET_TEST,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    )
            
merged_df = train.union(test)

In [None]:
# Prepare dataset to feature selection
new_columns = ["next-market-price"] + [col for col in merged_df.columns if col not in ["timestamp", "id", "next-market-price"]]
merged_df_no_indexes = merged_df.select(*new_columns)
merged_df_no_indexes.show()

In [None]:
# Assemble the data to apply PySpark methods
assembler = VectorAssembler(inputCols=merged_df_no_indexes.columns, outputCol='features')
assembled_data = assembler.transform(merged_df_no_indexes)

In [None]:
# Compute the correlation matrix
correlation_matrix = Correlation.corr(assembled_data, 'features').head()

# Get the highest correlated features
correlation_scores = correlation_matrix[0].toArray()
feature_names = merged_df_no_indexes.columns
feature_correlations = sorted([(feature_names[i], str(correlation_scores[i][0])) for i in range(len(feature_names))], key=lambda x: x[1], reverse=True)

# Print the results
for label, value in feature_correlations:
    print(f"Feature: {label}, Correlation: {value}")

In [None]:
# Define and train the Random Forest Model
rf = RandomForestRegressor(featuresCol='features', labelCol='next-market-price', seed=42)
results = rf.fit(assembled_data)

# Get most important features
importance_scores = results.featureImportances
feature_importances = sorted([(merged_df_no_indexes.columns[i], importance_scores[i]) for i in range(len(merged_df_no_indexes.columns))], key=lambda x: x[1], reverse=True)
    
# Normalize the values between -1 and 1
max_value = 0
for x in feature_importances:
    if x[1] > max_value:
        max_value = x[1]
feature_importances = [(x[0], x[1] / max_value) for x in feature_importances]

# Print the results
for label, value in feature_importances:
    print(f"Feature: {label}, Importance: {value}")

In [None]:
# Create dictionary to store correlation and importance values for each feature
feature_relevances = {}
for feature, correlation in feature_correlations:
    feature_relevances[feature] = {'Correlation': correlation}

for feature, importance in feature_importances:
    if feature in feature_relevances:
        feature_relevances[feature]['Importance'] = importance
    else:
        feature_relevances[feature] = {'Importance': importance}

# Print feature relevance
for feature, value in feature_relevances.items():
    print(f"Feature: {feature}, {value}")

❗TO FIX

---
Since both correlation and importance scores gave more or less the same features, I decided to use those based on correlation. I divided features into 3 groups to see the differences according to their use:
* **All:** contains all features
* **Most relevant:** contains features that have a relevance value > 0.5
* **Least relevant:** contains the features that have a relevance value <= 0.5

In [None]:
all_features = [x[0] for x in feature_correlations[1:]]
most_rel_features = [x[0] for x in feature_correlations[1:] if float(x[1]) >= 0.5]
least_rel_features = [x[0] for x in feature_correlations[1:] if float(x[1]) < 0.5]

In [None]:
all_features

In [None]:
most_rel_features

In [None]:
least_rel_features

# Saving selected features

In [None]:
# Save all the features and their relevance value
with open(FEATURES_RELEVANCE, 'w') as file:
    json.dump(feature_relevances, file)

In [None]:
# Save sll the features
with open(ALL_FEATURES, 'w') as file:
    json.dump(all_features, file)

In [None]:
# Save the most relevant features
with open(MOST_REL_FEATURES, 'w') as file:
    json.dump(most_rel_features, file)

In [None]:
# Save the least relevant features
with open(LEAST_REL_FEATURES, 'w') as file:
    json.dump(least_rel_features, file)