# **Bitcoin price prediction - Feature Engineering**
## Big Data Computing final project - A.Y. 2022 - 2023
Prof. Gabriele Tolomei

MSc in Computer Science

La Sapienza, University of Rome

### Author: Corsi Danilo (1742375) - corsi.1742375@studenti.uniroma1.it


---


Description: manipulation, visualization and feature extraction.


# Global constants, dependencies, libraries and tools

In [1]:
# Main constants
LOCAL_RUNNING = True
SLOW_OPERATIONS = False # Decide whether or not to use operations that might slow down notebook execution
ROOT_DIR = "D:/Documents/Repository/BDC/project" if LOCAL_RUNNING else "/content/drive"

In [2]:
if not LOCAL_RUNNING:
    # Point Colaboratory to Google Drive
    from google.colab import drive

    # Define GDrive paths
    drive.mount(ROOT_DIR, force_remount=True)

    # Install Spark and related dependencies
    !pip install pyspark
    !pip install -U -q PyDrive -qq
    !apt install openjdk-8-jdk-headless -qq

## Import my utilities

In [3]:
# Set main dir
MAIN_DIR = ROOT_DIR + "" if LOCAL_RUNNING else ROOT_DIR + "/MyDrive/BDC/project"

# Utilities dir
UTILITIES_DIR = MAIN_DIR + "/utilities"

# Import my utilities
import sys
sys.path.append(UTILITIES_DIR)

from imports import *
from config import *
import feature_engineering_utilities

importlib.reload(feature_engineering_utilities)

<module 'feature_engineering_utilities' from 'D:\\Documents/Repository/BDC/project/utilities\\feature_engineering_utilities.py'>

In [4]:
###################
# --- DATASET --- #
###################

# Datasets dirs
DATASET_RAW_DIR = MAIN_DIR + "/datasets/raw"
DATASET_OUTPUT_DIR = MAIN_DIR + "/datasets/output"
DATASET_TEMP_DIR = MAIN_DIR + "/datasets/temp"

# Datasets paths
DATASET_RAW = DATASET_RAW_DIR + "/" + DATASET_NAME + ".parquet"

####################
# --- FEATURES --- #
####################

# Features dir
FEATURES_DIR = MAIN_DIR + "/features"

# Features paths
FEATURES_CORRELATION = FEATURES_DIR + "/" + FEATURES_CORRELATION_LABEL + ".json"
BASE_FEATURES = FEATURES_DIR + "/" + BASE_FEATURES_LABEL + ".json"
BASE_AND_MOST_CORR_FEATURES = FEATURES_DIR + "/" + BASE_AND_MOST_CORR_FEATURES_LABEL + ".json"
BASE_AND_LEAST_CORR_FEATURES = FEATURES_DIR + "/" + BASE_AND_LEAST_CORR_FEATURES_LABEL + ".json"

In [5]:
# Suppression of warnings for better reading
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pio.renderers.default = 'vscode+colab' # To correctly render plotly plots

# Create the pyspark session

In [6]:
# Create the session
conf = SparkConf().\
                set('spark.ui.port', "4050").\
                set('spark.executor.memory', '12G').\
                set('spark.driver.memory', '12G').\
                set('spark.driver.maxResultSize', '109G').\
                set("spark.kryoserializer.buffer.max", "1G").\
                setAppName("BitcoinPricePrediction").\
                setMaster("local[*]")

# Create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

# Loading dataset

In [7]:
# Load datasets into pyspark dataset objects
df = spark.read.load(DATASET_RAW,
                         format="parquet",
                         sep=",",
                         inferSchema="true",
                         header="true"
                    ) \
                     .withColumn("id", F.row_number().over(Window.orderBy(F.monotonically_increasing_id()))-1) # Adding "id" column

In [8]:
feature_engineering_utilities.dataset_info(df)

+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+-----------------+--------------------+--------------------+------------------+------------------+--------------------+------------------------+-------------------+------------------+--------------------+--------------------+------------------+--------------+--------------------------------+-------------------+---+
|     market-price|    opening-price|    highest-price|     lowest-price|    closing-price|  trade-volume-btc|   total-bitcoins|          market-cap|    trade-volume-usd|       blocks-size|    avg-block-size|n-transactions-total|n-transactions-per-block|          hash-rate|        difficulty|      miners-revenue|transaction-fees-usd|n-unique-addresses|n-transactions|estimated-transaction-volume-usd|          timestamp| id|
+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+-----------------+------------------

# Adding useful features
After obtaining all the data, other features were added such as:
- `next-market-price:` that represents the price of Bitcoin for the next day, on which predictions will be made
- `simple-moving-averages:` indicators that calculate the average price over a specified number of days

In [9]:
# Creation of a new dataset for the new features
new_features_df = df.select("timestamp", "id", "market-price")

In [10]:
# Adding 'next-market-price' column
new_features_df = new_features_df.withColumn("next-market-price", F.lag("market-price", offset=-1) \
        .over(Window.orderBy("id"))) \
        .dropna()

In [11]:
def simple_moving_average(dataset, period, days, col="market-price", orderby="id"):
    dataset = dataset.withColumn(f"sma-{days}-days", F.avg(col) \
          .over(Window.orderBy(orderby) \
          .rowsBetween(-period,0)))
    return dataset

In [12]:
# Moving averages days (5/7/10/20/50/100)
MA5 = 60 * 24 * 5
MA7 = 60 * 24 * 7
MA10 = 60 * 24 * 10
MA20 = 60 * 24 * 20
MA50 = 60 * 24 * 50
MA100 = 60 * 24 * 100
moving_averages = [MA5, MA7, MA10, MA20, MA50, MA100]
days_list = [5, 7, 10, 20, 50, 100]

# Computing SMAs
for i, moving_avg in enumerate(moving_averages):
    new_features_df = simple_moving_average(new_features_df, moving_avg, days_list[i])

In [13]:
feature_engineering_utilities.dataset_info(new_features_df)

+-------------------+---+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|          timestamp| id|     market-price|next-market-price|       sma-5-days|       sma-7-days|      sma-10-days|      sma-20-days|      sma-50-days|     sma-100-days|
+-------------------+---+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|2019-11-16 00:00:00|  0|          8457.69|8457.950520833334|          8457.69|          8457.69|          8457.69|          8457.69|          8457.69|          8457.69|
|2019-11-16 00:15:00|  1|8457.950520833334|8458.211041666667|8457.820260416667|8457.820260416667|8457.820260416667|8457.820260416667|8457.820260416667|8457.820260416667|
|2019-11-16 00:30:00|  2|8458.211041666667|     8458.4715625|8457.950520833334|8457.950520833334|8457.950520833334|8457.950520833334|8457.950520833334

In [14]:
# Drop "market-price" column
new_features_df = new_features_df.drop("market-price")

# Merge original dataset with the one with the new features
merged_df = df.join(new_features_df, on=['timestamp','id'], how='inner')

# Persist the dataframe
merged_df.cache()

DataFrame[timestamp: timestamp_ntz, id: int, market-price: double, opening-price: double, highest-price: double, lowest-price: double, closing-price: double, trade-volume-btc: double, total-bitcoins: double, market-cap: double, trade-volume-usd: double, blocks-size: double, avg-block-size: double, n-transactions-total: double, n-transactions-per-block: double, hash-rate: double, difficulty: double, miners-revenue: double, transaction-fees-usd: double, n-unique-addresses: double, n-transactions: double, estimated-transaction-volume-usd: double, next-market-price: double, sma-5-days: double, sma-7-days: double, sma-10-days: double, sma-20-days: double, sma-50-days: double, sma-100-days: double]

In [15]:
feature_engineering_utilities.dataset_info(merged_df)

+-------------------+---+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+-----------------+--------------------+--------------------+------------------+------------------+--------------------+------------------------+-------------------+------------------+--------------------+--------------------+------------------+--------------+--------------------------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|          timestamp| id|     market-price|    opening-price|    highest-price|     lowest-price|    closing-price|  trade-volume-btc|   total-bitcoins|          market-cap|    trade-volume-usd|       blocks-size|    avg-block-size|n-transactions-total|n-transactions-per-block|          hash-rate|        difficulty|      miners-revenue|transaction-fees-usd|n-unique-addresses|n-transactions|estimated-transaction-volume-usd|next-market-price|   

In [16]:
# Rearranges columns
new_columns = ["timestamp", "id"] + [col for col in merged_df.columns if col not in ["timestamp", "id", "next-market-price"]] + ["next-market-price"]
merged_df = merged_df.select(*new_columns)

# Set the "timestamp" column as the index of the Pandas dataset
merged_df.toPandas().set_index("timestamp", inplace=True)

# Splitting dataset
Then the whole dataset will be splitted into two sets:
- `Train / Validation set:` that will be used to train the models and validate the performances
- `Test set:` that will be used to perform price prediction on never-before-seen data, in this case the last 3 months of the original dataset will be used

In [17]:
# Retrieve the last timestamp value
last_value = merged_df.agg(last("timestamp")).collect()[0][0]

# Subtract three month from the last timestamp value
split_date = last_value - relativedelta(months=3)

# Split the dataset based on the desired date
train_valid_df = merged_df[merged_df['timestamp'] <= split_date]
test_df = merged_df[merged_df['timestamp'] > split_date]

In [18]:
if SLOW_OPERATIONS:
    feature_engineering_utilities.dataset_visualization(train_valid_df.toPandas(), test_df.toPandas(), "Train / Validation and Test sets")

# Saving datasets

In [19]:
def output(dataset, dataset_type):
  dataset.write.parquet(DATASET_TEMP_DIR, mode='overwrite')

  while True:
      parquet_files = glob.glob(os.path.join(DATASET_TEMP_DIR, "part*.parquet"))
      if len(parquet_files) > 0:
          # .parquet file found!
          file_path = parquet_files[0]
          break
      else:
          print(".parquet file not found. I'll try again after 1 second...")
          time.sleep(1)

  print(".parquet file found:", file_path)

  new_file_path = DATASET_OUTPUT_DIR + "/" + DATASET_NAME + "_" + dataset_type + ".parquet"

  # Rename and move the file
  shutil.move(file_path, new_file_path)

  print("File renamed and moved successfully!")

In [20]:
# Save the train / validation set
output(train_valid_df, "train_valid")

.parquet file found: D:/Documents/Repository/BDC/project/datasets/temp\part-00000-b3512a1a-4a43-4eab-959a-d85b6b8b8af5-c000.snappy.parquet
File renamed and moved successfully!


In [21]:
# Save the test set
output(test_df, "test")

.parquet file found: D:/Documents/Repository/BDC/project/datasets/temp\part-00000-23117a30-f8da-447c-bb69-a129870cab1c-c000.snappy.parquet
File renamed and moved successfully!


# Data visualization

Here I am going to display the features taken under consideration according to their categories.

In [22]:
# Convert the PySpark dataset into Pandas
merged_df_pd = merged_df.toPandas()
merged_df_pd

Unnamed: 0,timestamp,id,market-price,opening-price,highest-price,lowest-price,closing-price,trade-volume-btc,total-bitcoins,market-cap,...,n-unique-addresses,n-transactions,estimated-transaction-volume-usd,sma-5-days,sma-7-days,sma-10-days,sma-20-days,sma-50-days,sma-100-days,next-market-price
0,2019-11-16 00:00:00,0,8457.690000,8463.790000,8528.440000,8430.770000,8484.070000,50.607771,1.805144e+07,1.531394e+11,...,495077.000000,283468.000000,4.618346e+08,8457.690000,8457.690000,8457.690000,8457.690000,8457.690000,8457.690000,8457.950521
1,2019-11-16 00:15:00,1,8457.950521,8464.019583,8529.472500,8430.178229,8484.321979,51.011410,1.805146e+07,1.531540e+11,...,494408.000000,283269.062500,4.605949e+08,8457.820260,8457.820260,8457.820260,8457.820260,8457.820260,8457.820260,8458.211042
2,2019-11-16 00:30:00,2,8458.211042,8464.249167,8530.505000,8429.586458,8484.573958,51.415050,1.805147e+07,1.531686e+11,...,493739.000000,283070.125000,4.593551e+08,8457.950521,8457.950521,8457.950521,8457.950521,8457.950521,8457.950521,8458.471563
3,2019-11-16 00:45:00,3,8458.471563,8464.478750,8531.537500,8428.994688,8484.825937,51.818689,1.805149e+07,1.531833e+11,...,493070.000000,282871.187500,4.581154e+08,8458.080781,8458.080781,8458.080781,8458.080781,8458.080781,8458.080781,8458.732083
4,2019-11-16 01:00:00,4,8458.732083,8464.708333,8532.570000,8428.402917,8485.077917,52.222329,1.805151e+07,1.531979e+11,...,492401.000000,282672.250000,4.568756e+08,8458.211042,8458.211042,8458.211042,8458.211042,8458.211042,8458.211042,8458.992604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139963,2023-11-12 22:45:00,139963,37063.213854,37058.379167,37412.822917,36389.557292,36523.256250,3165.546566,1.954142e+07,7.207361e+11,...,681273.604167,442820.541667,3.735292e+09,29190.852369,28853.799572,29112.500071,27489.266701,30249.831324,27800.756631,37062.367083
139964,2023-11-12 23:00:00,139964,37062.367083,37057.483333,37414.858333,36385.845833,36517.425000,3187.447119,1.954143e+07,7.206994e+11,...,679079.083333,439957.833333,3.757942e+09,29192.159918,28854.571619,29113.299324,27489.818348,30249.495430,27800.822802,37061.520313
139965,2023-11-12 23:15:00,139965,37061.520313,37056.587500,37416.893750,36382.134375,36511.593750,3209.347671,1.954144e+07,7.206627e+11,...,676884.562500,437095.125000,3.780593e+09,29193.465002,28855.343669,29114.098193,27490.369855,30249.159440,27800.888965,37060.673542
139966,2023-11-12 23:30:00,139966,37060.673542,37055.691667,37418.929167,36378.422917,36505.762500,3231.248224,1.954145e+07,7.206260e+11,...,674690.041667,434232.416667,3.803244e+09,29194.767621,28856.115723,29114.896680,27490.921219,30248.823355,27800.955122,37059.826771


In [23]:
# List of features according to categories
ohlcv_statistics = {'Opening price (USD)':'opening-price', 'Highest price (USD)':'highest-price', 'Lowest price (USD)':'lowest-price', 'Closing price (USD)':'closing-price', 'Trade volume (BTC)':'trade-volume-btc'}
currency_statistics = {'Market price (USD)':'market-price', 'Market cap (USD)':'market-cap', 'N. total bitcoins':'total-bitcoins', 'Trade volume (USD)':'trade-volume-usd'}
block_details = {'Blocks size (MB)':'blocks-size', 'Avg. block size (MB)':'avg-block-size', 'N. total transactions':'n-transactions-total', 'N. transactions per block':'n-transactions-per-block'}
mining_information = {'Hash rate (TH/s)':'hash-rate', 'Difficulty (T)':'difficulty', 'Miners revenue (USD)':'miners-revenue', 'Transaction fees (USD)':'transaction-fees-usd'}
network_activity = {"N. unique addresses":'n-unique-addresses', 'N. transactions':'n-transactions', 'Estimated transaction volume (USD)':'estimated-transaction-volume-usd'}
simple_moving_avg = {"Simple moving avg. (5d)":"sma-5-days", "Simple moving avg. (7d)":"sma-7-days", "Simple moving avg. (10d)":"sma-10-days", "Simple moving avg. (20d)":"sma-20-days", "Simple moving avg. (50d)":"sma-50-days", "Simple moving avg. (100d)":"sma-100-days"}

In [24]:
# OHLC Statistics
ohlc_statistics = list(ohlcv_statistics.items())[:4]
print(ohlc_statistics)

# Volume Statistics
volume_statistics = list(ohlcv_statistics.items())[4:]
print(volume_statistics)

[('Opening price (USD)', 'opening-price'), ('Highest price (USD)', 'highest-price'), ('Lowest price (USD)', 'lowest-price'), ('Closing price (USD)', 'closing-price')]
[('Trade volume (BTC)', 'trade-volume-btc')]


In [25]:
# OHLCV Statistics
if SLOW_OPERATIONS:
  feature_engineering_utilities.ohlc_visualization(merged_df_pd, ohlc_statistics, "OHLC Statistics (usd)")
  feature_engineering_utilities.features_visualization(merged_df_pd, volume_statistics[0][0], volume_statistics[0][1])

The OHLCV stastistics chart is a type of bar chart that shows the open, high, low, close and volume values for each period. They are useful because they show the five main points of a period, with the closing price being considered the most important by many traders. Due to an Algo Bug on Binance's U.S. Exchange I have a [strange dump on 21 October 2021](https://www.bloomberg.com/news/articles/2021-10-21/bitcoin-appears-to-crash-87-on-binance-in-apparent-mistake#xj4y7vzkg) regarding the lower price.

In [26]:
# Currency Statistics
if SLOW_OPERATIONS:
  for key, value in currency_statistics.items():
    feature_engineering_utilities.features_visualization(merged_df_pd, key, value)

Concerning currency statistics, we can see that in the period from late 2020 to mid-2022 there has been a rise in the price of Bitcoin, while the amount of Bitcoins issued is slowly peaking (i.e. 21 million), it is thought that the last BTC will be mined in 2140.

In [27]:
# Block Details
if SLOW_OPERATIONS:
  for key, value in block_details.items():
    feature_engineering_utilities.features_visualization(merged_df_pd, key, value)

Concerning block details, we can see that over time the number of transactions has increased exponentially, along with the size of the blocks. The peak around the end of January 2023 is due to the creation of the Ordinals protocol that allows the creation of 'digital artefacts' on the Bitcoin network (These can include JPEG images, PDFs and audio and video files).

In [28]:
# Mining Information
if SLOW_OPERATIONS:
  for key, value in mining_information.items():
    feature_engineering_utilities.features_visualization(merged_df_pd, key, value)

Regarding mining information, we can see how the difficulty of the network along with the hash rate has also increased exponentially, the greater the hashing (computing) power in the network, the greater its security and resistance to attacks. While the miners revenue more or less follows the price trend of Bitcoin itself (this is also thanks to the transaction fees that are distributed to the miners). The two biggest spikes in transaction fees are due to a combination of ASIC shortages, huge price increases of BTC outpacing difficulty and the sudden hashrate drop, resulting in slower block times, backlog of transactions and extra fees per block ([20 - 21 April 2021](https://www.coindesk.com/markets/2021/04/21/bitcoin-transactions-are-more-expensive-than-ever/)) and the increase in demand for block space attributed to the increase in Ordinals ([8 May 2023](https://www.coindesk.com/tech/2023/05/08/ordinals-upend-bitcoin-mining-pushing-transaction-fees-above-mining-reward-for-first-time-in-years/))

In [29]:
# Network Activity
if SLOW_OPERATIONS:
  for key, value in network_activity.items():
    feature_engineering_utilities.features_visualization(merged_df_pd, key, value)

Regarding Network Activity, we can see how this also increases as time goes by, a symbol that the Bitcoin protocol is becoming more and more popular and people are willing to pay to use it.

In [30]:
# Extract the short term SMA
short_term_sma = list(simple_moving_avg.items())[:3]
print(short_term_sma)

# Extract the long term SMA
long_term_sma = list(simple_moving_avg.items())[3:]
print(long_term_sma)

[('Simple moving avg. (5d)', 'sma-5-days'), ('Simple moving avg. (7d)', 'sma-7-days'), ('Simple moving avg. (10d)', 'sma-10-days')]
[('Simple moving avg. (20d)', 'sma-20-days'), ('Simple moving avg. (50d)', 'sma-50-days'), ('Simple moving avg. (100d)', 'sma-100-days')]


In [31]:
# Additional Features: Short term SMA
if SLOW_OPERATIONS:
  feature_engineering_utilities.sma_visualization(merged_df_pd, short_term_sma, "Short term SMA (usd)")

In [32]:
# Additional Features: Long term SMA
if SLOW_OPERATIONS:
  feature_engineering_utilities.sma_visualization(merged_df_pd,long_term_sma, "Long term SMA (usd)")

Taking into consideration the Simple Moving Averages (which instead give us a more medium to long term view of the price) we can see that the main price variations occur precisely in the latter, this tells us that although Bitcoin has high price volatility this often occurs days or even months later, except in some cases where unpredictable pumps or dumps can occur due to sudden news.

#  Feature selection
Then all the features have been divided into three distinct final groups:
- `Base features:` contains all the price features
- `Base + most / least correlated features:` contains the previous ones plus the additional blockchain features divided based on their correlation value with the price
- If this value is greater than equal to 0.6 they will be considered most correlated, least correlated otherwise

<img src="https://github.com/CorsiDanilo/bitcoin-price-prediction-with-pyspark/blob/main/notebooks/images/grouped_features.png?raw=1">

In [33]:
# List of features
all_features = [ohlcv_statistics, currency_statistics, block_details, mining_information, network_activity, simple_moving_avg]

# Count occurrences
count = 0
for features in all_features:
    for key, value in features.items():
        count += 1

print(f"Number of features: {count}")

Number of features: 26


In [34]:
# Count the number of columns
num_columns = len(merged_df.columns)
print(f"Number of columns in the dataset: {num_columns}")

Number of columns in the dataset: 29


In [35]:
# Prepare dataset to feature selection
new_columns = ["next-market-price"] + [
    col for col in merged_df.columns if col not in
    [
    'timestamp',
    'id',
    'next-market-price',
    'opening-price',
    'highest-price',
    'lowest-price',
    'closing-price',
    'trade-volume-btc',
    'market-price',
    'market-cap',
    'total-bitcoins',
    'trade-volume-usd'
 ]]
merged_df_only_blockchain_data = merged_df.select(*new_columns)
merged_df_only_blockchain_data.show()

+-----------------+------------------+------------------+--------------------+------------------------+-------------------+------------------+--------------------+--------------------+------------------+--------------+--------------------------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|next-market-price|       blocks-size|    avg-block-size|n-transactions-total|n-transactions-per-block|          hash-rate|        difficulty|      miners-revenue|transaction-fees-usd|n-unique-addresses|n-transactions|estimated-transaction-volume-usd|       sma-5-days|       sma-7-days|      sma-10-days|      sma-20-days|      sma-50-days|     sma-100-days|
+-----------------+------------------+------------------+--------------------+------------------------+-------------------+------------------+--------------------+--------------------+------------------+--------------+--------------------------------+-----------------+-----------

In [36]:
# Assemble the data to apply PySpark methods
assembler = VectorAssembler(inputCols=merged_df_only_blockchain_data.columns, outputCol='features')
assembled_data = assembler.transform(merged_df_only_blockchain_data)

In [37]:
# Compute the correlation matrix
correlation_matrix = Correlation.corr(assembled_data, 'features').head()

# Get the highest correlated features
correlation_scores = correlation_matrix[0].toArray()
feature_names = merged_df_only_blockchain_data.columns
feature_correlations = sorted([(feature_names[i], str(correlation_scores[i][0])) for i in range(len(feature_names))], key=lambda x: x[1], reverse=True)

# Print the results
for label, value in feature_correlations:
    print(f"Feature: {label}, Correlation: {value}")

Feature: next-market-price, Correlation: 1.0
Feature: miners-revenue, Correlation: 0.963273852367899
Feature: sma-5-days, Correlation: 0.9139958505084166
Feature: sma-7-days, Correlation: 0.8726142496801989
Feature: sma-10-days, Correlation: 0.8196340105607194
Feature: estimated-transaction-volume-usd, Correlation: 0.709530895856149
Feature: sma-20-days, Correlation: 0.6698547541848738
Feature: sma-100-days, Correlation: 0.4662914958156163
Feature: transaction-fees-usd, Correlation: 0.4155959803945346
Feature: n-unique-addresses, Correlation: 0.39679495661658293
Feature: sma-50-days, Correlation: 0.3134776959373877
Feature: n-transactions-total, Correlation: 0.2941482673105254
Feature: blocks-size, Correlation: 0.27146168953026767
Feature: hash-rate, Correlation: 0.09616890176693868
Feature: difficulty, Correlation: 0.09184540457259807
Feature: avg-block-size, Correlation: 0.0649178951579983
Feature: n-transactions-per-block, Correlation: -0.12932856951838906
Feature: n-transactions, C

Finally, I decided to divide the features into two distinct groups:
- `Base features:` contains all the Currency Statistics features
- `Base and additional features:` contains the Base features plus the additional features divided based on their correlation value with the price: 
    - If >= 0.6, then then they will be considered `most correlated`.
    - If < 0.6, then then they will be considered `least correlated`.

The strategy for the model's train / validation phase will be:
- Test models with base features
- See if by adding the additional most and least correlated features to them improves the performance

In [38]:
base_features = list(ohlcv_statistics.values()) + list(currency_statistics.values())
most_corr_features = [x[0] for x in feature_correlations[1:] if float(x[1]) >= 0.6]
base_and_most_corr_features = base_features + most_corr_features
least_corr_features = [x[0] for x in feature_correlations[1:] if float(x[1]) < 0.6]
base_and_least_corr_features = base_features + least_corr_features

In [39]:
base_features

['opening-price',
 'highest-price',
 'lowest-price',
 'closing-price',
 'trade-volume-btc',
 'market-price',
 'market-cap',
 'total-bitcoins',
 'trade-volume-usd']

In [40]:
most_corr_features

['miners-revenue',
 'sma-5-days',
 'sma-7-days',
 'sma-10-days',
 'estimated-transaction-volume-usd',
 'sma-20-days']

In [41]:
base_and_most_corr_features

['opening-price',
 'highest-price',
 'lowest-price',
 'closing-price',
 'trade-volume-btc',
 'market-price',
 'market-cap',
 'total-bitcoins',
 'trade-volume-usd',
 'miners-revenue',
 'sma-5-days',
 'sma-7-days',
 'sma-10-days',
 'estimated-transaction-volume-usd',
 'sma-20-days']

In [42]:
least_corr_features

['sma-100-days',
 'transaction-fees-usd',
 'n-unique-addresses',
 'sma-50-days',
 'n-transactions-total',
 'blocks-size',
 'hash-rate',
 'difficulty',
 'avg-block-size',
 'n-transactions-per-block',
 'n-transactions']

In [43]:
base_and_least_corr_features

['opening-price',
 'highest-price',
 'lowest-price',
 'closing-price',
 'trade-volume-btc',
 'market-price',
 'market-cap',
 'total-bitcoins',
 'trade-volume-usd',
 'sma-100-days',
 'transaction-fees-usd',
 'n-unique-addresses',
 'sma-50-days',
 'n-transactions-total',
 'blocks-size',
 'hash-rate',
 'difficulty',
 'avg-block-size',
 'n-transactions-per-block',
 'n-transactions']

# Saving selected features

In [44]:
# Save currency and ohlcv features
with open(BASE_FEATURES, 'w') as file:
    json.dump(base_features, file)

In [45]:
# Save currency, ohlcv and blockchain most correlated features
with open(BASE_AND_MOST_CORR_FEATURES, 'w') as file:
    json.dump(base_and_most_corr_features, file)

In [46]:
# Save currency, ohlcv and blockchain least correlated features
with open(BASE_AND_LEAST_CORR_FEATURES, 'w') as file:
    json.dump(base_and_least_corr_features, file)

In [47]:
# Export notebook in html format (remember to save the notebook and change the model name)
if LOCAL_RUNNING:
    !jupyter nbconvert --to html 2-feature-engineering.ipynb --output 2-feature-engineering --output-dir='./exports'

  warn(
[NbConvertApp] Converting notebook 2-feature-engineering.ipynb to html
[NbConvertApp] Writing 410111 bytes to exports\2-feature-engineering.html
