In [1]:
# !pip install ipynb
# !pip install tensorflow
# !pip install keras

In [2]:
# Import Statements
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import ipynb.fs.full.blockchain_api as blockchain_data
warnings.filterwarnings('ignore')

# blockchain_data.get_block_details(dir='./resource/block_details')
# blockchain_data.get_market_signals(dir='./resource/market_signals')
# blockchain_data.get_mining_information(dir='./resource/mining_information')
# blockchain_data.get_network_activity(dir='./resource/network_actvity')
# blockchain_data.get_wallet_activity(dir='./resource/wallet_activity')

# Import Data

In [3]:
# Bitcoin Price
bitcoin_price = pd.read_csv('./resource/bitstampUSD_1-min_data_2012-01-01_to_2021-03-31.csv')
bitcoin_price['Timestamp'] = pd.to_datetime(bitcoin_price.Timestamp, unit='s')
bitcoin_price.index = bitcoin_price['Timestamp']
bitcoin_price = bitcoin_price.drop(columns=['Timestamp'])

# Blockchain Information (Block Details)
avg_block_size = pd.read_csv('./resource/block_details/avg-block-size', index_col='Timestamp')
block_size = pd.read_csv('./resource/block_details/blocks-size', index_col='Timestamp')
median_confirmation_time = pd.read_csv('./resource/block_details/median-confirmation-time', index_col='Timestamp')
n_transactions_per_block = pd.read_csv('./resource/block_details/n-transactions-per-block', index_col='Timestamp')
n_transactions_total = pd.read_csv('./resource/block_details/n-transactions-total', index_col='Timestamp')
block_details_list = [avg_block_size, block_size, median_confirmation_time, n_transactions_per_block, n_transactions_total]

# Blockchain Information (Market Signals)
market_value_to_realised_value = pd.read_csv('./resource/market_signals/mvrv')
network_value_to_transactions = pd.read_csv('./resource/market_signals/nvt')
network_value_to_transactions_signal = pd.read_csv('./resource/market_signals/nvts')
market_signals_list = [market_value_to_realised_value, network_value_to_transactions, network_value_to_transactions_signal]

# Blockchain Information (Mining Information)
cost_per_transaction = pd.read_csv('./resource/mining_information/cost-per-transaction', index_col='Timestamp')
cost_per_transaction_percent = pd.read_csv('./resource/mining_information/cost-per-transaction-percent', index_col='Timestamp')
difficulty = pd.read_csv('./resource/mining_information/difficulty', index_col='Timestamp')
fees_usd_per_transaction = pd.read_csv('./resource/mining_information/fees-usd-per-transaction', index_col='Timestamp')
hash_rate = pd.read_csv('./resource/mining_information/hash-rate', index_col='Timestamp')
miners_revenue = pd.read_csv('./resource/mining_information/miners-revenue', index_col='Timestamp')
transaction_fees = pd.read_csv('./resource/mining_information/transaction-fees', index_col='Timestamp')
transaction_fees_usd = pd.read_csv('./resource/mining_information/transaction-fees-usd', index_col='Timestamp')
mining_information_list = [cost_per_transaction, cost_per_transaction_percent, difficulty, fees_usd_per_transaction, hash_rate, miners_revenue, transaction_fees, transaction_fees_usd]

# Blockchain Information (Network Activity)
estimated_transaction_volume = pd.read_csv('./resource/network_activity/estimated-transaction-volume', index_col='Timestamp')
estimated_transaction_volume_usd = pd.read_csv('./resource/network_activity/estimated-transaction-volume-usd', index_col='Timestamp')
n_transactions = pd.read_csv('./resource/network_activity/n-transactions', index_col='Timestamp')
n_transactions_exlcuding_popular = pd.read_csv('./resource/network_activity/n-transactions-excluding-popular', index_col='Timestamp')
n_unique_addresses = pd.read_csv('./resource/network_activity/n-unique-addresses', index_col='Timestamp')
output_volume = pd.read_csv('./resource/network_activity/output-volume', index_col='Timestamp')
network_activity_list = [estimated_transaction_volume, estimated_transaction_volume_usd, n_transactions, n_transactions_exlcuding_popular, n_unique_addresses, output_volume]

# Blockchain Information (Wallet Activity)
n_wallets = pd.read_csv('./resource/wallet_activity/my-wallet-n-users')

# Data Cleaning

In [4]:
# Bitcoin Price
weighted_price = bitcoin_price['Weighted_Price'].resample('D').mean()
volume_currency = bitcoin_price['Volume_(Currency)'].resample('D').sum()
volume_btc = bitcoin_price['Volume_(BTC)'].resample('D').sum()
close = bitcoin_price['Close'].resample('D').last()
low = bitcoin_price['Low'].resample('D').min()
high = bitcoin_price['High'].resample('D').max()
open = bitcoin_price['Open'].resample('D').first()
resampled_bitcoin_price = pd.concat([
    open, high, low, close, volume_btc, volume_currency, weighted_price
], axis=1)
resampled_bitcoin_price = resampled_bitcoin_price.loc[:'2021-03-30'] #Delete the data for 2021-03-31 because it only has 1 row (1 minute) of data so it is unreliable
resampled_bitcoin_price.loc['2015-01-06':'2015-01-08'] = np.nan # Set everything to NaN for interpolate
resampled_bitcoin_price = resampled_bitcoin_price.interpolate()

# Blockchain Information (Block Details)
block_details = pd.concat(block_details_list, axis=1)
block_details.index = pd.to_datetime(block_details.index)
block_details = block_details.loc[:'2021-03-30'] #Remove all data after 2021-03-30 to follow resampled_bitcoin_price data timestamp

# Blockchain Information (Market Signals)
market_signals = pd.DataFrame()
for df in market_signals_list:
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df.index = df['Timestamp']
    df = df.drop(columns=['Timestamp'])
    df = df.resample('D').mean()
    market_signals = pd.concat([market_signals, df], axis=1)
market_signals = market_signals.loc[:'2021-03-30'] #Remove all data after 2021-03-30 to follow resampled_bitcoin_price data timestamp

# Blockchain Information (Mining Information)
mining_information = pd.concat(mining_information_list, axis=1)
mining_information.index.name = 'Timestamp'
mining_information.index = pd.to_datetime(mining_information.index)
mining_information = mining_information.loc[:'2021-03-30']

# Blockchain Information (Network Activity)
network_activity = pd.concat(network_activity_list, axis=1)
network_activity.index.name = 'Timestamp'
network_activity.index = pd.to_datetime(network_activity.index)
network_activity = network_activity.loc[:'2021-03-30']
network_activity['n-unique-addresses'] = network_activity['n-unique-addresses'].interpolate().round()
print(f'Check for anymore missing values\n{network_activity.isnull().sum()}')

# Blockchain Information (Wallet Activity)
n_wallets['Timestamp'] = pd.to_datetime(n_wallets.Timestamp)
n_wallets.index = n_wallets.Timestamp
new_n_wallets_data = n_wallets.drop(columns=['Timestamp'])
# Create a new column dataframe to process new-n-wallets-created per day
new_n_wallets_created = new_n_wallets_data.copy()
new_n_wallets_created = new_n_wallets_created.diff()
new_n_wallets_created = new_n_wallets_created.resample('D').sum()
new_n_wallets_created.rename(columns={'my-wallet-n-users':'new-n-wallets-created'}, inplace=True)
print('Number of Null Values:', new_n_wallets_created.isnull().sum())
# Clean the original column
resampled_n_wallets = new_n_wallets_data.resample('D').last()
resampled_n_wallets = resampled_n_wallets.interpolate().round()
print("Number of Null Values:", resampled_n_wallets.isnull().sum())
# Concatenate the two columns
wallets_activity = pd.concat([new_n_wallets_created, resampled_n_wallets], axis=1)
wallets_activity = wallets_activity.loc['2011-12-31':'2021-03-30']

Check for anymore missing values
estimated-transaction-volume        0
estimated-transaction-volume-usd    0
n-transactions                      0
n-transactions-excluding-popular    0
n-unique-addresses                  0
output-volume                       0
dtype: int64
Number of Null Values: new-n-wallets-created    0
dtype: int64
Number of Null Values: my-wallet-n-users    0
dtype: int64


In [43]:
# To reframe timeseries data for supervised learning
def timeseries_to_supervised(data, columns, n_in=1, n_out=1, dropnan=True):
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [f'{name}(t-{i})' for name in columns]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [f'{name}(t)' for name in columns]
        else:
            names += [f'{name}(t+{i})' for name in columns]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg #pd.DataFrame

# Split reframed supervised timeseries data to train and test
def train_test_split(data, n_timesteps=1, n_features=11, n_test=1):
    if isinstance(data, pd.DataFrame):
        data = np.array(data)
    n_obs = n_timesteps * n_features
    X_train, X_test = data[:-n_test, :n_obs], data[-n_test:, :n_obs]
    y_train, y_test = data[:-n_test, -1], data[-n_test:, -1]
    return X_train, X_test, y_train, y_test #np.array

def difference(data, interval=1):
    if isinstance(data, pd.DataFrame):
        data = np.array(data)
    diff = list()
    for i in range(interval, len(data)):
        value = data[i] - data[i - interval]
        diff.append(value)
    return np.array(diff) #np.array

# invert differenced value
def inverse_difference(original_data, y_pred, interval=1):
    return y_pred + original_data[-interval]

def scale(train, test):
    if len(train.shape) == 1 and len(test.shape) == 1:
        train = train.reshape(-1, 1)
        test = test.reshape(-1, 1)
    scaler = MinMaxScaler()
    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    return scaler, train, test

def inverse_scale(scaler, y_pred):
    y_pred = scaler.inverse_transform(y_pred)
    return y_pred

In [6]:
df = pd.concat([
    resampled_bitcoin_price, block_details, market_signals, mining_information, network_activity, wallets_activity
], axis=1)
selected_features = ['miners-revenue','estimated-transaction-volume-usd','transaction-fees-usd',
                     'difficulty', 'hash-rate', 'fees-usd-per-transaction', 'blocks-size',
                     'my-wallet-n-users','n-transactions-total','Volume_(Currency)','Weighted_Price']
feature_extracted_df = df.reindex(columns=selected_features) # Only select the selected features for the new data
feature_extracted_df = feature_extracted_df['2016':] # Remove values before 2016

# Conduct Differencing
raw_values = feature_extracted_df.values
diff_values = difference(raw_values, 1)

# Model Building

In [42]:
# Stateless LSTM

from math import sqrt
from numpy import concatenate
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

def forecast_lstm(model, batch_size, X_test):
    X_test = X_test.reshape((1, X_test.shape[0], X_test.shape[1])) 
    y_pred = model.predict(X_test, batch_size=batch_size)
    return y_pred

def update_model(model, X_train, y_train, batch_size, updates):
    model.fit(X_train, y_train, epochs=updates, batch_size=batch_size, verbose=1, shuffle=False)

def fit_lstm(X_train, y_train, batch_size=100, n_epochs=50, neurons=50):
    model=Sequential()
    model.add(LSTM(neurons, batch_input_shape=(batch_size, X_train.shape[1], X_train.shape[2]), stateful=False))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size, verbose=1, shuffle=False)
    return model

def experiment_lstm(repeats, data, updates, n_timesteps, n_features, n_test, n_epochs, neurons):
    supervised_values = timeseries_to_supervised(data, feature_extracted_df.columns, n_timesteps)
    X_train, X_test, y_train, y_test = train_test_split(supervised_values, n_timesteps, n_features, n_test)
    # Apply MinMaxScaler to both X and y data
    X_scaler, X_train, X_test = scale(X_train, X_test)
    y_scaler, y_train, y_test = scale(y_train, y_test)
    # Reshape X data to 3 dimensional as per LSTM input requirement
    X_train = X_train.reshape((X_train.shape[0], n_timesteps, n_features))
    X_test = X_test.reshape((X_test.shape[0], n_timesteps, n_features))
    # Reset y_test to original data without any pre-processing 
    ori_y_test_values = feature_extracted_df.values[-n_test:, -1]

    # Repeatedly train and test LSTM model to eliminate outlier results
    error_scores = list()
    prediction_list = list()
    for r in range(repeats):
        print(f'Stateless Repeat {r}. Initial Fit Model. neurons: {neurons}')
        lstm_model = fit_lstm(X_train, y_train, batch_size=1, n_epochs=n_epochs, neurons=neurons) 
        # Copy train data so tested data from walk-forward validation can be appended
        X_train_copy = np.copy(X_train)
        y_train_copy = np.copy(y_train)  
        
        # Walk-Forward Validation
        predictions = list()
        for i in range(n_test):  
            if i > 0:
                print(f'Stateless Repeat {r}. Test Data: {i}. neurons: {neurons}')
                update_model(lstm_model, X_train_copy, y_train_copy, 1, updates)
            X_test_copy = X_test[i]
            y_pred = forecast_lstm(lstm_model, 1, X_test_copy)
            y_pred = inverse_scale(y_scaler, y_pred)
            y_pred = y_pred[0][0] # inverse_scale return a 2-d ndarray
            y_pred = inverse_difference(raw_values[:, -1], y_pred, n_test+1-i)
            predictions.append(y_pred)
            X_train_copy = concatenate((X_train_copy, X_test[i].reshape(1, X_test.shape[1], X_test.shape[2])))        
            y_train_copy = concatenate((y_train_copy, y_test[i].reshape(1, y_test.shape[1])))
        # Report Performance
        MAE = mean_absolute_error(ori_y_test_values, predictions)
        print(f'Stateless Repeat {r}. MAE = {MAE}. neurons: {neurons}')
        error_scores.append(MAE)
        prediction_list.append(predictions)
    return error_scores, prediction_list    
    
def run_lstm(n_timesteps, n_test, data, n_epochs, neurons):
    repeats = 10 # 10 is recommended
    results = pd.DataFrame()
    updates = 5 # 5 or more is recommended
    n_features = len(feature_extracted_df.columns) # 11
    n_epochs = n_epochs
    neurons = neurons
    results['Results'], results['prediction_list'] = experiment_lstm(
        repeats,
        data,
        updates,
        n_timesteps,
        n_features,
        n_test,
        n_epochs,
        neurons
    )
    
    return results

KeyboardInterrupt: 

In [66]:
# Random Forest

from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error

# Random Forest
def forecast_random_forest(model, X_test):
    y_pred = model.predict(X_test)
    return y_pred[0]

def fit_random_forest(X_train, y_train, n_estimators=500):
    random_forest = RandomForestRegressor(n_estimators=n_estimators, n_jobs=-1, random_state=1)
    model  = TransformedTargetRegressor(regressor=random_forest, transformer=MinMaxScaler())
    model.fit(X_train, y_train)
    return model

def experiment_random_forest(data, n_timesteps, n_features, n_test, n_estimators=500):
    supervised_values = timeseries_to_supervised(data, feature_extracted_df.columns, n_timesteps)
    X_train, X_test, y_train, y_test = train_test_split(supervised_values, n_timesteps, n_features, n_test)
    # Apply MinMaxScaler to X data. Scaling for y data will be handled by meta-estimator TransformedTargetRegressor
    X_scaler, X_train, X_test = scale(X_train, X_test)
    # Copy train data 
    X_train_copy = np.copy(X_train)
    y_train_copy = np.copy(y_train)
    # Copy real y_test values
    ori_y_test_values = feature_extracted_df.values[-n_test:, -1]
    predictions = list()
    # Walk-Forward Validation
    for i in range(n_test):
        model = fit_random_forest(X_train_copy, y_train_copy, n_estimators)
        X_test_copy = X_test[i].reshape(1, X_test.shape[1])
        y_pred = forecast_random_forest(model, X_test_copy)
        y_pred = inverse_difference(raw_values[:, -1], y_pred, n_test+1-i)
        predictions.append(y_pred)
        X_train_copy = concatenate((X_train_copy, X_test[i].reshape(1, X_test.shape[1])))
        y_train_copy = concatenate((y_train_copy, y_test[i].reshape(1)))
#         print(f'y_pred {i} = {predictions[i]}, expected = {ori_y_test_values[i]}')
        
    MAE = mean_absolute_error(ori_y_test_values, predictions)
    return MAE, ori_y_test_values, predictions

def run_random_forest(n_timesteps, n_test, data, n_estimators):
    n_features = len(feature_extracted_df.columns) # 11
    error, original_values, predictions = experiment_random_forest(data, n_timesteps, n_features, n_test, n_estimators=n_estimators)
    print(f'MAE {error}')
    return error, original_values, predictions

In [None]:
timesteps = [5, 10, 20, 50, 100]
n_estimators = [100, 200, 500, 1000, 2500, 5000]

random_forest_error = dict()
random_forest_predictions = dict()

for t in timesteps:
    for n in n_estimators:
        print(f'Current timestep = {t}\ncurrent n estimators used = {n}')
        error, _ , predictions = run_random_forest (n_timesteps=t, n_test=30, data=diff_values, n_estimators=n)
        random_forest_error[f't-{t}/estimators-{n}'] = error
        random_forest_predictions[f't-{t}/estimators-{n}'] = predictions
        print('\n\n\n')

Current timestep = 5
current n estimators used = 100
y_pred 0 = 43464.36467653192, expected = 47641.53689948156
y_pred 1 = 47859.87779751698, expected = 48617.28932090135
y_pred 2 = 48287.982828630214, expected = 50429.44901579067
y_pred 3 = 51194.15749519829, expected = 49398.2503985708
y_pred 4 = 49778.79971597322, expected = 47717.64330027977
y_pred 5 = 47765.293189726806, expected = 48371.54353797001
y_pred 6 = 48655.31335923888, expected = 50218.21680438112
y_pred 7 = 50547.89951688584, expected = 50798.35421240729
y_pred 8 = 50918.922743895375, expected = 53980.847261679024
y_pred 9 = 54229.36695779942, expected = 55375.36624125794
y_pred 10 = 55659.82943849309, expected = 56272.596972013926
y_pred 11 = 56328.51932007179, expected = 56826.56003304944
y_pred 12 = 57119.889798857315, expected = 58802.601567706326
y_pred 13 = 59311.14459233649, expected = 60455.84483148646
y_pred 14 = 60175.18637937987, expected = 57507.73450910979
y_pred 15 = 57355.27747542304, expected = 55339.804

y_pred 11 = 56286.48953598755, expected = 56826.56003304944
y_pred 12 = 57176.67589366843, expected = 58802.601567706326
y_pred 13 = 59344.822159243755, expected = 60455.84483148646
y_pred 14 = 60443.65335205834, expected = 57507.73450910979
y_pred 15 = 57371.820135269336, expected = 55339.80479183194
y_pred 16 = 55695.00582732066, expected = 56158.0173198603
y_pred 17 = 57371.21064624016, expected = 58472.11732155986
y_pred 18 = 59230.922183489165, expected = 58283.54799432517
y_pred 19 = 57954.23840416846, expected = 58779.76938655648
y_pred 20 = 58778.388914034906, expected = 57312.59853645818
y_pred 21 = 58001.30179911546, expected = 56787.14842211064
y_pred 22 = 56497.7789658702, expected = 54703.38774047086
y_pred 23 = 54997.81310739058, expected = 55123.99385491758
y_pred 24 = 55405.055525153475, expected = 52121.68411029948
y_pred 25 = 52432.182283196795, expected = 53241.03748850518
y_pred 26 = 53445.37291267015, expected = 55193.35725955907
y_pred 27 = 56240.086799543875, exp

y_pred 23 = 54263.693777556204, expected = 55123.99385491758
y_pred 24 = 55665.01342402055, expected = 52121.68411029948
y_pred 25 = 52334.977453848674, expected = 53241.03748850518
y_pred 26 = 53126.568368291635, expected = 55193.35725955907
y_pred 27 = 56043.17442865743, expected = 55832.958823907036
y_pred 28 = 55744.017031269876, expected = 56913.99381926178
y_pred 29 = 57223.7537784158, expected = 58346.912268295404
MAE 1419.857722555552




Current timestep = 10
current n estimators used = 1000
y_pred 0 = 43839.59900256177, expected = 47641.53689948156
y_pred 1 = 47047.04253774486, expected = 48617.28932090135
y_pred 2 = 49598.80367010381, expected = 50429.44901579067
y_pred 3 = 50304.103778138146, expected = 49398.2503985708
y_pred 4 = 50051.85951775318, expected = 47717.64330027977
y_pred 5 = 47806.87101526508, expected = 48371.54353797001
y_pred 6 = 48133.821785465945, expected = 50218.21680438112
y_pred 7 = 50476.29872691631, expected = 50798.35421240729
y_pred 8 = 50591.5777

In [None]:
rf_error = pd.DataFrame(random_forest_error)
rf_error.to_csv('rf_error.csv')
rf_predictions = pd.DataFrame(random_forest_predictions)
rf_predictions.to_csv('rf_predictions.csv')