In [1]:
# In this notebook I include all relevant functions used in this model
# The goal is to predict the upward/downward of the Ethereum cryptocurrency on a 3-minute time horizon
# The use of each function is described throroughly throughout the notebook
# I use Gemini exchange data available at: https://www.cryptodatadownload.com/data/gemini/

In [2]:
import random
import pandas as pd
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [3]:
# The datasets used for this model can be found in the /crypto_data/Gemini folder
# Each dataset includes minute-by-minute open, high, low, close prices of a particular crypto - BTC, ETH, LTC, ZEC 
# Datasets are loaded into pandas dataframes for further manipulation

# The below function creates a combined dataset of both ETH and BTC price data from a given year (YEAR)

def load_gemini_dateset(YEAR):
    
    # Loading datasets
    
    btc_usd = pd.read_csv(f"crypto_data/Gemini/BTC-USD_{YEAR}.csv", names=["time", "date", "symbol", "btc_open", "btc_high", "btc_low", "btc_close", "btc_volume"])
    eth_usd = pd.read_csv(f"crypto_data/Gemini/ETH-USD_{YEAR}.csv", names=["time", "date", "symbol", "eth_open", "eth_high", "eth_low", "eth_close", "eth_volume"])
    
    # This function fixes the inconsistent timestamps within datasets
    
    def unix_to_seconds(time):
        if time > 1900000000:
            return time // 1000
        else:
            return time

    # Both datasets are pruned, rearranged and their time column is transoformed
    
    btc_usd.drop(["date", "symbol"], axis=1, inplace=True)
    btc_usd = btc_usd[::-1]
    btc_usd.reset_index(drop=True, inplace=True)
    btc_usd.time = btc_usd.time.apply(unix_to_seconds)
    
    eth_usd.drop(["date", "symbol"], axis=1, inplace=True)
    eth_usd = eth_usd[::-1]
    eth_usd.reset_index(drop=True, inplace=True)
    eth_usd.time = eth_usd.time.apply(unix_to_seconds)
    
    # Both datasets are merged on time and missing values are filled with the ones preceding them
    # In most instances gaps were sporadic and not a large proportion has been filled

    df = pd.merge(eth_usd, btc_usd, how='left', on='time')
    df = df.fillna(method='ffill')
    df = df.iloc[1:]
        
    
    return df

In [6]:
# A snapshot of the dataframe data is shown below:

YEAR = 2018

df = load_gemini_dateset(YEAR)
df.iloc[5300:5310]

Unnamed: 0,time,eth_open,eth_high,eth_low,eth_close,eth_volume,btc_open,btc_high,btc_low,btc_close,btc_volume
5301,1515082860,981.51,981.81,981.5,981.54,138.218681,14514.5,14521.0,14514.49,14515.81,5.469575
5302,1515082920,981.54,981.83,981.0,981.0,76.948456,14515.81,14521.0,14514.49,14514.68,4.081804
5303,1515082980,981.0,981.5,981.0,981.49,76.140935,14514.68,14530.49,14514.49,14520.98,27.435541
5304,1515083040,981.49,981.49,981.0,981.49,236.583591,14520.98,14520.98,14515.01,14515.01,0.454036
5305,1515083100,981.49,983.0,981.49,983.0,0.660766,14515.01,14520.98,14515.01,14520.98,1.569043
5306,1515083160,983.0,983.57,983.0,983.56,14.883745,14520.98,14558.96,14520.98,14534.55,5.380189
5307,1515083220,983.56,984.45,983.56,984.29,4.482341,14534.55,14551.75,14534.55,14536.59,0.152562
5308,1515083280,984.29,984.29,984.24,984.25,32.180429,14536.59,14536.59,14530.49,14536.59,2.374266
5309,1515083340,984.25,984.25,984.24,984.24,13.553575,14536.59,14559.84,14534.93,14546.45,1.582403
5310,1515083400,984.24,984.24,984.01,984.01,29.56784,14546.45,14546.47,14545.01,14545.01,4.083873


In [None]:
import numpy as np
from sklearn.preprocessing import scale

# The function below calassifies each minute on a upward (1) / downward (0) ETH movement basis
# We specify long in advance do we want to predict - in our case this is 3 minutes
# Next, we specify how many minutes back is the model supposed to look - this will be 45 minutes
# TARGET variable specifies the feature we are trying to predict - e.g. eth_close

def preprocessing(df, TIME_BACK, TIME_IN_ADVANCE, TARGET, YEAR):
    
    # TIME_BACK        - how many minutes into the past is the model looking
    # TIME_IN_ADVANCE  - how many minutes into the future are we trying to predict
    # TARGET           - the exchange rate we are trying to predict
    
    # Appending a column with a future value of a target exchange
    n = TIME_BACK + 2 # 2 comes from further data preprocessing
    
    # Filling the label with NANs initially
    df[f"future_{TARGET}"] = np.nan
    
    # Iterating through each row of the dataset
    while n + TIME_IN_ADVANCE < len(df):
        
        # Check if the value in advance is present in the df
        if df['time'].iloc[n + TIME_IN_ADVANCE] - TIME_IN_ADVANCE * 60 == df['time'].iloc[n]:
            
            # Check if all previous values back in time are present
            if len(df['time'].iloc[n - TIME_BACK : n].diff().value_counts()) == 1:
                
                # Assign future value
                df.at[n, f'future_{TARGET}'] = df[TARGET].iloc[n + TIME_IN_ADVANCE] 
                
        n += 1
        
        print(f"{n} out of {len(df)} rows", end='\r')
        
    print("")

    
    # Adding the future_change column describing the actual change from the present value
    # Adding a target value 1 - goes up, 0 - goes down
    df['future_change'] = df[f'future_{TARGET}'] / df[TARGET[:3] + '_open']
    df['target'] = df["future_change"].apply(classify)

    # Scaling and preprocessing feature variables
    df = scale_feature_variables(df, TARGET)
    df.reset_index(drop=True, inplace=True)
        
    return df

# Function used to fill the target column in the dataframe

def classify(future_change):
    if future_change > 1:
        return 1
    else:
        return 0
    

# Each feature is scaled to improve the learning process of the network
    
    
def scale_feature_variables(df, TARGET):
    target_df = df.copy()
    columns = target_df.columns
    for col in columns:
        if col not in ["time", f"future_{TARGET}", "future_change", "target"]:
            target_df[col] = target_df[col].pct_change()
            target_df[col] = target_df[col].replace([np.inf], 1)
            target_df[col] = scale(list(target_df[col]))
           
    return target_df.iloc[1:]

In [None]:
# Function that transforms the preprocessed dataframe into a 3-D feature tensor X of shape [no_samples, TIME_BACK, no_features]
# It also returns 1-D label tensor y

def prepare_features(df, TARGET, TIME_BACK, YEAR):
    
    X = []
    y = []
    rows = []
    
    # Iterating over every minute of the dataset
    
    for i in range(TIME_BACK, len(df)):
        
        # Check if the future value exist
        if np.isnan(df[f'future_change'].iloc[i]) or df[f'future_change'].iloc[i] == 1.0:
            continue
        
        # Check if all feature variables are present
        if df.iloc[i-TIME_BACK:i, 1:-3].isnull().values.any():
            continue
            
        # Create a sample and add it to the feature and label array
        X.append(np.array(df.iloc[i-TIME_BACK:i, 1:-3]))
        y.append(df['target'].iloc[i])
        rows.append(i)
        
        print(f"{i} out of {len(df)} rows", end='\r')
    print("")
    
    return X, y

In [None]:
# The function below splits the data into train and test batches

def split_train_test_data(X, y, split_point=75000):
    
    # Split the data into training and testing, and shuffle the train part
    
    X_train, X_test = X[:split_point], X[split_point:]
    y_train, y_test = y[:split_point], y[split_point:]
    train = list(zip(X_train, y_train))
    random.shuffle(train)
    
    # Recover the X and y train and reshape them for the input layer of the network
    # Make sure y labels are of type int
    
    X_train, y_train = np.array(train)[:, 0], np.array(train)[:, 1].astype(int)
    X_train = np.rollaxis(np.dstack(X_train), -1)
    X_test = np.rollaxis(np.dstack(X_test), -1)
    y_test = np.array(y_test).astype(int)
    
    return X_train, X_test, y_train, y_test

In [None]:
# In order for the training process to be effective, and not end up at a sub-optimal minimum,
# the training data need to be balanced 50-50 between upward (1) and downward (0) classes
# The below function balances the training sample

def balance_the_train_batch(X_train, y_train):

    # Dividing feature samples into the ones associated with different classes
    
    positive_X = []
    negative_X = []
    
    for i in range(len(X_train)):
        if y_train[i] == 0:
            negative_X.append(X_train[i])
        elif y_train[i] == 1:
            positive_X.append(X_train[i])
            
    # Limiting the number of majority class instance to the number of minority occurances
    
    if len(positive_X) > len(negative_X):
        positive_X = positive_X[:len(negative_X)]
        
    elif len(positive_X) < len(negative_X):
        negative_X = negative_X[:len(positive_X)]
        
        
    # Zipping all feature samples back together and shuffling them to avoid generalizations in the training process
    
    train = list(zip(positive_X, np.ones(len(positive_X)))) + list(zip(negative_X, np.zeros(len(negative_X))))
    random.shuffle(train)
    
    # Changing the output shape to fit the network
    
    X_train, y_train = np.array(train)[:, 0], np.array(train)[:, 1].astype(int)
    X_train = np.rollaxis(np.dstack(X_train), -1)

    return X_train, y_train


In [None]:
# Create train and test data using function specified above

YEAR = 2018
TARGET = "eth_close"
TIME_BACK = 45
TIME_IN_ADVANCE = 3

df = load_gemini_dateset(YEAR)
df = preprocessing(df, TIME_BACK, TIME_IN_ADVANCE, TARGET, YEAR)
X, y = prepare_features(df, TARGET, TIME_BACK, YEAR)
X_train, X_test, y_train, y_test = split_train_test_data(X, y, split_point=75000)
X_train, y_train = balance_the_train_batch(X_train, y_train)



In [None]:
# Below I implement the TensorFlow library to create a network architecture that will suit the problem

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import ModelCheckpoint
import time

NAME = f"Gemini-3-LSTM-3-Dense-TIMEBACK-{TIME_BACK}-TARGET-{TARGET}-{time.time()}"
BATCH_SIZE = 32
EPOCHS = 10   # Low epoch number is caused by a huge number of samples given in each epoch

# Below the architecture of the neural network is specified
# I mostly test a mixture of LSTM and Dense layers with using Dropout regularization

model = Sequential()
model.add(LSTM(128, input_shape=(X_train.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())  #normalizes activation outputs, same reason you want to normalize your input data.

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())


model.add(LSTM(128))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))

opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

# Compile model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)

# I record the learning process using the TensorBoard library
# I save the model after each epoch in the /models folder

tensorboard = TensorBoard(log_dir=f"logs\{NAME}")

filepath = NAME + "\{epoch:02d}-{val_accuracy:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch
checkpoint = ModelCheckpoint("models\{}.model".format(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')) # saves only the best ones



In [None]:
# Train model
history = model.fit(
    X_train, y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(X_test, y_test),
    callbacks=[tensorboard, checkpoint],
)

In [None]:
import os
from sklearn.metrics import confusion_matrix, accuracy_score

# Choose the model you want to evaluate, and print the performance of the model after each epoch

MODEL_NAME = "Gemini-4-LSTM-2-Dense-TIMEBACK-45-TARGET-eth_close-1581412893.36031"

for filename in os.listdir(fr"models\{MODEL_NAME}"):

    if not filename.endswith(".model"):
        continue

    # Get probability distributions returned by the network and change them to class predictions
    
    model = tf.keras.models.load_model(f"models\{MODEL_NAME}\{filename}")
    distributions = model.predict(X_test)
    predictions = []
    
    for dist in distributions:
        if dist[0] > dist[1]:
            y_pred.append(0)
        else:
            y_pred.append(1)
            
    # Print the loss, accuracy and confusion matrix of the model at each epoch
    
    score = model.evaluate(X_test, y_test, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])
    
    print(confusion_matrix(y_test, predictions), "\n")