In [1]:
import pandas as pd
from collections import deque
import random
import numpy as np
from sklearn import preprocessing
import os

In [2]:
SEQ_LEN = 60  # how long of a preceeding sequence to collect for RNN
FUTURE_PERIOD_PREDICT = 3  # how far into the future are we trying to predict?
RATIO_TO_PREDICT = "LTC-USD"

In [3]:
def classify(current, future):
    """Function takes in the current and future price of a crypto-currency or stock.
    Returns 1 of price increased in the future and 0 if price decrease."""
    if float(future) > float(current):
        return 1
    else:
        return 0

In [4]:
def preprocess_df(df):
    # Droppig label column
    df = df.drop("future", axis=1)

    for col in df.columns:
        if col != "target": # Normalize all columns except target (label)
            # Percent change "normalizes" the different currencies (each crypto coin has vastly diff values, we're really more interested in the other coin's movements)
            df[col] = df[col].pct_change()
            # Removing NaN values created by pct_change
            df.dropna(inplace=True)
            # Scale values to be between 0-1
            df[col] = preprocessing.scale(df[col].values)
    
    df.dropna(inplace=True) # Dropping NaN just in case!
    sequential_data = [] # List to hold the sequences
    prev_days = deque(maxlen=SEQ_LEN) # These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in

    for i in df.values:
        # Store all columns except target
        prev_days.append([n for n in i[:-1]])
        if len(prev_days) == SEQ_LEN:
            # Append feature (a sequence of prices) and target (label 0 = Fall, 1 = Rise)
            sequential_data.append([np.array(prev_days), i[-1]])
    
    # Shuffling data
    random.shuffle(sequential_data)

    buys = []  # list that will store our buy sequences and targets
    sells = []  # list that will store our sell sequences and targets

    for seq, target in sequential_data:
        if target == 0:
            sells.append([seq, target])
        elif target == 1:
            buys.append([seq, target])
    
    # Shuffling data
    random.shuffle(buys)
    random.shuffle(sells)

    lower = min(len(buys), len(sells))

    buys = buys[:lower]
    sells = sells[:lower]

    sequential_data = buys + sells
    random.shuffle(sequential_data)

    X = []
    y = []

    for seq, target in sequential_data:
        X.append(seq)     # X is the sequence of prices (feature data)
        y.append(target)  # y is the targets/labels (buys vs sell/notbuy)

    return np.array(X), y
#######################################FUNCTION END#######################################

In [5]:
DIRECTORY = "Data/crypto"

# Empty DataFrame
main_df = pd.DataFrame()
for i, file in enumerate(os.listdir(DIRECTORY)):
    ratio = file.split(".")[0]
    print("File", i+1, "|", file, "| Processing...")
    dataset = os.path.join(DIRECTORY, file)
    df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])
    
    # Rename volume and close to include the ticker so we can still which close/volume is which:
    df.rename(columns={"close": f"{ratio}_close", "volume": f"{ratio}_volume"}, inplace=True)
    
    # Set time as index so we can join them on this shared time
    df.set_index("time", inplace=True)
    
    # Ignore the other columns besides price and volume
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]
    
    if len(main_df)==0:
        main_df = df
    else:
        main_df = main_df.join(df)
print("Process complete!")

File 1 | BCH-USD.csv | Processing...
File 2 | BTC-USD.csv | Processing...
File 3 | ETH-USD.csv | Processing...
File 4 | LTC-USD.csv | Processing...
Process complete!


In [6]:
# Forward filling any missing data
main_df.fillna(method="ffill", inplace=True)
main_df.dropna(inplace=True)

main_df['future'] = main_df[f'{RATIO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)
main_df['target'] = list(map(classify, main_df[f'{RATIO_TO_PREDICT}_close'], main_df['future']))

main_df.dropna(inplace=True)

# Get the timestamps
times = sorted(main_df.index.values)
# Get the last 5% of the times
last_5pct = sorted(main_df.index.values)[-int(0.05*len(times))]

# Make the validation data where the index is in the last 5%
validation_main_df = main_df[(main_df.index >= last_5pct)]
# Now the main_df is all the data up to the last 5%
main_df = main_df[(main_df.index < last_5pct)]

In [7]:
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

print(f"train data: {len(train_x)} validation: {len(validation_x)}")
print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"VALIDATION Dont buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")

train data: 75002 validation: 3812
Dont buys: 37501, buys: 37501
VALIDATION Dont buys: 1906, buys: 1906


# Building LSTM Model

In [8]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import ModelCheckpoint, ModelCheckpoint
import time

In [9]:
EPOCHS = 10
BATCH_SIZE = 64
NAME = f"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

In [13]:
model = Sequential()

# Input layer
model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

# Hidden layer 1
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

# Hidden layer 2
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(BatchNormalization())

# Hidden layer 3
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

# Output layer
model.add(Dense(2, activation='softmax'))


# Optimizer
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

# Compile model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)

# Tensorboard callback
tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))

# For finding and saving best model
filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')) # saves only the best ones

In [14]:
# Train model
history = model.fit(
    train_x, np.array(train_y),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(validation_x, np.array(validation_y)),
    callbacks=[tensorboard, checkpoint],
)

Train on 75002 samples, validate on 3812 samples


NotFoundError: Failed to create a directory: logs/60-SEQ-3-PRED-1574234463\train; No such file or directory [Op:CreateSummaryFileWriter]

In [None]:
# Score model
score = model.evaluate(validation_x, validation_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# Save model
if input("Save current model? (Y/N) ") == "Y":
    model.save("models/{}".format(NAME))
    print("Save successfully!")