In [1]:
import pandas as pd
import random 
from sklearn import preprocessing
from collections import deque
import numpy as np
import time

df = pd.read_csv("crypto_data/LTC-USD.csv",names =["time","low","high","open","close","volume"])

print(df.head())

         time        low       high       open      close      volume
0  1528968660  96.580002  96.589996  96.589996  96.580002    9.647200
1  1528968720  96.449997  96.669998  96.589996  96.660004  314.387024
2  1528968780  96.470001  96.570000  96.570000  96.570000   77.129799
3  1528968840  96.449997  96.570000  96.570000  96.500000    7.216067
4  1528968900  96.279999  96.540001  96.500000  96.389999  524.539978


In [50]:
SEQ_LEN = 60  # how long of a preceeding sequence to collect for RNN
FUTURE_PERIOD_PREDICT = 3  # how far into the future trying to predict?
RATIO_TO_PREDICT = "BTC-USD"
EPOCHS = 10  # how many passes through our data
BATCH_SIZE = 64  # how many batches?
NAME = f"{RATIO_TO_PREDICT}-{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

In [3]:
# to create target
def classify(current,future):
    if float(future)>float(current):
        return 1   # buy the stock
    
    else :
        return 0   # sell the stock
        

In [4]:
# funtion to process the training and validation data
# normalization  
#scaling  
#balancing the data
# creating SEQ_LEN long sequences

def preprocess_df(df):
    df = df.drop("future",1)    # value to be predicted so dropped
    
    for col in df.columns:
        if col != "target" :
            df[col]=df[col].pct_change()   # normalize the column
            df.dropna(inplace = True)
            df[col] = preprocessing.scale(df[col].values)   # scale the column values
    
    df.dropna(inplace = True)
    
    # creating sequences for model    
    sequential_data =[]
    prev_days = deque(maxlen=SEQ_LEN)
    
    for i in df.values:
        prev_days.append([n for n in i[:-1]])
        if len(prev_days) == SEQ_LEN:
            sequential_data.append([np.array(prev_days),i[-1]])
    
    
    random.shuffle(sequential_data)
    
    buys = []
    sells = []
    
    for seq , target in sequential_data:
        if target == 0:
            sells.append([seq,target])
        elif target == 1:
                    buys.append([seq,target])
    
    random.shuffle(buys)
    random.shuffle(sells)
    
    # balancing the data
    lower = min(len(buys),len(sells))
    
    buys = buys[:lower]
    sells = sells[:lower]
    
    sequential_data = buys +sells
    random.shuffle(sequential_data)
    
    X = []
    y = []
    
    for seq , target in sequential_data:
        X.append(seq)        # sequences  
        y.append(target)     # labels ie buy or sell
        
    return np.array(X), y

In [51]:
# declare an empty data frame
main_df = pd.DataFrame()

ratios = ["BTC-USD","LTC-USD","BCH-USD","ETH-USD"]

for ratio in ratios:
    print(ratio)
    dataset = f'crypto_data/{ratio}.csv'
    
    df = pd.read_csv(dataset,names =["time","low","high","open","close","volume"])
    df.rename(columns={"close":f"{ratio}_close","volume":f"{ratio}_volume"},inplace = True)
    
    df.set_index("time",inplace=True)               # common column for all 4 database
    df = df[[f"{ratio}_close",f"{ratio}_volume"]]   # ignore other columns
    
    if len(main_df)==0 :
        main_df = df
    else : 
        main_df = main_df.join(df)
    
main_df.fillna(method="ffill",inplace=True)
main_df.dropna(inplace=True)
print(main_df.head())

BTC-USD
LTC-USD
BCH-USD
ETH-USD
            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       77.129799   
1528968840    6479.410156        1.404100      96.500000        7.216067   
1528968900    6479.979980        0.753000      96.389999      524.539978   
1528968960    6480.000000        1.490900      96.519997       16.991997   

            BCH-USD_close  BCH-USD_volume  ETH-USD_close  ETH-USD_volume  
time                                                                      
1528968720     870.859985       26.856577      486.01001       26.019083  
1528968780     870.099976        1.124300      486.00000        8.449400  
1528968840     870.789978        1.749862      485.75000       26.994646  
1528968900     870.000000        1.680500      486.00000    

In [52]:
main_df['future'] = main_df[f'{RATIO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)
main_df['target'] = list(map(classify,main_df[f'{RATIO_TO_PREDICT}_close'],main_df['future']))

In [53]:
print(main_df.head())

            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       77.129799   
1528968840    6479.410156        1.404100      96.500000        7.216067   
1528968900    6479.979980        0.753000      96.389999      524.539978   
1528968960    6480.000000        1.490900      96.519997       16.991997   

            BCH-USD_close  BCH-USD_volume  ETH-USD_close  ETH-USD_volume  \
time                                                                       
1528968720     870.859985       26.856577      486.01001       26.019083   
1528968780     870.099976        1.124300      486.00000        8.449400   
1528968840     870.789978        1.749862      485.75000       26.994646   
1528968900     870.000000        1.680500      486.00000       77.355759   
1528968960 

In [54]:
# create a validation set 
times = sorted(main_df.index.values)  # get the times
last_5pct = times[-int(0.05*len(times))]
print(last_5pct)

1534922100


In [55]:
validation_main_df = main_df[(main_df.index >= last_5pct)]  # validation data where the index is in the last 5%
main_df = main_df[(main_df.index < last_5pct)]  # now the main_df is all the data up to the last 5%

In [56]:
train_x ,train_y = preprocess_df(main_df)
validation_x ,validation_y = preprocess_df(validation_main_df)

In [57]:
print(f"train data: {len(train_x)} validation: {len(validation_x)}")
print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"VALIDATION Dont buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")

train data: 83156 validation: 4472
Dont buys: 41578, buys: 41578
VALIDATION Dont buys: 2236, buys: 2236


In [29]:
# model 

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, CuDNNLSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import ModelCheckpoint

In [44]:
model = Sequential()

model.add(CuDNNLSTM(128,input_shape=(train_x.shape[1:]),return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(CuDNNLSTM(128,return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(CuDNNLSTM(128))           # CuDNNLSTM uses tanh as activation
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2,activation='softmax'))


# optimizer
opt = tf.keras.optimizers.Adam(lr=0.001,decay=1e-6)

# compile the model
model.compile(
    loss = 'sparse_categorical_crossentropy',
    optimizer = opt,
    metrics = ['accuracy']
)


In [58]:
tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))

In [59]:
filepath = "RNN_Final-{epoch:02d}--{val_acc:.3f}"  # file name include epoch and validation acc for that epoch
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_acc', verbose=1, 
                                                      save_best_only=True, mode='max'))       # saves only the best ones

In [60]:
# Train model
history = model.fit(
    train_x, train_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(validation_x, validation_y),
    callbacks=[tensorboard, checkpoint]
)

Train on 83156 samples, validate on 4472 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [61]:
# Score model
score = model.evaluate(validation_x, validation_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.8182466729595751
Test accuracy: 0.5357781752064318


In [62]:
model.save("models/{}".format(NAME))

### Epoch Accuracy

<img src ="epoch_acc.jpg">

### Epoch Loss

<img src = "epoch_loss.jpg">

### Epoch Validation Accuracy

<img src ="epoch_val_acc.jpg">

### Epoch Validation Loss

<img src ="epoch_val_loss.jpg">