<a href="https://colab.research.google.com/github/EhsanEs-hub/ML-practice-folder/blob/master/Cryptocurrency.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import io
from google.colab import drive, files
from sklearn import preprocessing
from collections import deque
import random
import time
import tensorflow as tf
from keras.layers import Dense, Dropout, LSTM, CuDNNLSTM, BatchNormalization
from keras.models import Sequential
from keras.callbacks import  TensorBoard, ModelCheckpoint

Using TensorFlow backend.


In [None]:
uploaded = files.upload()

Saving BCHtoUSD.csv to BCHtoUSD.csv
Saving BTCtoUSD.csv to BTCtoUSD.csv
Saving ETHtoUSD.csv to ETHtoUSD.csv
Saving LTC-USD.csv to LTC-USD.csv


In [None]:
df = pd.read_csv('BTCtoUSD.csv')
print(df.head())
print(df)

   2019-06-27  13017.125000  ...  11182.806641.1  39977475222
0  2019-06-28  11162.167969  ...    12407.332031  35087757765
1  2019-06-29  12400.763672  ...    11959.371094  29923961127
2  2019-06-30  11931.991211  ...    10817.155273  27256473494
3  2019-07-01  10796.930664  ...    10583.134766  29378589324
4  2019-07-02  10588.683594  ...    10801.677734  31015895222

[5 rows x 7 columns]
     2019-06-27  13017.125000  ...  11182.806641.1  39977475222
0    2019-06-28  11162.167969  ...    12407.332031  35087757765
1    2019-06-29  12400.763672  ...    11959.371094  29923961127
2    2019-06-30  11931.991211  ...    10817.155273  27256473494
3    2019-07-01  10796.930664  ...    10583.134766  29378589324
4    2019-07-02  10588.683594  ...    10801.677734  31015895222
..          ...           ...  ...             ...          ...
361  2020-06-23   9644.076172  ...     9629.658203  17006433272
362  2020-06-24   9632.149414  ...     9313.610352  18961716075
363  2020-06-25   9314.126953 

In [None]:
SEQ_LEN = 60  # how long of a preceeding sequence to collect for RNN
FUTURE_PERIOD_PREDICT = 3  # how far into the future are we trying to predict?
RATIO_TO_PREDICT = "BTCtoUSD"
EPOCHS = 10  # how many passes through our data
BATCH_SIZE = 64  # how many batches? Try smaller batch if you're getting OOM (out of memory) errors.
NAME = f"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"  # a unique name for the model

In [None]:
def classify(current, future):

  # if the future price is higher than the current, that's a buy, or a 1
    if float(future) > float(current):  
        return 1
        
    else:  # otherwise... it's a 0!
        return 0

In [None]:
def preprocess_df(df):

    df = df.drop("future", 1)  # don't need this anymore.

    for col in df.columns:  # go through all of the columns
        if col != "target":  # normalize all ... except for the target itself!

# pct change "normalizes" the different currencies (each crypto coin has vastly diff values, we're really more interested in the other coin's movements)
            df[col] = df[col].pct_change()  
            df.dropna(inplace=True)  # remove the nas created by pct_change
            df[col] = preprocessing.scale(df[col].values)  # scale between 0 and 1.

    df.dropna(inplace=True) 


    sequential_data = []  # this is a list that will CONTAIN the sequences
# These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in
    prev_days = deque(maxlen=SEQ_LEN)  

    for i in df.values:  # iterate over the values
        prev_days.append([n for n in i[:-1]])  # store all but the target
        if len(prev_days) == SEQ_LEN:  # make sure we have 60 sequences!
            sequential_data.append([np.array(prev_days), i[-1]])  

    random.shuffle(sequential_data)  # shuffle for good measure.

    buys = []  # list that will store our buy sequences and targets
    sells = []  # list that will store our sell sequences and targets

    for seq, target in sequential_data:  # iterate over the sequential data
        if target == 0:  # if it's a "not buy"
            sells.append([seq, target])  # append to sells list
        elif target == 1:  # otherwise if the target is a 1...
            buys.append([seq, target])  # it's a buy!

    random.shuffle(buys)  # shuffle the buys
    random.shuffle(sells)  # shuffle the sells! 

    lower = min(len(buys), len(sells))  # what's the shorter length?

    buys = buys[:lower]  # make sure both lists are only up to the shortest length.
    sells = sells[:lower]  # make sure both lists are only up to the shortest length.

    sequential_data = buys+sells  # add them together
    random.shuffle(sequential_data)  # another shuffle, so the model doesn't get confused with all 1 class then the other.

    X = []
    y = []

    for seq, target in sequential_data:  # going over our new sequential data
        X.append(seq)  # X is the sequences
        y.append(target)  # y is the targets/labels (buys vs sell/notbuy)

    return np.array(X), y  # return X and y...and make X a numpy array!

    

In [None]:
main_df = pd.DataFrame()

ratios = ['BTCtoUSD', 'BCHtoUSD', 'ETHtoUSD', 'LTC-USD']

for ratio in ratios: # begin iteration

  ratio = ratio.split('.csv')[0]  # split away the ticker from the file-name
  dataset = f'{ratio}.csv'  # get the full path to the file.
 
  df = pd.read_csv(dataset, names=["time", "open", "high", "low", "close", "adjclose", "volume"])
# print(df.head())

  # rename volume and close to include the ticker so we can still which close/volume is which:
  df.rename(columns={"close": f"{ratio}_close", "volume": f"{ratio}_volume"}, inplace=True)

  df.set_index("time", inplace=True)  # set time as index so we can join them on this shared time
  df = df[[f"{ratio}_close", f"{ratio}_volume"]]  # ignore the other columns besides price and volume

  if len(main_df)==0:  # if the dataframe is empty
      main_df = df  # then it's just the current df
  else:  # otherwise, join this data to the main one
      main_df = main_df.join(df)

main_df.fillna(method="ffill", inplace=True)  # if there are gaps in data, use previously known values
main_df.dropna(inplace=True)
print(main_df.head())  


            BTCtoUSD_close  BTCtoUSD_volume  ...  LTC-USD_close  LTC-USD_volume
time                                         ...                               
2019-07-05    10978.459961      23838480210  ...     118.352509    4.110868e+09
2019-07-06    11208.550781      21092024306  ...     117.823624    3.484378e+09
2019-07-07    11450.846680      19369044276  ...     120.320877    3.348662e+09
2019-07-08    12285.958008      23482551458  ...     123.285484    3.803750e+09
2019-07-09    12573.812500      28167921522  ...     119.432816    3.785641e+09

[5 rows x 8 columns]


In [None]:
main_df['future'] = main_df[f'{RATIO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)
main_df['target'] = list(map(classify, main_df[f'{RATIO_TO_PREDICT}_close'], main_df['future']))

main_df.dropna(inplace=True)

In [None]:
# here, split away some slice of the future data from the main main_df.

times = sorted(main_df.index.values)
last_5pct = sorted(main_df.index.values)[-int(0.05*len(times))]


In [None]:
validation_main_df = main_df[(main_df.index >= last_5pct)]
main_df = main_df[(main_df.index < last_5pct)]

In [None]:
# preprocess_df(main_df)
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

ValueError: ignored

In [None]:
print(f"train data: {len(train_x)} validation: {len(validation_x)}")
print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"VALIDATION Dont buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")

NameError: ignored

Now for the model. I tried a few things like 2 vs 3 layers, 64 vs 128 nodes, and found the following to begin to work:

In [None]:
# Create the LSTM model

# Initialising the RNN
model = Sequential()
# Adding the first LSTM layer and some Dropout regularisation

model.add(CuDNNLSTM(128, return_sequences=True, input_shape=(x_train.shape[1:]))
# Ignore 20% of the neurons
model.add(Dropout(0.2))
model.add(BatchNormalization)

model.add(CuDNNLSTM(128, return_sequences=True, input_shape=(x_train.shape[1:]))
model.add(Dropout(0.1))
 
# Normalizes activation outputs, same reason you want to normalize your input data.
model.add(BatchNormalization)

model.add(CuDNNLSTM(128, input_shape=(x_train.shape[1:]))
model.add(Dropout(0.2))
model.add(BatchNormalization)

# Adding the output layer (2 output)
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))
model.add(Dropout(0.2))



Model **compile** settings:

In [None]:
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

# Compile model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)

TensorBoard **callback**:

In [None]:
tensorboard = TensorBoard(log_dir=f'logs/{NAME}')

Next, let's check out the **ModelCheckpoint** callback :

In [None]:
filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')) # saves only the best ones

In [None]:
# Train model
history = model.fit(
    train_x, train_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(validation_x, validation_y),
    callbacks=[tensorboard, checkpoint],
)

In [None]:
# Score model
score = model.evaluate(validation_x, validation_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# Save model
model.save("models/{}".format(NAME))