# Crypto Predictor


In [1]:
import pandas as pd

df = pd.read_csv("crypto_data/LTC-USD.csv",names = ['time','low','high', 'open','close','volume'])

In [2]:
df.head()

Unnamed: 0,time,low,high,open,close,volume
0,1528968660,96.580002,96.589996,96.589996,96.580002,9.6472
1,1528968720,96.449997,96.669998,96.589996,96.660004,314.387024
2,1528968780,96.470001,96.57,96.57,96.57,77.129799
3,1528968840,96.449997,96.57,96.57,96.5,7.216067
4,1528968900,96.279999,96.540001,96.5,96.389999,524.539978


In [3]:
main_df = pd.DataFrame()


In [4]:
ratios = ['BTC-USD','LTC-USD','ETH-USD','BCH-USD']

for ratio in ratios:
    dataset = f"crypto_data/{ratio}.csv"
    
    df = pd.read_csv(dataset, names=["time","low","high","open","close","volume"])
    df.rename(columns={"close" : f"{ratio}_close","volume":f"{ratio}_volume"},inplace=True)
    
    df.set_index("time",inplace=True)
    df = df[[f"{ratio}_close",f"{ratio}_volume"]]
    
    if len(main_df) == 0:
        main_df = df
    else:
        main_df = main_df.join(df)

In [5]:
main_df


Unnamed: 0_level_0,BTC-USD_close,BTC-USD_volume,LTC-USD_close,LTC-USD_volume,ETH-USD_close,ETH-USD_volume,BCH-USD_close,BCH-USD_volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1528968660,6489.549805,0.587100,96.580002,9.647200,,,871.719971,5.675361
1528968720,6487.379883,7.706374,96.660004,314.387024,486.010010,26.019083,870.859985,26.856577
1528968780,6479.410156,3.088252,96.570000,77.129799,486.000000,8.449400,870.099976,1.124300
1528968840,6479.410156,1.404100,96.500000,7.216067,485.750000,26.994646,870.789978,1.749862
1528968900,6479.979980,0.753000,96.389999,524.539978,486.000000,77.355759,870.000000,1.680500
...,...,...,...,...,...,...,...,...
1535214960,6713.140137,0.769891,58.020000,6.434783,279.359985,11.280577,531.479980,1.208560
1535215020,6714.520020,1.002652,58.009998,7.301921,279.359985,8.790519,531.479980,0.016868
1535215080,6714.520020,1.021925,58.020000,23.802017,279.369995,1.311763,531.469971,0.013854
1535215140,6715.000000,3.645508,58.020000,6.953497,279.660004,11.752819,531.479980,0.016900


In [6]:
SEQ_LEN = 60 #WE HAVE THE DATA OF PREVIOUS 60 MIN
FUTURE_PERIOD_PREDICT = 3 # TRY TO PREDICT 3 MIN IN THE FUTURE
RATIO_TO_PREDICT = "LTC-USD"

In [7]:
def classify(present, future):
    if float(future) >= float(present):
        return 1      # it implies the crypto yeilds profit
    else:
        return 0

In [8]:
main_df['future'] = main_df[f"{RATIO_TO_PREDICT}_close"].shift(-FUTURE_PERIOD_PREDICT) # shift the row or column depend on axis specification by a certain number secified in the shift


In [9]:
main_df['future'].head()

time
1528968660    96.500000
1528968720    96.389999
1528968780    96.519997
1528968840    96.440002
1528968900    96.470001
Name: future, dtype: float64

In [10]:
main_df[[f"{RATIO_TO_PREDICT}_close","future"]]

Unnamed: 0_level_0,LTC-USD_close,future
time,Unnamed: 1_level_1,Unnamed: 2_level_1
1528968660,96.580002,96.500000
1528968720,96.660004,96.389999
1528968780,96.570000,96.519997
1528968840,96.500000,96.440002
1528968900,96.389999,96.470001
...,...,...
1535214960,58.020000,58.020000
1535215020,58.009998,58.080002
1535215080,58.020000,
1535215140,58.020000,


In [11]:
# now create target variable

main_df['target'] = list(map(classify, main_df[f"{RATIO_TO_PREDICT}_close"], main_df['future'])) #map current and future as typed

In [12]:
main_df[[f"{RATIO_TO_PREDICT}_close","future","target"]]

Unnamed: 0_level_0,LTC-USD_close,future,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1528968660,96.580002,96.500000,0
1528968720,96.660004,96.389999,0
1528968780,96.570000,96.519997,0
1528968840,96.500000,96.440002,0
1528968900,96.389999,96.470001,1
...,...,...,...
1535214960,58.020000,58.020000,1
1535215020,58.009998,58.080002,1
1535215080,58.020000,,0
1535215140,58.020000,,0


In [13]:
times = sorted(main_df.index.values) #values convert it to numpy array

last_5pct = times[-int(0.05*len(times))]
print(last_5pct)

1534922100


In [14]:
# seperating data into validation and test
validation_main_df = main_df[(main_df.index >= last_5pct)]

main_df = main_df[(main_df.index < last_5pct)]

In [15]:
# define functon to perform preprocessing

from sklearn import preprocessing
from collections import deque
import numpy as np
import random

def preprocess_df(df):
    df = df.drop('future',1)
    
    #iterate over the column and scale the column
    for col in df.columns:
        if col != "target":
            df[col] = df[col].pct_change() #pct_change normalises the data
            df.dropna(inplace = True)
            df[col] = preprocessing.scale(df[col].values)
            
    df.dropna(inplace = True)
    
    sequential_data = []
    previous_days = deque(maxlen = SEQ_LEN) #previous_day is a list as soon the list reaches maxlen value it gets new items it pops the old items
    
    # to check the result of the function 
    # print(df.head())
    for i in df.values: # df.values convert ur datafraame to a list of list(here it is list of column) it wont contain time but it is in order but it will contain target
        previous_days.append([n for n in i[:-1]]) #we need to append a list, n for n is each value in that list of list(n=> each row element), -1 mean excluding the target column 
        if len(previous_days) == SEQ_LEN:
            sequential_data.append([np.array(previous_days), i[-1]]) #now append x and y i.e features and label
        
        
        
    random.shuffle(sequential_data)
    
    # balance the data => buys = sells 
    buys = []
    sells = []

    for seq, target in sequential_data:
        if target ==0: #i.e sell
            sells.append([seq, target])
        elif target == 1:
            buys.append([seq, target])
            
    random.shuffle(buys)
    random.shuffle(sells)
    
    #know which is lower buys or sells
    lower = min(len(buys), len(sells))
    
    #to equalise the buys and sells
    buys = buys[:lower]
    sells = sells[:lower]
    
    sequential_data = buys + sells
    random.shuffle(sequential_data)
    
    # now to seperate the features and labels
    X = []
    y = []
    
    for seq, target in sequential_data:
        X.append(seq)
        y.append(target)
        
    return np.array(X), y
            
    

In [16]:
preprocess_df(main_df) #things got converted to percent change and normalised

(array([[[ 1.98993787e-02, -6.53351351e-02, -7.41317091e-02, ...,
          -5.05482196e-02, -4.98993163e-03, -6.21331166e-03],
         [-8.30083053e-04, -6.15952077e-02,  8.22366276e-02, ...,
          -2.43234320e-02,  2.45620675e-01, -4.20153788e-03],
         [-8.30083053e-04, -6.39032399e-02,  5.51199623e-01, ...,
          -1.01321610e-02,  6.33477705e-01, -6.03986839e-03],
         ...,
         [ 8.45221872e-02, -5.20355903e-02,  8.22392505e-02, ...,
          -3.64955975e-02, -4.14424361e-01, -6.22434609e-03],
         [ 3.49053805e-01, -6.18255235e-02,  4.04838613e-03, ...,
          -4.41869565e-02,  1.13020993e-02, -4.76295985e-03],
         [-8.30083053e-04, -4.43714927e-02,  4.04838613e-03, ...,
          -5.03254856e-02,  3.10232629e-03, -5.63677635e-03]],
 
        [[ 1.74157749e+00,  3.83218620e-01,  1.84184342e-01, ...,
          -3.40659878e-02,  1.53908600e+00, -6.17728386e-03],
         [ 4.60459077e-01, -8.38470254e-02,  4.04838613e-03, ...,
          -5.07247846

In [17]:
train_x , train_y = preprocess_df(main_df)
validation_x , validation_y = preprocess_df(validation_main_df)

In [18]:
#converting it to array
train_x = np.asarray(train_x)
train_y = np.asarray(train_y)
validation_x = np.asarray(validation_x)
validation_y = np.asarray(validation_y)

In [19]:
print(f"train data: {len(train_x)} ,validation: {len(validation_x)}")
print(f"Dont buys: {np.count_nonzero(train_y == 0)}, Buys: {np.count_nonzero(train_y == 1)}")  #np.ndarray doesnt have the count function so we use array.count_nonzero() and can count the number of unique object in it by passing th value in the parenthesis 
print(f"Validation Dont buy: {np.count_nonzero(validation_y == 0)}, Validation buys: {np.count_nonzero(validation_y == 1)}")

train data: 71236 ,validation: 3072
Dont buys: 35618, Buys: 35618
Validation Dont buy: 1536, Validation buys: 1536


In [20]:
import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization #BatchNormalization is normalisation between layers 
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint #ModelCheckpoint is  way to set parameters to when u want to save checkpoint ex like after 100 epochs so we track the model even before it breaks



EPOCHS = 10
BATCH_SIZE = 64
NAME = f"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

In [21]:
model  = Sequential()
model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())


model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(128, input_shape=(train_x.shape[1:])))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation="relu"))
model.add(Dropout(0.2))

model.add(Dense(2, activation="softmax")) #binary choise 2 2 dense layer enough


opt = tf.keras.optimizers.Adam(learning_rate = 0.001, decay = 1e-6)

model.compile(loss="sparse_categorical_crossentropy",
             optimizer = opt,
             metrics = ['accuracy'])


tensorboard = TensorBoard(log_dir = f'logs/{NAME}')

filepath = "RNN_Final-{epoch:02d}-{val_accuracy:.3f}" #unique file name that will include the epoch and validation accuracy for that epoch
checkpoint = ModelCheckpoint("model/{}.model".format(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')) #save only the best one

model.fit(train_x, train_y,
                   batch_size=BATCH_SIZE,
                   validation_data=(validation_x, validation_y),
                   callbacks=[tensorboard, checkpoint])

model.save("")





INFO:tensorflow:Assets written to: model\RNN_Final-01-0.500.model\assets


INFO:tensorflow:Assets written to: model\RNN_Final-01-0.500.model\assets






INFO:tensorflow:Assets written to: assets


INFO:tensorflow:Assets written to: assets


In [23]:
predicted_x = model.predict(validation_x)



In [28]:
print(predicted_x)
print(predicted_x.shape)
predicted_x[0]

[[0.44852006 0.5514799 ]
 [0.5188537  0.48114628]
 [0.5189359  0.48106408]
 ...
 [0.5187816  0.48121837]
 [0.4728357  0.52716434]
 [0.477568   0.5224321 ]]
(3072, 2)


array([0.44852006, 0.5514799 ], dtype=float32)

In [22]:
validation_y

array([1., 0., 1., ..., 0., 0., 1.])

In [32]:
val_loss, val_acc = model.evaluate(validation_x, validation_y)



In [33]:
print(val_loss, val_acc)

0.6940186619758606 0.5003255009651184
