In [1]:
# Initial imports
import numpy as np
import pandas as pd
from sklearn import preprocessing
from collections import deque
import random
import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

%matplotlib inline

In [2]:
#def targets
SEQ_LEN = 9
FUTURE_PERIOD_PREDICT = 3
RATIO_TO_PREDICT = "ETH-USD"
EPOCHS = 10
BATCH_SIZE = 90
NAME = f"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

In [3]:
#def classification 

def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

In [4]:
def test_df(df):
     for col in df.columns:  # go through all of the columns
        df[col] = df[col].pct_change()  # pct change "normalizes" the different currencies (each crypto coin has vastly diff values, we're really more interested in the other coin's movements)


In [20]:
def gen_datasets(df):
    X = df.to_numpy()
    future = df.shift(-FUTURE_PERIOD_PREDICT)
    result= X[:-3]
    future.dropna(inplace=True)
    print(result, future)
    return result, future[["Close"]]
    #profitable= future-x
    # "future, Close, Volume"

In [21]:
def preprocess_df(df):
    ##df = df.drop("future", 1)  # don't need this anymore.

    for col in df.columns:  # go through all of the columns
        if col != "target":  # normalize all ... except for the target itself!
            df[col] = df[col].pct_change()  # pct change "normalizes" the different currencies (each crypto coin has vastly diff values, we're really more interested in the other coin's movements)
            df.dropna(inplace=True)  # remove the nas created by pct_change
            df[col] = preprocessing.scale(df[col].values)  # scale between 0 and 1.

    df.dropna(inplace=True)  # cleanup again... jic.


    sequential_data = []  # this is a list that will CONTAIN the sequences
    prev_days = deque(maxlen=SEQ_LEN)  # These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in

    for i in df.values:  # iterate over the values
        prev_days.append([n for n in i[:-1]])  # store all but the target
        if len(prev_days) == SEQ_LEN:  # make sure we have 60 sequences!
            sequential_data.append([np.array(prev_days), i[-1]])# append those bad boys!
            
    

    random.shuffle(sequential_data)  # shuffle for good measure.

    buys = []  # list that will store our buy sequences and targets
    sells = []  # list that will store our sell sequences and targets

    for seq, target in sequential_data:  # iterate over the sequential data
        if target == 0:  # if it's a "not buy"
            sells.append([seq, target])  # append to sells list
        elif target == 1:  # otherwise if the target is a 1...
            buys.append([seq, target])  # it's a buy!

    random.shuffle(buys)  # shuffle the buys
    random.shuffle(sells)  # shuffle the sells!

    lower = min(len(buys), len(sells))  # what's the shorter length?

    buys = buys[:lower]  # make sure both lists are only up to the shortest length.
    sells = sells[:lower]  # make sure both lists are only up to the shortest length.

    sequential_data = buys+sells  # add them together
    
    random.shuffle(sequential_data)  # another shuffle, so the model doesn't get confused with all 1 class then the other.

    X = []
    y = []

    for seq, target in sequential_data:  # going over our new sequential data
        X.append(seq)  # X is the sequences
        y.append(target)  # y is the targets/labels (buys vs sell/notbuy)

    return np.array(X),np.array(y) # return X and y...and make X a numpy array!

In [22]:
#preping data
def build_data(currency_name, data_location):
    ##dataset = f"DATA/ETH/ETH-USD.csv"
    df = pd.read_csv(data_location)  # read in specific file
    df.set_index("Date", inplace=True)  # set time as index so we can join them on this shared time
    df = df[[f"Close", f"Volume"]]  # ignore the other columns besides price and volume

    #if len(main_df)==0:  # if the dataframe is empty
    # main_df = df  # then it's just the current df
    # else:  # otherwise, join this data to the main one
    #  print(df)
    # print(main_df)
    # main_df = main_df.join(df)
    
        
    df.fillna(method="ffill", inplace=True)  # if there are gaps in data, use previously known values
    df.dropna(inplace=True)
    
    return df

In [23]:
#make futre column 
def add_future_column(df):
    main_df= df.copy()
    main_df['future'] = df[f"Close"].shift(-FUTURE_PERIOD_PREDICT)
    
    main_df['target'] = list(map(classify, main_df[f"Close"], main_df["future"]))

    main_df.dropna(inplace=True)

    main_df.head()
    
    return main_df

In [None]:
def build_linear_and_softmax_model():
    inputs = tf.keras.Input(shape=(52,))
    dense1 = tf.keras.layers.Dense(100, activation='relu')
    x = dense1(inputs)
    x = tf.keras.layers.Dense(10, activation='relu')(x)
    x = tf.keras.layers.Dense(20, activation='relu')(x)
    output_softmax = tf.keras.layers.Dense(2, activation="softmax", name="win_chance")(x)
    output_linear = tf.keras.layers.Dense(5, activation='linear', name="others")(x)
    model = tf.keras.Model(inputs = inputs, outputs = [output_softmax, output_linear])
    model.compile(loss = {"win_chance": "categorical_crossentropy", "others": "mean_squared_error"})
    return model

In [None]:
def build_linear_and_softmax_model():
    inputs = tf.keras.Input(shape=(52,))
    dense1 = tf.keras.layers.Dense(100, activation='relu')
    x = dense1(inputs)
    x = tf.keras.layers.Dense(10, activation='relu')(x)
    x = tf.keras.layers.Dense(20, activation='relu')(x)
    output_softmax = tf.keras.layers.Dense(2, activation="softmax", name="win_chance")(x)
    output_linear = tf.keras.layers.Dense(5, activation='linear', name="others")(x)
    model = tf.keras.Model(inputs = inputs, outputs = [output_softmax, output_linear])
    model.compile(loss = {"win_chance": "categorical_crossentropy", "others": "mean_squared_error"})
    return model

In [24]:
def get_model(train_x):
    #print(f"train data: {len(train_x)} validation: {len(validation_x)}")
    #print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
    #print(f"VALIDATION Dont buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")

    model = Sequential()
    
    model.add(LSTM(128, input_shape=(2,), return_sequences=True))
    model.add(Dropout(0.2))
    model.add(BatchNormalization())

    #model.add(LSTM(128, return_sequences=True))
    #model.add(Dropout(0.1))
    #model.add(BatchNormalization())

    #model.add(LSTM(128))
    #model.add(Dropout(0.2))
    #model.add(BatchNormalization())

    #model.add(Dense(32, activation='relu'))
    #model.add(Dropout(0.2))

    model.add(Dense(2, activation='softmax'))
    
    return model

In [25]:
# Compile model
def compiling(model):
    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer= tf.keras.optimizers.Adam(lr=0.001, decay=1e-6),
        metrics=['accuracy']
    )
    
    
    return model


In [26]:
## here, split away some slice of the future data from the main main_df.
times = sorted(main_df.index.values)
last_5pct = sorted(main_df.index.values)[-int(0.05*len(times))]

NameError: name 'main_df' is not defined

In [27]:
#validation_main_df = main_df[(main_df.index >= last_5pct)]
main_pros_df = main_df[(main_df.index < last_5pct)]
test_df(main_pros_df)
#train_x, train_y = preprocess_df(main_df)
#validation_x, validation_y = preprocess_df(validation_main_df)


NameError: name 'main_df' is not defined

In [28]:
df1= build_data("ETH-USD",f"DATA/ETH/ETH-USD.csv")
X,y= gen_datasets(df1)
print(X.shape, y.shape)
model=get_model(X)
model=compiling(model)
tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))

filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch

checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')) # saves only the best ones

history = model.fit(
    X, y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS
    #validation_data=(validation_x, validation_y),
    #callbacks=[tensorboard, checkpoint],
)


[[2.77212000e+00 1.64329000e+05]
 [7.53325000e-01 6.74188000e+05]
 [7.01897000e-01 5.32170000e+05]
 ...
 [1.99097083e+03 2.12533598e+10]
 [2.02520276e+03 1.74954803e+10]
 [2.12477661e+03 1.62003925e+10]]                   Close        Volume
Date                                 
2015-08-07     0.708448  4.052830e+05
2015-08-08     1.067860  1.463100e+06
2015-08-09     1.217440  2.150620e+06
2015-08-10     1.827670  4.068680e+06
2015-08-11     1.827870  4.637030e+06
...                 ...           ...
2021-07-19  2025.202759  1.749548e+10
2021-07-20  2124.776611  1.620039e+10
2021-07-21  2189.218750  1.605745e+10
2021-07-22  2191.373779  1.456648e+10
2021-07-23  2233.366699  2.961432e+10

[2178 rows x 2 columns]
(2178, 2) (2178, 1)


ValueError: Input 0 of layer lstm_1 is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 2)