In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from cryptobot.yahoo_market_data import get_yahoo_data
from cryptobot.data_engineering import *
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn import set_config; set_config(display='diagram')
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential, layers
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.metrics import MAPE
from sklearn.preprocessing import OrdinalEncoder
from tensorflow.keras.models import load_model, save_model

2022-06-10 18:48:06.788915: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-10 18:48:06.788971: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
"""
Get BASE ETHEREUM data per hour from Binance (downloaded .csv file)
"""

BASE_DATA_PATH = "../cryptobot/data/candles-ETHUSDT.csv"
COLUMN_NAMES = ["open_time","open","high","low", "close",
                         "volume", "close_time" ,"quote_asset_volume",
                         "number_of_trades", "taker_buy_base_asset_volume",
                         "taker_buy_quote_asset_volume"]

def get_data_without_headers(path):
    df = pd.read_csv(path,
                 header="infer")
    df.open_time = df.open_time.apply(lambda x: datetime.utcfromtimestamp(x/1000))
    df.close_time = df.close_time.apply(lambda x: datetime.utcfromtimestamp(x/1000))
    df["close_time_min"] = df["close_time"].apply(lambda x: x.strftime("%Y-%m-%d-%H-%M"))
    df["close_time_day"] = df["close_time"].apply(lambda x: x.strftime("%Y-%m-%d"))
    
    return df

df = get_data_without_headers(BASE_DATA_PATH)
df = df.drop(columns= ["quote_asset_volume",
                         "number_of_trades",
                         "taker_buy_base_asset_volume",
                         "taker_buy_quote_asset_volume"
                         ])
df

Unnamed: 0,open_time,open,high,low,close,volume,close_time,close_time_min,close_time_day
0,2018-02-01 03:00:00,1150.51,1164.99,1125.01,1140.00,7622.31840,2018-02-01 03:59:59.999,2018-02-01-03-59,2018-02-01
1,2018-02-01 04:00:00,1140.00,1154.72,1130.18,1152.51,3899.48396,2018-02-01 04:59:59.999,2018-02-01-04-59,2018-02-01
2,2018-02-01 05:00:00,1152.51,1164.42,1146.50,1159.13,3462.28586,2018-02-01 05:59:59.999,2018-02-01-05-59,2018-02-01
3,2018-02-01 06:00:00,1159.02,1159.02,1140.01,1142.99,3400.69303,2018-02-01 06:59:59.999,2018-02-01-06-59,2018-02-01
4,2018-02-01 07:00:00,1142.99,1151.00,1135.00,1137.10,2589.27888,2018-02-01 07:59:59.999,2018-02-01-07-59,2018-02-01
...,...,...,...,...,...,...,...,...,...
37798,2022-05-30 23:00:00,1999.41,2013.00,1984.03,1998.78,50617.55410,2022-05-30 23:59:59.999,2022-05-30-23-59,2022-05-30
37799,2022-05-31 00:00:00,1998.78,2016.45,1980.01,1992.86,44752.71670,2022-05-31 00:59:59.999,2022-05-31-00-59,2022-05-31
37800,2022-05-31 01:00:00,1992.85,1993.48,1976.55,1986.59,21968.64670,2022-05-31 01:59:59.999,2022-05-31-01-59,2022-05-31
37801,2022-05-31 02:00:00,1986.59,1999.90,1981.71,1996.26,18712.98890,2022-05-31 02:59:59.999,2022-05-31-02-59,2022-05-31


In [3]:
"""
Get FEAR AND GREED data from Binance (downloaded .csv file)
"""

FEAR_GREED_PATH = "../cryptobot/data/fear_greed_index.csv"

def get_data_fg_with_headers(path):
    fg = pd.read_csv(path)
    fg["timestamp"] = fg.timestamp.apply(lambda x: datetime.utcfromtimestamp(int(x)))
    fg["close_time_day"] = fg["timestamp"].apply(lambda x: x.strftime("%Y-%m-%d"))
    fg.drop(columns=["Unnamed: 0","timestamp"],inplace=True)
    fg.columns = ['FG_value', 'FG_val_clasif','close_time_day']
    return fg

fg = get_data_fg_with_headers(FEAR_GREED_PATH)
fg.head(3)

Unnamed: 0,FG_value,FG_val_clasif,close_time_day
0,13,Extreme Fear,2022-06-06
1,10,Extreme Fear,2022-06-05
2,14,Extreme Fear,2022-06-04


In [4]:
"""
Get BITCOIN DOMINANCE data from Binance (downloaded .csv file)
"""

BTC_DOM_PATH = "../cryptobot/data/BTC Dominance - Trading View - 1day.csv"

def get_data_btc_with_headers(path):
    btc_dom = pd.read_csv(path)
    btc_dom["time"] = btc_dom.time.apply(lambda x: datetime.utcfromtimestamp(int(x)))
    btc_dom["close_time_day"] = btc_dom["time"].apply(lambda x: x.strftime("%Y-%m-%d"))
    btc_dom["btc_avg"]= btc_dom["open"]+btc_dom["close"]+btc_dom["high"]+btc_dom["low"]/4
    btc_dom.drop(columns=["time","open","high","low","close","Volume","Volume MA"], inplace=True)

    return btc_dom

btc_dom = get_data_btc_with_headers(BTC_DOM_PATH)
btc_dom


Unnamed: 0,close_time_day,btc_avg
0,2014-04-01,323.016114
1,2014-04-02,322.943635
2,2014-04-03,323.063574
3,2014-04-04,323.113947
4,2014-04-05,323.200215
...,...,...
2981,2022-06-05,152.360945
2982,2022-06-06,153.657384
2983,2022-06-07,154.868802
2984,2022-06-08,154.631329


In [5]:
"""
Merging BASE ETHEREUM data from Binance ===> with FEAR AND GREED data
"""

def merge_df_fg(df,fg):
    df = df.merge(fg, on = "close_time_day", how="left")
    return df

df_fg =  merge_df_fg(df,fg)
df_fg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37803 entries, 0 to 37802
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   open_time       37803 non-null  datetime64[ns]
 1   open            37803 non-null  float64       
 2   high            37803 non-null  float64       
 3   low             37803 non-null  float64       
 4   close           37803 non-null  float64       
 5   volume          37803 non-null  float64       
 6   close_time      37803 non-null  datetime64[ns]
 7   close_time_min  37803 non-null  object        
 8   close_time_day  37803 non-null  object        
 9   FG_value        37731 non-null  float64       
 10  FG_val_clasif   37731 non-null  object        
dtypes: datetime64[ns](2), float64(6), object(3)
memory usage: 3.5+ MB


In [6]:
"""
Merging BASE ETHEREUM data from Binance + FEAR AND GREED data ===> with BITCOIN DOMINANCE data
"""

def merge_df_btc(df,btc):
    df = df.merge(btc, on = "close_time_day", how="left" )
    return df
                  
df_fg =  merge_df_btc(df_fg,btc_dom)
df_fg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37803 entries, 0 to 37802
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   open_time       37803 non-null  datetime64[ns]
 1   open            37803 non-null  float64       
 2   high            37803 non-null  float64       
 3   low             37803 non-null  float64       
 4   close           37803 non-null  float64       
 5   volume          37803 non-null  float64       
 6   close_time      37803 non-null  datetime64[ns]
 7   close_time_min  37803 non-null  object        
 8   close_time_day  37803 non-null  object        
 9   FG_value        37731 non-null  float64       
 10  FG_val_clasif   37731 non-null  object        
 11  btc_avg         37803 non-null  float64       
dtypes: datetime64[ns](2), float64(7), object(3)
memory usage: 3.7+ MB


In [7]:
"""
Get and Merge PERCENT SUPPLY IN PROFIT data from Glassnode (downloaded .json file)
"""

JSON_PATH = "../cryptobot/data/percent-supply-in-profit-eth-24h.json"

def merge_perc_supp(df, path):
    data = pd.read_json(path)
    data["close_time_day"]=data["t"].apply(lambda x: x[0:10])
    data.drop(columns="t",inplace=True)
    data = data.rename(columns= {"close_time_day": "close_time_day", "v": "%_supply_in_profit"})
    data = df.merge(data, on = "close_time_day", how="left")
    return data

df_fg = merge_perc_supp(df_fg,JSON_PATH)
df_fg

Unnamed: 0,open_time,open,high,low,close,volume,close_time,close_time_min,close_time_day,FG_value,FG_val_clasif,btc_avg,%_supply_in_profit
0,2018-02-01 03:00:00,1150.51,1164.99,1125.01,1140.00,7622.31840,2018-02-01 03:59:59.999,2018-02-01-03-59,2018-02-01,30.0,Fear,123.645560,0.875434
1,2018-02-01 04:00:00,1140.00,1154.72,1130.18,1152.51,3899.48396,2018-02-01 04:59:59.999,2018-02-01-04-59,2018-02-01,30.0,Fear,123.645560,0.875434
2,2018-02-01 05:00:00,1152.51,1164.42,1146.50,1159.13,3462.28586,2018-02-01 05:59:59.999,2018-02-01-05-59,2018-02-01,30.0,Fear,123.645560,0.875434
3,2018-02-01 06:00:00,1159.02,1159.02,1140.01,1142.99,3400.69303,2018-02-01 06:59:59.999,2018-02-01-06-59,2018-02-01,30.0,Fear,123.645560,0.875434
4,2018-02-01 07:00:00,1142.99,1151.00,1135.00,1137.10,2589.27888,2018-02-01 07:59:59.999,2018-02-01-07-59,2018-02-01,30.0,Fear,123.645560,0.875434
...,...,...,...,...,...,...,...,...,...,...,...,...,...
37798,2022-05-30 23:00:00,1999.41,2013.00,1984.03,1998.78,50617.55410,2022-05-30 23:59:59.999,2022-05-30-23-59,2022-05-30,10.0,Extreme Fear,151.034615,
37799,2022-05-31 00:00:00,1998.78,2016.45,1980.01,1992.86,44752.71670,2022-05-31 00:59:59.999,2022-05-31-00-59,2022-05-31,16.0,Extreme Fear,151.648366,
37800,2022-05-31 01:00:00,1992.85,1993.48,1976.55,1986.59,21968.64670,2022-05-31 01:59:59.999,2022-05-31-01-59,2022-05-31,16.0,Extreme Fear,151.648366,
37801,2022-05-31 02:00:00,1986.59,1999.90,1981.71,1996.26,18712.98890,2022-05-31 02:59:59.999,2022-05-31-02-59,2022-05-31,16.0,Extreme Fear,151.648366,


In [8]:
""" 
Merging ===> with YAHOO FINANCIAL INDICES data with get_yahoo_data() fuction that feeds from Yahoo API

Loop for importing datasets with various financial indices   
    
    '^IXIC' - Nasdaq compound index
    '^GSPC' - S&P 500 index
    'GC=F' - Gold
    '^DJI' - Dow Jones Industrial Average
    '^TNX' = S&P 500 US T-bills (10 year)
    'DX-Y.NYB'= US Dollar/USDX - Index - Cash

"""

# Index '^TNX' with "Volume" 0 value in all rows

SYMBOL_LIST = ['^IXIC','^GSPC','GC=F','^DJI', '^TNX','DX-Y.NYB']
START_DATE = "2018-02-01"
END_DATE = "2022-06-07"

for symbol in SYMBOL_LIST:
    yd = get_yahoo_data(symbol, START_DATE, END_DATE)
    yd[f'{symbol}_Date'] = yd[f'{symbol}_Date'].apply(lambda x: x.strftime("%Y-%m-%d"))
    yd.rename(columns = {f'{symbol}_Date':"close_time_day"}, inplace = True)
    yd[f'{symbol}_avg']= yd[f'{symbol}_Open']+yd[f'{symbol}_Close']+yd[f'{symbol}_High']+yd[f'{symbol}_Low']/4
    df_fg = df_fg.merge(yd, on = "close_time_day", how="left" )
    df_fg.drop(columns=[f'{symbol}_timestamp',f'{symbol}_Open',f'{symbol}_Close',f'{symbol}_High',f'{symbol}_Low'], inplace=True)
df_fg


Unnamed: 0,open_time,open,high,low,close,volume,close_time,close_time_min,close_time_day,FG_value,FG_val_clasif,btc_avg,%_supply_in_profit,^IXIC_avg,^GSPC_avg,GC=F_avg,^DJI_avg,^TNX_avg,DX-Y.NYB_avg
0,2018-02-01 03:00:00,1150.51,1164.99,1125.01,1140.00,7622.31840,2018-02-01 03:59:59.999,2018-02-01-03-59,2018-02-01,30.0,Fear,123.645560,0.875434,24044.689575,9177.564880,4372.900085,85080.059082,8.9725,289.180000
1,2018-02-01 04:00:00,1140.00,1154.72,1130.18,1152.51,3899.48396,2018-02-01 04:59:59.999,2018-02-01-04-59,2018-02-01,30.0,Fear,123.645560,0.875434,24044.689575,9177.564880,4372.900085,85080.059082,8.9725,289.180000
2,2018-02-01 05:00:00,1152.51,1164.42,1146.50,1159.13,3462.28586,2018-02-01 05:59:59.999,2018-02-01-05-59,2018-02-01,30.0,Fear,123.645560,0.875434,24044.689575,9177.564880,4372.900085,85080.059082,8.9725,289.180000
3,2018-02-01 06:00:00,1159.02,1159.02,1140.01,1142.99,3400.69303,2018-02-01 06:59:59.999,2018-02-01-06-59,2018-02-01,30.0,Fear,123.645560,0.875434,24044.689575,9177.564880,4372.900085,85080.059082,8.9725,289.180000
4,2018-02-01 07:00:00,1142.99,1151.00,1135.00,1137.10,2589.27888,2018-02-01 07:59:59.999,2018-02-01-07-59,2018-02-01,30.0,Fear,123.645560,0.875434,24044.689575,9177.564880,4372.900085,85080.059082,8.9725,289.180000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37798,2022-05-30 23:00:00,1999.41,2013.00,1984.03,1998.78,50617.55410,2022-05-30 23:59:59.999,2022-05-30-23-59,2022-05-30,10.0,Extreme Fear,151.034615,,,,,,,
37799,2022-05-31 00:00:00,1998.78,2016.45,1980.01,1992.86,44752.71670,2022-05-31 00:59:59.999,2022-05-31-00-59,2022-05-31,16.0,Extreme Fear,151.648366,,39394.634766,13477.799561,6013.299988,107579.014648,9.2580,330.712502
37800,2022-05-31 01:00:00,1992.85,1993.48,1976.55,1986.59,21968.64670,2022-05-31 01:59:59.999,2022-05-31-01-59,2022-05-31,16.0,Extreme Fear,151.648366,,39394.634766,13477.799561,6013.299988,107579.014648,9.2580,330.712502
37801,2022-05-31 02:00:00,1986.59,1999.90,1981.71,1996.26,18712.98890,2022-05-31 02:59:59.999,2022-05-31-02-59,2022-05-31,16.0,Extreme Fear,151.648366,,39394.634766,13477.799561,6013.299988,107579.014648,9.2580,330.712502


In [9]:
df_fg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37803 entries, 0 to 37802
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   open_time           37803 non-null  datetime64[ns]
 1   open                37803 non-null  float64       
 2   high                37803 non-null  float64       
 3   low                 37803 non-null  float64       
 4   close               37803 non-null  float64       
 5   volume              37803 non-null  float64       
 6   close_time          37803 non-null  datetime64[ns]
 7   close_time_min      37803 non-null  object        
 8   close_time_day      37803 non-null  object        
 9   FG_value            37731 non-null  float64       
 10  FG_val_clasif       37731 non-null  object        
 11  btc_avg             37803 non-null  float64       
 12  %_supply_in_profit  29213 non-null  float64       
 13  ^IXIC_avg           26036 non-null  float64   

In [10]:
"""
Creating new features using data_engineering.py defined fuctions
"""

df_fg = ema_metric(df_fg,df_fg["close"],10)
df_fg = ema_metric(df_fg,df_fg["close"],50)
df_fg = ema_metric(df_fg,df_fg["close"],200)
df_fg = rsi_metric(df_fg,df_fg["close"],14)
df_fg = adx_dmp_dmn_metric(df_fg,df_fg["high"],df_fg["low"],df_fg["close"],14)
df_fg = atr_metric(df_fg,df_fg["high"],df_fg["low"],df_fg["close"],14)
df_fg


Unnamed: 0,open_time,open,high,low,close,volume,close_time,close_time_min,close_time_day,FG_value,...,^TNX_avg,DX-Y.NYB_avg,EMA_10,EMA_50,EMA_200,RSI_14,ADX_14,DMP_14,DMN_14,ATR_14
0,2018-02-01 03:00:00,1150.51,1164.99,1125.01,1140.00,7622.31840,2018-02-01 03:59:59.999,2018-02-01-03-59,2018-02-01,30.0,...,8.9725,289.180000,,,,,,,,
1,2018-02-01 04:00:00,1140.00,1154.72,1130.18,1152.51,3899.48396,2018-02-01 04:59:59.999,2018-02-01-04-59,2018-02-01,30.0,...,8.9725,289.180000,,,,,,,,
2,2018-02-01 05:00:00,1152.51,1164.42,1146.50,1159.13,3462.28586,2018-02-01 05:59:59.999,2018-02-01-05-59,2018-02-01,30.0,...,8.9725,289.180000,,,,,,,,
3,2018-02-01 06:00:00,1159.02,1159.02,1140.01,1142.99,3400.69303,2018-02-01 06:59:59.999,2018-02-01-06-59,2018-02-01,30.0,...,8.9725,289.180000,,,,,,,,
4,2018-02-01 07:00:00,1142.99,1151.00,1135.00,1137.10,2589.27888,2018-02-01 07:59:59.999,2018-02-01-07-59,2018-02-01,30.0,...,8.9725,289.180000,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37798,2022-05-30 23:00:00,1999.41,2013.00,1984.03,1998.78,50617.55410,2022-05-30 23:59:59.999,2022-05-30-23-59,2022-05-30,10.0,...,,,1953.190456,1871.405020,1885.043501,81.600631,49.306548,38.204494,6.974402,24.620486
37799,2022-05-31 00:00:00,1998.78,2016.45,1980.01,1992.86,44752.71670,2022-05-31 00:59:59.999,2022-05-31-00-59,2022-05-31,16.0,...,9.2580,330.712502,1960.403100,1876.167961,1886.116302,78.448296,50.395425,34.299453,7.389129,25.464737
37800,2022-05-31 01:00:00,1992.85,1993.48,1976.55,1986.59,21968.64670,2022-05-31 01:59:59.999,2022-05-31-01-59,2022-05-31,16.0,...,9.2580,330.712502,1965.164355,1880.498237,1887.116041,75.137547,51.119054,32.630668,8.023957,24.855113
37801,2022-05-31 02:00:00,1986.59,1999.90,1981.71,1996.26,18712.98890,2022-05-31 02:59:59.999,2022-05-31-02-59,2022-05-31,16.0,...,9.2580,330.712502,1970.818108,1885.037914,1888.202050,76.766123,51.922373,32.772617,7.596319,24.379033


In [11]:
not_fillna = ["%_supply_in_profit", 'open_time', "close_time", 'close_time_min', 'close_time_day']
aux_df = df_fg.drop(columns= not_fillna)
aux_df.interpolate(method='linear', limit_direction='forward', axis=0, inplace=True)

aux_df.isnull().sum()

open               0
high               0
low                0
close              0
volume             0
FG_value           0
FG_val_clasif     72
btc_avg            0
^IXIC_avg          0
^GSPC_avg          0
GC=F_avg           0
^DJI_avg           0
^TNX_avg           0
DX-Y.NYB_avg       0
EMA_10             9
EMA_50            49
EMA_200          199
RSI_14            14
ADX_14            27
DMP_14            14
DMN_14            14
ATR_14            14
dtype: int64

In [12]:
df_fg[list(aux_df.columns)] = aux_df
df_fg.isnull().sum()

open_time                0
open                     0
high                     0
low                      0
close                    0
volume                   0
close_time               0
close_time_min           0
close_time_day           0
FG_value                 0
FG_val_clasif           72
btc_avg                  0
%_supply_in_profit    8590
^IXIC_avg                0
^GSPC_avg                0
GC=F_avg                 0
^DJI_avg                 0
^TNX_avg                 0
DX-Y.NYB_avg             0
EMA_10                   9
EMA_50                  49
EMA_200                199
RSI_14                  14
ADX_14                  27
DMP_14                  14
DMN_14                  14
ATR_14                  14
dtype: int64

In [13]:
'''
"""
Filling NAN values:

df.fillna(method='ffill') 
       
    This method propagate[s] last valid observation forward to next valid
        
df.fillna(method="bfill")
     
     This method propagate[s] last valid observation backward to last valid
"""



df_fg=df_fg.fillna(method="bfill").fillna(method="ffill")
df_fg.isna().sum()

'''

'\n"""\nFilling NAN values:\n\ndf.fillna(method=\'ffill\') \n       \n    This method propagate[s] last valid observation forward to next valid\n        \ndf.fillna(method="bfill")\n     \n     This method propagate[s] last valid observation backward to last valid\n"""\n\n\n\ndf_fg=df_fg.fillna(method="bfill").fillna(method="ffill")\ndf_fg.isna().sum()\n\n'

In [14]:
"""
Creating Target Feature  ===> Difference between open ETH value and close ETH value
""" 

def define_target(df):
    df["target"] = (df.close - df.open).apply(lambda x: 0 if x <0 else 1)
    return df

df = define_target(df_fg)
df_fg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37803 entries, 0 to 37802
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   open_time           37803 non-null  datetime64[ns]
 1   open                37803 non-null  float64       
 2   high                37803 non-null  float64       
 3   low                 37803 non-null  float64       
 4   close               37803 non-null  float64       
 5   volume              37803 non-null  float64       
 6   close_time          37803 non-null  datetime64[ns]
 7   close_time_min      37803 non-null  object        
 8   close_time_day      37803 non-null  object        
 9   FG_value            37803 non-null  float64       
 10  FG_val_clasif       37731 non-null  object        
 11  btc_avg             37803 non-null  float64       
 12  %_supply_in_profit  29213 non-null  float64       
 13  ^IXIC_avg           37803 non-null  float64   

In [15]:
"""
Some Minor changes before preprocessing:
     2 columns from timestamp to int
     Dropping columns created to merge
"""

#df_fg["open_time"]= pd.to_datetime(df_fg["open_time"]).astype(np.int64)
#df_fg["close_time"] = pd.to_datetime(df_fg["close_time"]).astype(np.int64)
df_fg.drop(columns=['open_time', "close_time", 'close_time_min', 'close_time_day'], inplace=True)
df_fg.head()

Unnamed: 0,open,high,low,close,volume,FG_value,FG_val_clasif,btc_avg,%_supply_in_profit,^IXIC_avg,...,DX-Y.NYB_avg,EMA_10,EMA_50,EMA_200,RSI_14,ADX_14,DMP_14,DMN_14,ATR_14,target
0,1150.51,1164.99,1125.01,1140.0,7622.3184,30.0,Fear,123.64556,0.875434,24044.689575,...,289.18,,,,,,,,,0
1,1140.0,1154.72,1130.18,1152.51,3899.48396,30.0,Fear,123.64556,0.875434,24044.689575,...,289.18,,,,,,,,,1
2,1152.51,1164.42,1146.5,1159.13,3462.28586,30.0,Fear,123.64556,0.875434,24044.689575,...,289.18,,,,,,,,,1
3,1159.02,1159.02,1140.01,1142.99,3400.69303,30.0,Fear,123.64556,0.875434,24044.689575,...,289.18,,,,,,,,,0
4,1142.99,1151.0,1135.0,1137.1,2589.27888,30.0,Fear,123.64556,0.875434,24044.689575,...,289.18,,,,,,,,,0


In [16]:
df_fg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37803 entries, 0 to 37802
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   open                37803 non-null  float64
 1   high                37803 non-null  float64
 2   low                 37803 non-null  float64
 3   close               37803 non-null  float64
 4   volume              37803 non-null  float64
 5   FG_value            37803 non-null  float64
 6   FG_val_clasif       37731 non-null  object 
 7   btc_avg             37803 non-null  float64
 8   %_supply_in_profit  29213 non-null  float64
 9   ^IXIC_avg           37803 non-null  float64
 10  ^GSPC_avg           37803 non-null  float64
 11  GC=F_avg            37803 non-null  float64
 12  ^DJI_avg            37803 non-null  float64
 13  ^TNX_avg            37803 non-null  float64
 14  DX-Y.NYB_avg        37803 non-null  float64
 15  EMA_10              37794 non-null  float64
 16  EMA_

In [17]:
df_fg.isnull().sum()

open                     0
high                     0
low                      0
close                    0
volume                   0
FG_value                 0
FG_val_clasif           72
btc_avg                  0
%_supply_in_profit    8590
^IXIC_avg                0
^GSPC_avg                0
GC=F_avg                 0
^DJI_avg                 0
^TNX_avg                 0
DX-Y.NYB_avg             0
EMA_10                   9
EMA_50                  49
EMA_200                199
RSI_14                  14
ADX_14                  27
DMP_14                  14
DMN_14                  14
ATR_14                  14
target                   0
dtype: int64

In [18]:
df_fg.tail(20)

Unnamed: 0,open,high,low,close,volume,FG_value,FG_val_clasif,btc_avg,%_supply_in_profit,^IXIC_avg,...,DX-Y.NYB_avg,EMA_10,EMA_50,EMA_200,RSI_14,ADX_14,DMP_14,DMN_14,ATR_14,target
37783,1907.83,1910.86,1900.4,1908.18,16918.9482,10.0,Extreme Fear,151.034615,,39329.330827,...,330.707022,1876.630992,1820.096597,1877.870568,79.005592,38.790121,41.47788,8.139369,18.54247,1
37784,1908.19,1910.19,1899.0,1904.43,25196.8911,10.0,Extreme Fear,151.034615,,39333.412323,...,330.707365,1881.685357,1823.403789,1878.13484,76.637891,40.680611,39.637832,8.333311,18.017293,0
37785,1904.43,1908.72,1891.26,1893.2,22261.6384,10.0,Extreme Fear,151.034615,,39337.49382,...,330.707707,1883.778929,1826.140896,1878.284743,69.883632,41.675339,36.888057,10.830484,17.977486,0
37786,1893.2,1907.72,1892.12,1902.64,16754.6409,10.0,Extreme Fear,151.034615,,39341.575316,...,330.70805,1887.208214,1829.140861,1878.527083,72.108873,42.599016,34.579847,10.152784,17.807666,1
37787,1902.65,1914.58,1872.71,1888.1,45339.3672,10.0,Extreme Fear,151.034615,,39345.656812,...,330.708392,1887.370357,1831.452984,1878.622336,64.23605,41.713538,29.283509,15.698033,19.526404,0
37788,1888.1,1893.68,1874.64,1892.78,26122.9935,10.0,Extreme Fear,151.034615,,39349.738308,...,330.708734,1888.353929,1833.857965,1878.763209,65.54018,40.89131,27.240298,14.602728,19.491661,1
37789,1892.69,1924.0,1890.14,1896.73,47970.2575,10.0,Extreme Fear,151.034615,,39353.819804,...,330.709077,1889.876851,1836.323535,1878.941983,66.645692,41.236469,34.584534,12.88142,20.517971,1
37790,1896.73,1915.65,1894.8,1912.44,27495.4667,10.0,Extreme Fear,151.034615,,39357.9013,...,330.709419,1893.979241,1839.308494,1879.275296,70.675176,41.556974,32.077133,11.947509,20.541687,1
37791,1912.43,1935.44,1906.0,1918.9,33908.0414,10.0,Extreme Fear,151.034615,,39361.982796,...,330.709762,1898.510288,1842.42973,1879.669572,72.164332,42.413168,35.566888,10.761148,21.177281,1
37792,1918.74,1934.96,1912.54,1927.87,19558.3681,10.0,Extreme Fear,151.034615,,39366.064293,...,330.710104,1903.848418,1845.780329,1880.149178,74.128888,43.208205,32.888542,9.950785,21.266047,1


In [19]:
"""
Creating Preproccesing Pipeline:

    Numerical Features (excluding "target") scaled with RobustScaler
    Categorical Feature (only one: "value_classification") encoded with OneHotEncoder

Note: Pending to implement Ordinal Encoder with 5 categories: "Extreme Fear", "Fear", etc

"""
#Select num columns to scale: 

NUM_COL_LIST = df_fg.drop(columns=["target","%_supply_in_profit"]).select_dtypes(include=["float64","int64"]).columns.values.tolist()
CAT_COL_LIST = ["FG_val_clasif"]

# Impute then Scale for numerical variables: 

num_scaler = RobustScaler()

# Encode categorical variables

# cat_encoder = OneHotEncoder()

features = ["Extreme Fear", "Fear", "Neutral","Greed", "Extreme Greed"]

cat_encoder = OrdinalEncoder(categories=[features],
                            dtype= np.int64,
                            handle_unknown="use_encoded_value",
                            unknown_value=-999
                            )



# Paralellize "num_transformer" and "cat_transformer"

preprocessor = ColumnTransformer([
                                ('num_tr', num_scaler, NUM_COL_LIST),
                                ('cat_tr', cat_encoder, CAT_COL_LIST)
                                ],
                                remainder='passthrough')

preprocessor

In [20]:
"""
Fitting and transforming dataframe (df_fg) in preprocessing pipeline
Creating preprocessed dataframe (df_fg_prep)
"""

df_fg_prep = pd.DataFrame(preprocessor.fit_transform(df_fg), columns=preprocessor.get_feature_names_out())
df_fg_prep

Unnamed: 0,num_tr__open,num_tr__high,num_tr__low,num_tr__close,num_tr__volume,num_tr__FG_value,num_tr__btc_avg,num_tr__^IXIC_avg,num_tr__^GSPC_avg,num_tr__GC=F_avg,...,num_tr__EMA_50,num_tr__EMA_200,num_tr__RSI_14,num_tr__ADX_14,num_tr__DMP_14,num_tr__DMN_14,num_tr__ATR_14,cat_tr__FG_val_clasif,remainder__%_supply_in_profit,remainder__target
0,0.419480,0.422938,0.410434,0.413702,-0.407670,-0.277778,-0.962740,-0.321715,-0.246727,-0.651556,...,,,,,,,,1.0,0.875434,0.0
1,0.413700,0.417334,0.413303,0.420582,-0.577712,-0.277778,-0.962740,-0.321715,-0.246727,-0.651556,...,,,,,,,,1.0,0.875434,1.0
2,0.420580,0.422627,0.422363,0.424223,-0.597681,-0.277778,-0.962740,-0.321715,-0.246727,-0.651556,...,,,,,,,,1.0,0.875434,1.0
3,0.424161,0.419680,0.418760,0.415346,-0.600494,-0.277778,-0.962740,-0.321715,-0.246727,-0.651556,...,,,,,,,,1.0,0.875434,0.0
4,0.415344,0.415303,0.415979,0.412107,-0.637556,-0.277778,-0.962740,-0.321715,-0.246727,-0.651556,...,,,,,,,,1.0,0.875434,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37798,0.886358,0.885726,0.887274,0.886009,1.556156,-0.833333,-0.526131,0.519831,0.780164,0.361776,...,0.810013,0.808227,1.963501,1.497644,1.919978,-1.334586,0.746919,0.0,,0.0
37799,0.886011,0.887609,0.885042,0.882754,1.288277,-0.666667,-0.516347,0.520054,0.780374,0.361704,...,0.812614,0.808807,1.764188,1.565320,1.526022,-1.294226,0.779072,0.0,,0.0
37800,0.882750,0.875073,0.883121,0.879305,0.247605,-0.666667,-0.516347,0.520054,0.780374,0.361704,...,0.814979,0.809348,1.554858,1.610295,1.357669,-1.232447,0.755855,0.0,,0.0
37801,0.879307,0.878577,0.885986,0.884624,0.098901,-0.666667,-0.516347,0.520054,0.780374,0.361704,...,0.817458,0.809935,1.657829,1.660223,1.371989,-1.274063,0.737723,0.0,,1.0


In [21]:
df_fg_prep.isnull().sum()

num_tr__open                        0
num_tr__high                        0
num_tr__low                         0
num_tr__close                       0
num_tr__volume                      0
num_tr__FG_value                    0
num_tr__btc_avg                     0
num_tr__^IXIC_avg                   0
num_tr__^GSPC_avg                   0
num_tr__GC=F_avg                    0
num_tr__^DJI_avg                    0
num_tr__^TNX_avg                    0
num_tr__DX-Y.NYB_avg                0
num_tr__EMA_10                      9
num_tr__EMA_50                     49
num_tr__EMA_200                   199
num_tr__RSI_14                     14
num_tr__ADX_14                     27
num_tr__DMP_14                     14
num_tr__DMN_14                     14
num_tr__ATR_14                     14
cat_tr__FG_val_clasif               0
remainder__%_supply_in_profit    8590
remainder__target                   0
dtype: int64

In [22]:
df_fg_prep= df_fg_prep.drop(columns='remainder__%_supply_in_profit')
df_fg_prep = df_fg_prep[200:]
df_fg_prep.isnull().sum()

num_tr__open             0
num_tr__high             0
num_tr__low              0
num_tr__close            0
num_tr__volume           0
num_tr__FG_value         0
num_tr__btc_avg          0
num_tr__^IXIC_avg        0
num_tr__^GSPC_avg        0
num_tr__GC=F_avg         0
num_tr__^DJI_avg         0
num_tr__^TNX_avg         0
num_tr__DX-Y.NYB_avg     0
num_tr__EMA_10           0
num_tr__EMA_50           0
num_tr__EMA_200          0
num_tr__RSI_14           0
num_tr__ADX_14           0
num_tr__DMP_14           0
num_tr__DMN_14           0
num_tr__ATR_14           0
cat_tr__FG_val_clasif    0
remainder__target        0
dtype: int64

In [23]:
"""
Defining Subsampling fuction
"""

def subsample_sequence(df, length):
    
    last_possible = df.shape[0] - length - 1
    
    random_start = np.random.randint(0, last_possible)
    X = df[random_start: random_start+length].values
    y = df.iloc[random_start+length+1]['remainder__target']
   
    return X, y


In [24]:
"""
Defining fuction to create X and y using Subsampling function
"""

def get_X_y(df, length_of_observations):
    X, y = [], []
    
    for length in length_of_observations:
        xi, yi = subsample_sequence(df, length)
        X.append(xi)
        y.append(yi)
    
    return X, y

# length_of_observations = np.random.randint(10, 15, 10000)
# X, y = get_X_y(df_fg_prep, length_of_observations)

In [25]:
"""
Splitting data into train and test dataframes
"""

horizon = 1
gap = horizon - 1

len_ = int(0.8*df_fg_prep.shape[0])

df_train = df_fg_prep[:len_]
df_test = df_fg_prep[len_+gap:]

In [26]:
"""
Splitting data into train and test sequences
"""

length_of_observations = np.random.randint(120, 168, 20000)
X_train, y_train = get_X_y(df_train, length_of_observations)

length_of_observations = np.random.randint(120, 168, 2000)
X_test, y_test = get_X_y(df_test, length_of_observations)

In [27]:
"""
Padding train set sequences
"""

X_train_pad = pad_sequences(X_train, dtype='float32', value=-999)

X_train_pad.shape

(20000, 167, 23)

In [28]:
"""
Building RNN Model
"""

normalizer = Normalization()
normalizer.adapt(X_train_pad)
model = Sequential()
model.add(normalizer)
model.add(layers.Masking(mask_value=-999))
model.add(layers.LSTM(20, activation='tanh', return_sequences=True))
model.add(layers.LSTM(10, activation='tanh'))    
model.add(layers.Dense(20, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()

2022-06-10 18:48:23.715110: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:925] could not open file to read NUMA node: /sys/bus/pci/devices/0000:02:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-06-10 18:48:23.715586: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-10 18:48:23.715792: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-06-10 18:48:23.715918: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2022-06-10 18:48:23.716056: W tensorflow/stream_executor/platform/default/dso_loader.cc:6

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization (Normalizatio  (None, None, 23)         47        
 n)                                                              
                                                                 
 masking (Masking)           (None, None, 23)          0         
                                                                 
 lstm (LSTM)                 (None, None, 20)          3520      
                                                                 
 lstm_1 (LSTM)               (None, 10)                1240      
                                                                 
 dense (Dense)               (None, 20)                220       
                                                                 
 dropout (Dropout)           (None, 20)                0         
                                                        

In [29]:
"""
Compiling model
"""
model.compile(loss='binary_crossentropy', optimizer=RMSprop(learning_rate=0.01), metrics="accuracy")

In [None]:
"""
Fitting model
"""

model.fit(X_train_pad, np.array(y_train), epochs=50, batch_size=128, validation_split=0.3)

Epoch 1/50


2022-06-10 18:48:27.433466: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 215096000 exceeds 10% of free system memory.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50

bucket = 'cryptobot-889'
symbol = 'ETHUSDT'
version = datetime.now().strftime("%Y%m%d-%H%M%S")
latest_path = f'gs://{bucket}/trained_models/{symbol}/latest/model'
version_path = f'gs://{bucket}/trained_models/{symbol}/{version}/model'

save_model(model, version_path, overwrite=True, save_format='tf')
save_model(model, latest_path, overwrite=True, save_format='tf')
