Use all available tokens as targets
 

# Install Libraries and imports

In [1]:
!pip install fastai -Uqq

[K     |████████████████████████████████| 189 kB 27.8 MB/s 
[K     |████████████████████████████████| 56 kB 5.5 MB/s 
[?25h

In [6]:
from fastai.tabular.all import *
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics import r2_score
from torch.utils.data import Dataset

# Google Drive

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
path = Path('/content/drive/MyDrive/colab_notebooks/algovera/defi')

# Preparing the data

In [9]:
df = pd.read_csv(path/'compound.csv')
df["Date"] = pd.to_datetime(df["Timestamp"], unit='s', origin='unix')

tokens = df["Token"].unique()
tokens

array(['DAI', 'USDC', 'USDT', 'ETH'], dtype=object)

In [10]:
df = df.drop_duplicates(['Timestamp', 'Token'])
counts = pd.DataFrame(df['Timestamp'].value_counts()).reset_index()
counts.columns = ['Timestamp', 'Counts']
df = df.merge(counts, on='Timestamp')

In [11]:
df = df[df['Counts'] == 4].reset_index(drop=True).drop('Counts', axis=1)

In [12]:
df1 = pd.DataFrame()
for tok in tokens:
    df_tok = df[df['Token']==tok]
    df_tok = df_tok.drop(['Token', 'Date'], axis=1)

    col_names = []
    for col in df_tok.columns:
        if col == 'Timestamp':
            col_names.append(f'{col}')
        else:
            col_names.append(f'{tok}_{col}')
        
    df_tok.columns = col_names
    #df_tok = df_tok.set_index('Timestamp', drop=True)
    
    if df1.empty:
        df1 = df_tok
    else:
        df1 = pd.merge(df1, df_tok, on='Timestamp')

In [13]:
df1.sort_values('Timestamp', inplace=True)
df1["Date"] = pd.to_datetime(df1["Timestamp"], unit='s', origin='unix')

In [14]:
df1.head()

Unnamed: 0,Timestamp,DAI_Borrowing Rate,DAI_Deposit Rate,DAI_Borrow Volume,DAI_Supply Volume,USDC_Borrowing Rate,USDC_Deposit Rate,USDC_Borrow Volume,USDC_Supply Volume,USDT_Borrowing Rate,USDT_Deposit Rate,USDT_Borrow Volume,USDT_Supply Volume,ETH_Borrowing Rate,ETH_Deposit Rate,ETH_Borrow Volume,ETH_Supply Volume,Date
0,1609471800,0.073195,0.050982,1069964000.0,61964810000.0,0.087046,0.066993,728543000.0,40630420000.0,0.099588,0.077548,64305360.0,3696225000.0,0.022952,0.000489,30553.654354,56632570.0,2021-01-01 03:30:00
1,1609473600,0.073101,0.050912,1069961000.0,61970500000.0,0.087053,0.066998,728546900.0,40630190000.0,0.09489,0.073569,64078770.0,3700299000.0,0.022952,0.000489,30553.703955,56633530.0,2021-01-01 04:00:00
2,1609475400,0.073061,0.050882,1069972000.0,61973540000.0,0.087058,0.067003,728552800.0,40630190000.0,0.085767,0.065933,63994180.0,3729213000.0,0.022951,0.000489,30553.830472,56645340.0,2021-01-01 04:30:00
3,1609477200,0.073436,0.051161,1070496000.0,61979660000.0,0.086921,0.066889,728571300.0,40636600000.0,0.072946,0.0555,63171620.0,3730028000.0,0.022952,0.000489,30553.78627,56638600.0,2021-01-01 05:00:00
4,1609479000,0.067829,0.047015,1070566000.0,62345800000.0,0.086312,0.066383,728575500.0,40661620000.0,0.057764,0.043239,62560950.0,3752927000.0,0.022952,0.000489,30553.841412,56634400.0,2021-01-01 05:30:00


# Single-timestep input

In [None]:
def get_target(row, target_column, target_window):

    try:
        target = df1[df1['Timestamp'] == row['Timestamp'] + 1800.0*target_window][target_column].values[0]
    except:
        target = np.NaN
    
    return target


def get_tabpandas_singletimestep(df, tokens, target_window):

    y_names = []
    for tok in tokens:
        target = f'{tok}_Target'
        y_names.append(target)
        target_column = f'{tok}_Borrowing Rate'
        df[target] =  df.apply(lambda x: get_target(x, target_column, target_window), axis=1)

    df = df.dropna()
    df = df.drop(['Timestamp', 'Date'], axis=1)
    
    df['Train'] = None
    train_index = int(len(df)*0.8)
    df.loc[:train_index, 'Train'] = True
    df.loc[train_index:, 'Train'] = False
    
    df = df.reset_index(drop=True)
    splits = (list(df[df['Train']==True].index), list(df[df['Train']==False].index))
    
    df = df.drop(['Train'], axis=1)

    cont_names = list(df.columns[:len(tokens)])

    procs = [Categorify, FillMissing, Normalize]
    y_block = RegressionBlock()

    to = TabularPandas(df, procs=procs, cont_names=cont_names,
                       y_names=y_names, y_block=y_block, splits=splits)
    dls = to.dataloaders(bs=128)

    return to, dls

## fastai tabular model

### target_window = 3

In [None]:
target_window = 3
to, dls = get_tabpandas_singletimestep(df1, tokens, target_window)

In [None]:
len(to.train), len(to.valid) 

(8965, 2953)

In [None]:
to.cont_names

(#4) ['DAI_Borrowing Rate','DAI_Deposit Rate','DAI_Borrow Volume','DAI_Supply Volume']

In [None]:
sm = SaveModelCallback(fname=f'scope1_fastaitabmodel_{target_window}')

In [None]:
learn = tabular_learner(dls, 
                        [200,100], 
                        n_out=4, 
                        metrics=rmse,
                        cbs=sm)

In [None]:
learn.loss_func

FlattenedLoss of MSELoss()

In [None]:
learn.model

TabularModel(
  (embeds): ModuleList()
  (emb_drop): Dropout(p=0.0, inplace=False)
  (bn_cont): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): LinBnDrop(
      (0): Linear(in_features=4, out_features=200, bias=False)
      (1): ReLU(inplace=True)
      (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): LinBnDrop(
      (0): Linear(in_features=200, out_features=100, bias=False)
      (1): ReLU(inplace=True)
      (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): LinBnDrop(
      (0): Linear(in_features=100, out_features=4, bias=True)
    )
  )
)

In [None]:
learn.fit_one_cycle(100, 0.03)

epoch,train_loss,valid_loss,_rmse,time
0,0.008734,0.001401,0.037429,00:00
1,0.002483,0.001087,0.032963,00:00
2,0.001201,0.000786,0.028043,00:00
3,0.000939,0.001132,0.03365,00:00
4,0.000917,0.000738,0.027159,00:00
5,0.000967,0.000596,0.024409,00:00
6,0.001009,0.002374,0.048725,00:00
7,0.001143,0.001454,0.038131,00:00
8,0.001217,0.001592,0.039896,00:00
9,0.001276,0.008222,0.090674,00:00


Better model found at epoch 0 with valid_loss value: 0.001400893903337419.
Better model found at epoch 1 with valid_loss value: 0.001086575211957097.
Better model found at epoch 2 with valid_loss value: 0.0007864049402996898.
Better model found at epoch 4 with valid_loss value: 0.0007375930435955524.
Better model found at epoch 5 with valid_loss value: 0.0005958020337857306.
Better model found at epoch 18 with valid_loss value: 0.0003962700429838151.
Better model found at epoch 22 with valid_loss value: 0.0003815468808170408.
Better model found at epoch 27 with valid_loss value: 0.00037628994323313236.
Better model found at epoch 28 with valid_loss value: 0.0003643343225121498.
Better model found at epoch 31 with valid_loss value: 0.0003458160499576479.
Better model found at epoch 65 with valid_loss value: 0.0003456440463196486.


In [None]:
preds, targs = learn.get_preds(dl=dls.valid)

In [None]:
print("fastai TabModel",":",r2_score(preds.flatten().numpy(), targs.flatten().numpy()))

fastai TabModel : -0.5966149482781236


### target_window = 10

In [None]:
target_window = 10
to, dls = get_tabpandas_singletimestep(df1, tokens, target_window)

In [None]:
sm = SaveModelCallback(fname=f'scope1_fastaitabmodel_{target_window}')

In [None]:
learn = tabular_learner(dls, 
                        [200,100], 
                        n_out=4, 
                        metrics=rmse,
                        cbs=sm)

In [None]:
learn.fit_one_cycle(100, 0.03)

epoch,train_loss,valid_loss,_rmse,time
0,0.013502,0.011085,0.105283,00:00
1,0.004054,0.008451,0.091928,00:00
2,0.001849,0.003599,0.059989,00:00
3,0.001155,0.005447,0.073804,00:00
4,0.000931,0.002516,0.050163,00:00
5,0.000826,0.002873,0.053604,00:00
6,0.00084,0.001557,0.039463,00:00
7,0.000748,0.000922,0.030364,00:00
8,0.000727,0.001576,0.039704,00:00
9,0.000773,0.000728,0.026982,00:00


Better model found at epoch 0 with valid_loss value: 0.011084610596299171.
Better model found at epoch 1 with valid_loss value: 0.00845071580260992.
Better model found at epoch 2 with valid_loss value: 0.0035986495204269886.
Better model found at epoch 4 with valid_loss value: 0.0025163106620311737.
Better model found at epoch 6 with valid_loss value: 0.001557298586703837.
Better model found at epoch 7 with valid_loss value: 0.0009219924104399979.
Better model found at epoch 9 with valid_loss value: 0.0007280073477886617.
Better model found at epoch 11 with valid_loss value: 0.000582766835577786.
Better model found at epoch 12 with valid_loss value: 0.00040296459337696433.
Better model found at epoch 16 with valid_loss value: 0.00017723618657328188.
Better model found at epoch 18 with valid_loss value: 0.0001515439071226865.
Better model found at epoch 20 with valid_loss value: 0.00013342064630705863.
Better model found at epoch 38 with valid_loss value: 0.00013324922474566847.
Better 

In [None]:
preds, targs = learn.get_preds(dl=dls.valid)

In [None]:
print("fastai TabModel",":",r2_score(preds.flatten().numpy(), targs.flatten().numpy()))

fastai TabModel : 0.6545063822638751


### target_window = 20

In [None]:
target_window = 20
to, dls = get_tabpandas_singletimestep(df1, tokens, target_window)

In [None]:
sm = SaveModelCallback(fname=f'scope1_fastaitabmodel_{target_window}')

In [None]:
learn = tabular_learner(dls, 
                        [200,100], 
                        n_out=4, 
                        metrics=rmse,
                        cbs=sm)

In [None]:
learn.fit_one_cycle(100, 0.03)

epoch,train_loss,valid_loss,_rmse,time
0,0.010546,0.008105,0.09003,00:00
1,0.003075,0.003942,0.062784,00:00
2,0.001341,0.0036,0.059997,00:00
3,0.000908,0.004304,0.065608,00:00
4,0.000829,0.002753,0.052466,00:00
5,0.000693,0.002801,0.052925,00:00
6,0.000769,0.002628,0.051262,00:00
7,0.000742,0.001156,0.034,00:00
8,0.000685,0.000853,0.029201,00:00
9,0.000939,0.002086,0.045677,00:00


Better model found at epoch 0 with valid_loss value: 0.008105489425361156.
Better model found at epoch 1 with valid_loss value: 0.003941881004720926.
Better model found at epoch 2 with valid_loss value: 0.003599595744162798.
Better model found at epoch 4 with valid_loss value: 0.0027527296915650368.
Better model found at epoch 6 with valid_loss value: 0.0026277813594788313.
Better model found at epoch 7 with valid_loss value: 0.001156000653281808.
Better model found at epoch 8 with valid_loss value: 0.0008527027675881982.
Better model found at epoch 10 with valid_loss value: 0.0007592022302560508.
Better model found at epoch 11 with valid_loss value: 0.0005842227838002145.
Better model found at epoch 16 with valid_loss value: 0.0004373863048385829.
Better model found at epoch 17 with valid_loss value: 0.00039892943459562957.
Better model found at epoch 18 with valid_loss value: 0.00034226180287078023.
Better model found at epoch 19 with valid_loss value: 0.00032955134520307183.
Better 

In [None]:
preds, targs = learn.get_preds(dl=dls.valid)

In [None]:
print("fastai TabModel",":",r2_score(preds.flatten().numpy(), targs.flatten().numpy()))

fastai TabModel : 0.3160684393490881


In [None]:
analysis = pd.DataFrame([[3,10,20], [0.8545908591578281, 0.5323255441613118, 0.43878372442256885]]).T

In [None]:
analysis.columns = ['target_window', 'r2']

In [None]:
analysis

Unnamed: 0,target_window,r2
0,3.0,0.854591
1,10.0,0.532326
2,20.0,0.438784


# Multi-timestep input

## fastai tabular model

In [15]:
def get_tabpandas_multi(df, token, target_window, n_timepoint_inp):

    df = df.reset_index(drop=True)
    feature_cols = ['DAI_Borrowing Rate', 'DAI_Deposit Rate', 'DAI_Borrow Volume', 'DAI_Supply Volume', 
                    'USDC_Borrowing Rate', 'USDC_Deposit Rate', 'USDC_Borrow Volume', 'USDC_Supply Volume', 
                    'USDT_Borrowing Rate', 'USDT_Deposit Rate', 'USDT_Borrow Volume', 'USDT_Supply Volume',
                    'ETH_Borrowing Rate', 'ETH_Deposit Rate', 'ETH_Borrow Volume', 'ETH_Supply Volume']

    #target_columns = ['DAI_Borrowing Rate', 'USDC_Borrowing Rate', 'USDT_Borrowing Rate', 'ETH_Borrowing Rate']
    target_column = [f'{token}_Borrowing Rate']
    target_column_name = [f'{token}_Target_{target_window}']
    
    cols_names = []
    for j in range(n_timepoint_inp):
        for col in feature_cols:
            cols_names.append(f'{col}_t-{n_timepoint_inp -j-1}')
    cols_names += target_column_name
    #cols_names += [f'DAI_Target{target_window}', f'USDC_Target{target_window}', f'USDT_Target{target_window}', f'ETH_Target{target_window}']


    pairs = []
    for i, row in tqdm(df.iterrows()):
        if i < (len(df)-target_window-n_timepoint_inp-1):
            features = df.loc[i:i+n_timepoint_inp-1, feature_cols].values
            features = [item for sublist in features for item in sublist]
            targ = list(df.loc[i+n_timepoint_inp-1+target_window, target_column].values)
            features += targ
            pairs.append(features)

    df = pd.DataFrame(pairs, columns=cols_names)
    df = df.dropna()
    df = df.reset_index(drop=True)

    #train_test_split
    df['Train'] = None
    train_index = int(len(df)*0.8)
    df.loc[:train_index, 'Train'] = True
    df.loc[train_index:, 'Train'] = False

    splits = (list(df[df['Train']==True].index), list(df[df['Train']==False].index))

    df = df.drop(['Train'], axis=1)

    cont_names = list(df.columns[:-1])

    procs = [Categorify, FillMissing, Normalize]
    y_block = RegressionBlock()

    to = TabularPandas(df, procs=procs, cont_names=cont_names, y_names=target_column_name, y_block=y_block, splits=splits)
    dls = to.dataloaders(bs=128)

    return df, to, dls

In [16]:
def get_learner_train(token, n_timepoint, target_window):
    
    df, to, dls = get_tabpandas_multi(df1, token, target_window, n_timepoint)
    sm = SaveModelCallback(fname=f'multitimepoint_fastaitabmodel_{token}_{n_timepoint}_{target_window}')

    learn = tabular_learner(dls, 
                        [200,100], 
                        metrics=rmse,
                        n_out=1,
                        cbs=sm)
    
    learn.fit_one_cycle(100, 0.03)

    return learn

def get_preds(learner):
    preds, targs = learn.get_preds(dl=learn.dls.valid)
    targs, preds = targs.flatten().numpy(), preds.flatten().numpy()
    r2 = r2_score(preds, targs)
    print("fastai TabModel",":",r2_score(preds, targs))

    return targs, preds, r2

def plot_results(model_type, token, n_timepoint, target_window, targs, preds, r2):
    plt.figure(figsize=(10,10))
    plt.plot(range(len(targs)), targs)
    plt.plot(range(len(targs)), preds)
    plt.title(f'{model_type} {token} n_timepoint: {n_timepoint} target_window: {target_window} r2: {r2:.4f}')
    plt.legend(['Target', 'Prediction'])
    plt.savefig(f'{path}/results/{model_type} {token} n_timepoint {n_timepoint} target_window {target_window}')


In [17]:
#for tok in ['USDC', 'USDT', 'ETH']:
#    for ntp in [1, 3, 10, 20]:
#        for tw in [3, 5, 10]:
#            learn = get_learner_train(tok, ntp, tw)
#            targs, preds, r2 = get_preds(learn)
#            plot_results(tok, ntp, tw, targs, preds, r2)

## LSTM

In [18]:
class LSTMDataset(Dataset):
    def __init__(self, df, 
                 n_timepoint_input, 
                 numfeatpertimepoint):
        self.df = df.items.reset_index(drop=True)
        self.n_sequence = n_timepoint_input
        self.numfeatures = numfeatpertimepoint

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.loc[index,:]
        target = tensor(row.iloc[-4:])
        features = [tensor(list(row[i*self.numfeatures:(i*self.numfeatures+self.numfeatures)].values)) for i in range(self.n_sequence)]
        features = torch.stack(features)
        return (features, target)


class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=2, num_classes=len(tokens)):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)


    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_()
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_()

        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out = out[:, -1, :]
        out = self.fc(out)

        return out

In [19]:
def get_lstm_learner_train(token, n_timepoint, target_window):
    
    df, to, dls = get_tabpandas_multi(df1, token, target_window, n_timepoint)
    train_dset = LSTMDataset(to.train, n_timepoint, 16)
    valid_dset = LSTMDataset(to.valid, n_timepoint, 16)
    dls = DataLoaders.from_dsets(train_dset, valid_dset, bs=128)

    sm = SaveModelCallback(fname=f'{lstm}_{token}_{n_timepoint}_{target_window}')

    model = LSTMModel(16, 128)
    learn = Learner(dls, model, loss_func=MSELossFlat(), metrics=rmse, cbs=sm)
    
    learn.fit_one_cycle(50, 0.03)

    return learn

In [23]:
df, to, dls = get_tabpandas_multi(df1, 'DAI', 3, 1)
train_dset = LSTMDataset(to.train, 1, 16)
valid_dset = LSTMDataset(to.valid, 1, 16)
dls = DataLoaders.from_dsets(train_dset, valid_dset, bs=128)

13766it [00:20, 669.20it/s]


In [24]:
x, y = dls.one_batch()

In [25]:
x.shape

tensor([[-6.9214e-01,  1.6767e+00,  3.1783e+00,  9.4763e-02],
        [ 3.8766e-01,  5.9651e-01,  3.6119e-01,  4.2233e-02],
        [ 3.3026e-03,  3.1150e-01,  4.9108e-01,  7.5920e-02],
        [ 1.6776e+00,  1.1418e+00, -2.1510e-01,  4.1742e-02],
        [-9.1767e-01,  1.1003e+00,  2.9655e+00,  6.7749e-02],
        [-6.9242e-01,  1.6763e+00,  3.1786e+00,  9.4750e-02],
        [-8.5198e-01, -8.0265e-01,  1.7499e-01,  6.0674e-02],
        [ 1.3112e+00,  5.1373e-01, -5.5314e-01,  3.9272e-02],
        [-2.3400e-01, -8.2750e-01, -7.7134e-01,  4.1840e-02],
        [-5.0799e-01, -4.1404e-01, -2.4036e-01,  1.5830e-01],
        [ 1.2602e-01, -5.9195e-01, -8.2377e-01,  4.0702e-02],
        [-3.1725e-02,  7.3857e-01,  1.1084e+00,  7.0586e-02],
        [ 1.8073e+00,  1.3926e+00, -7.8451e-02,  4.2111e-02],
        [ 5.3173e-01,  1.0190e+00,  2.0672e-01,  9.7765e-02],
        [ 1.5156e+00,  1.1286e+00, -1.1249e-01,  4.1938e-02],
        [ 1.5281e+00,  1.4294e+00,  1.5986e-01,  4.1900e-02],
        

In [None]:
sm = SaveModelCallback(fname=f'{lstm}_{token}_{n_timepoint}_{target_window}')

model = LSTMModel(16, 128)
learn = Learner(dls, model, loss_func=MSELossFlat(), metrics=rmse, cbs=sm)

learn.fit_one_cycle(50, 0.03)

In [20]:
model_type = 'LSTM'
for tok in ['DAI', 'USDC', 'USDT', 'ETH']:
    for ntp in [1, 3, 10, 20]:
        for tw in [3, 5, 10]:
            learn = get_lstm_learner_train(tok, ntp, tw)
            targs, preds, r2 = get_preds(learn)
            plot_results(model_type, tok, ntp, tw, targs, preds, r2)

AttributeError: ignored