Use all available tokens as targets
 

# Install Libraries and imports

In [1]:
!pip install fastai -Uqq

[K     |████████████████████████████████| 189 kB 5.6 MB/s 
[K     |████████████████████████████████| 55 kB 3.9 MB/s 
[?25h

In [2]:
from fastai.tabular.all import *
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics import r2_score
from torch.utils.data import Dataset

# Google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
path = Path('/content/drive/MyDrive/colab_notebooks/algovera/defi')

In [5]:
%cd /content/drive/MyDrive/colab_notebooks/algovera/defi

/content/drive/MyDrive/colab_notebooks/algovera/defi


# Preparing the data

In [6]:
df = pd.read_csv(path/'compound.csv')
df["Date"] = pd.to_datetime(df["Timestamp"], unit='s', origin='unix') #convert timestamp to datetime 


tokens = df["Token"].unique()
tokens

array(['DAI', 'USDC', 'USDT', 'ETH'], dtype=object)

In [7]:
df = df.drop_duplicates(['Timestamp', 'Token'])
counts = pd.DataFrame(df['Timestamp'].value_counts()).reset_index()
counts.columns = ['Timestamp', 'Counts']
df = df.merge(counts, on='Timestamp') #add number of entries per timestamp - used to eliminate those timestamps with data missing

In [8]:
df = df[df['Counts'] == 4].reset_index(drop=True).drop('Counts', axis=1) # select only those with 4 entries per timestamp

In [9]:
#make df such that there is one row for each time stamp
df1 = pd.DataFrame()
for tok in tokens:
    df_tok = df[df['Token']==tok]
    df_tok = df_tok.drop(['Token', 'Date'], axis=1)

    col_names = []
    for col in df_tok.columns:
        if col == 'Timestamp':
            col_names.append(f'{col}')
        else:
            col_names.append(f'{tok}_{col}')
        
    df_tok.columns = col_names
    #df_tok = df_tok.set_index('Timestamp', drop=True)
    
    if df1.empty:
        df1 = df_tok
    else:
        df1 = pd.merge(df1, df_tok, on='Timestamp')

In [10]:
df1.sort_values('Timestamp', inplace=True)
df1["Date"] = pd.to_datetime(df1["Timestamp"], unit='s', origin='unix')

In [11]:
df1.head()

Unnamed: 0,Timestamp,DAI_Borrowing Rate,DAI_Deposit Rate,DAI_Borrow Volume,DAI_Supply Volume,USDC_Borrowing Rate,USDC_Deposit Rate,USDC_Borrow Volume,USDC_Supply Volume,USDT_Borrowing Rate,USDT_Deposit Rate,USDT_Borrow Volume,USDT_Supply Volume,ETH_Borrowing Rate,ETH_Deposit Rate,ETH_Borrow Volume,ETH_Supply Volume,Date
0,1609471800,0.073195,0.050982,1069964000.0,61964810000.0,0.087046,0.066993,728543000.0,40630420000.0,0.099588,0.077548,64305360.0,3696225000.0,0.022952,0.000489,30553.654354,56632570.0,2021-01-01 03:30:00
1,1609473600,0.073101,0.050912,1069961000.0,61970500000.0,0.087053,0.066998,728546900.0,40630190000.0,0.09489,0.073569,64078770.0,3700299000.0,0.022952,0.000489,30553.703955,56633530.0,2021-01-01 04:00:00
2,1609475400,0.073061,0.050882,1069972000.0,61973540000.0,0.087058,0.067003,728552800.0,40630190000.0,0.085767,0.065933,63994180.0,3729213000.0,0.022951,0.000489,30553.830472,56645340.0,2021-01-01 04:30:00
3,1609477200,0.073436,0.051161,1070496000.0,61979660000.0,0.086921,0.066889,728571300.0,40636600000.0,0.072946,0.0555,63171620.0,3730028000.0,0.022952,0.000489,30553.78627,56638600.0,2021-01-01 05:00:00
4,1609479000,0.067829,0.047015,1070566000.0,62345800000.0,0.086312,0.066383,728575500.0,40661620000.0,0.057764,0.043239,62560950.0,3752927000.0,0.022952,0.000489,30553.841412,56634400.0,2021-01-01 05:30:00


# Multi-timestep input

## fastai tabular model

In [12]:
def get_target(
    row:pd.Series, # Row to infer current timestamp 
    target_column:str, # Name of target column
    target_window:int, # # Number of timepoint into the future make prediction 
):
    'Prepares target based on target_window'
    try:
        target = df1[df1['Timestamp'] == row['Timestamp'] + 1800.0*target_window][target_column].values[0]
    except:
        target = np.NaN
    
    return target


def get_tabpandas_multi(
    df:pd.DataFrame, # Dataframe of the raw data 
    token:Str, # Token to predict `DAI`, `USDC`, `USDT`,  `ETH`
    target_window:int, # Number of timepoints in the future to predict 
    n_timepoint:int, # Number of previous timepoints to be used as features   
    inference:bool=False, # Flag True for inference
):
    'makes dataframe based on `n_timepoints` and `target_window`'
    df = df.reset_index(drop=True)
    feature_cols = ['DAI_Borrowing Rate', 'DAI_Deposit Rate', 'DAI_Borrow Volume', 'DAI_Supply Volume', 
                    'USDC_Borrowing Rate', 'USDC_Deposit Rate', 'USDC_Borrow Volume', 'USDC_Supply Volume', 
                    'USDT_Borrowing Rate', 'USDT_Deposit Rate', 'USDT_Borrow Volume', 'USDT_Supply Volume',
                    'ETH_Borrowing Rate', 'ETH_Deposit Rate', 'ETH_Borrow Volume', 'ETH_Supply Volume']

    #target_columns = ['DAI_Borrowing Rate', 'USDC_Borrowing Rate', 'USDT_Borrowing Rate', 'ETH_Borrowing Rate']
    target_column = [f'{token}_Borrowing Rate']
    target_column_name = [f'{token}_Target_{target_window}']
    
    cols_names = []
    for j in range(n_timepoint):
        for col in feature_cols:
            cols_names.append(f'{col}_t-{n_timepoint -j-1}')
    
    if not inference:
        cols_names += target_column_name
    #cols_names += [f'DAI_Target{target_window}', f'USDC_Target{target_window}', f'USDT_Target{target_window}', f'ETH_Target{target_window}']


    pairs = []
    for i, row in tqdm(df.iterrows()):
        if not inference:
            if i < (len(df)-target_window-n_timepoint-1):
                features = df.loc[i:i+n_timepoint-1, feature_cols].values
                features = [item for sublist in features for item in sublist]
                targ = list(df.loc[i+n_timepoint-1+target_window, target_column].values)
                features += targ
                pairs.append(features)
        if inference:
            if i < (len(df)):
                features = df.loc[i:i+n_timepoint-1, feature_cols].values
                features = [item for sublist in features for item in sublist]
                pairs.append(features)

    df = pd.DataFrame(pairs, columns=cols_names)
    df = df.dropna()
    df = df.reset_index(drop=True)
    
    #sanity check
    if not inference:
        assert 0 == sum(df[f'{token}_Borrowing Rate_t-0'].shift(-target_window).dropna() != df[f'{token}_Target_{target_window}'].iloc[:-target_window])

        #train_test_split
        
        df['Train'] = None
        train_index = int(len(df)*0.8)
        df.loc[:train_index, 'Train'] = True
        df.loc[train_index:, 'Train'] = False

        splits = (list(df[df['Train']==True].index), list(df[df['Train']==False].index))

        df = df.drop(['Train'], axis=1)

        cont_names = list(df.columns[:-1])

        procs = [Categorify, FillMissing, Normalize]
        y_block = RegressionBlock()

        to = TabularPandas(df, procs=procs, cont_names=cont_names, y_names=target_column_name, y_block=y_block, splits=splits)
        dls = to.dataloaders(bs=128)

        return df, to, dls
    
    if inference:
        return df

In [13]:
def get_learner_train(
    df_p:pd.DataFrame, # Prepared df
    token:str, # Token to predict `DAI`, `USDC`, `USDT`,  `ETH`
    n_timepoint:int, # Number of previous timepoints to be used as features   
    target_window:int, # Number of timepoints in the future to predict 
):
    'Gets learner and trains, best model is also saved'
    df, to, dls = get_tabpandas_multi(df_p, token, target_window, n_timepoint)
    sm = SaveModelCallback(fname=f'multitimepoint_fastaitabmodel_{token}_{n_timepoint}_{target_window}')

    learn = tabular_learner(dls, 
                        [200,100], 
                        metrics=rmse,
                        n_out=1,
                        cbs=sm)
    
    learn.fit_one_cycle(100, 0.03)

    return learn

def get_preds(
    learner, # fastai's learner 
):
    'gets prediciton and prints r2 score'
    preds, targs = learn.get_preds(dl=learn.dls.valid)
    targs, preds = targs.flatten().numpy(), preds.flatten().numpy()
    r2 = r2_score(preds, targs)
    print("fastai TabModel",":",r2_score(preds, targs))

    return targs, preds, r2

def plot_results(
    model_type:str, # Model type - fastai tabular model or LSTM model
    token:Str, # Token to predict `DAI`, `USDC`, `USDT`,  `ETH`
    n_timepoint:int, # Number of previous timepoints to be used as features   
    target_window:int, # Number of timepoints in the future to predict 
    targs, # Real y_values 
    preds, # Predicted y_values
    r2
):
    plt.figure(figsize=(10,10))
    plt.plot(range(len(targs)), targs)
    plt.plot(range(len(targs)), preds)
    plt.title(f'{model_type} {token} n_timepoint: {n_timepoint} target_window: {target_window} r2: {r2:.4f}')
    plt.legend(['Target', 'Prediction'])
    plt.savefig(f'{path}/results/{model_type} {token} n_timepoint {n_timepoint} target_window {target_window}')


# Inference - fastai tabular model

Let's train a model with `SaveModelCallback` that saves the best model.

In [14]:
learn = get_learner_train(df1, 'DAI', 3, 5)

13766it [00:19, 718.47it/s]


epoch,train_loss,valid_loss,_rmse,time
0,0.010918,0.003039,0.055131,00:00
1,0.002201,0.002166,0.046542,00:00
2,0.000705,0.001673,0.0409,00:00
3,0.000647,0.001367,0.036968,00:00
4,0.000696,0.001112,0.033354,00:00
5,0.000504,0.000345,0.018564,00:00
6,0.000416,0.000336,0.01833,00:00
7,0.000328,0.0006,0.024501,00:00
8,0.000565,0.000468,0.021641,00:00
9,0.000614,0.000578,0.024032,00:00


Better model found at epoch 0 with valid_loss value: 0.0030394529458135366.
Better model found at epoch 1 with valid_loss value: 0.0021661301143467426.
Better model found at epoch 2 with valid_loss value: 0.0016727710608392954.
Better model found at epoch 3 with valid_loss value: 0.0013666019076481462.
Better model found at epoch 4 with valid_loss value: 0.0011124670272693038.
Better model found at epoch 5 with valid_loss value: 0.00034463833435438573.
Better model found at epoch 6 with valid_loss value: 0.00033599388552829623.
Better model found at epoch 10 with valid_loss value: 0.00021707243286073208.
Better model found at epoch 11 with valid_loss value: 0.0001760545274009928.
Better model found at epoch 12 with valid_loss value: 0.0001146372887887992.
Better model found at epoch 14 with valid_loss value: 4.929805436404422e-05.
Better model found at epoch 18 with valid_loss value: 1.0464445949764922e-05.
Better model found at epoch 19 with valid_loss value: 7.890777851571329e-06.
Be

After training, use the `export` method from fastai learner to save model plus the dataloader

In [15]:
learn.export('inference/testing.pkl')

To carry our inference, load the learner from export

In [16]:
inference = load_learner('inference/testing.pkl')

In [17]:
inference.model

TabularModel(
  (embeds): ModuleList()
  (emb_drop): Dropout(p=0.0, inplace=False)
  (bn_cont): BatchNorm1d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): LinBnDrop(
      (0): Linear(in_features=48, out_features=200, bias=False)
      (1): ReLU(inplace=True)
      (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): LinBnDrop(
      (0): Linear(in_features=200, out_features=100, bias=False)
      (1): ReLU(inplace=True)
      (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): LinBnDrop(
      (0): Linear(in_features=100, out_features=1, bias=True)
    )
  )
)

Assume this is the dataset we would want to carry out our inference on 

In [18]:
test = df1.iloc[-10:, ]

In [19]:
test

Unnamed: 0,Timestamp,DAI_Borrowing Rate,DAI_Deposit Rate,DAI_Borrow Volume,DAI_Supply Volume,USDC_Borrowing Rate,USDC_Deposit Rate,USDC_Borrow Volume,USDC_Supply Volume,USDT_Borrowing Rate,USDT_Deposit Rate,USDT_Borrow Volume,USDT_Supply Volume,ETH_Borrowing Rate,ETH_Deposit Rate,ETH_Borrow Volume,ETH_Supply Volume,Date
13756,1639485000,0.04665,0.031471,3669882000.0,210905400000.0,0.040323,0.025672,2588018000.0,166645800000.0,0.039179,0.024246,606286300.0,41621290000.0,0.027231,0.000765,53139.698365,74488130.0,2021-12-14 12:30:00
13757,1639486800,0.046646,0.031465,3669895000.0,210925300000.0,0.040317,0.025664,2587664000.0,166648200000.0,0.039127,0.024181,605485300.0,41621080000.0,0.027231,0.000765,53139.770109,74488170.0,2021-12-14 13:00:00
13758,1639488600,0.046644,0.031464,3669904000.0,210931200000.0,0.040301,0.025643,2587190000.0,166682600000.0,0.039243,0.024324,607248500.0,41620990000.0,0.027232,0.000765,53143.491452,74471490.0,2021-12-14 13:30:00
13759,1639490400,0.046637,0.031453,3669909000.0,210964900000.0,0.040298,0.025641,2587184000.0,166690800000.0,0.039241,0.024322,607250000.0,41622840000.0,0.027234,0.000765,53161.922065,74468130.0,2021-12-14 14:00:00
13760,1639492200,0.046636,0.031453,3669920000.0,210966700000.0,0.040307,0.025651,2587711000.0,166690800000.0,0.039234,0.024314,607252000.0,41629900000.0,0.027234,0.000765,53161.994342,74472440.0,2021-12-14 14:30:00
13761,1639494000,0.046639,0.031456,3669928000.0,210955900000.0,0.040312,0.025658,2588043000.0,166690800000.0,0.039243,0.024325,607252600.0,41620810000.0,0.027233,0.000765,53162.061654,74487590.0,2021-12-14 15:00:00
13762,1639497600,0.046638,0.031456,3669944000.0,210957500000.0,0.040311,0.025657,2587365000.0,166649500000.0,0.039302,0.024397,607536400.0,41578850000.0,0.027234,0.000765,53173.698145,74487810.0,2021-12-14 16:00:00
13763,1639501200,0.046638,0.031456,3669965000.0,210958000000.0,0.04032,0.025668,2585915000.0,166518700000.0,0.039309,0.024405,607639300.0,41578750000.0,0.02724,0.000767,53163.371786,74352580.0,2021-12-14 17:00:00
13764,1639504800,0.046639,0.031457,3669982000.0,210953500000.0,0.040532,0.025937,2586157000.0,165679500000.0,0.039307,0.024403,607609100.0,41578630000.0,0.027247,0.000768,53163.546809,74232260.0,2021-12-14 18:00:00
13765,1639506600,0.046691,0.031337,3506399000.0,202630200000.0,0.040563,0.025976,2585726000.0,165530700000.0,0.039305,0.0244,607577400.0,41578620000.0,0.027247,0.000768,53163.653493,74228780.0,2021-12-14 18:30:00


Build a `test_df` with `get_tabpandas_multi` with `inference` flag

In [20]:
test_df = get_tabpandas_multi(test, 'DAI', 5, 3, inference=True)

10it [00:00, 849.41it/s]


Pass the resulting `test_df` to `test_dl` that comes with `learner`

In [21]:
test_dl = inference.dls.test_dl(test_df)

In [22]:
test_dl.items

Unnamed: 0,DAI_Borrowing Rate_t-2,DAI_Deposit Rate_t-2,DAI_Borrow Volume_t-2,DAI_Supply Volume_t-2,USDC_Borrowing Rate_t-2,USDC_Deposit Rate_t-2,USDC_Borrow Volume_t-2,USDC_Supply Volume_t-2,USDT_Borrowing Rate_t-2,USDT_Deposit Rate_t-2,USDT_Borrow Volume_t-2,USDT_Supply Volume_t-2,ETH_Borrowing Rate_t-2,ETH_Deposit Rate_t-2,ETH_Borrow Volume_t-2,ETH_Supply Volume_t-2,DAI_Borrowing Rate_t-1,DAI_Deposit Rate_t-1,DAI_Borrow Volume_t-1,DAI_Supply Volume_t-1,USDC_Borrowing Rate_t-1,USDC_Deposit Rate_t-1,USDC_Borrow Volume_t-1,USDC_Supply Volume_t-1,USDT_Borrowing Rate_t-1,USDT_Deposit Rate_t-1,USDT_Borrow Volume_t-1,USDT_Supply Volume_t-1,ETH_Borrowing Rate_t-1,ETH_Deposit Rate_t-1,ETH_Borrow Volume_t-1,ETH_Supply Volume_t-1,DAI_Borrowing Rate_t-0,DAI_Deposit Rate_t-0,DAI_Borrow Volume_t-0,DAI_Supply Volume_t-0,USDC_Borrowing Rate_t-0,USDC_Deposit Rate_t-0,USDC_Borrow Volume_t-0,USDC_Supply Volume_t-0,USDT_Borrowing Rate_t-0,USDT_Deposit Rate_t-0,USDT_Borrow Volume_t-0,USDT_Supply Volume_t-0,ETH_Borrowing Rate_t-0,ETH_Deposit Rate_t-0,ETH_Borrow Volume_t-0,ETH_Supply Volume_t-0
0,-0.747125,-0.673791,1.697127,1.419722,-0.81124,-0.803626,0.385764,0.455823,-0.79077,-0.794242,1.664384,1.915274,-0.63356,-1.374001,-1.400982,0.141309,-0.747158,-0.67392,1.696852,1.419807,-0.811377,-0.803836,0.385034,0.455582,-0.791778,-0.795644,1.658436,1.9151,-0.633928,-1.374344,-1.401512,0.141106,-0.747093,-0.673889,1.696562,1.419624,-0.811777,-0.804423,0.384181,0.455855,-0.789429,-0.79243,1.67033,1.914935,-0.633837,-1.374215,-1.401917,0.139919
1,-0.74726,-0.67402,1.697143,1.420101,-0.811407,-0.803866,0.385399,0.455862,-0.791817,-0.795684,1.658811,1.915252,-0.633559,-1.374,-1.40098,0.141311,-0.747197,-0.673987,1.696862,1.41992,-0.811808,-0.804452,0.384546,0.456133,-0.789464,-0.792464,1.670704,1.91509,-0.633469,-1.37387,-1.40138,0.140124,-0.747325,-0.67428,1.696569,1.420264,-0.811835,-0.804506,0.384175,0.455985,-0.789464,-0.792478,1.67034,1.915131,-0.633224,-1.373582,-1.401267,0.139722
2,-0.7473,-0.674086,1.697153,1.420213,-0.811838,-0.804481,0.384911,0.456413,-0.789504,-0.792503,1.671079,1.915242,-0.6331,-1.373525,-1.400848,0.140329,-0.747429,-0.674378,1.696868,1.42056,-0.811866,-0.804535,0.384539,0.456263,-0.789499,-0.792511,1.670714,1.915286,-0.632856,-1.373237,-1.40073,0.139927,-0.747336,-0.674299,1.696582,1.420297,-0.811612,-0.804188,0.384718,0.455985,-0.789598,-0.792662,1.670355,1.915874,-0.633313,-1.373673,-1.401264,0.139975
3,-0.747532,-0.674478,1.697159,1.420854,-0.811896,-0.804564,0.384904,0.456543,-0.789539,-0.792551,1.671089,1.915438,-0.632488,-1.372892,-1.400199,0.140132,-0.74744,-0.674397,1.696882,1.420594,-0.811643,-0.804216,0.385083,0.456263,-0.789633,-0.792696,1.670728,1.916029,-0.632945,-1.373328,-1.400728,0.140181,-0.747262,-0.674173,1.696592,1.420094,-0.811472,-0.803988,0.38506,0.455985,-0.789423,-0.792421,1.670358,1.914917,-0.633629,-1.374,-1.401262,0.140867
4,-0.747543,-0.674497,1.697173,1.420887,-0.811673,-0.804246,0.385448,0.456543,-0.789673,-0.792735,1.671104,1.916182,-0.632576,-1.372984,-1.400196,0.140386,-0.747366,-0.674271,1.696892,1.42039,-0.811503,-0.804016,0.385425,0.456263,-0.789458,-0.792455,1.670732,1.915072,-0.633261,-1.373655,-1.400725,0.141072,-0.747271,-0.674188,1.696612,1.420123,-0.811492,-0.804016,0.384361,0.455325,-0.78825,-0.790804,1.672333,1.910491,-0.633291,-1.373651,-1.400851,0.14088
5,-0.747469,-0.674371,1.697183,1.420683,-0.811533,-0.804046,0.38579,0.456543,-0.789498,-0.792494,1.671108,1.915224,-0.632892,-1.37331,-1.400194,0.141277,-0.747375,-0.674286,1.696912,1.42042,-0.811523,-0.804044,0.384726,0.455603,-0.788285,-0.790838,1.672707,1.910646,-0.632923,-1.373306,-1.400315,0.141085,-0.747272,-0.67419,1.696637,1.420134,-0.811248,-0.803666,0.382867,0.453233,-0.788115,-0.790618,1.673049,1.91048,-0.630749,-1.371021,-1.401216,0.13292
6,-0.747477,-0.674386,1.697203,1.420713,-0.811553,-0.804074,0.385091,0.455883,-0.788325,-0.790877,1.673082,1.910799,-0.632555,-1.372962,-1.399783,0.14129,-0.747376,-0.674288,1.696936,1.42043,-0.811279,-0.803695,0.383232,0.45351,-0.78815,-0.790652,1.673423,1.910635,-0.630381,-1.370676,-1.400679,0.133125,-0.747239,-0.674135,1.696657,1.420047,-0.805571,-0.795527,0.383117,0.439804,-0.788154,-0.790672,1.672839,1.910467,-0.628211,-1.368395,-1.401209,0.125837
7,-0.747479,-0.674388,1.697227,1.420723,-0.811309,-0.803724,0.383597,0.45379,-0.78819,-0.790691,1.673798,1.910788,-0.630013,-1.370332,-1.400147,0.133329,-0.747343,-0.674233,1.696957,1.420344,-0.805602,-0.795556,0.383481,0.440082,-0.788189,-0.790706,1.673213,1.910623,-0.627844,-1.368051,-1.400673,0.126042,-0.745666,-0.678755,1.498338,1.261819,-0.804763,-0.794365,0.382672,0.437423,-0.788196,-0.790731,1.672619,1.910467,-0.628134,-1.368316,-1.401206,0.125633


Make the inference

In [23]:
inference.get_preds(dl=test_dl)[0]

tensor([[0.0470],
        [0.0469],
        [0.0469],
        [0.0469],
        [0.0469],
        [0.0469],
        [0.0469],
        [0.0468]])

## LSTM

In [24]:
class LSTMDataset(Dataset):
    def __init__(self, 
        df, # `to` from fastai tabular 
        n_timepoint:int, # Number of previous timepoints to be used as features   
        numfeatpertimepoint:int, # Number of features per timepoint 
    ):
        self.df = df.items.reset_index(drop=True)
        self.n_sequence = n_timepoint
        self.numfeatures = numfeatpertimepoint

    def __len__(self):
        return len(self.df)

    def __getitem__(self, 
        index
    ):
        row = self.df.loc[index,:]
        target = tensor(row.iloc[-1])
        features = [tensor(list(row[i*self.numfeatures:(i*self.numfeatures+self.numfeatures)].values)) for i in range(self.n_sequence)]
        features = torch.stack(features)
        return (features, target)

class LSTMTestDataset(Dataset):
    def __init__(self, 
        df, # `to` from fastai tabular 
        n_timepoint:int, # Number of previous timepoints to be used as features   
        numfeatpertimepoint:int, # Number of features per timepoint 
    ):
        self.df = df.items.reset_index(drop=True)
        self.n_sequence = n_timepoint
        self.numfeatures = numfeatpertimepoint

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.loc[index,:]
        features = [tensor(list(row[i*self.numfeatures:(i*self.numfeatures+self.numfeatures)].values)) for i in range(self.n_sequence)]
        features = torch.stack(features)
        return (features,)


class LSTMModel(nn.Module):
    def __init__(self, 
        input_size:int, # Number of features per timepoint
        hidden_size:int, # Hidden size to be used 
        num_layers:int=2, # Number of LSTM layers to use
        num_classes:int=1, # Output size
    ):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)


    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_()
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_()

        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out = out[:, -1, :]
        out = self.fc(out)

        return out

In [25]:
def get_lstm_learner_train(
    df_p:pd.DataFrame, # Prepared df
    token:str, # Token to predict `DAI`, `USDC`, `USDT`,  `ETH`
    n_timepoint:int, # Number of previous timepoints to be used as features   
    target_window:int, # Number of timepoints in the future to predict 
):
    
    df, to, dls_pre = get_tabpandas_multi(df_p, token, target_window, n_timepoint)
    train_dset = LSTMDataset(to.train, n_timepoint, 16)
    valid_dset = LSTMDataset(to.valid, n_timepoint, 16)
    dls = DataLoaders.from_dsets(train_dset, valid_dset, bs=128)

    sm = SaveModelCallback(fname=f'{token}_{n_timepoint}_{target_window}')

    model = LSTMModel(16, 128)
    learn = Learner(dls, model, loss_func=MSELossFlat(), metrics=rmse, cbs=sm)
    
    learn.fit_one_cycle(10, 0.03)

    return learn, dls_pre

For LSTM inference is slightly different

In [26]:
learn, dls_pre = get_lstm_learner_train(df1, 'DAI', 3, 5)

13766it [00:18, 727.24it/s]


epoch,train_loss,valid_loss,_rmse,time
0,8.4e-05,4e-06,0.002032,00:06
1,5.7e-05,2e-05,0.004517,00:06
2,7.1e-05,1e-05,0.003144,00:06
3,4.7e-05,4e-06,0.002013,00:06
4,4.6e-05,4e-06,0.001993,00:06
5,3.5e-05,8e-06,0.002812,00:06
6,3.2e-05,4e-06,0.002054,00:06
7,3.7e-05,6e-06,0.002476,00:06
8,3.3e-05,4e-06,0.001968,00:06
9,3.2e-05,4e-06,0.001896,00:06


Better model found at epoch 0 with valid_loss value: 4.12702092944528e-06.
Better model found at epoch 3 with valid_loss value: 4.053277734783478e-06.
Better model found at epoch 4 with valid_loss value: 3.972825652454048e-06.
Better model found at epoch 8 with valid_loss value: 3.874508365697693e-06.
Better model found at epoch 9 with valid_loss value: 3.5957139061792986e-06.


Save the model

In [27]:
learn.save('lstm_testing')

Path('models/lstm_testing.pth')

Save the fastai `dls` that we used for  

In [28]:
save_pickle('inference/lstm_pre_dls_testing.pkl', dls_pre)

Extensiate LSTM model and load state dict

In [29]:
inf_model = LSTMModel(16, 128)
inf_model.load_state_dict(torch.load('models/lstm_testing.pth')['model'])

<All keys matched successfully>

Generate `test_df`

In [30]:
test_df = get_tabpandas_multi(test, 'DAI', 5, 3, inference=True)

10it [00:00, 1059.57it/s]


Load `pre_dls` that we saved  

In [31]:
inf_dls = load_pickle('inference/lstm_pre_dls_testing.pkl')

Use `pre_dls` to build dest_dl and build `LSTMTestDataset`

In [32]:
test_dl = inf_dls.test_dl(test_df)
valid_dset = LSTMTestDataset(test_dl, 3, 16)

Put the model on a eval mode

In [33]:
inf_model.eval()

LSTMModel(
  (lstm): LSTM(16, 128, num_layers=2, batch_first=True)
  (fc): Linear(in_features=128, out_features=1, bias=True)
)

Build dataloader and make prediction

In [34]:
from torch.utils.data import DataLoader
dl = DataLoader(valid_dset, batch_size=8)

In [35]:
for x in dl:
    print(inf_model(x[0]))

tensor([[0.0463],
        [0.0463],
        [0.0463],
        [0.0463],
        [0.0463],
        [0.0463],
        [0.0463],
        [0.0463]], grad_fn=<AddmmBackward0>)
