# Install Libraries and imports

In [1]:
!pip install fastai -Uqq

[K     |████████████████████████████████| 189 kB 8.4 MB/s 
[K     |████████████████████████████████| 55 kB 4.2 MB/s 
[?25h

In [2]:
from fastai.tabular.all import *
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics import r2_score
from torch.utils.data import Dataset

# Google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
path = Path('/content/drive/MyDrive/colab_notebooks/algovera/defi')

# Preparing the data

In [5]:
df = pd.read_csv(path/'compound.csv')
df["Date"] = pd.to_datetime(df["Timestamp"], unit='s', origin='unix')

tokens = df["Token"].unique()
tokens

array(['DAI', 'USDC', 'USDT', 'ETH'], dtype=object)

In [6]:
df = df.drop_duplicates(['Timestamp', 'Token'])
counts = pd.DataFrame(df['Timestamp'].value_counts()).reset_index()
counts.columns = ['Timestamp', 'Counts']
df = df.merge(counts, on='Timestamp')

In [7]:
df = df[df['Counts'] == 4].reset_index(drop=True).drop('Counts', axis=1)

In [8]:
df1 = pd.DataFrame()
for tok in tokens:
    df_tok = df[df['Token']==tok]
    df_tok = df_tok.drop(['Token', 'Date'], axis=1)

    col_names = []
    for col in df_tok.columns:
        if col == 'Timestamp':
            col_names.append(f'{col}')
        else:
            col_names.append(f'{tok}_{col}')
        
    df_tok.columns = col_names
    #df_tok = df_tok.set_index('Timestamp', drop=True)
    
    if df1.empty:
        df1 = df_tok
    else:
        df1 = pd.merge(df1, df_tok, on='Timestamp')

In [9]:
df1.sort_values('Timestamp', inplace=True)
df1["Date"] = pd.to_datetime(df1["Timestamp"], unit='s', origin='unix')

In [10]:
df1.head()

Unnamed: 0,Timestamp,DAI_Borrowing Rate,DAI_Deposit Rate,DAI_Borrow Volume,DAI_Supply Volume,USDC_Borrowing Rate,USDC_Deposit Rate,USDC_Borrow Volume,USDC_Supply Volume,USDT_Borrowing Rate,USDT_Deposit Rate,USDT_Borrow Volume,USDT_Supply Volume,ETH_Borrowing Rate,ETH_Deposit Rate,ETH_Borrow Volume,ETH_Supply Volume,Date
0,1609471800,0.073195,0.050982,1069964000.0,61964810000.0,0.087046,0.066993,728543000.0,40630420000.0,0.099588,0.077548,64305360.0,3696225000.0,0.022952,0.000489,30553.654354,56632570.0,2021-01-01 03:30:00
1,1609473600,0.073101,0.050912,1069961000.0,61970500000.0,0.087053,0.066998,728546900.0,40630190000.0,0.09489,0.073569,64078770.0,3700299000.0,0.022952,0.000489,30553.703955,56633530.0,2021-01-01 04:00:00
2,1609475400,0.073061,0.050882,1069972000.0,61973540000.0,0.087058,0.067003,728552800.0,40630190000.0,0.085767,0.065933,63994180.0,3729213000.0,0.022951,0.000489,30553.830472,56645340.0,2021-01-01 04:30:00
3,1609477200,0.073436,0.051161,1070496000.0,61979660000.0,0.086921,0.066889,728571300.0,40636600000.0,0.072946,0.0555,63171620.0,3730028000.0,0.022952,0.000489,30553.78627,56638600.0,2021-01-01 05:00:00
4,1609479000,0.067829,0.047015,1070566000.0,62345800000.0,0.086312,0.066383,728575500.0,40661620000.0,0.057764,0.043239,62560950.0,3752927000.0,0.022952,0.000489,30553.841412,56634400.0,2021-01-01 05:30:00


# Utility Functions

In [25]:
#classes
# 0 - to stay within
# 1 - to drop more than or equal to 
# 2 - to increase more than or equal to 

def set_target(
    v, 
    threshold
):
    if v > threshold:
        return 2
    elif (v <= threshold) and (v >= -threshold):
        return 0
    elif v < - threshold:
        return 1


def get_tabpandas_multi(
    df:pd.DataFrame, # Dataframe of the raw data 
    token:Str, # Token to predict `DAI`, `USDC`, `USDT`,  `ETH`
    threshold:float, # Threshold for target 
    target_window:int, # Number of timepoints in the future to predict 
    n_timepoint:int, # Number of previous timepoints to be used as features   
    inference:bool=False, # Flag True for inference
):

    df = df.reset_index(drop=True)
    feature_cols = ['DAI_Borrowing Rate', 'DAI_Deposit Rate', 'DAI_Borrow Volume', 'DAI_Supply Volume', 
                    'USDC_Borrowing Rate', 'USDC_Deposit Rate', 'USDC_Borrow Volume', 'USDC_Supply Volume', 
                    'USDT_Borrowing Rate', 'USDT_Deposit Rate', 'USDT_Borrow Volume', 'USDT_Supply Volume',
                    'ETH_Borrowing Rate', 'ETH_Deposit Rate', 'ETH_Borrow Volume', 'ETH_Supply Volume']

    target_column = f'{token}_Borrowing Rate'
    target = f'{token}_Target'

    cols_names = []
    for j in range(n_timepoint):
        for col in feature_cols:
            cols_names.append(f'{col}_t-{n_timepoint -j-1}')
    cols_names += [target]

    pairs = []
    for i, row in tqdm(df.iterrows()):
        if i < (len(df)-target_window-n_timepoint-1):
            features = df.loc[i:i+n_timepoint-1, feature_cols].values
            features = [item for sublist in features for item in sublist]

            val =  ((df.loc[i+n_timepoint-1+target_window, target_column]) / (df.loc[i+n_timepoint-1, target_column]) - 1)
            targs = set_target(val, threshold)

            features += [targs]
            pairs.append(features)

    df = pd.DataFrame(pairs, columns=cols_names).dropna().reset_index(drop=True)
    
    if not inference:
        n_min = df[target].value_counts()[2]

        df_a = df[df[target] == 0].sample(n=n_min)
        df_b = df[df[target] != 0]
        df = pd.concat([df_a, df_b]).reset_index(drop=True)
        
        splits = RandomSplitter(seed=101)(range_of(df))

        cont_names = list(df.columns[:-1])
        y_names = target
        procs = [Categorify, FillMissing, Normalize]
        y_block = CategoryBlock()

        to = TabularPandas(df, procs=procs, cont_names=cont_names, y_names=y_names, y_block=y_block, splits=splits)
        dls = to.dataloaders(bs=128)

        return df, to, dls
    
    if inference:
        return df


@patch
def plot_confusion_matrix(
    self:ClassificationInterpretation, 
    normalize=False, 
    title='Confusion matrix', 
    cmap="Blues", 
    norm_dec=2,
    plot_txt=True, 
    **kwargs
):
    "Plot the confusion matrix, with `title` and using `cmap`."
    # This function is mainly copied from the sklearn docs
    cm = self.confusion_matrix()
    if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    fig = plt.figure(**kwargs)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    tick_marks = np.arange(len(self.vocab))
    plt.xticks(tick_marks, self.vocab, rotation=90)
    plt.yticks(tick_marks, self.vocab, rotation=0)

    if plot_txt:
        thresh = cm.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            coeff = f'{cm[i, j]:.{norm_dec}f}' if normalize else f'{cm[i, j]}'
            plt.text(j, i, coeff, horizontalalignment="center", verticalalignment="center", color="white" if cm[i, j] > thresh else "black")

    ax = fig.gca()
    ax.set_ylim(len(self.vocab)-.5,-.5)

    plt.tight_layout()
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.grid(False)

    return fig


def get_preds_save_classification(
    learn, # fastai learner
    model_type, # LSTM model or fastai TabularModel
    token:Str, # Token to predict `DAI`, `USDC`, `USDT`,  `ETH`
    n_timepoint:int, # Number of previous timepoints to be used as features   
    target_window:int, # Number of timepoints in the future to predict 
    threshold:float, # Threshold for target 
):
    preds, targs = learn.get_preds(dl=learn.dls.valid)
    targs = targs.squeeze()
    preds = torch.argmax(preds, 1)
    acc = sum(preds==targs)/len(preds) * 100
    interp = ClassificationInterpretation.from_learner(learn)
    p = interp.plot_confusion_matrix(title=f'{model_type} {token} n_timepoint {n_timepoint} target_window {target_window} accuracy {acc}')
    p.savefig(f'{path}/results/{model_type} {token} n_timepoint {n_timepoint} target_window {target_window} threshold {threshold}.png')


def get_tab_learner_train(
    model_type, # LSTM model or fastai TabularModel
    token:Str, # Token to predict `DAI`, `USDC`, `USDT`,  `ETH`
    n_timepoint:int, # Number of previous timepoints to be used as features   
    target_window:int, # Number of timepoints in the future to predict 
    threshold:float, # Threshold for target 
):
    
    df, to, dls = get_tabpandas_multi(df1, token, threshold, target_window, n_timepoint)
    sm = SaveModelCallback(fname=f'{path}/models/{model_type}_{token}_{n_timepoint}_{target_window}')

    learn = tabular_learner(dls, 
                        [200,100], 
                        metrics=accuracy,
                        n_out=3,
                        cbs=sm)
    
    learn.fit_one_cycle(100, 0.03)
    
    get_preds_save_classification(learn, token, threshold, target_window, n_timepoint)
    
    return learn

class LSTMDataset(Dataset):
    def __init__(self, 
        df, # `to` from fastai tabular 
        n_timepoint:int, # Number of previous timepoints to be used as features   
        numfeatpertimepoint:int, # Number of features per timepoint 
    ):
        self.df = df.items.reset_index(drop=True)
        self.n_sequence = n_timepoint
        self.numfeatures = numfeatpertimepoint
        self.vocab = ([0, 1, 2], )

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.loc[index,:]
        target = tensor(row.iloc[-1]).long()
        features = [tensor(list(row[i*self.numfeatures:(i*self.numfeatures+self.numfeatures)].values)) for i in range(self.n_sequence)]
        features = torch.stack(features)
        return (features, target)


class LSTMModel(nn.Module):
    def __init__(self, 
        input_size:int, # Number of features per timepoint
        hidden_size:int, # Hidden size to be used 
        num_layers:int=2, # Number of LSTM layers to use
        num_classes:int=3, # Output size 3 for 0=stay within threshold, 1=go below threshold, 2=go above threshold
    ):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_()
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_()

        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out = out[:, -1, :]
        out = self.fc(out)

        return out


def get_lstm_learner_train(
    model_type, # LSTM model or fastai TabularModel
    token:Str, # Token to predict `DAI`, `USDC`, `USDT`,  `ETH`
    n_timepoint:int, # Number of previous timepoints to be used as features   
    target_window:int, # Number of timepoints in the future to predict 
    threshold:float, # Threshold for target 
):
    
    df, to, dls = get_tabpandas_multi(df1, token, threshold, target_window, n_timepoint)
    train_dset = LSTMDataset(to.train, n_timepoint, 16)
    valid_dset = LSTMDataset(to.valid, n_timepoint, 16)

    dls2 = DataLoaders.from_dsets(train_dset, valid_dset, bs=128)
    
    hidden_size = 128
    
    sm = SaveModelCallback(fname=f'{path}/models/{model_type}_{token}_{n_timepoint}_{target_window}')
    model = LSTMModel(16, hidden_size)
    learn = Learner(dls2, 
                model, 
                metrics=accuracy, 
                loss_func=CrossEntropyLossFlat(),
                cbs=sm)
    learn.fit_one_cycle(50, 0.03)    
    get_preds_save_classification(learn, model_type, token, n_timepoint, target_window, threshold)
    
    return df, dls, dls2, learn


## TabModel Classification

In [12]:
# target_window = 5
# n_timepoint = 1
# threshold = 0.05
# token = 'DAI'
# model_type = 'TabClassification'

In [22]:
# learn = get_tab_learner_train(model_type, token, n_timepoint, target_window, threshold)

## LSTM

In [27]:
# threshold = 0.05
# model_type = 'LSTMClassi'

# for tok in ['USDT']:
#    for ntp in [48]:
#        for tw in [48]:
#            df, dls, dls2, learn = get_lstm_learner_train(model_type, tok, ntp, tw, threshold)