In this notebook,
- convert to daily data
- Use VG's strategy - to predict lowest borrow rate for a given time
- Start with basic models 

# Install Libraries and imports

In [1]:
!pip install fastai -Uqq

In [2]:
from fastai.tabular.all import *
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics import r2_score
from torch.utils.data import Dataset

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [3]:
RANDOM_STATE = 42

# IPFS Functions to upload models, scalers, label encoders, and data

* Because of the feature filtering some model and hp require scalers a scaler fit on top 20 features or all 44 features

In [4]:

from pinataapi import pin,unpin,edit_hash,get_pinned_files,get_pinned_jobs
from nftstorage import NFTStorage
import json

def createMetadata(ntp_tw,add_metadata):
    metadata_dict = {"keyvalues":{}}
    metadata_dict["keyvalues"].update({"dao":"Defi_Squad"})
    metadata_dict["keyvalues"].update({"protocol":"Compound"})
    metadata_dict["keyvalues"].update({"network":"Mainnet"})
    metadata_dict["keyvalues"].update({"ntp_tw":str(ntp_tw)})
    metadata_dict["keyvalues"].update(add_metadata)

    return metadata_dict


NS = NFTStorage()
ns_creds = NS.get_creds()

with open("creds.json") as f:

    p_creds = json.loads(f.read())["Pinata"]["JWT"]


def upload_ipfs(fpath,NS,ns_creds,p_creds,p_metadata):

    r_json, rs = NS.upload_file(ns_creds,fpath)

    cid = r_json["value"]["cid"]

    print(cid)

    # rs = unpin(p_creds,cid)

    r, rj = pin(p_creds,cid)

    r, rs = edit_hash(p_creds,cid,p_metadata)

    print(r.text)

    return cid

# Google Drive

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [58]:
from pathlib import Path
path = Path('/content/drive/MyDrive/colab_notebooks/algovera/defi')

# Preparing the data

In [59]:
df_init = pd.read_csv(path/'/content/drive/MyDrive/Algovera_DataLake/30_compoundV2_mainnet_2021_2022.csv')
tokens = df_init["underlyingSymbol"].unique()
tokens

array(['USDC', 'ETH', 'DAI', 'USDT'], dtype=object)

In [8]:
dataset_dict = {"keyvalues":{}}
dataset_dict["keyvalues"].update({"version":"V1"})
dataset_dict["keyvalues"].update({"overall":str({"dao":"DefiSquad","protocol":"Compound","network":"Mainnet"})})
dataset_dict["keyvalues"].update({"type":"Sample_Dataset"})
dataset_dict["keyvalues"].update({"condition":"Raw"})
dataset_dict["keyvalues"].update({"dateRange":"Jan2021-May102021"})

In [9]:
dataset_dict

{'keyvalues': {'condition': 'Raw',
  'dateRange': 'Jan2021-May102021',
  'overall': "{'dao': 'DefiSquad', 'protocol': 'Compound', 'network': 'Mainnet'}",
  'type': 'Sample_Dataset',
  'version': 'V1'}}

In [10]:
df_init.head(5).to_csv("sample.csv",index=False)

In [11]:
upload_ipfs('sample.csv',NS,ns_creds,p_creds,dataset_dict)

bafkreihthqliqh4mjrfatyfkclm3joccne3hlwor74rxplzb6ylyu6rz2i
OK


'bafkreihthqliqh4mjrfatyfkclm3joccne3hlwor74rxplzb6ylyu6rz2i'

In [12]:
df_init.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188772 entries, 0 to 188771
Data columns (total 20 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   id                        188772 non-null  object 
 1   borrowRate                188772 non-null  float64
 2   cash                      188772 non-null  float64
 3   collateralFactor          188772 non-null  float64
 4   exchangeRate              188772 non-null  float64
 5   interestRateModelAddress  188772 non-null  object 
 6   name                      188772 non-null  object 
 7   reserves                  188772 non-null  float64
 8   supplyRate                188772 non-null  float64
 9   symbol                    188772 non-null  object 
 10  totalBorrows              188772 non-null  float64
 11  totalSupply               188772 non-null  float64
 12  underlyingAddress         188772 non-null  object 
 13  underlyingName            188772 non-null  o

In [13]:
list(df_init.columns)

['id',
 'borrowRate',
 'cash',
 'collateralFactor',
 'exchangeRate',
 'interestRateModelAddress',
 'name',
 'reserves',
 'supplyRate',
 'symbol',
 'totalBorrows',
 'totalSupply',
 'underlyingAddress',
 'underlyingName',
 'underlyingPrice',
 'underlyingSymbol',
 'reserveFactor',
 'underlyingPriceUSD',
 'timestamp',
 'Date']

In [14]:
df_init["underlyingSymbol"].value_counts()

USDC    47193
ETH     47193
DAI     47193
USDT    47193
Name: underlyingSymbol, dtype: int64

In [15]:
df = df_init.drop_duplicates(['timestamp', 'underlyingSymbol'])
counts = pd.DataFrame(df['timestamp'].value_counts()).reset_index()
counts.columns = ['timestamp', 'Counts']
df = df.merge(counts, on='timestamp')

In [16]:
df = df[df['Counts'] == 4].drop('Counts', axis=1).reset_index(drop=True)

In [17]:
df

Unnamed: 0,id,borrowRate,cash,collateralFactor,exchangeRate,interestRateModelAddress,name,reserves,supplyRate,symbol,totalBorrows,totalSupply,underlyingAddress,underlyingName,underlyingPrice,underlyingSymbol,reserveFactor,underlyingPriceUSD,timestamp,Date
0,0x39aa39c021dfbae8fac545936693ac917d5e7563,0.094380,1.090111e+08,0.750,0.021379,0xd8ec56013ea119e7181d231e5048f90fbbe753c0,Compound USD Coin,4.897860e+05,0.074197,cUSDC,6.144258e+08,3.381572e+10,0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48,USD//C,0.001336,USDC,75000000000000000,1.000000,1609471800,2021-01-01 03:30:00
1,0x4ddc2d193948926d02f9b1fe9e1daa0718270ed5,0.023622,1.040458e+06,0.750,0.020030,0x0c3f8df27e1a00b47653fde878d68d35f00714c0,Compound Ether,1.118526e+02,0.000685,cETH,3.910705e+04,5.389069e+07,0x0000000000000000000000000000000000000000,Ether,1.000000,ETH,200000000000000000,748.490000,1609471800,2021-01-01 03:30:00
2,0x5d3a536e4d6dbd6114cc1ead35777bab948e3643,0.107785,1.630969e+08,0.750,0.020877,0xfb564da37b41b2f6b6edcc3e56fbf523bd9f2012,Compound Dai,1.724288e+06,0.078992,cDAI,1.009593e+09,5.608807e+10,0x6b175474e89094c44da98b954eedeac495271d0f,Dai Stablecoin,0.001341,DAI,150000000000000000,1.003733,1609471800,2021-01-01 03:30:00
3,0xf650c3d88d12db855b8bf7d11be6c55a4e07dcc9,0.133510,1.051888e+07,0.000,0.020484,0xfb564da37b41b2f6b6edcc3e56fbf523bd9f2012,Compound USDT,1.731669e+05,0.109392,cUSDT,8.023823e+07,4.422259e+09,0xdac17f958d2ee523a2206206994597c13d831ec7,Tether USD,0.001336,USDT,75000000000000000,1.000000,1609471800,2021-01-01 03:30:00
4,0xf650c3d88d12db855b8bf7d11be6c55a4e07dcc9,0.132843,1.058156e+07,0.000,0.020484,0xfb564da37b41b2f6b6edcc3e56fbf523bd9f2012,Compound USDT,1.732111e+05,0.108771,cUSDT,8.023882e+07,4.425319e+09,0xdac17f958d2ee523a2206206994597c13d831ec7,Tether USD,0.001336,USDT,75000000000000000,1.000000,1609473600,2021-01-01 04:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188767,0x39aa39c021dfbae8fac545936693ac917d5e7563,0.023084,7.629603e+08,0.800,0.022600,0xd8ec56013ea119e7181d231e5048f90fbbe753c0,Compound USD Coin,1.320519e+07,0.009858,cUSDC,6.430360e+08,6.162774e+10,0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48,USD//C,0.000418,USDC,75000000000000000,1.000000,1652207400,2022-05-10 18:30:00
188768,0xf650c3d88d12db855b8bf7d11be6c55a4e07dcc9,0.023990,3.972378e+08,0.000,0.021830,0xfb564da37b41b2f6b6edcc3e56fbf523bd9f2012,Compound USDT,2.992297e+06,0.010647,cUSDT,3.636335e+08,3.471759e+10,0xdac17f958d2ee523a2206206994597c13d831ec7,Tether USD,0.000423,USDT,75000000000000000,1.000000,1652208200,2022-05-10 18:43:20
188769,0x5d3a536e4d6dbd6114cc1ead35777bab948e3643,0.030936,3.997360e+08,0.800,0.021993,0xfb564da37b41b2f6b6edcc3e56fbf523bd9f2012,Compound Dai,2.079132e+07,0.016269,cDAI,6.149108e+08,4.519029e+10,0x6b175474e89094c44da98b954eedeac495271d0f,Dai Stablecoin,0.000419,DAI,150000000000000000,0.999569,1652208200,2022-05-10 18:43:20
188770,0x4ddc2d193948926d02f9b1fe9e1daa0718270ed5,0.022284,8.175917e+05,0.825,0.020063,0x0c3f8df27e1a00b47653fde878d68d35f00714c0,Compound Ether,6.887464e+02,0.000407,cETH,1.910877e+04,4.166977e+07,0x0000000000000000000000000000000000000000,Ether,1.000000,ETH,200000000000000000,2386.928252,1652208200,2022-05-10 18:43:20


In [18]:
df["timestamp"].max()

1652208200

## Shows these two methods are the same

In [19]:
def transform_timeseries(df):

    df1 = pd.DataFrame()
    for tok in tokens:
        df_tok = df[df['underlyingSymbol']==tok]
        df_tok = df_tok.drop(['underlyingSymbol'], axis=1)

        col_names = []
        for col in df_tok.columns:
            if col == 'timestamp':
                col_names.append(f'{col}')
            else:
                col_names.append(f'{tok}_{col}')
            
        df_tok.columns = col_names
        df_tok = df_tok.set_index('timestamp', drop=True)
        
        if df1.empty:
            df1 = df_tok
        else:
            df1 = pd.merge(df1, df_tok, on='timestamp')

    return df1.reset_index()

In [20]:
#Transform Dataset to columnar
def columnar(df):

    pivot_df = pd.pivot(df,index=["timestamp","Date"],columns=["underlyingSymbol"])

    pivot_df.columns = ["_".join(reversed(col)) for col in  pivot_df.columns]

    return pivot_df.reset_index()

In [21]:
df1 = transform_timeseries(df).sort_values('timestamp')

df1["Date"] = pd.to_datetime(df1["timestamp"], unit='s', origin='unix')

# resample to daily data - here using the mean value 
df1_day = df1.resample('D', on='Date').mean().reset_index()

df1_comp = df1_day[sorted(df1_day.columns)]

In [22]:
df1_comp.shape

(495, 46)

In [23]:
%%timeit
df1 = transform_timeseries(df)

1 loop, best of 5: 346 ms per loop


In [24]:
%%timeit
df_columnar = columnar(df)

1 loop, best of 5: 201 ms per loop


In [25]:
df_columnar = columnar(df).sort_values('timestamp')

df_columnar["Date"] = pd.to_datetime(df_columnar["timestamp"], unit='s', origin='unix')

# resample to daily data - here using the mean value 
df_columnar_day = df_columnar.resample('D', on='Date').mean().reset_index()

df_columnar_comp = df_columnar_day[sorted(df_columnar_day.columns)]

In [26]:
df_columnar_comp.shape

(495, 46)

### Illustrate difference in approaches

In [27]:
(df1_comp - df_columnar_comp).sum()

DAI_borrowRate                         0.0
DAI_cash                               0.0
DAI_collateralFactor                   0.0
DAI_exchangeRate                       0.0
DAI_reserveFactor                      0.0
DAI_reserves                           0.0
DAI_supplyRate                         0.0
DAI_totalBorrows                       0.0
DAI_totalSupply                        0.0
DAI_underlyingPrice                    0.0
DAI_underlyingPriceUSD                 0.0
Date                       0 days 00:00:00
ETH_borrowRate                         0.0
ETH_cash                               0.0
ETH_collateralFactor                   0.0
ETH_exchangeRate                       0.0
ETH_reserveFactor                      0.0
ETH_reserves                           0.0
ETH_supplyRate                         0.0
ETH_totalBorrows                       0.0
ETH_totalSupply                        0.0
ETH_underlyingPrice                    0.0
ETH_underlyingPriceUSD                 0.0
USDC_borrow

In [28]:
(df_columnar_comp - df1_comp).sum()

DAI_borrowRate                         0.0
DAI_cash                               0.0
DAI_collateralFactor                   0.0
DAI_exchangeRate                       0.0
DAI_reserveFactor                      0.0
DAI_reserves                           0.0
DAI_supplyRate                         0.0
DAI_totalBorrows                       0.0
DAI_totalSupply                        0.0
DAI_underlyingPrice                    0.0
DAI_underlyingPriceUSD                 0.0
Date                       0 days 00:00:00
ETH_borrowRate                         0.0
ETH_cash                               0.0
ETH_collateralFactor                   0.0
ETH_exchangeRate                       0.0
ETH_reserveFactor                      0.0
ETH_reserves                           0.0
ETH_supplyRate                         0.0
ETH_totalBorrows                       0.0
ETH_totalSupply                        0.0
ETH_underlyingPrice                    0.0
ETH_underlyingPriceUSD                 0.0
USDC_borrow

In [29]:
# check if there are missing dates
daydiff = (df_columnar_comp['Date'].shift(-1) - df_columnar_comp['Date']).apply(lambda x: x.days)

In [30]:
daydiff.value_counts()

1.0    494
Name: Date, dtype: int64

# Timeseries Formatting, loading, saving, and plotting functions

In [31]:
def get_synth_sample(x,y):

    from imblearn.combine import SMOTETomek,SMOTEENN
    X_resampled, y_resampled = SMOTETomek(random_state=RANDOM_STATE).fit_resample(x, y)

    synthetic_samples = y.shape[0] - y_resampled.shape[0]

    synthetic_indices = y[synthetic_samples:]

    return X_resampled,y_resampled



def get_tabpandas_multi(
    df:pd.DataFrame, # Dataframe of the raw data 
    n_timepoint:int, # Number of previous timepoints to be used as features   
    target_window:int, # Number of timepoints in the future to predict 
    stratify_sample:bool=False, #Make test train split stratified by class
    filter_cols:list=None,
    inference:bool=False, # Flag True for inference
    synth_sample:bool=False
):

    df = df.reset_index(drop=True)
    feature_cols = df.columns

    target_columns = ['DAI_borrowRate', 'USDC_borrowRate', 'USDT_borrowRate']#['borrowRate_DAI', 'borrowRate_USDC', 'borrowRate_USDT']
    target = 'Target'

    cols_names = []
    for j in range(n_timepoint):
        for col in feature_cols:
            cols_names.append(f'{col}_t-{n_timepoint -j-1}')
    cols_names += [target]

    pairs = []
    for i, row in tqdm(df.iterrows()):
        if i < (len(df)-target_window-n_timepoint+1):#+1 bc includes last full prediction set
            features = df.loc[i:i+n_timepoint-1, feature_cols].values #-1 bc loc is inclusive
            features = [item for sublist in features for item in sublist]

            val =  df.loc[i+n_timepoint: i+n_timepoint-1+target_window, target_columns].mean().idxmin()

            features += [val]

            pairs.append(features)

    df = pd.DataFrame(pairs, columns=cols_names).dropna().reset_index(drop=True)


    if filter_cols:
        x = df.iloc[:,filter_cols]
        y = df.iloc[:, -1]
    else:
        x = df.iloc[:, :-1]
        y = df.iloc[:, -1]

    
    if not inference:

        if synth_sample:
            x,y = get_synth_sample(x,y)

        if stratify_sample:
            x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, 
                                                                random_state=RANDOM_STATE,
                                                                shuffle=True,stratify=y)
        else:
            x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, 
                                                    random_state=RANDOM_STATE,
                                                    shuffle=True)


        ss = StandardScaler()
        x_train = ss.fit_transform(x_train)
        x_test = ss.transform(x_test)

        le = LabelEncoder()
        y_train = le.fit_transform(y_train)
        y_test = le.transform(y_test)

        return df,x_train, x_test, y_train, y_test, ss, le #synthetic_indices

    if inference:
        return x,y


def plot_confusion_matrix(
    targs,
    preds,
    title
):
    cm = confusion_matrix(targs, preds, normalize=None)
    df_cm = pd.DataFrame(cm, 
                         columns=['DAI', 'USDC', 'USDT'], 
                         index=['DAI', 'USDC', 'USDT',])
    
    ax = sns.heatmap(df_cm, 
            annot=True, 
            annot_kws={"size": 16},
            cmap="Blues",
            ).set_title(title)
            
    plt.show()


def save_models(
    ntp,
    tw,
    ss,
    le,
    clf,
    dataset,
):

    folder = "-".join([str(ntp),str(tw)])

    save_pickle(path/f'models/V1_all_features/{folder}/{dataset}_ss_clf_{ntp}_{tw}.pkl', ss)
    save_pickle(path/f'models/V1_all_features/{folder}/{dataset}_le_clf_{ntp}_{tw}.pkl', le)
    save_pickle(path/f'models/V1_all_features/{folder}/{dataset}_clf_{ntp}_{tw}.pkl', clf)


def load_models(
    ntp,
    tw,
    dataset,  
):

    folder = "-".join([str(ntp),str(tw)])
    ss = load_pickle(path/f'models/V1_all_features/{folder}/{dataset}_ss_clf_{ntp}_{tw}.pkl')
    le = load_pickle(path/f'models/V1_all_features/{folder}/{dataset}_le_clf_{ntp}_{tw}.pkl')
    clf = load_pickle(path/f'models/V1_all_features/{folder}/{dataset}_clf_{ntp}_{tw}.pkl')

    return ss, le, clf

# Load Models

In [32]:
df_top_models = pd.read_csv(path/"compoundV2_model_results_2021.csv")
df_top_models["train_test_diff"] = df_top_models["mean_train_score"] - df_top_models["mean_test_score"]
df_top_models.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,...,ntp_tw,dataset,classes,train_class_size,test_class_size,param_max_depth,param_min_samples_leaf,param_n_estimators,param_learning_rate,train_test_diff
0,0.008097,0.000443,0.000931,6.6e-05,42,{'random_state': 42},0.755319,0.723404,0.795699,0.758141,...,5-7,"[False, None, False]","DAI_borrowRate,USDC_borrowRate,USDT_borrowRate","[129, 107, 45]","[26, 27, 18]",,,,,0.241859
1,0.019566,0.002312,0.002827,0.000624,42,"{'max_depth': 3, 'min_samples_leaf': 1, 'n_estimators': 10, 'random_state': 42}",0.787234,0.723404,0.752688,0.754442,...,5-7,"[False, None, False]","DAI_borrowRate,USDC_borrowRate,USDT_borrowRate","[129, 107, 45]","[26, 27, 18]",3.0,1.0,10.0,,0.089027
2,0.031551,0.000651,0.003516,2.2e-05,42,"{'max_depth': 3, 'min_samples_leaf': 1, 'n_estimators': 20, 'random_state': 42}",0.776596,0.744681,0.731183,0.75082,...,5-7,"[False, None, False]","DAI_borrowRate,USDC_borrowRate,USDT_borrowRate","[129, 107, 45]","[26, 27, 18]",3.0,1.0,20.0,,0.078342
3,0.046227,0.000853,0.004323,0.000154,42,"{'max_depth': 3, 'min_samples_leaf': 1, 'n_estimators': 30, 'random_state': 42}",0.776596,0.734043,0.731183,0.747274,...,5-7,"[False, None, False]","DAI_borrowRate,USDC_borrowRate,USDT_borrowRate","[129, 107, 45]","[26, 27, 18]",3.0,1.0,30.0,,0.092573
4,0.060076,0.000851,0.005304,3.9e-05,42,"{'max_depth': 3, 'min_samples_leaf': 1, 'n_estimators': 40, 'random_state': 42}",0.776596,0.712766,0.741935,0.743766,...,5-7,"[False, None, False]","DAI_borrowRate,USDC_borrowRate,USDT_borrowRate","[129, 107, 45]","[26, 27, 18]",3.0,1.0,40.0,,0.083623


# Select Models

In [33]:
model_to_ipfs = ((df_top_models[["Model","mean_train_score","mean_test_score","train_test_diff","params","ntp_tw","dataset",'classes','train_class_size', 'test_class_size']]
.sort_values(["train_test_diff","mean_test_score"],ascending=True)
.drop_duplicates(["Model","params","ntp_tw"])
)
[["Model","params","ntp_tw","dataset"]]
)

model_to_ipfs.shape

(1707, 4)

## Get Model Metadata

In [34]:
def get_model_metadata(model_metadata):

    model = eval(model_metadata[0])
    params = model_metadata[1]

    return model, params


def get_data_metadata(data_metadata):

    ntp = int(data_metadata[2].split("-")[0])
    tw = int(data_metadata[2].split("-")[1])
    dataset_mutation = eval(data_metadata[3])
    stratify = dataset_mutation[0]
    fc = dataset_mutation[1]
    synth = dataset_mutation[2]

    return ntp,tw,stratify,fc,synth


## Create Train Test and Predict datasets

In [35]:
def train_test(df):

    keep_cols =  [col for col in df.columns if "borrowRate" in col or "supplyRate" in col
                or "totalBorrows" in col or "totalSupply" in col or col.startswith("Date")]

    df_train = df[keep_cols].query('Date >= "2021-01-01" & Date <= "2021-12-31"')
    df_predict = df[keep_cols].query('Date >= "2022-01-01" & Date <= "2022-05-30"').reset_index(drop=True)

    return df_train,df_predict


In [36]:
columnar_train, columnar_test  = train_test(df_columnar_comp)
df1_train, df1_test  = train_test(df1_comp)

In [37]:
(columnar_train - df1_train).sum()

DAI_borrowRate                   0.0
DAI_supplyRate                   0.0
DAI_totalBorrows                 0.0
DAI_totalSupply                  0.0
Date                 0 days 00:00:00
ETH_borrowRate                   0.0
ETH_supplyRate                   0.0
ETH_totalBorrows                 0.0
ETH_totalSupply                  0.0
USDC_borrowRate                  0.0
USDC_supplyRate                  0.0
USDC_totalBorrows                0.0
USDC_totalSupply                 0.0
USDT_borrowRate                  0.0
USDT_supplyRate                  0.0
USDT_totalBorrows                0.0
USDT_totalSupply                 0.0
dtype: object

In [38]:
(columnar_test - df1_test).sum()

DAI_borrowRate                   0.0
DAI_supplyRate                   0.0
DAI_totalBorrows                 0.0
DAI_totalSupply                  0.0
Date                 0 days 00:00:00
ETH_borrowRate                   0.0
ETH_supplyRate                   0.0
ETH_totalBorrows                 0.0
ETH_totalSupply                  0.0
USDC_borrowRate                  0.0
USDC_supplyRate                  0.0
USDC_totalBorrows                0.0
USDC_totalSupply                 0.0
USDT_borrowRate                  0.0
USDT_supplyRate                  0.0
USDT_totalBorrows                0.0
USDT_totalSupply                 0.0
dtype: object

In [39]:
# keep_cols =  [col for col in df_columnar_comp.columns if "borrowRate" in col or "supplyRate" in col
#                 or "totalBorrows" in col or "totalSupply" in col or col.startswith("Date")]

keep_cols = [col for col in df_columnar_comp.columns if col != "name" and col != "symbol" 
             and col != "underlyingAddress" and col != "underlyingName" and col != "underlyingSymbol"
             and col != "underlyingPrice" and col != "underlyingPriceUSD" and col != "timestamp"
             and col != "interestRateModelAddress"]


df_train = df_columnar_comp[keep_cols].query('Date >= "2021-01-01" & Date <= "2021-12-31"')
df_predict = df_columnar_comp[keep_cols].query('Date >= "2022-01-01" & Date <= "2022-05-30"').reset_index(drop=True)


In [40]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 365 entries, 0 to 364
Data columns (total 45 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   DAI_borrowRate           365 non-null    float64       
 1   DAI_cash                 365 non-null    float64       
 2   DAI_collateralFactor     365 non-null    float64       
 3   DAI_exchangeRate         365 non-null    float64       
 4   DAI_reserveFactor        365 non-null    float64       
 5   DAI_reserves             365 non-null    float64       
 6   DAI_supplyRate           365 non-null    float64       
 7   DAI_totalBorrows         365 non-null    float64       
 8   DAI_totalSupply          365 non-null    float64       
 9   DAI_underlyingPrice      365 non-null    float64       
 10  DAI_underlyingPriceUSD   365 non-null    float64       
 11  Date                     365 non-null    datetime64[ns]
 12  ETH_borrowRate           365 non-nul

In [41]:
df_predict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 45 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   DAI_borrowRate           130 non-null    float64       
 1   DAI_cash                 130 non-null    float64       
 2   DAI_collateralFactor     130 non-null    float64       
 3   DAI_exchangeRate         130 non-null    float64       
 4   DAI_reserveFactor        130 non-null    float64       
 5   DAI_reserves             130 non-null    float64       
 6   DAI_supplyRate           130 non-null    float64       
 7   DAI_totalBorrows         130 non-null    float64       
 8   DAI_totalSupply          130 non-null    float64       
 9   DAI_underlyingPrice      130 non-null    float64       
 10  DAI_underlyingPriceUSD   130 non-null    float64       
 11  Date                     130 non-null    datetime64[ns]
 12  ETH_borrowRate           130 non-nul

In [42]:
df_predict.to_csv("30_compoundV2_mainnet_2022_prediction.csv",index=False)

In [43]:
dataset_dict = {"keyvalues":{}}
dataset_dict["keyvalues"].update({"version":"V1"})
dataset_dict["keyvalues"].update({"overall":str({"dao":"DefiSquad","protocol":"Compound","network":"Mainnet"})})
dataset_dict["keyvalues"].update({"type":"Dataset"})
dataset_dict["keyvalues"].update({"condition":"Prediction"})
dataset_dict["keyvalues"].update({"dateRange":"Jan2022-May102022"})

In [44]:
upload_ipfs('30_compoundV2_mainnet_2022_prediction.csv',NS,ns_creds,p_creds,dataset_dict)

bafkreie6bxnljf6dxlr3wro6fu6adwj2vctdwgy4eccgnwntdnee42tlqe
{"error":"Cannot destructure property 'keyvalues' of '(intermediate value)' as it is null."}


'bafkreie6bxnljf6dxlr3wro6fu6adwj2vctdwgy4eccgnwntdnee42tlqe'

# Train and Test Models

## Save Top models to disk

In [45]:
import pandas as pd

In [46]:
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score
def calc_f1(y,preds):

    f1 = f1_score(y,preds,average="micro")
    
    return f1

In [1]:
#model_to_ipfs.shape[0]

df_train_results = []


for tw in [7,14,21]:

    print("______")
    print(tw)

    save_modeldf = model_to_ipfs.query(f'ntp_tw == "5-{tw}"').reset_index(drop=True)

    for i in range(save_modeldf.shape[0]):

        train_results = dict()

        model,params = get_model_metadata(save_modeldf.iloc[i])

        ntp,tw,stratify,fc,synth = get_data_metadata(save_modeldf.iloc[i])

        clf = model(**eval(params))

        print(clf)

        model_name = type(clf).__name__

        df_test, x_train, x_test, y_train, y_test, ss, le  = get_tabpandas_multi(df_train.drop(columns="Date"), ntp, tw,stratify_sample=stratify,filter_cols=fc,synth_sample=synth)

        print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

        clf.fit(x_train,y_train)

        train_preds = clf.predict(x_train)

        test_preds = clf.predict(x_test)

        tr_f1  = calc_f1(y_train,train_preds)
        te_f1  = calc_f1(y_test,test_preds)

        train_results["model_name"] = model_name
        train_results["params"] = params
        train_results["dataset"] = [stratify,fc,synth]
        train_results["ntp_tw"] = "-".join([str(ntp),str(tw)])
        train_results["index"] = i
        train_results["model"] = str(clf)
        train_results["train_f1"] = tr_f1
        train_results["test_f1"] = te_f1

        df_train_results.append(train_results)

        save_models(ntp,tw,ss,le,clf,i)

pd.DataFrame(df_train_results).to_csv(path/"ModelResults/modelresults_newdata_compoundV2_updated.csv",index=False)

______
7


NameError: ignored

# Dollar Impact

In [60]:
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score
def run_test(
    ntp,
    tw,
    X,
    y,
    dataset,
    filter_cols
):

    ss, le, clf = load_models(ntp, tw, dataset)

    if filter_cols:
        X = X.iloc[:,filter_cols]
    
    print(X.shape)
    x_test = ss.transform(X)
    
    pred = clf.predict(x_test)
    pred = le.classes_[pred]
    
    print(f"Accuracy score: {accuracy_score(y, pred)}")

    return accuracy_score(y, pred),f1_score(y,pred,average="micro"),precision_score(y,pred,average="micro"),recall_score(y,pred,average="micro")



def strategy_cost(row):
    return row[row.Predict] * row['Borrow Amount']/365

def compare_strategy(ntp, tw, initial_borrow, df,i,filter_cols):

    ss, le, clf = load_models(ntp, tw,i)

    chunks = []
    for i, v in enumerate(range(0, len(df), tw)):
        if i == 0:
            chunks.append(df.loc[:ntp+tw-1].copy().reset_index(drop=True))
        else:
            chunks.append(df.loc[v:v+ntp+tw-1].copy().reset_index(drop=True))
        
    for i, chu in enumerate(chunks):
        if len(chu) == ntp+tw:

            if filter_cols:
                x = chu.iloc[:,filter_cols]
                x = x.iloc[:ntp,:]

            else:
                x = chu.iloc[:ntp,:]
            x = ss.transform(x)
            pred = clf.predict(x)
            pred = le.classes_[pred]

            chu.loc[ntp:, 'Predict'] = pred[0]  
            if i == 0:
                final = chu.dropna()
            else:
                final = final.append(chu.dropna()).reset_index(drop=True)

    
    final = (final[['DAI_borrowRate_t-0','USDC_borrowRate_t-0','USDT_borrowRate_t-0','Predict']]
             .rename(columns={"DAI_borrowRate_t-0":"DAI_borrowRate",
                      'USDC_borrowRate_t-0':'USDC_borrowRate',
                      'USDT_borrowRate_t-0':'USDT_borrowRate'})
)
    
    final['Borrow Amount'] = initial_borrow

    final['DAI_br_cost'] = final['DAI_borrowRate'] * final['Borrow Amount']/365
    final['USDC_br_cost'] = final['USDC_borrowRate'] * final['Borrow Amount']/365
    final['USDT_br_cost'] = final['USDT_borrowRate'] * final['Borrow Amount']/365


    final['Strategy_br_cost'] = final.apply(lambda row: strategy_cost(row), axis=1)

    return final[['Borrow Amount','DAI_br_cost','USDC_br_cost','USDT_br_cost','Strategy_br_cost',"Predict"]]

In [3]:
df_results = pd.read_csv(path/"ModelResults/modelresults_newdata_compoundV2_updated.csv")

df_results

NameError: ignored

# Test Trained models on Unseen data

In [2]:
#model_to_ipfs.shape[0]

dollar_results = pd.DataFrame()
metrics = []

ss_list = list()
le_list = list()

initial_amount = 1000000


for tw in [7,14,21]:

    print("______")
    print(tw)

    save_modeldf = (df_results.query(f'ntp_tw == "5-{tw}"')
                    .reset_index(drop=True)
                    [["model_name","params","ntp_tw","dataset","index"]]
)

    for i in range(save_modeldf.shape[0]):

        print(save_modeldf.iloc[i]["index"])
        ss,le,clf = load_models(ntp,tw,save_modeldf.iloc[i]["index"])

        print(clf,ss)

        ntp,tw,stratify,fc,synth = get_data_metadata(save_modeldf.iloc[i])

        ntp_tw = "_".join([str(ntp),str(tw)])

        df_test,y = get_tabpandas_multi(df_predict.drop(columns="Date"), ntp, tw,inference=True)

        print(df_test.shape,y.shape,fc)

        acc,f1,precision,recall = run_test(ntp,tw,df_test,y,save_modeldf.iloc[i]["index"],fc)

        metrics_dict = dict()
        metrics_dict["index"] = save_modeldf.iloc[i]["index"]
        metrics_dict["ntp_tw"] = "-".join([str(ntp),str(tw)])
        metrics_dict["u_acc"] = acc
        metrics_dict["u_f1"] = f1
        metrics_dict["u_precision"] = precision
        metrics_dict["u_recall"] = recall

        metrics.append(metrics_dict)

NameError: ignored

In [51]:
df_final_predictions = pd.DataFrame(metrics).merge(df_results,on=["ntp_tw","index"])

In [57]:
df_final_predictions.query('ntp_tw == "5-21"').sort_values('u_acc',ascending=False)

Unnamed: 0,index,ntp_tw,u_acc,u_f1,u_precision,u_recall,model_name,params,dataset,model,train_f1,test_f1
1606,468,5-21,0.857143,0.857143,0.857143,0.857143,GradientBoostingClassifier,"{'learning_rate': 0.05, 'n_estimators': 60, 'random_state': 42}","[False, [49, 64, 65, 48, 33, 32, 16, 34, 1, 39, 9, 17, 0, 18, 22, 2, 30, 78, 56, 8], True]","GradientBoostingClassifier(learning_rate=0.05, n_estimators=60, random_state=42)",0.992424,0.910000
1596,458,5-21,0.847619,0.847619,0.847619,0.847619,GradientBoostingClassifier,"{'learning_rate': 0.05, 'n_estimators': 50, 'random_state': 42}","[False, [49, 64, 65, 48, 33, 32, 16, 34, 1, 39, 9, 17, 0, 18, 22, 2, 30, 78, 56, 8], True]","GradientBoostingClassifier(learning_rate=0.05, n_estimators=50, random_state=42)",0.987374,0.910000
1657,519,5-21,0.828571,0.828571,0.828571,0.828571,GradientBoostingClassifier,"{'learning_rate': 0.05, 'n_estimators': 70, 'random_state': 42}","[False, [49, 64, 65, 48, 33, 32, 16, 34, 1, 39, 9, 17, 0, 18, 22, 2, 30, 78, 56, 8], True]","GradientBoostingClassifier(learning_rate=0.05, n_estimators=70, random_state=42)",0.994949,0.930000
1538,400,5-21,0.809524,0.809524,0.809524,0.809524,GradientBoostingClassifier,"{'learning_rate': 0.05, 'n_estimators': 40, 'random_state': 42}","[False, [49, 64, 65, 48, 33, 32, 16, 34, 1, 39, 9, 17, 0, 18, 22, 2, 30, 78, 56, 8], True]","GradientBoostingClassifier(learning_rate=0.05, n_estimators=40, random_state=42)",0.979798,0.900000
1148,10,5-21,0.800000,0.800000,0.800000,0.800000,RandomForestClassifier,"{'max_depth': 3, 'min_samples_leaf': 9, 'n_estimators': 120, 'random_state': 42}","[False, [49, 64, 65, 48, 33, 32, 16, 34, 1, 39, 9, 17, 0, 18, 22, 2, 30, 78, 56, 8], True]","RandomForestClassifier(max_depth=3, min_samples_leaf=9, n_estimators=120,\n random_state=42)",0.893939,0.780000
...,...,...,...,...,...,...,...,...,...,...,...,...
1700,562,5-21,0.228571,0.228571,0.228571,0.228571,GradientBoostingClassifier,"{'learning_rate': 0.1, 'n_estimators': 40, 'random_state': 42}","[False, None, False]","GradientBoostingClassifier(n_estimators=40, random_state=42)",1.000000,0.941176
1701,563,5-21,0.228571,0.228571,0.228571,0.228571,GradientBoostingClassifier,"{'learning_rate': 0.1, 'n_estimators': 50, 'random_state': 42}","[False, None, False]","GradientBoostingClassifier(n_estimators=50, random_state=42)",1.000000,0.941176
1694,556,5-21,0.219048,0.219048,0.219048,0.219048,RandomForestClassifier,"{'max_depth': 9, 'min_samples_leaf': 3, 'n_estimators': 10, 'random_state': 42}","[True, [49, 64, 65, 48, 33, 32, 16, 34, 1, 39, 9, 17, 0, 18, 22, 2, 30, 78, 56, 8], False]","RandomForestClassifier(max_depth=9, min_samples_leaf=3, n_estimators=10,\n random_state=42)",0.988971,0.970588
1695,557,5-21,0.219048,0.219048,0.219048,0.219048,RandomForestClassifier,"{'max_depth': 12, 'min_samples_leaf': 3, 'n_estimators': 10, 'random_state': 42}","[True, [49, 64, 65, 48, 33, 32, 16, 34, 1, 39, 9, 17, 0, 18, 22, 2, 30, 78, 56, 8], False]","RandomForestClassifier(max_depth=12, min_samples_leaf=3, n_estimators=10,\n random_state=42)",0.988971,0.970588


## Save Models to IPFS

In [4]:

ss_list = list()
le_list = list()

initial_amount = 1000000


for tw in [7,14,21]:

    print("______")
    print(tw)

    save_modeldf = (df_final_predictions.query(f'ntp_tw == "5-{tw}"')
                    .sort_values('u_acc',ascending=False)
                    .reset_index(drop=True)
                    [["model_name","params","ntp_tw","dataset","index"]]
)

    for i in range(3):

        print(save_modeldf.iloc[i]["index"])
        ss,le,clf = load_models(ntp,tw,save_modeldf.iloc[i]["index"])

        print(clf,ss)

        ntp,tw,stratify,fc,synth = get_data_metadata(save_modeldf.iloc[i])


        scaler_dict = {"keyvalues":{}}
        scaler_dict["keyvalues"].update({"Version":"V02"})
        scaler_dict["keyvalues"].update({"dao":"DefiSquad","protocol":"Compound","network":"Mainnet"})
        scaler_dict["keyvalues"].update({"type":"StandardScaler"})
        scaler_dict["keyvalues"].update({"ntp_tw":str(ntp_tw)})
        scaler_dict["keyvalues"].update({"features":str(fc) if fc else str(df_test.columns)})
        scaler_dict["keyvalues"].update({"size":str(len(fc)) if fc else str(df_test.shape[0])})

        label_encoder_dict = {"keyvalues":{}}
        label_encoder_dict["keyvalues"].update({"Version":"V02"})
        label_encoder_dict["keyvalues"].update({"dao":"DefiSquad","protocol":"Compound","network":"Mainnet"})
        label_encoder_dict["keyvalues"].update({"type":"Label_Encoder"})
        label_encoder_dict["keyvalues"].update({"ntp_tw":str(ntp_tw)})
        label_encoder_dict["keyvalues"].update({"classes":str(list(le.classes_))})


        ss_path = str(path/f'models/{ntp_tw}/{i}_ss_clf_{ntp}_{tw}.pkl')
        le_path = str(path/f'models/{ntp_tw}/{i}_le_clf_{ntp}_{tw}.pkl')
        clf_path = str(path/f'models/{ntp_tw}/{i}_clf_{ntp}_{tw}.pkl')


        ss_cid = upload_ipfs(ss_path,NS,ns_creds,p_creds,scaler_dict)
        le_cid = upload_ipfs(le_path,NS,ns_creds,p_creds,label_encoder_dict)

        # metrics_dict["ipfs_cid"] = str({"cidV1":cidV1,"cidV2":cidV2})
        model_dict = {"keyvalues":{}}
        model_dict["keyvalues"].update({"Version":"V02"})
        model_dict["keyvalues"].update({"projectData":str({"dao":"DefiSquad","protocol":"Compound","network":"Mainnet"})})
        model_dict["keyvalues"].update({"type":"BorrowPrediction"})
        model_dict["keyvalues"].update({"ntp_tw":str(ntp_tw)})
        model_dict["keyvalues"].update({"UnseenAccuracy":acc})
        model_dict["keyvalues"].update({"UnseenDataMetrics":str({"f1_score":f1,"precision":precision,"recall":recall})})
        model_dict["keyvalues"].update({"datasetTraining":str({"stratify":stratify,"filter_cols":fc,"smote":synth})})
        model_dict["keyvalues"].update({"scalerLabelEncoder":str({"ss":ss_cid,"le":le_cid})})
        model_dict["keyvalues"].update({"model":str({"model_name":model_name,"params":str(params)})})


        
        model_cid = upload_ipfs(clf_path,NS,ns_creds,p_creds,model_dict)
        model_dict["cid"] = model_cid
        metrics.append(model_dict)
        ss_list.append(ss_cid)
        le_list.append(le_cid)


    #     dollar_preds = compare_strategy(ntp,tw,initial_amount,df_test,i,fc)
    
    #     dollar_results[str(model_cid)] = dollar_preds["Strategy_br_cost"]

    #     dollar_results["Date"] = df_predict[["Date"]][:len(dollar_results)]
    #     dollar_results["ntp_tw"] = ntp_tw


    # dollar_results["DAI_br_cost"] = dollar_preds["DAI_br_cost"]
    # dollar_results["USDC_br_cost"] = dollar_preds["USDC_br_cost"]
    # dollar_results["USDT_br_cost"] = dollar_preds["USDT_br_cost"]


______
7


NameError: ignored

#Stop Here

In [None]:
#         df_temp = pd.DataFrame()
#         df_temp["Strategy_br_cost"] = dollar_preds["Strategy_br_cost"]
#         df_temp["Predict"] = dollar_preds["Predict"]

#         df_temp["Date"] = df_predict["Date"]
#         df_temp["ntp_tw"] = ntp_tw
#         df_temp["Strategy"] = str(model_cid)
#         df_temp["Model_Name"] = str(clf)

#         dollar_results = pd.concat([dollar_results,df_temp],axis=0)

# for hold_position in ["DAI","USDC","USDT_br_cost"]:

#     df_temp["Strategy_br_cost"] = dollar_preds[f"{hold_position}_br_cost"]
#     df_temp["Predict"] = dollar_preds["Predict"]

#     df_temp["Date"] = df_predict["Date"]
#     df_temp["ntp_tw"] = ntp_tw
#     df_temp["Strategy"] = str(model_cid)
#     df_temp["Model_Name"] = str(clf)

In [67]:
set(ss_list),set(le_list)

({'bafkreia7zazf3kgy4tki4gheeve4b7kfdablkhsxmbvxcpcdo3aeturjzy',
  'bafkreibh4otv3mfpicn5kdsqwdlfw7qmr7hqussepchivxeutdmlvskyya',
  'bafkreicyevoea22ifxgpvdq7rayevzcjsopq5ckwtzuupehlyb75fbx5da',
  'bafkreidq7vsjbv5affqeyf3elvnokdefxqzzfvnsrsd4qyfiauwa4askni',
  'bafkreiezar4apmprshcdmvefrfcfrhxg7scf76nj2kc37amowke7x7rb5y',
  'bafkreigk2wyls7liuicm5ohup67ndij6bvacpx4ms32flnotpat7lo2pgy',
  'bafkreignw4jcp7qjbckd6zwtyspa7jsgjmedjdthsyhcvq4mvok52m44t4'},
 {'bafkreia7cnu4gjorb6pvutiugh33uxw74zoh4wfv7ut7ihyht6upyvfq5a',
  'bafkreiagzsunf4l2oy3f7bak3uvn723i7frfpuaqpnce5oxy22n2yjjwny',
  'bafkreibzscgshq42slwii4ney54sqv75mixfzzmq3hyhonqzfzbauwu4hy',
  'bafkreielx2nvilkrixh32a4p4zbpfeyqt7n74zqw4i6t5wcjczc2algiru',
  'bafkreienikmd42ooxie3a3bl2ajp3fdnighplgopuvln53omdybbxfgrxm',
  'bafkreieuvvvasey2pfie52ot53p2e5ps224skgesi7ronv4g73lbfhf5hm',
  'bafkreifle6qy2urzzg46hpmdlfdw5p4gjq7hnjfykyl2mfytegczntno4m'})

In [None]:
dollar_results

In [None]:
dollar_results_plot = pd.melt(dollar_results.copy(),id_vars=['ntp_tw','Date','bafkreiekzz6h566pmwexkgautmfbbjiralxnw7uyulmzgmrkm6yyptzijqPredict'],var_name=["strategy"],value_name="borrow_cost")

In [None]:
dollar_results_plot

In [68]:
metrics_df = pd.DataFrame(metrics).set_index("cid")


(round(dollar_results_plot[["strategy","borrow_cost"]].groupby("strategy")
.sum()
.sort_values("borrow_cost",ascending=True))
.merge(metrics_df,left_index=True,right_index=True)
)

NameError: ignored

In [None]:
import plotly.express as px

px.line(dollar_results_plot,x="Date",y="borrow_cost",color="strategy")

In [69]:
get_pinned_jobs(p_creds)

(<Response [200]>, {'count': 0, 'rows': []})

In [76]:
import pandas as pd
rs, response = get_pinned_files(p_creds,params={"pageLimit":300})

df_pinata = pd.DataFrame(response["rows"])

df_pinatastorage = (pd.merge(df_pinata,pd.json_normalize(df_pinata["metadata"],sep="_"),left_index=True,right_index=True)
[["ipfs_pin_hash","keyvalues_UnseenAccuracy","keyvalues_type","keyvalues_ntp_tw","keyvalues_scalerLabelEncoder","keyvalues_datasetTraining","keyvalues_model"]].drop_duplicates("ipfs_pin_hash")
.query('keyvalues_type == "BorrowPrediction" & keyvalues_ntp_tw == "5_21"')
.sort_values(["keyvalues_UnseenAccuracy"],ascending=False).iloc[:50]
 );df_pinatastorage

Unnamed: 0,ipfs_pin_hash,keyvalues_UnseenAccuracy,keyvalues_type,keyvalues_ntp_tw,keyvalues_scalerLabelEncoder,keyvalues_datasetTraining,keyvalues_model
46,bafybeieuujacw7ptxq4zjhbinqeekglyahjmauijtqxismmqt7k57z3r6m,0.857143,BorrowPrediction,5_21,"{'ss': 'bafkreiezar4apmprshcdmvefrfcfrhxg7scf76nj2kc37amowke7x7rb5y', 'le': 'bafkreibzscgshq42slwii4ney54sqv75mixfzzmq3hyhonqzfzbauwu4hy'}","{'stratify': False, 'filter_cols': [49, 64, 65, 48, 33, 32, 16, 34, 1, 39, 9, 17, 0, 18, 22, 2, 30, 78, 56, 8], 'smote': True}","{'model_name': 'RandomForestClassifier', 'params': ""{'max_depth': 3, 'min_samples_leaf': 9, 'n_estimators': 190, 'random_state': 42}""}"
41,bafybeiaaox25axn47wd736c22ea52e6apkquhbej4huxjb3yzad2gla7ou,0.857143,BorrowPrediction,5_21,"{'ss': 'bafkreiezar4apmprshcdmvefrfcfrhxg7scf76nj2kc37amowke7x7rb5y', 'le': 'bafkreibzscgshq42slwii4ney54sqv75mixfzzmq3hyhonqzfzbauwu4hy'}","{'stratify': False, 'filter_cols': [49, 64, 65, 48, 33, 32, 16, 34, 1, 39, 9, 17, 0, 18, 22, 2, 30, 78, 56, 8], 'smote': True}","{'model_name': 'RandomForestClassifier', 'params': ""{'max_depth': 3, 'min_samples_leaf': 9, 'n_estimators': 220, 'random_state': 42}""}"
58,bafkreigqyfur44zxcnnrjqcikevs7schagxmnjcopg4zdsq5e6oq2rnxse,0.857143,BorrowPrediction,5_21,"{'ss': 'bafkreiezar4apmprshcdmvefrfcfrhxg7scf76nj2kc37amowke7x7rb5y', 'le': 'bafkreibzscgshq42slwii4ney54sqv75mixfzzmq3hyhonqzfzbauwu4hy'}","{'stratify': False, 'filter_cols': [49, 64, 65, 48, 33, 32, 16, 34, 1, 39, 9, 17, 0, 18, 22, 2, 30, 78, 56, 8], 'smote': True}","{'model_name': 'RandomForestClassifier', 'params': ""{'max_depth': 3, 'min_samples_leaf': 9, 'n_estimators': 150, 'random_state': 42}""}"
57,bafkreiakockm6nh5g24r46ytlbv6z6zwiljwoybdm3h7yfstgqde55nsxq,0.857143,BorrowPrediction,5_21,"{'ss': 'bafkreiezar4apmprshcdmvefrfcfrhxg7scf76nj2kc37amowke7x7rb5y', 'le': 'bafkreibzscgshq42slwii4ney54sqv75mixfzzmq3hyhonqzfzbauwu4hy'}","{'stratify': False, 'filter_cols': [49, 64, 65, 48, 33, 32, 16, 34, 1, 39, 9, 17, 0, 18, 22, 2, 30, 78, 56, 8], 'smote': True}","{'model_name': 'RandomForestClassifier', 'params': ""{'max_depth': 3, 'min_samples_leaf': 9, 'n_estimators': 140, 'random_state': 42}""}"
55,bafkreihmtxs52i3obakaft7xaktu5smsueen4wp3yflefwrlcuruh6sunu,0.857143,BorrowPrediction,5_21,"{'ss': 'bafkreiezar4apmprshcdmvefrfcfrhxg7scf76nj2kc37amowke7x7rb5y', 'le': 'bafkreibzscgshq42slwii4ney54sqv75mixfzzmq3hyhonqzfzbauwu4hy'}","{'stratify': False, 'filter_cols': [49, 64, 65, 48, 33, 32, 16, 34, 1, 39, 9, 17, 0, 18, 22, 2, 30, 78, 56, 8], 'smote': True}","{'model_name': 'RandomForestClassifier', 'params': ""{'max_depth': 3, 'min_samples_leaf': 9, 'n_estimators': 130, 'random_state': 42}""}"
54,bafkreif3ylquaj3b77rfx55a335t654tjgoqbosn2gvymwizkfk5347sby,0.857143,BorrowPrediction,5_21,"{'ss': 'bafkreiezar4apmprshcdmvefrfcfrhxg7scf76nj2kc37amowke7x7rb5y', 'le': 'bafkreibzscgshq42slwii4ney54sqv75mixfzzmq3hyhonqzfzbauwu4hy'}","{'stratify': False, 'filter_cols': [49, 64, 65, 48, 33, 32, 16, 34, 1, 39, 9, 17, 0, 18, 22, 2, 30, 78, 56, 8], 'smote': True}","{'model_name': 'RandomForestClassifier', 'params': ""{'max_depth': 3, 'min_samples_leaf': 6, 'n_estimators': 130, 'random_state': 42}""}"
50,bafkreidzxnyfk3loanbsws5gfte5s32iw4m6qrw5pdx4e2yzjwsbviuv44,0.857143,BorrowPrediction,5_21,"{'ss': 'bafkreiezar4apmprshcdmvefrfcfrhxg7scf76nj2kc37amowke7x7rb5y', 'le': 'bafkreibzscgshq42slwii4ney54sqv75mixfzzmq3hyhonqzfzbauwu4hy'}","{'stratify': False, 'filter_cols': [49, 64, 65, 48, 33, 32, 16, 34, 1, 39, 9, 17, 0, 18, 22, 2, 30, 78, 56, 8], 'smote': True}","{'model_name': 'RandomForestClassifier', 'params': ""{'max_depth': 3, 'min_samples_leaf': 3, 'n_estimators': 120, 'random_state': 42}""}"
48,bafkreiese7ohm5kf622ebjrw2wayva2p2ng5725d5t77otfnyypnpjdkvm,0.857143,BorrowPrediction,5_21,"{'ss': 'bafkreiezar4apmprshcdmvefrfcfrhxg7scf76nj2kc37amowke7x7rb5y', 'le': 'bafkreibzscgshq42slwii4ney54sqv75mixfzzmq3hyhonqzfzbauwu4hy'}","{'stratify': False, 'filter_cols': [49, 64, 65, 48, 33, 32, 16, 34, 1, 39, 9, 17, 0, 18, 22, 2, 30, 78, 56, 8], 'smote': True}","{'model_name': 'RandomForestClassifier', 'params': ""{'max_depth': 3, 'min_samples_leaf': 3, 'n_estimators': 130, 'random_state': 42}""}"
59,bafkreifvlrh4rqjeg5ikkpugfndqfk46vmk2fltzx4etzwnv565mekxpi4,0.857143,BorrowPrediction,5_21,"{'ss': 'bafkreiezar4apmprshcdmvefrfcfrhxg7scf76nj2kc37amowke7x7rb5y', 'le': 'bafkreibzscgshq42slwii4ney54sqv75mixfzzmq3hyhonqzfzbauwu4hy'}","{'stratify': False, 'filter_cols': [49, 64, 65, 48, 33, 32, 16, 34, 1, 39, 9, 17, 0, 18, 22, 2, 30, 78, 56, 8], 'smote': True}","{'model_name': 'RandomForestClassifier', 'params': ""{'max_depth': 3, 'min_samples_leaf': 9, 'n_estimators': 160, 'random_state': 42}""}"
42,bafybeib3hmpjbjmkixfy7qmzm6lj75xim3gmgifyk7uz4law67benfeiky,0.847619,BorrowPrediction,5_21,"{'ss': 'bafkreiezar4apmprshcdmvefrfcfrhxg7scf76nj2kc37amowke7x7rb5y', 'le': 'bafkreibzscgshq42slwii4ney54sqv75mixfzzmq3hyhonqzfzbauwu4hy'}","{'stratify': False, 'filter_cols': [49, 64, 65, 48, 33, 32, 16, 34, 1, 39, 9, 17, 0, 18, 22, 2, 30, 78, 56, 8], 'smote': True}","{'model_name': 'RandomForestClassifier', 'params': ""{'max_depth': 3, 'min_samples_leaf': 3, 'n_estimators': 200, 'random_state': 42}""}"


In [None]:
w = df_pinatastorage[["ipfs_pin_hash","keyvalues_scalerLabelEncoder","keyvalues_datasetTraining"]].to_numpy().tolist()

In [None]:
for model, le,dataattr in [tuple(r) for r in w]:

    print(model,le,dataattr)

In [None]:
df_pinatastorage.to_csv("5_14_models.csv",index=False)

## Trained On Dates

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:

fig, ax = plt.subplots(nrows=3,ncols=2,constrained_layout = True)
plt.rcParams["figure.figsize"] = (10,10)

for i,tw in enumerate([7,14,21]):


    X,y = get_tabpandas_multi(df_train, 5, tw, inference=True)

    classes,count = np.unique(y,return_counts=True)

    _ = ax[i][0].bar(x=classes,height=count)

    ax[i][0].title.set_text(f"Train Classification 5-{tw}")

    X,y = get_tabpandas_multi(df_predict, 5, tw, inference=True)

    classes,count = np.unique(y,return_counts=True)

    _ = ax[i][1].bar(x=classes,height=count)

    ax[i][1].title.set_text(f"Unseen Classification 5-{tw}")

    
    


## Save Models