In [None]:
import pyreadr
import pandas as pd

In [None]:
result = pyreadr.read_r("/data/IDEA_DeFi_Research/Data/Lending_Protocols/Aave/V2/Mainnet/transactions.rds")

In [None]:
df = result[None] # extract the pandas data frame 

In [None]:
pd.set_option('mode.chained_assignment', None)

In [None]:
df

In [None]:
%store df

In [None]:
df.isna()

In [None]:
df.isnull().sum()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
missing_matrix = df.isnull()
plt.figure(figsize=(20, 10))
missing_heat_map = sns.heatmap(data = missing_matrix)
missing_heat_map.set(xlabel = 'Features')
missing_heat_map.set(ylabel = 'Samples')
plt.title('Missingness Heatmap')
plt.show()

In [None]:
corr = df.corr() 
sns.heatmap(corr, annot=True, ax=plt.subplots(figsize=(20,10))[1]) 
plt.title('Correlation Heatmap') 
plt.show()

In [None]:
df.head()

In [None]:
types = df['type'].unique()
print(f'Different types: {types}')

In [None]:
subframes = [ df[df['type'] == i] for i in types ]

In [None]:
%store subframes

In [None]:
def clean_data(df):
    return (
        df
         .dropna(axis=1)
         .assign(date=pd.to_datetime(df['timestamp']))
         .set_index('date')
         .drop('id', axis='columns')
         .drop('timestamp', axis='columns')
         .sort_index()
    )

In [None]:
cleaned = [clean_data(i) for i in subframes]

In [None]:
%store cleaned

In [None]:
borrow, collateral, deposit, liquidation, withdraw, repay, swap, flashLoan = cleaned

In [None]:
%store borrow
%store collateral
%store deposit
%store liquidation
%store withdraw
%store repay
%store swap
%store flashLoan

In [None]:
print(borrow.head())

In [None]:
collateral.head()

In [None]:
deposit.head()

In [None]:
liquidation.head()

In [None]:
withdraw.head()

In [None]:
repay.head()

In [None]:
swap.head()

In [None]:
flashLoan.head()

In [None]:
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [None]:
borrow = borrow.head(10000)
# Separate features and target variables (you can choose based on your prediction needs)
features = borrow.drop('amountUSD', axis=1)  # Features to predict amountUSD
target = borrow['amountUSD']

In [None]:
# Preprocess categorical features with One-Hot Encoding
categorical_features = ['userAlias', 'pool', 'reserve']
encoder = OneHotEncoder(sparse=False)


In [None]:
encoded_features = encoder.fit_transform(features[categorical_features])

In [None]:
# Combine numerical features (after normalization/scaling if needed) and encoded categorical features
processed_features = np.concatenate([features.drop(categorical_features, axis=1), encoded_features], axis=1)

# Define hyperparameters (experiment with these values)
embedding_dim = 16  # Experiment with different values
lstm_units = 32  # Experiment with different values
epochs = 10  # Experiment with different values
batch_size = 64  # Experiment with different values

In [None]:
# encoded_features = encoder.fit_transform(features[categorical_features])
# categorical_feature_names = encoder.get_feature_names_out()  # Get encoded feature names

In [None]:
# # Split data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(processed_features, target, test_size=0.2, random_state=42)

# # Define the model
# model = keras.Sequential()

# # Embedding layer for categorical features (adjust input_dim based on one-hot encoded vocabulary size)
# model.add(keras.layers.Embedding(input_dim=encoder.get_feature_names_out().shape[0], output_dim=embedding_dim, input_length=processed_features.shape[1]))

# # LSTM layers
# model.add(keras.layers.LSTM(units=lstm_units, return_sequences=True))
# model.add(keras.layers.LSTM(units=lstm_units))

# # Output layer for predicting amountUSD (adjust based on your target variable)
# model.add(keras.layers.Dense(units=1, activation='linear'))

# # Compile the model
# model.compile(loss='mse', optimizer='adam')

# # Train the model
# model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))

# # Make predictions on testing set
# y_pred = model.predict(X_test)

# # Evaluate model performance (e.g., calculate mean squared error)
# mse = tf.keras.losses.MeanSquaredError()
# loss = mse(y_test, y_pred)
# print(f"Mean Squared Error on Testing Set: {loss.numpy()}")