# Credit Scoring for Aave V2 Wallets

This notebook loads Aave V2 transaction data, engineers features per wallet, applies KMeans clustering to derive credit score bands (0-1000), and visualizes results.

In [None]:
# Imports and setup
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score

%matplotlib inline

In [None]:
# Load and normalize JSON data
with open('datasets/user-wallet-transactions.json','r') as f:
    data = json.load(f)
df = pd.json_normalize(data)
print('Total records:', len(df))
df.head(3)

In [None]:
# Check columns and sample
print(df.columns.tolist())
df[['userWallet','action','timestamp',
    'actionData.amount','actionData.assetSymbol','actionData.assetPriceUSD']].head()

In [None]:
# Convert amounts and compute USD value
df['amount'] = pd.to_numeric(df['actionData.amount'], errors='coerce')
df['price_usd'] = pd.to_numeric(df['actionData.assetPriceUSD'], errors='coerce')
# Adjust units: USDC/USDT=6 decimals, others=18
def convert_amount(row):
    if row['actionData.assetSymbol'] in ['USDC','USDT']:
        return row['amount']/1e6
    return row['amount']/1e18
df['amount_converted'] = df.apply(convert_amount, axis=1)
df['usd_value'] = df['amount_converted'] * df['price_usd']
df[['amount_converted','usd_value']].head()

In [None]:
# Feature engineering per wallet
wallet_stats = defaultdict(lambda: {'deposit_usd':0,'borrow_usd':0,'repay_usd':0,
                                        'redeem_usd':0,'num_deposits':0,'num_borrows':0,'num_repays':0,
                                        'num_liquidations':0,'num_txns':0,'unique_assets':set(),
                                        'timestamps':[]})
for _, row in df.iterrows():
    w = row['userWallet']; act = row['action'].lower(); usd = row['usd_value']
    wallet_stats[w]['num_txns'] += 1
    wallet_stats[w]['unique_assets'].add(row['actionData.assetSymbol'])
    wallet_stats[w]['timestamps'].append(row['timestamp'])
    if act=='deposit': wallet_stats[w]['deposit_usd']+=usd; wallet_stats[w]['num_deposits']+=1
    elif act=='borrow': wallet_stats[w]['borrow_usd']+=usd; wallet_stats[w]['num_borrows']+=1
    elif act=='repay': wallet_stats[w]['repay_usd']+=usd; wallet_stats[w]['num_repays']+=1
    elif act=='redeemunderlying': wallet_stats[w]['redeem_usd']+=usd
    elif act=='liquidationcall': wallet_stats[w]['num_liquidations']+=1

rows=[]
for w, s in wallet_stats.items():
    ts = sorted(s['timestamps']); gaps = np.diff(ts)/86400 if len(ts)>1 else [0]
    rows.append({'wallet':w,'deposit_usd':s['deposit_usd'],'borrow_usd':s['borrow_usd'],
                 'repay_usd':s['repay_usd'],'net_borrow_usd':s['borrow_usd']-s['repay_usd'],
                 'num_liquidations':s['num_liquidations'],'num_txns':s['num_txns'],
                 'num_assets':len(s['unique_assets']),'avg_days_between_txns':np.mean(gaps),
                 'repay_to_borrow_ratio':(s['repay_usd']/s['borrow_usd'] if s['borrow_usd']>0 else 0)})
wallet_df = pd.DataFrame(rows)
wallet_df.head()

In [None]:
# Visualization: Feature Distributions
features = ['deposit_usd','repay_usd','borrow_usd','net_borrow_usd']
plt.figure(figsize=(12,8))
for i, f in enumerate(features,1):
    plt.subplot(2,2,i)
    sns.histplot(wallet_df[f], bins=30, kde=True)
    plt.title(f'Distribution of {f}')
plt.tight_layout(); plt.show()

In [None]:
# Prepare for clustering
features = ['deposit_usd','borrow_usd','repay_usd','net_borrow_usd',
            'num_liquidations','num_txns','repay_to_borrow_ratio','num_assets']
X = wallet_df[features].fillna(0)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled.shape

In [None]:
# Elbow & Silhouette Analysis
ks=range(2,11)
inertias=[]; sils=[]
for k in ks:
    km=KMeans(n_clusters=k,random_state=42,n_init='auto').fit(X_scaled)
    inertias.append(km.inertia_)
    sils.append(silhouette_score(X_scaled, km.labels_))
plt.figure(figsize=(12,4))
plt.subplot(1,2,1); plt.plot(ks,inertias,'-o'); plt.title('Elbow'); plt.xlabel('k'); plt.ylabel('Inertia')
plt.subplot(1,2,2); plt.plot(ks,sils,'-o'); plt.title('Silhouette'); plt.xlabel('k'); plt.ylabel('Score')
plt.tight_layout(); plt.show()

In [None]:
# Fit KMeans (k=5)
k=5
kmeans=KMeans(n_clusters=k,random_state=42,n_init='auto')
wallet_df['cluster']=kmeans.fit_predict(X_scaled)
print(wallet_df['cluster'].value_counts())

In [None]:
# Cluster Profiles and Heatmap
cluster_profiles=wallet_df.groupby('cluster')[features].mean().round(2)
display(cluster_profiles)
plt.figure(figsize=(10,5))
sns.heatmap(cluster_profiles, annot=True, cmap='coolwarm')
plt.title('Cluster Profiles'); plt.show()

In [None]:
# Map clusters to credit scores and visualize
cluster_to_score={2:950,0:800,1:600,3:400,4:100}
wallet_df['credit_score_ml']=wallet_df['cluster'].map(cluster_to_score)
wallet_df['credit_score_ml']=MinMaxScaler((0,1000)).fit_transform(wallet_df[['credit_score_ml']])
plt.figure(figsize=(8,4)); sns.histplot(wallet_df['credit_score_ml'], bins=20, kde=True)
plt.title('Credit Score Distribution'); plt.show()
plt.figure(figsize=(6,4)); wallet_df.groupby('cluster')['credit_score_ml'].mean().plot(kind='bar')
plt.title('Avg Credit Score per Cluster'); plt.show()

In [None]:
# Save final scores to CSV
wallet_df[['wallet','credit_score_ml']].to_csv('wallet_credit_scores_ml.csv', index=False)
print('Saved wallet_credit_scores_ml.csv')