In [1]:
!pip install torch torch_geometric torch_scatter torch_sparse torch_cluster torch_spline_conv pandas numpy requests scikit-learn imbalanced-learn xgboost tensorflow shap lime matplotlib

Collecting torch_geometric
  Using cached torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
Collecting torch_scatter
  Using cached torch_scatter-2.1.2-cp311-cp311-linux_x86_64.whl
Collecting torch_sparse
  Using cached torch_sparse-0.6.18-cp311-cp311-linux_x86_64.whl
Collecting torch_cluster
  Using cached torch_cluster-1.6.3.tar.gz (54 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch_spline_conv
  Using cached torch_spline_conv-1.2.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadat

# Step 1: Data Acquisition

In [2]:
import requests
import pandas as pd
import numpy as np

ETHERSCAN_API_KEY = "V8RHS7P2YNSAHUY92CXVANVQK8MIYK95UQ"
BASE_URL = "https://api.etherscan.io/api"

def get_transactions(address, start_block=0, end_block=99999999):
    params = {
        'module': 'account',
        'action': 'txlist',
        'address': address,
        'startblock': start_block,
        'endblock': end_block,
        'sort': 'asc',
        'apikey': ETHERSCAN_API_KEY
    }
    response = requests.get(BASE_URL, params=params)
    return response.json()['result']

normal_addresses = [
    "0x742d35Cc6634C0532925a3b844Bc454e4438f44e",
    "0xDC76CD25977E0a5Ae17155770273aD58648900D3",
    "0x267be1c1d684f78cb4f6a176c4911b741e4ffdc0",
]

fraud_addresses = [
    "0x283aa3c6e0cf2c2d8f2c1c3b7603e7b4c8a9f2a6",
    "0x6f46cf5569aefa1acc1009290c8e043747172d89",
]

normal_txns = [get_transactions(addr) for addr in normal_addresses]
fraud_txns = [get_transactions(addr) for addr in fraud_addresses]

normal_df = pd.DataFrame([tx for sublist in normal_txns for tx in sublist])
normal_df['is_fraud'] = 0
fraud_df = pd.DataFrame([tx for sublist in fraud_txns for tx in sublist])
fraud_df['is_fraud'] = 1
df = pd.concat([normal_df, fraud_df], axis=0)


# Step 2: Data Preprocessing & Feature Engineering

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from datetime import datetime

def preprocess_data(df):
    # Remove non-numeric columns for ML
    non_numeric = ['hash', 'nonce', 'blockHash', 'from', 'to', 'input', 'contractAddress', 'cumulativeGasUsed', 'blockNumber', 'timeStamp', 'transactionIndex']
    for col in non_numeric:
        if col in df.columns:
            df = df.drop(columns=[col])
    # Convert timestamp first for features
    if 'timeStamp' in df.columns:
        df['timestamp'] = df['timeStamp'].apply(lambda x: datetime.fromtimestamp(int(x)))
    else:
        df['timestamp'] = pd.to_datetime('now')
    # Feature engineering
    df['value_eth'] = df['value'].astype(float) / 1e18
    df['gas_price_gwei'] = df['gasPrice'].astype(float) / 1e9
    df['gas_used'] = df['gasUsed'].astype(float)
    df['gas_cost'] = df['gas_price_gwei'] * df['gas_used']
    df['hour_of_day'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['value_gas_ratio'] = df['value_eth'] / (df['gas_cost'] + 1e-9)
    # Add sender/receiver txn count (optional, but requires original from/to columns)
    df['sender_txn_count'] = 1  # Dummy if dropped
    df['receiver_txn_count'] = 1
    # Select features
    features = [
        'value_eth', 'gas_price_gwei', 'gas_used', 'gas_cost',
        'hour_of_day', 'day_of_week', 'is_weekend', 'value_gas_ratio',
        'sender_txn_count', 'receiver_txn_count'
    ]
    X = df[features]
    y = df['is_fraud']
    return X, y

X, y = preprocess_data(df)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# Step 3: Class Balancing

In [4]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

over = SMOTE(sampling_strategy=0.1, random_state=42)
under = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
resample_pipeline = Pipeline([
    ('o', over),
    ('u', under)
])
X_train_res, y_train_res = resample_pipeline.fit_resample(X_train, y_train)


# Step 4: Model Building

## XGBoost Model


In [5]:
import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    scale_pos_weight=(len(y_train_res) - sum(y_train_res)) / sum(y_train_res)
)
xgb_model.fit(X_train_scaled, y_train_res)
y_pred_xgb = xgb_model.predict(X_test_scaled)
y_proba_xgb = xgb_model.predict_proba(X_test_scaled)[:, 1]
print("XGBoost Performance:")
print(classification_report(y_test, y_pred_xgb))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_xgb))


XGBoost Performance:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98      4159
           1       0.36      0.96      0.52       100

    accuracy                           0.96      4259
   macro avg       0.68      0.96      0.75      4259
weighted avg       0.98      0.96      0.97      4259

ROC-AUC: 0.9846585717720606


## LSTM Model

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Masking, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam

# We create sequences using only numeric features
def create_sequences(X, y, sequence_length=10):
    sequences, labels = [], []
    for i in range(len(X) - sequence_length):
        seq = X.iloc[i:i+sequence_length].values
        label = y.iloc[i+sequence_length-1]
        sequences.append(seq)
        labels.append(label)
    return np.array(sequences), np.array(labels)

X_sequences, y_sequences = create_sequences(X, y)
X_seq_train, X_seq_test, y_seq_train, y_seq_test = train_test_split(
    X_sequences, y_sequences, test_size=0.2, random_state=42, stratify=y_sequences
)
# Resample: flatten to 2D, resample, then reshape
X_seq_train_2d = X_seq_train.reshape(X_seq_train.shape[0], -1)
X_seq_train_res, y_seq_train_res = resample_pipeline.fit_resample(X_seq_train_2d, y_seq_train)
X_seq_train_res = X_seq_train_res.reshape(-1, X_seq_train.shape[1], X_seq_train.shape[2])

# Scale
seq_scaler = StandardScaler()
X_seq_train_res_flat = X_seq_train_res.reshape(-1, X_seq_train_res.shape[2])
X_seq_train_scaled = seq_scaler.fit_transform(X_seq_train_res_flat).reshape(X_seq_train_res.shape)
X_seq_test_flat = X_seq_test.reshape(-1, X_seq_test.shape[2])
X_seq_test_scaled = seq_scaler.transform(X_seq_test_flat).reshape(X_seq_test.shape)

# LSTM Model
lstm_model = Sequential([
    Masking(mask_value=0., input_shape=(X_seq_train_scaled.shape[1], X_seq_train_scaled.shape[2])),
    LSTM(64, return_sequences=True),
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])
lstm_model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)
history_lstm = lstm_model.fit(
    X_seq_train_scaled, y_seq_train_res,
    validation_data=(X_seq_test_scaled, y_seq_test),
    epochs=10,
    batch_size=64,
    class_weight={0: 1., 1: 5.}
)
y_pred_lstm = (lstm_model.predict(X_seq_test_scaled) > 0.5).astype(int)
y_proba_lstm = lstm_model.predict(X_seq_test_scaled)
print("LSTM Performance:")
print(classification_report(y_seq_test, y_pred_lstm))
print("ROC-AUC:", roc_auc_score(y_seq_test, y_proba_lstm))



  super().__init__(**kwargs)


Epoch 1/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 67ms/step - accuracy: 0.5311 - loss: 1.2405 - val_accuracy: 0.8612 - val_loss: 0.4739
Epoch 2/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 72ms/step - accuracy: 0.8889 - loss: 0.5771 - val_accuracy: 0.8868 - val_loss: 0.4097
Epoch 3/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 60ms/step - accuracy: 0.9021 - loss: 0.5041 - val_accuracy: 0.8959 - val_loss: 0.3096
Epoch 4/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 53ms/step - accuracy: 0.9033 - loss: 0.4391 - val_accuracy: 0.8760 - val_loss: 0.3347
Epoch 5/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 93ms/step - accuracy: 0.9239 - loss: 0.3440 - val_accuracy: 0.9009 - val_loss: 0.2959
Epoch 6/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 52ms/step - accuracy: 0.9339 - loss: 0.3348 - val_accuracy: 0.9302 - val_loss: 0.2119
Epoch 7/10
[1m78/78[0m [32m━━━

## CNN Model

In [7]:
cnn_model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_seq_train_scaled.shape[1], X_seq_train_scaled.shape[2])),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])
cnn_model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)
history_cnn = cnn_model.fit(
    X_seq_train_scaled, y_seq_train_res,
    validation_data=(X_seq_test_scaled, y_seq_test),
    epochs=10,
    batch_size=64,
    class_weight={0: 1., 1: 5.}
)
y_pred_cnn = (cnn_model.predict(X_seq_test_scaled) > 0.5).astype(int)
y_proba_cnn = cnn_model.predict(X_seq_test_scaled)
print("CNN Performance:")
print(classification_report(y_seq_test, y_pred_cnn))
print("ROC-AUC:", roc_auc_score(y_seq_test, y_proba_cnn))



Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.5299 - loss: 1.3783 - val_accuracy: 0.4141 - val_loss: 0.7242
Epoch 2/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.7109 - loss: 0.8471 - val_accuracy: 0.8710 - val_loss: 0.3767
Epoch 3/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.8970 - loss: 0.5151 - val_accuracy: 0.9204 - val_loss: 0.2795
Epoch 4/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9449 - loss: 0.3504 - val_accuracy: 0.9563 - val_loss: 0.1715
Epoch 5/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9564 - loss: 0.2677 - val_accuracy: 0.9542 - val_loss: 0.1641
Epoch 6/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9582 - loss: 0.2435 - val_accuracy: 0.9295 - val_loss: 0.2256
Epoch 7/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━

## GCN Model

In [8]:
# Cell: Build address graph (add before GNN model code)
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.preprocessing import LabelEncoder

# Build mapping for addresses
addresses = pd.concat([df['from'], df['to']]).unique()
addr2idx = {addr: idx for idx, addr in enumerate(addresses)}

# Create edge index (source and target address indices)
edges = torch.tensor([
    [addr2idx[f], addr2idx[t]]
    for f, t in zip(df['from'], df['to'])
    if f in addr2idx and t in addr2idx
], dtype=torch.long).t().contiguous()

# Node features: e.g. total sent, received, and count of txns per address
feat_df = pd.DataFrame({'address': addresses})
feat_df['sent_count'] = feat_df['address'].map(df['from'].value_counts()).fillna(0)
feat_df['recv_count'] = feat_df['address'].map(df['to'].value_counts()).fillna(0)
feat_df['sent_value'] = feat_df['address'].map(df.groupby('from')['value_eth'].sum()).fillna(0)
feat_df['recv_value'] = feat_df['address'].map(df.groupby('to')['value_eth'].sum()).fillna(0)
x = torch.tensor(feat_df[['sent_count','recv_count','sent_value','recv_value']].values, dtype=torch.float)

# Node labels: fraud = 1 if in fraud list, else 0
feat_df['label'] = feat_df['address'].apply(lambda x: 1 if x in fraud_addresses else 0)
y = torch.tensor(feat_df['label'].values, dtype=torch.long)

# Cell: Define and train GCN
class SimpleGCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, num_classes):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

# Prepare data object
data = Data(x=x, edge_index=edges, y=y)

# Train/test split: use known frauds/non-frauds as train, rest as test
mask = feat_df['address'].isin(normal_addresses + fraud_addresses)
train_mask = torch.tensor(mask, dtype=torch.bool)
test_mask = ~train_mask

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SimpleGCN(x.shape[1], 16, 2).to(device)
data = data.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
loss_fn = torch.nn.CrossEntropyLoss()

for epoch in range(50):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = loss_fn(out[train_mask], data.y[train_mask])
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

# Inference
model.eval()
with torch.no_grad():
    logits = model(data.x, data.edge_index)
    pred = logits.argmax(dim=1)
    from sklearn.metrics import classification_report
    print("GCN on known addresses:")
    print(classification_report(data.y[test_mask].cpu(), pred[test_mask].cpu()))
    gcn_probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()

KeyError: 'Column not found: value_eth'

## Ensembling Your Models

In [None]:
# Cell: Ensemble model predictions
from sklearn.ensemble import RandomForestClassifier

# Prepare ensemble features (align test sets)
# Note: You must align the indices between test sets. Here, we use only addresses present in all outputs.
test_addresses = set(df.iloc[y_test.index]['from']) & set(feat_df.loc[test_mask, 'address'])
ensemble_idx = [feat_df.index[feat_df['address'] == addr][0] for addr in test_addresses]

# Get probabilities (adjust as needed for your data alignment)
ensemble_X = np.vstack([
    y_proba_xgb[:len(ensemble_idx)],
    y_proba_lstm[:len(ensemble_idx)],
    y_proba_cnn[:len(ensemble_idx)],
    gcn_probs[ensemble_idx]
]).T

# Use known labels for these addresses
ensemble_y = y.iloc[y_test.index][:len(ensemble_idx)].values

# Train/test split for stacking (or do cross-validation)
X_ens_train, X_ens_test, y_ens_train, y_ens_test = train_test_split(ensemble_X, ensemble_y, test_size=0.2, random_state=42)

rf_ensemble = RandomForestClassifier(n_estimators=50, random_state=42)
rf_ensemble.fit(X_ens_train, y_ens_train)
y_ensemble_pred = rf_ensemble.predict(X_ens_test)
y_ensemble_proba = rf_ensemble.predict_proba(X_ens_test)[:, 1]

print("Ensemble Performance:")
print(classification_report(y_ens_test, y_ensemble_pred))
print("Ensemble ROC-AUC:", roc_auc_score(y_ens_test, y_ensemble_proba))

# Step 5: Model Evaluation

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score

def plot_roc_curve(y_true, y_proba, model_name, color=None):
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.2f})', color=color)
    plt.plot([0, 1], [0, 1], 'k--', lw=1)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')

def plot_pr_curve(y_true, y_proba, model_name, color=None):
    precision, recall, _ = precision_recall_curve(y_true, y_proba)
    ap = average_precision_score(y_true, y_proba)
    plt.plot(recall, precision, label=f'{model_name} (AP = {ap:.2f})', color=color)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc='upper right')

# === Main plotting section ===

plt.figure(figsize=(10, 8))
plot_roc_curve(y_test, y_proba_xgb, 'XGBoost', color='C0')
plot_roc_curve(y_seq_test, y_proba_lstm, 'LSTM', color='C1')
plot_roc_curve(y_seq_test, y_proba_cnn, 'CNN', color='C2')
# Add GCN if available
if 'gcn_probs' in globals() and 'test_mask' in globals():
    # Use only test_mask indices for evaluation
    gcn_y_true = y[test_mask].cpu().numpy() if hasattr(y[test_mask], 'cpu') else y[test_mask].values
    gcn_y_proba = gcn_probs[test_mask] if hasattr(gcn_probs, '__getitem__') else gcn_probs
    plot_roc_curve(gcn_y_true, gcn_y_proba, 'GCN (GNN)', color='C3')
plt.show()

plt.figure(figsize=(10, 8))
plot_pr_curve(y_test, y_proba_xgb, 'XGBoost', color='C0')
plot_pr_curve(y_seq_test, y_proba_lstm, 'LSTM', color='C1')
plot_pr_curve(y_seq_test, y_proba_cnn, 'CNN', color='C2')
if 'gcn_probs' in globals() and 'test_mask' in globals():
    plot_pr_curve(gcn_y_true, gcn_y_proba, 'GCN (GNN)', color='C3')
plt.show()

# Step 6: Explainability with SHAP and LIME

# SHAP Analysis

In [None]:
import shap
explainer_xgb = shap.TreeExplainer(xgb_model)
shap_values_xgb = explainer_xgb.shap_values(X_test_scaled)
shap.summary_plot(shap_values_xgb, X_test_scaled, feature_names=X.columns)
shap.force_plot(explainer_xgb.expected_value, shap_values_xgb[0, :], X_test_scaled[0, :], feature_names=X.columns)


## LIME Analysis



In [None]:
import lime
import lime.lime_tabular
explainer_lime = lime.lime_tabular.LimeTabularExplainer(
    X_train_scaled,
    feature_names=X.columns,
    class_names=['Normal', 'Fraud'],
    mode='classification'
)
exp = explainer_lime.explain_instance(
    X_test_scaled[0],
    xgb_model.predict_proba,
    num_features=10
)
exp.show_in_notebook()

# Step 7: Comparison & Reporting

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score

def evaluate_model(y_true, y_pred, y_proba, model_name):
    return {
        'Model': model_name,
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred, zero_division=0),
        'Recall': recall_score(y_true, y_pred, zero_division=0),
        'F1-Score': f1_score(y_true, y_pred, zero_division=0),
        'ROC-AUC': roc_auc_score(y_true, y_proba)
    }

# Gather results
results = [
    evaluate_model(y_test, y_pred_xgb, y_proba_xgb, 'XGBoost'),
    evaluate_model(y_seq_test, y_pred_lstm, y_proba_lstm, 'LSTM'),
    evaluate_model(y_seq_test, y_pred_cnn, y_proba_cnn, 'CNN')
]

# Add GCN/GNN if available
if 'gcn_probs' in globals() and 'test_mask' in globals():
    gcn_y_true = y[test_mask].cpu().numpy() if hasattr(y[test_mask], 'cpu') else y[test_mask].values
    gcn_y_pred = (gcn_probs[test_mask] > 0.5).astype(int) if hasattr(gcn_probs, '__getitem__') else (gcn_probs > 0.5).astype(int)
    gcn_y_proba = gcn_probs[test_mask] if hasattr(gcn_probs, '__getitem__') else gcn_probs
    results.append(evaluate_model(gcn_y_true, gcn_y_pred, gcn_y_proba, 'GCN (GNN)'))

results_df = pd.DataFrame(results)

print("Model Performance Comparison:")
print(results_df)

# Visual comparison
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 6))
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']
for i, metric in enumerate(metrics):
    plt.subplot(1, 5, i+1)
    plt.bar(results_df['Model'], results_df[metric])
    plt.title(metric)
    plt.ylim(0, 1)
    plt.xticks(rotation=20)
plt.tight_layout()
plt.show()