In [1]:
# Cell 1: Imports and Setup
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GATConv, global_mean_pool
from torch_geometric.utils import to_networkx
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
import pickle
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
print("Imports completed!")

Imports completed!


In [2]:
# Cell 2: Data Loading and Preprocessing
print("Loading dataset...")
df = pd.read_csv('transaction_dataset.csv')
print(f"Dataset shape: {df.shape}")

# Basic cleaning
df = df.dropna().reset_index(drop=True)
print(f"After dropna shape: {df.shape}")

# Separate features and target
X = df.drop(columns=['Address', 'FLAG', 'Index'] if 'Index' in df.columns else ['Address', 'FLAG'])
y = df['FLAG'].values

# Handle non-numeric columns
for col in X.columns:
    X[col] = pd.to_numeric(X[col], errors='coerce')

# Fill remaining NaN values
X = X.fillna(0)

# Scale features
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(
    scaler.fit_transform(X),
    columns=X.columns,
    index=X.index
)

print(f"Feature shape: {X_scaled.shape}")
print(f"Fraud cases: {y.sum()} ({y.sum()/len(y)*100:.2f}%)")

# Convert to numpy
X_np = X_scaled.values.astype(np.float32)
y_np = y.astype(np.int64)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_np, y_np, test_size=0.3, random_state=42, stratify=y_np
)

print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")


Loading dataset...
Dataset shape: (9841, 51)
After dropna shape: (7121, 51)
Feature shape: (7121, 48)
Fraud cases: 1350 (18.96%)

Train set: 4984 samples
Test set: 2137 samples


In [3]:
# Cell 3: Graph Construction Function
def build_knn_graph(features, k=10):
    """
    Build a graph using K-nearest neighbors based on feature similarity
    """
    from sklearn.neighbors import NearestNeighbors
    
    nbrs = NearestNeighbors(n_neighbors=k+1, metric='cosine').fit(features)
    distances, indices = nbrs.kneighbors(features)
    
    # Create edge list (excluding self-loops)
    edge_list = []
    edge_weights = []
    
    for i in range(len(features)):
        for j, idx in enumerate(indices[i][1:]):  # Skip first (self)
            edge_list.append([i, idx])
            # Convert distance to similarity weight
            weight = 1 - distances[i][j+1]
            edge_weights.append(max(0, weight))
    
    edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(edge_weights, dtype=torch.float32)
    
    return edge_index, edge_attr

print("Graph construction function defined!")


Graph construction function defined!


In [4]:
# Cell 4: Build Training Graph
print("\nBuilding graph structure...")
edge_index, edge_weights = build_knn_graph(X_train, k=15)
print(f"Number of edges: {edge_index.shape[1]}")

# Create PyTorch Geometric Data object
x = torch.tensor(X_train, dtype=torch.float32)
y_tensor = torch.tensor(y_train, dtype=torch.long)

data = Data(x=x, edge_index=edge_index, edge_attr=edge_weights, y=y_tensor)

print(f"\nGraph Data:")
print(f"  Nodes: {data.num_nodes}")
print(f"  Edges: {data.num_edges}")
print(f"  Node features: {data.num_node_features}")
print(f"  Classes: {data.y.unique()}")



Building graph structure...


Number of edges: 74760

Graph Data:
  Nodes: 4984
  Edges: 74760
  Node features: 48
  Classes: tensor([0, 1])


In [5]:
# Cell 5: Define GNN Model
class FraudGNN(nn.Module):
    def __init__(self, num_features, hidden_dim=64, num_classes=2, dropout=0.3):
        super(FraudGNN, self).__init__()
        
        # Graph Convolutional Layers
        self.conv1 = GCNConv(num_features, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.conv3 = GCNConv(hidden_dim, hidden_dim // 2)
        
        # Batch normalization
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        self.bn3 = nn.BatchNorm1d(hidden_dim // 2)
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
        
        # Classifier
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim // 2, hidden_dim // 4),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 4, num_classes)
        )
        
    def forward(self, x, edge_index, edge_attr=None):
        # Graph convolutions
        x = self.conv1(x, edge_index, edge_attr)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.dropout(x)
        
        x = self.conv2(x, edge_index, edge_attr)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.dropout(x)
        
        x = self.conv3(x, edge_index, edge_attr)
        x = self.bn3(x)
        x = F.relu(x)
        
        # Node-level predictions
        x = self.classifier(x)
        
        return x

print("GNN Model class defined!")


GNN Model class defined!


In [6]:
# Cell 6: Initialize Model and Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model = FraudGNN(
    num_features=data.num_node_features,
    hidden_dim=128,
    num_classes=2,
    dropout=0.3
).to(device)

data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")


Using device: cpu
Model parameters: 33,826


In [7]:
# Cell 7: Training and Evaluation Functions
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index, data.edge_attr)
    loss = criterion(out, data.y)
    loss.backward()
    optimizer.step()
    return loss.item()

def evaluate(features, labels, edge_index, edge_attr):
    """Fixed evaluation function that accepts edge_index and edge_attr"""
    model.eval()
    with torch.no_grad():
        out = model(features, edge_index, edge_attr)
        pred = out.argmax(dim=1)
        acc = (pred == labels).float().mean().item()
        loss = criterion(out, labels).item()
    return acc, loss, pred.cpu().numpy(), F.softmax(out, dim=1)[:, 1].cpu().numpy()

print("Training and evaluation functions defined!")


Training and evaluation functions defined!


In [8]:
# Cell 8: Training Loop
print("\nTraining GNN model...")
num_epochs = 200
best_acc = 0
patience = 20
patience_counter = 0

train_losses = []
train_accs = []

for epoch in range(num_epochs):
    loss = train()
    acc, val_loss, _, _ = evaluate(data.x, data.y, data.edge_index, data.edge_attr)
    
    train_losses.append(loss)
    train_accs.append(acc)
    
    if acc > best_acc:
        best_acc = acc
        patience_counter = 0
        # Save best model
        torch.save(model.state_dict(), 'best_gnn_model.pt')
    else:
        patience_counter += 1
    
    if (epoch + 1) % 20 == 0:
        print(f'Epoch {epoch+1:03d}, Loss: {loss:.4f}, Acc: {acc:.4f}')
    
    if patience_counter >= patience:
        print(f"Early stopping at epoch {epoch+1}")
        break

# Load best model
model.load_state_dict(torch.load('best_gnn_model.pt'))
print(f"\nBest training accuracy: {best_acc:.4f}")



Training GNN model...


Epoch 020, Loss: 0.2661, Acc: 0.8104


Early stopping at epoch 25

Best training accuracy: 0.8790


In [9]:
# Cell 9: Evaluate on Training Set
train_acc, train_loss, train_pred, train_proba = evaluate(data.x, data.y, data.edge_index, data.edge_attr)
print(f"\nTraining Results:")
print(f"  Accuracy: {train_acc:.4f}")
print(f"  Loss: {train_loss:.4f}")



Training Results:
  Accuracy: 0.8790
  Loss: 0.6847


In [10]:
# Cell 10: Build Test Graph and Evaluate
print("\nBuilding test graph...")
edge_index_test, edge_weights_test = build_knn_graph(X_test, k=15)
x_test = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(device)
data_test = Data(x=x_test, edge_index=edge_index_test.to(device), 
                 edge_attr=edge_weights_test.to(device), y=y_test_tensor).to(device)

# Evaluate on test set
test_acc, test_loss, test_pred, test_proba = evaluate(data_test.x, data_test.y, 
                                                       data_test.edge_index, data_test.edge_attr)
print(f"\nTest Results:")
print(f"  Accuracy: {test_acc:.4f}")
print(f"  Loss: {test_loss:.4f}")



Building test graph...



Test Results:
  Accuracy: 0.8685
  Loss: 0.6846


In [11]:
# Cell 11: Classification Metrics
print("\nClassification Report (Test Set):")
print(classification_report(y_test, test_pred, target_names=['Legitimate', 'Fraud']))

print("\nConfusion Matrix (Test Set):")
print(confusion_matrix(y_test, test_pred))

# ROC AUC Score
try:
    roc_auc = roc_auc_score(y_test, test_proba)
    print(f"\nROC AUC Score: {roc_auc:.4f}")
except:
    print("\nROC AUC Score: Could not calculate (possibly only one class present)")



Classification Report (Test Set):
              precision    recall  f1-score   support

  Legitimate       0.92      0.91      0.92      1732
       Fraud       0.65      0.68      0.66       405

    accuracy                           0.87      2137
   macro avg       0.78      0.79      0.79      2137
weighted avg       0.87      0.87      0.87      2137


Confusion Matrix (Test Set):
[[1582  150]
 [ 131  274]]

ROC AUC Score: 0.9244


In [12]:
# Cell 12: Save Model Pipeline
print("\nSaving model and preprocessing objects...")

# Create a dictionary with all necessary components
gnn_pipeline = {
    'model': model,
    'model_state_dict': model.state_dict(),
    'scaler': scaler,
    'num_features': data.num_node_features,
    'hidden_dim': 128,
    'num_classes': 2,
    'k_neighbors': 15,
    'feature_names': X.columns.tolist()
}

# Save as pickle
with open('gnn_model.pkl', 'wb') as f:
    pickle.dump(gnn_pipeline, f)

print("Model saved as 'gnn_model.pkl'")



Saving model and preprocessing objects...
Model saved as 'gnn_model.pkl'


In [13]:
# Cell 13: Prediction Function
def predict_fraud(address_features, model_pipeline, k_neighbors=15):
    """
    Predict fraud for new address features
    
    Parameters:
    -----------
    address_features : array-like, shape (n_samples, n_features)
        Feature matrix for addresses to predict
    model_pipeline : dict
        Loaded GNN pipeline dictionary
    k_neighbors : int
        Number of neighbors for graph construction
    
    Returns:
    --------
    predictions : array, shape (n_samples,)
        Binary predictions (0: Legitimate, 1: Fraud)
    probabilities : array, shape (n_samples,)
        Fraud probabilities
    """
    model = model_pipeline['model']
    scaler = model_pipeline['scaler']
    
    # Preprocess
    features_df = pd.DataFrame(address_features, columns=model_pipeline['feature_names'])
    for col in features_df.columns:
        features_df[col] = pd.to_numeric(features_df[col], errors='coerce')
    features_df = features_df.fillna(0)
    
    features_scaled = scaler.transform(features_df)
    features_tensor = torch.tensor(features_scaled, dtype=torch.float32).to(device)
    
    # Build graph
    edge_index, edge_weights = build_knn_graph(features_scaled, k=k_neighbors)
    edge_index = edge_index.to(device)
    edge_weights = edge_weights.to(device)
    
    # Predict
    model.eval()
    with torch.no_grad():
        out = model(features_tensor, edge_index, edge_weights)
        probs = F.softmax(out, dim=1)
        preds = out.argmax(dim=1).cpu().numpy()
        fraud_probs = probs[:, 1].cpu().numpy()
    
    return preds, fraud_probs

print("Prediction function defined!")


Prediction function defined!


In [14]:
# Cell 14: Final Summary
print("\n" + "="*50)
print("GNN Model Training Complete!")
print("="*50)
print(f"\nModel file: gnn_model.pkl")
print(f"Model architecture: 3-layer GCN with hidden_dim=128")
print(f"Training accuracy: {train_acc:.4f}")
print(f"Test accuracy: {test_acc:.4f}")



GNN Model Training Complete!

Model file: gnn_model.pkl
Model architecture: 3-layer GCN with hidden_dim=128
Training accuracy: 0.8790
Test accuracy: 0.8685
