In [None]:
import torch
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, precision_recall_curve, average_precision_score

In [None]:
data = pd.read_pickle("../data/transactions.pkl")

X = data.drop(['fraud'],axis=1)
y = data['fraud']

In [None]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Resampled set size: {X_res.shape[0]} samples")
print(f"Validation set size: {X_valid.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

In [None]:
X_train_np = X_res.values
X_valid_np = X_valid.values
X_test_np = X_test.values
y_train_np = y_res.values
y_valid_np = y_valid.values
y_test_np = y_test.values

X_train_tensor = torch.tensor(X_train_np, dtype=torch.float32).to(device)
X_valid_tensor = torch.tensor(X_valid_np, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test_np, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train_np, dtype=torch.float32).unsqueeze(1).to(device)
y_valid_tensor = torch.tensor(y_valid_np, dtype=torch.float32).unsqueeze(1).to(device)
y_test_tensor = torch.tensor(y_test_np, dtype=torch.float32).unsqueeze(1).to(device)

In [None]:
from torch_geometric.nn import GCNConv, global_mean_pool
import torch.nn.functional as F

class GNN(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(input_dim, 16)
        self.conv2 = GCNConv(16, 32)
        self.out = torch.nn.Linear(32, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.conv2(x, edge_index))
        
        # Global mean pooling
        x = global_mean_pool(x, batch=data.batch)  # if batch is None, this line can be omitted
        
        x = F.dropout(x, training=self.training)
        x = self.out(x)
        
        return F.log_softmax(x, dim=1)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GNN(input_dim=X_res.shape[1], output_dim=2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

In [None]:
def train():
    model.train()
    total_loss = 0
    for data in DataLoader(train_data, batch_size=32, shuffle=True):  # DataLoader usage
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data.x, data.edge_index)
        loss = criterion(output, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_data)

In [None]:
def evaluate(data):
    model.eval()
    correct = 0
    predictions, targets = [], []
    with torch.no_grad():
        for data in DataLoader(data, batch_size=32):  # For large datasets
            data = data.to(device)
            output = model(data.x, data.edge_index)
            pred = output.argmax(dim=1)
            correct += (pred == data.y).sum().item()
            predictions.extend(pred.tolist())
            targets.extend(data.y.tolist())
    accuracy = correct / len(data.dataset)
    precision = precision_score(targets, predictions, average='weighted')
    recall = recall_score(targets, predictions, average='weighted')
    f1 = f1_score(targets, predictions, average='weighted')
    return accuracy, precision, recall, f1

In [None]:
num_epochs = 100
for epoch in range(num_epochs):
    loss = train()
    train_metrics = evaluate(train_data)
    valid_metrics = evaluate(valid_data)
    print(f'Epoch: {epoch+1}, Loss: {loss:.4f}, Train Acc: {train_metrics[0]:.4f}, Valid Acc: {valid_metrics[0]:.4f}, '
          f'Train Precision: {train_metrics[1]:.4f}, Train Recall: {train_metrics[2]:.4f}, Train F1: {train_metrics[3]:.4f}, '
          f'Valid Precision: {valid_metrics[1]:.4f}, Valid Recall: {valid_metrics[2]:.4f}, Valid F1: {valid_metrics[3]:.4f}')