In [1]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import sys

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *

import torch.utils.data as data
import torch
from torch_geometric.data import Data
from torch.nn import Linear
from torch_geometric.nn import GCNConv
from torch.utils.data import Dataset

import os
import torch
import torch.nn.functional as F
from tqdm import tqdm
from torch_geometric.loader import NeighborLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch_geometric.nn import MessagePassing, SAGEConv
from ogb.nodeproppred import Evaluator #PygNodePropPredDatase

from IPython.display import clear_output
import wandb
import pickle

In [25]:
# Set path
path = 'C:/Users/User/Desktop/Assignatures/Synthesis project/GraphAnomaly/dades_arnau/'
# Load train and test set
with open(path + 'sequential_train.pkl', 'rb') as f:
    train_set = pickle.load(f)
with open(path + 'sequential_test.pkl', 'rb') as f:
    test_set = pickle.load(f)
with open(path + 'edges.pkl', 'rb') as f:
    edges = pickle.load(f)

In [26]:
train_set = train_set.loc[train_set['class'].isin([0, 1])] # Drop unknown
test_set = test_set.loc[test_set['class'].isin([0, 1])] # Drop unknown
train_set = train_set.reset_index(drop=True)
test_set = test_set.reset_index(drop=True)

# Split edges
edges_train = edges.loc[((edges['node1'].isin(train_set['node'])))]
edges_test = edges.loc[((edges['node1'].isin(test_set['node'])))]
edges_train = edges_train.reset_index(drop=True)
edges_test= edges_test.reset_index(drop=True)

In [27]:
def map_idx(feats: pd.DataFrame, edges: pd.DataFrame):
    mapping_txid = dict(zip(feats['node'], list(feats.index)))      
    df_edges_mapped = edges.replace({'node1': mapping_txid, 'node2': mapping_txid})
    df_feats = feats.replace({'node': mapping_txid})
    return df_feats, df_edges_mapped

train_set, edges_train = map_idx(feats = train_set, edges = edges_train)
test_set, edges_test = map_idx(feats = test_set, edges = edges_test) 

In [29]:
def get_data(feats: pd.DataFrame, edges:pd.DataFrame):
    x = torch.tensor(feats.drop(columns=['class', 'time step', 'node']).values, dtype=torch.float)
    edge_index = torch.tensor(edges.values, dtype=torch.long).T    
    y = torch.tensor(np.array(feats['class'].values, np.int64))
    time = torch.tensor(feats['time step'].values)
    data = Data(x=x, edge_index=edge_index, y=y, time=time)
    return data


train_data = get_data(train_set, edges_train)
test_data = get_data(test_set, edges_test)

In [36]:
train_data.x.shape[1]

165

In [30]:
class SAGE(torch.nn.Module):
    def __init__(self, in_channels,
                 hidden_channels, out_channels,
                 n_layers=2):
        
        super(SAGE, self).__init__()
        self.n_layers = n_layers
        self.layers = torch.nn.ModuleList()
        self.layers_bn = torch.nn.ModuleList()
        if n_layers == 1:
            self.layers.append(SAGEConv(in_channels, out_channels,   normalize=False))
        elif n_layers == 2:
            self.layers.append(SAGEConv(in_channels, hidden_channels, normalize=False))
            self.layers_bn.append(torch.nn.BatchNorm1d(hidden_channels))
            # self.layers.append(SAGEConv(hidden_channels, out_channels, normalize=False))
        else:
            self.layers.append(SAGEConv(in_channels, hidden_channels, normalize=False))
            # self.layers_bn.append(torch.nn.BatchNorm1d(hidden_channels))
        for _ in range(n_layers - 2):
            self.layers.append(SAGEConv(hidden_channels,  hidden_channels, normalize=False))
            self.layers_bn.append(torch.nn.BatchNorm1d(hidden_channels))
                    
                
        if n_layers != 1:
            self.layers.append(SAGEConv(hidden_channels, out_channels, normalize=False))
        
        for layer in self.layers:
            layer.reset_parameters()
            
            
            
    def forward(self, x, edge_index):
        if len(self.layers) > 1:
            looper = self.layers[:-1]
        else:
            looper = self.layers
        
        for i, layer in enumerate(looper):
            x = layer(x, edge_index)
            # print(f"SHAPE: {x.shape}, step: {i}")
            # print(f"Step: {i}")
            try:
                x = self.layers_bn[i](x)
            except Exception as e:
                abs(1)
            finally:
                x = F.relu(x)
                x = F.dropout(x, p=0.5, training=self.training)
        
        if len(self.layers) > 1:
            x = self.layers[-1](x, edge_index)
        return F.log_softmax(x, dim=-1), torch.var(x)
    
    def inference(self, total_loader, device):
        xs = []
        var_ = []
        for batch in total_loader:
            out, var = self.forward(batch.x.to(device), batch.edge_index.to(device))
            out = out[:batch.batch_size]
            xs.append(out.cpu())
            var_.append(var.item())
        
        out_all = torch.cat(xs, dim=0)
        
        return out_all, var_

Entrenar models

In [31]:
EPOCHS = 101
layers_list = [2]
# layers_list = [4]
wb = True
for LAYERS in layers_list:
    model = SAGE(train_data.x.shape[1], 256, torch.unique(train_data.y).size(0), n_layers=LAYERS)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.03)
    scheduler = ReduceLROnPlateau(optimizer, 'max', patience=7)
    criterion = torch.nn.CrossEntropyLoss()
    if wb:
        wandb.init(
            # set the wandb project where this run will be logged
            project="GraphAnomaly",
            name = f"2_SAGE - n_layers: {LAYERS}",
            # track hyperparameters and run metadata
            config={
            "architecture": "SAGE_3",
            "dataset": "Time Steps elliptic",
            "epochs": EPOCHS,
            "layers": LAYERS
            }
        )

    def train(data, epoch):
        optimizer.zero_grad()  # Clear gradients.
        out, h = model(data.x, data.edge_index)  # Perform a single forward pass.
        pred = out.argmax(dim=1)
        loss = criterion(out, data.y)  # Compute the loss solely based on the training nodes.
        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.
        
        # Calculate accuracy
        correct = pred.eq(data.y).sum().item()
        total = len(data.y)
        accuracy = correct / total
        
        if wb:
            wandb.log({
                "epoch": epoch,
                "train_loss": loss,
                "train_accuracy": accuracy,
            })
        if epoch%5 == 0:
            max_value = torch.max(pred)
            counts = torch.bincount(pred, minlength=max_value.item() + 1)
            print(f"""Nodes classified as fraud: {counts[0]}, 
                nodes classified as licit: {counts[1]}""")
            
            
        return loss, accuracy

    def test(data, epoch):
        model.eval()  # Set the model to evaluation mode.
        with torch.no_grad():
            out, _ = model(data.x, data.edge_index)  # Perform a forward pass.
            loss = criterion(out, data.y)
            
            pred = out.argmax(dim=1)  # Get the predicted labels by selecting the class with the highest probability.
            # Calculate accuracy
            correct = pred.eq(data.y).sum().item()
            total = len(data.y)
            accuracy = correct / total
            
            #precision, recall, f1 score
            precision = precision_score(data.y.cpu().numpy(), pred.cpu().numpy(), average=None)
            recall = recall_score(data.y.cpu().numpy(), pred.cpu().numpy(), average=None)
            f1 = f1_score(data.y.cpu().numpy(), pred.cpu().numpy(), average=None)
            
            other_metrics = [precision, recall, f1]
            if wb:
                wandb.log({
                "epoch": epoch,
                "test_loss": loss,
                "test_accuracy": accuracy,
                "test_precision_class_0": precision[0],
                "test_precision_class_1": precision[1],
                "test_recall_class_0": recall[0],
                "test_recall_class_1": recall[1],
                "test_f1_score_class_0": f1[0],
                "test_f1_score_class_1": f1[1]
            })
            
        return loss, accuracy, other_metrics
            
        
        
    train_loss_history = []
    test_loss_history = []
    train_acc_hist = []
    test_acc_hist = []

    for epoch in range(EPOCHS):
        train_loss, train_acc = train(train_data, epoch)
        train_loss_history.append(train_loss.detach().numpy())
        train_acc_hist.append(train_acc)
        
        test_loss, test_acc, metrics = test(test_data, epoch)
        test_loss_history.append(test_loss.detach().numpy())
        test_acc_hist.append(test_acc)
        
        if epoch%5 == 0:
            print(f"""Epoch: {epoch}, 
            Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.4f}
            Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}
            """)
            
            print(f"Precision: {metrics[0]}, Recall: {metrics[1]}, F1 Score: {metrics[2]} \n")
            print("---------------------------------------------------------------------------------")
    
    
        

VBox(children=(Label(value='0.001 MB of 0.030 MB uploaded\r'), FloatProgress(value=0.04431723162024828, max=1.…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

ValueError: Encountered invalid 'dim_size' (got '29894' but expected >= '402916477')

In [11]:
save_model = False
if save_model:
    torch.save(model.state_dict(), 'final_model_2_layers.pth')

Load model

In [23]:
unk_data.x.shape[1]

165

In [35]:
# Instantiate the model
model = SAGE(unk_data.x.shape[1], 256, 2, n_layers=2)

# Load the model's state dict
model.load_state_dict(torch.load('C:/Users/gsamp/OneDrive/Documents/AI-3/2n Semestre/Projecte de Síntesi 2/GraphAnomaly/GNN_models/trained_models/final_model_2_layers.pth'))

# Set the model to evaluation mode
model.eval()

with torch.no_grad():
    # Forward pass through the model
    output = model(unk_data.x, unk_data.edge_index)

# 'output' now contains the model's predictions for the unseen data
output[0]
predicted_classes = torch.argmax(output[0], dim=1)

# Count the occurrences of each value
value_counts = torch.bincount(predicted_classes)

# Print the distribution
for value, count in enumerate(value_counts):
    print(f"Value {value}: {count.item()} occurrences")

Value 0: 7341 occurrences
Value 1: 115940 occurrences
