# Graph Neural Network

## Preparing dataset

In [1]:
# Import packages
import pandas as pd
from sklearn.metrics import *
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from torch_geometric.data import Data
from torch.optim.lr_scheduler import ReduceLROnPlateau
import wandb

In [2]:
# Read files
import pickle
path = 'C:/Users/User/Desktop/Assignatures/Synthesis project/GraphAnomaly/dades_arnau/'

with open(path + 'sequential_train.pkl', 'rb') as f:
    df_train_init = pickle.load(f)
with open(path + 'sequential_test.pkl', 'rb') as f:
    df_test_init = pickle.load(f)
with open(path + 'edges.pkl', 'rb') as f:
    df_edges_init = pickle.load(f)

In [3]:
def prep_df(df_feats: pd.DataFrame, edges: pd.DataFrame):
    #ens quedem només amb els edges que apareixen en el nodes d'entrenament
    df_edges = edges.loc[((edges['node1'].isin(df_feats['node'])) & (df_edges_init['node2'].isin(df_feats['node'])))]
    df_edges = df_edges.reset_index(drop=True)
    print(f"Comptador de valors per classe: \n {df_feats['class'].value_counts()}\n")
    return  df_feats, df_edges

df_train, df_edges_train = prep_df(df_train_init, df_edges_init)
df_test, df_edges_test = prep_df(df_test_init, df_edges_init)

Comptador de valors per classe: 
 class
2    110537
1     27591
0      3644
Name: count, dtype: int64

Comptador de valors per classe: 
 class
2    46668
1    14428
0      901
Name: count, dtype: int64



In [4]:
def map_idx(feats: pd.DataFrame, edges: pd.DataFrame, save = True, loading_dir = "a"):
    mapping_txid = dict(zip(feats['node'], list(feats.index)))
    dir = 'dades_guillem/' + str(loading_dir) + '.pkl'
    if save:
        df_edges_mapped = edges.replace({'txId1': mapping_txid, 'txId2': mapping_txid})
        
        df_edges_mapped.to_pickle(loading_dir)
    else:
        df_edges_mapped = pd.read_pickle(loading_dir)
    return df_edges_mapped

df_edges_mapped_train = map_idx(feats = df_train, edges = df_edges_train, save = True, loading_dir='train')
df_edges_mapped_test = map_idx(feats = df_test, edges = df_edges_test, save = True, loading_dir='test') 

In [5]:
def get_data(feats: pd.DataFrame, edges:pd.DataFrame):
    x = torch.tensor(feats.drop(columns=['class', 'time step', 'node']).values, dtype=torch.float)
    edge_index = torch.tensor(edges.values, dtype=torch.long).T
    y = torch.tensor(list(feats['class'].values))
    time = torch.tensor(feats['time step'].values)
    data = Data(x=x, edge_index=edge_index, y=y, time=time)
    return data


train_data = get_data(df_train, df_edges_mapped_train)
test_data = get_data(df_test, df_edges_mapped_test)

## Defining, training and saving model

In [6]:
class SAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, n_layers=2):
        super(SAGE, self).__init__()
        self.n_layers = n_layers
        self.layers = torch.nn.ModuleList()
        self.batch_norms = torch.nn.ModuleList()

        # Define the first layer
        self.layers.append(SAGEConv(in_channels, hidden_channels if n_layers > 1 else out_channels, normalize=False))
        if n_layers > 1:
            self.batch_norms.append(torch.nn.BatchNorm1d(hidden_channels))

        # Define the middle layers
        for _ in range(n_layers - 2):
            self.layers.append(SAGEConv(hidden_channels, hidden_channels, normalize=False))
            self.batch_norms.append(torch.nn.BatchNorm1d(hidden_channels))

        # Define the last layer if more than one layer
        if n_layers > 1:
            self.layers.append(SAGEConv(hidden_channels, out_channels, normalize=False))

        # Reset parameters
        for layer in self.layers:
            layer.reset_parameters()

    def forward(self, x, edge_index):
        for i, layer in enumerate(self.layers[:-1]):
            x = layer(x, edge_index)
            x = self.batch_norms[i](x)
            x = F.relu(x)
            x = F.dropout(x, p=0.5, training=self.training)  # Dropout for regularization

        x = self.layers[-1](x, edge_index)
        return F.log_softmax(x, dim=-1), torch.var(x)

    def inference(self, total_loader, device):
        xs, var_ = [], []
        for batch in total_loader:
            out, var = self.forward(batch.x.to(device), batch.edge_index.to(device))
            out = out[:batch.batch_size]
            xs.append(out.cpu())
            var_.append(var.item())
        
        out_all = torch.cat(xs, dim=0)
        return out_all, var_


In [None]:
EPOCHS = 101
layers_list = [2]
wb = True
for LAYERS in layers_list:
    model = SAGE(train_data.x.shape[1], 1024, torch.unique(train_data.y).size(0), n_layers=LAYERS)    
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.05, weight_decay=5e-4)  # Using AdamW optimizer
    #scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=7, min_lr=1e-5)  # Learning rate scheduler

    scheduler = ReduceLROnPlateau(optimizer, 'max', patience=7)
    criterion = torch.nn.CrossEntropyLoss()
    if wb:
        wandb.init(
            # set the wandb project where this run will be logged
            project="GraphAnomaly",
            name = f"2_SAGE - n_layers: {LAYERS}",
            # track hyperparameters and run metadata
            config={
            "architecture": "SAGE_3",
            "dataset": "Time Steps elliptic",
            "epochs": EPOCHS,
            "layers": LAYERS
            }
        )

    def train(data, epoch):
        optimizer.zero_grad()  # Clear gradients.
        out, h = model(data.x, data.edge_index)  # Perform a single forward pass.
        pred = out.argmax(dim=1)
        loss = criterion(out, data.y)  # Compute the loss solely based on the training nodes.
        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.
        
        # Calculate accuracy
        correct = pred.eq(data.y).sum().item()
        total = len(data.y)
        accuracy = correct / total
        
        if wb:
            wandb.log({
                "epoch": epoch,
                "train_loss": loss,
                "train_accuracy": accuracy,
            })
        if epoch%5 == 0:
            max_value = torch.max(pred)
            counts = torch.bincount(pred, minlength=max_value.item() + 1)
            print(f"""Nodes classified as fraud: {counts[0]}, 
                nodes classified as licit: {counts[1]}""")
            
            
        return loss, accuracy

    def test(data, epoch):
        model.eval()  # Set the model to evaluation mode.
        with torch.no_grad():
            out, _ = model(data.x, data.edge_index)  # Perform a forward pass.
            loss = criterion(out, data.y)
            
            pred = out.argmax(dim=1)  # Get the predicted labels by selecting the class with the highest probability.
            # Calculate accuracy
            correct = pred.eq(data.y).sum().item()
            total = len(data.y)
            accuracy = correct / total
            
            #precision, recall, f1 score
            precision = precision_score(data.y.cpu().numpy(), pred.cpu().numpy(), average=None)
            recall = recall_score(data.y.cpu().numpy(), pred.cpu().numpy(), average=None)
            f1 = f1_score(data.y.cpu().numpy(), pred.cpu().numpy(), average=None)
            
            other_metrics = [precision, recall, f1]
            if wb:
                wandb.log({
                "epoch": epoch,
                "test_loss": loss,
                "test_accuracy": accuracy,
                "test_precision_class_0": precision[0],
                "test_precision_class_1": precision[1],
                "test_recall_class_0": recall[0],
                "test_recall_class_1": recall[1],
                "test_f1_score_class_0": f1[0],
                "test_f1_score_class_1": f1[1]
            })
            
        return loss, accuracy, other_metrics
            
        
        
    train_loss_history = []
    test_loss_history = []
    train_acc_hist = []
    test_acc_hist = []

    for epoch in range(EPOCHS):
        train_loss, train_acc = train(train_data, epoch)
        train_loss_history.append(train_loss.detach().numpy())
        train_acc_hist.append(train_acc)
        
        test_loss, test_acc, metrics = test(test_data, epoch)
        test_loss_history.append(test_loss.detach().numpy())
        test_acc_hist.append(test_acc)
        
        if epoch%5 == 0:
            print(f"""Epoch: {epoch}, 
            Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.4f}
            Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}
            """)
            
            print(f"Precision: {metrics[0]}, Recall: {metrics[1]}, F1 Score: {metrics[2]} \n")
            print("---------------------------------------------------------------------------------")
    
    
        

VBox(children=(Label(value='0.001 MB of 0.019 MB uploaded\r'), FloatProgress(value=0.0686186634797331, max=1.0…

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
test_accuracy,▂▁▂▄▆▆▆▇▇▇▇▇▇███████████████████████████
test_f1_score_class_0,▁▁▂▄▅▇▇▇██████████████▇▇████████████████
test_f1_score_class_1,▂▁▃▅▆▇▇▇▇▇██████████████████████████████
test_loss,▅█▆▅▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_precision_class_0,▁▁▁▂▃▄▄▄▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇███████████████
test_precision_class_1,▄█▆▆▄▅▄▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_recall_class_0,▇█▇▆▄▄▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_recall_class_1,▂▁▂▄▆▆▆▆▇▇▇▇▇▇██████████████████████████
train_accuracy,▁▆▇▇████████████████████████████████████

0,1
epoch,100.0
test_accuracy,0.82924
test_f1_score_class_0,0.56283
test_f1_score_class_1,0.8939
test_loss,0.44619
test_precision_class_0,0.70493
test_precision_class_1,0.8522
test_recall_class_0,0.4684
test_recall_class_1,0.93988
train_accuracy,0.90354


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

Nodes classified as fraud: 85504, 
                nodes classified as licit: 47299


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch: 0, 
            Training Loss: 0.8545, Training Accuracy: 0.3794
            Test Loss: 46.0407, Test Accuracy: 0.7653
            
Precision: [0.         0.76533024], Recall: [0. 1.], F1 Score: [0.        0.8670675] 

---------------------------------------------------------------------------------
Nodes classified as fraud: 20694, 
                nodes classified as licit: 112109
Epoch: 5, 
            Training Loss: 2.9865, Training Accuracy: 0.8464
            Test Loss: 16.0341, Test Accuracy: 0.7961
            
Precision: [0.60491416 0.82885314], Recall: [0.37749407 0.92440099], F1 Score: [0.46488109 0.87402351] 

---------------------------------------------------------------------------------
Nodes classified as fraud: 19968, 
                nodes classified as licit: 112835
Epoch: 10, 
            Training Loss: 3.1078, Training Accuracy: 0.8592
            Test Loss: 34.7099, Test Accuracy: 0.5374
            
Precision: [0.29785018 0.84697681], Recall: [0.71553217 

In [8]:
save_model = False
if save_model:
    torch.save(model.state_dict(), 'final_model_2_layers.pth')