In [1]:
!pip install torch_geometric



In [2]:
%cd Desktop/DeepHW/

C:\Users\fede6\Desktop\DeepHW


In [15]:
import os
import torch
import pandas as pd
import matplotlib.pyplot as plt
import logging
from tqdm import tqdm
from torch_geometric.loader import DataLoader
from torch.utils.data import random_split
# Load utility functions from cloned repository
from loadData import GraphDataset

from torch_geometric.nn import GCNConv, global_mean_pool, ResGatedGraphConv

# from src.utils import set_seed
# from src.models import GNN
import argparse

import numpy as np

In [4]:
import random
import tarfile

def set_seed(seed=777):
    seed = seed
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    random.seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

set_seed()

def gzip_folder(folder_path, output_file):
    """
    Compresses an entire folder into a single .tar.gz file.
    
    Args:
        folder_path (str): Path to the folder to compress.
        output_file (str): Path to the output .tar.gz file.
    """
    with tarfile.open(output_file, "w:gz") as tar:
        tar.add(folder_path, arcname=os.path.basename(folder_path))
    print(f"Folder '{folder_path}' has been compressed into '{output_file}'")

In [4]:
def add_zeros(data):
    data.x = torch.zeros(data.num_nodes, dtype=torch.long)
    return data

In [5]:
def train(data_loader, model, optimizer, criterion, device, save_checkpoints, checkpoint_path, current_epoch):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for data in tqdm(data_loader, desc="Iterating training graphs", unit="batch"):
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        pred = output.argmax(dim=1)
        correct += (pred == data.y).sum().item()
        total += data.y.size(0)

    # Save checkpoints if required
    # if save_checkpoints:
    #     checkpoint_file = f"{checkpoint_path}_epoch_{current_epoch + 1}.pth"
    #     torch.save(model.state_dict(), checkpoint_file)
    #     print(f"Checkpoint saved at {checkpoint_file}")

    return total_loss / len(data_loader),  correct / total

In [6]:
from sklearn.metrics import f1_score

def evaluate(data_loader, model, device, calculate_accuracy=False):
    model.eval()
    correct = 0
    total = 0
    ground_truths = []
    predictions = []
    
    total_loss = 0
    criterion = torch.nn.CrossEntropyLoss()
    with torch.no_grad():
        for data in tqdm(data_loader, desc="Iterating eval graphs", unit="batch"):
            data = data.to(device)
            output = model(data)
            pred = output.argmax(dim=1)

            if calculate_accuracy:
                correct += (pred == data.y).sum().item()
                total += data.y.size(0)
                total_loss += criterion(output, data.y).item()
                
                predictions.extend(pred.cpu().numpy())
                ground_truths.extend(data.y.cpu().numpy())
                
                f1 = f1_score(ground_truths, predictions, average='macro')                
                
            else:
                predictions.extend(pred.cpu().numpy())
    if calculate_accuracy:
        accuracy = correct / total
        return  total_loss / len(data_loader), accuracy, f1
    return predictions

In [7]:
def save_predictions(predictions, test_path):
    script_dir = os.getcwd()
    submission_folder = os.path.join(script_dir, "submission")
    test_dir_name = os.path.basename(os.path.dirname(test_path))

    os.makedirs(submission_folder, exist_ok=True)

    output_csv_path = os.path.join(submission_folder, f"testset_{test_dir_name}.csv")

    test_graph_ids = list(range(len(predictions)))
    output_df = pd.DataFrame({
        "id": test_graph_ids,
        "pred": predictions
    })

    output_df.to_csv(output_csv_path, index=False)
    print(f"Predictions saved to {output_csv_path}")

In [8]:
def plot_training_progress(train_losses, train_accuracies, output_dir, f1_score=None):
    epochs = range(1, len(train_losses) + 1)
    plt.figure(figsize=(12, 6))

    if f1_score is None:
        # Plot loss
        plt.subplot(1, 2, 1)
        plt.plot(epochs, train_losses, label="Training Loss", color='blue')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.title('Training Loss per Epoch')

        # Plot accuracy
        plt.subplot(1, 2, 2)
        plt.plot(epochs, train_accuracies, label="Training Accuracy", color='green')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.title('Training Accuracy per Epoch')
    else:
        # Plot loss
        plt.subplot(1, 3, 1)
        plt.plot(epochs, train_losses, label="Training Loss", color='blue')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.title('Training Loss per Epoch')

        # Plot accuracy
        plt.subplot(1, 3, 2)
        plt.plot(epochs, train_accuracies, label="Training Accuracy", color='green')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.title('Training Accuracy per Epoch')
        
        # Plot f1-score
        plt.subplot(1, 3, 3)
        plt.plot(epochs, f1_score, label="Training F1-score", color='red')
        plt.xlabel('Epoch')
        plt.ylabel('F1-score')
        plt.title('Training F1-score per Epoch')

    # Save plots in the current directory
    os.makedirs(output_dir, exist_ok=True)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "training_progress.png"))
    plt.close()

In [9]:
def get_user_input(prompt, default=None, required=False, type_cast=str):

    while True:
        user_input = input(f"{prompt} [{default}]: ")

        if user_input == "" and required:
            print("This field is required. Please enter a value.")
            continue

        if user_input == "" and default is not None:
            return default

        if user_input == "" and not required:
            return None

        try:
            return type_cast(user_input)
        except ValueError:
            print(f"Invalid input. Please enter a valid {type_cast.__name__}.")

In [10]:
def get_arguments():
    args = {}
    args['train_path'] = get_user_input("Path to the training dataset (optional)")
    args['test_path'] = get_user_input("Path to the test dataset", required=True)
    args['num_checkpoints'] = get_user_input("Number of checkpoints to save during training", type_cast=int)
    args['device'] = get_user_input("Which GPU to use if any", default=1, type_cast=int)
    args['gnn'] = get_user_input("GNN type (gin, gin-virtual, gcn, gcn-virtual)", default='gin')
    args['drop_ratio'] = get_user_input("Dropout ratio", default=0.0, type_cast=float)
    args['num_layer'] = get_user_input("Number of GNN message passing layers", default=5, type_cast=int)
    args['emb_dim'] = get_user_input("Dimensionality of hidden units in GNNs", default=300, type_cast=int)
    args['batch_size'] = get_user_input("Input batch size for training", default=32, type_cast=int)
    args['epochs'] = get_user_input("Number of epochs to train", default=10, type_cast=int)
    args['baseline_mode'] = get_user_input("Baseline mode: 1 (CE), 2 (Noisy CE)", default=1, type_cast=int)
    args['noise_prob'] = get_user_input("Noise probability p (used if baseline_mode=2)", default=0.2, type_cast=float)


    return argparse.Namespace(**args)


In [11]:
class NoisyCrossEntropyLoss(torch.nn.Module):
    def __init__(self, p_noisy):
        super().__init__()
        self.p = p_noisy
        self.ce = torch.nn.CrossEntropyLoss(reduction='mean')

    def forward(self, logits, targets):
        losses = self.ce(logits, targets)
        weights = (1 - self.p) + self.p * (1 - torch.nn.functional.one_hot(targets, num_classes=logits.size(1)).float().sum(dim=1))
        return (losses * weights).mean()

In [12]:
def populate_args(args):
    print("Arguments received:")
    for key, value in vars(args).items():
        print(f"{key}: {value}")
args = get_arguments()
populate_args(args)

Arguments received:
train_path: C:\Users\fede6\Desktop\DeepHW\data\A\train.json.gz
test_path: C:\Users\fede6\Desktop\DeepHW\data\A\test.json.gz
num_checkpoints: 5
device: 1
gnn: gin-virtual
drop_ratio: 0.5
num_layer: 5
emb_dim: 300
batch_size: 32
epochs: 50
baseline_mode: 2
noise_prob: 0.5


Arguments received:
train_path: C:\Users\fede6\Desktop\DeepHW\data\A\train.json.gz
test_path: C:\Users\fede6\Desktop\DeepHW\data\A\test.json.gz
num_checkpoints: 5
device: 1
gnn: gin
drop_ratio: 0.5
num_layer: 5
emb_dim: 300
batch_size: 32
epochs: 50
baseline_mode: 2
noise_prob: 0.5

In [13]:
from torch_geometric.utils import dropout_edge


def DE_addZero(data, drop_prob=0.2):
    # DropEdge
    edge_index, edge_attr = dropout_edge(
        data.edge_index,
        data.edge_attr,
        p=0.2,
        force_undirected=False,
        num_nodes=data.num_nodes,
        training=True
    )
    data.edge_index = edge_index
    data.edge_attr = edge_attr

    # Add zero node features
    data.x = torch.zeros(data.num_nodes, dtype=torch.long)

    return data

def normalize_edge_attr(data):
    if data.edge_attr is not None:
        data.edge_attr = (data.edge_attr - data.edge_attr.mean(dim=0)) / (data.edge_attr.std(dim=0) + 1e-6)
    return data

In [16]:
full_dataset = GraphDataset(args.train_path, transform=add_zeros, pre_transform=None)
val_size = int(0.2 * len(full_dataset))
train_size = len(full_dataset) - val_size

generator = torch.Generator().manual_seed(12)
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size], generator=generator)

Loading graphs from C:\Users\fede6\Desktop\DeepHW\data\A\train.json.gz...
This may take a few minutes, please wait...


Processing graphs: 100%|██████████████████████████████████████████| 11280/11280 [00:35<00:00, 318.32graph/s]


In [17]:
train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False)

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes=6, smoothing=0.1, dim=-1):
        """
        classes: numero di classi
        smoothing: epsilon per la label smoothing
        dim: asse su cui calcolare softmax
        """
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        """
        pred: logits (non softmaxati), shape [batch_size, num_classes]
        target: etichette intere (non one-hot), shape [batch_size]
        """
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

In [28]:
import kaggle_models as km
from torch.optim.lr_scheduler import ReduceLROnPlateau

script_dir = os.getcwd()
# device = torch.device(f"cuda:{args.device}" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_checkpoints = args.num_checkpoints if args.num_checkpoints else 3

if args.gnn == 'gin':
    model = km.GNN(gnn_type='gin', num_class=6, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False, graph_pooling='mean').to(device)
elif args.gnn == 'gin-virtual':
    model = km.GNN(gnn_type='gin', num_class=6, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True, graph_pooling='mean').to(device)
elif args.gnn == 'gcn':
    model = km.GNN(gnn_type='gcn', num_class=6, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False, graph_pooling='mean').to(device)
elif args.gnn == 'gcn-virtual':
    model = km.GNN(gnn_type='gcn', num_class=6, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device)
else:
    raise ValueError('Invalid GNN type')

model = km.GNN(gnn_type='gin', num_class=6, num_layer=5, emb_dim=150, drop_ratio=0.5, virtual_node=True, residual=True, graph_pooling='attention').to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-2, weight_decay=1e-10)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='max',   # Monitor validation 
            factor=0.7,   # Reduce LR by 50% on plateau
            patience=3,  # Number of epochs with no improvement
            min_lr=1e-6,
            verbose=True
        )

criterion = torch.nn.CrossEntropyLoss()
if args.baseline_mode == 2:
    criterion = NoisyCrossEntropyLoss(args.noise_prob)
else:
    criterion = torch.nn.CrossEntropyLoss()



In [36]:
test_dir_name = os.path.basename(os.path.dirname(args.test_path))
logs_folder = os.path.join(script_dir, "logs", test_dir_name)
log_file = os.path.join(logs_folder, "training.log")
os.makedirs(os.path.dirname(log_file), exist_ok=True)
logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s - %(message)s')
# logging.getLogger().addHandler(logging.StreamHandler())

checkpoint_path = os.path.join(script_dir, "checkpoints", f"model_{test_dir_name}_best.pth")
checkpoints_folder = os.path.join(script_dir, "checkpoints", test_dir_name)
os.makedirs(checkpoints_folder, exist_ok=True)

In [54]:
restart_epoch = 63
best_f1_score = 0.0
checkpoint_fn = os.path.join(script_dir, "checkpoints", f"A\model_A_epoch_{restart_epoch}.pth")
start = 0
if os.path.exists(checkpoint_path): # and not args.train_path:
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    restart_epoch = checkpoint['epoch']
    best_f1_score = checkpoint['best_f1_score']
    print(f"Loaded best model from {checkpoint_path}")

Loaded best model from C:\Users\fede6\Desktop\DeepHW\checkpoints\model_A_best.pth


  checkpoint = torch.load(checkpoint_path)


In [55]:
import gc

torch.cuda.empty_cache()
gc.collect()

2587

In [None]:
if args.train_path:
    num_epochs = args.epochs
    best_val_accuracy = 0.0 # max(checkpoint['val_accuracy'])
    best_f1_score =     checkpoint['best_f1_score'] # max(checkpoint['val_f1_score'])

    train_losses = []
    train_accuracies = []
    val_losses = []
    val_accuracies = []
    val_f1_scores = []

    if num_checkpoints > 1:
        checkpoint_intervals = [int((i + 1) * num_epochs / num_checkpoints) for i in range(num_checkpoints)]
    else:
        checkpoint_intervals = [num_epochs]

    for epoch in range(restart_epoch, 100):
        train_loss, train_acc = train(
            train_loader, model, optimizer, criterion, device,
            save_checkpoints=(epoch + 1 in checkpoint_intervals),
            checkpoint_path=os.path.join(checkpoints_folder, f"model_{test_dir_name}"),
            current_epoch=epoch
        )

        val_loss, val_acc, val_f1 = evaluate(val_loader, model, device, calculate_accuracy=True)
        
        scheduler.step(val_f1)
        
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}")
        logging.info(f"Epoch {epoch + 1}/{100}, Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}")

        train_losses.append(train_loss)
        train_accuracies.append(train_acc)
        val_losses.append(val_loss)
        val_accuracies.append(val_acc)
        val_f1_scores.append(val_f1)
        
        checkpoint = {
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'best_f1_score': best_f1_score,
            'train_loss': train_losses,
            'train_accuracy': train_accuracies,
            'val_loss': val_losses,
            'val_accuracy': val_accuracies,
            'val_f1_score': val_f1_scores
        }
        
        if (epoch + 1)%5 == 0:
            torch.save(checkpoint, os.path.join(script_dir, "checkpoints", f"A\model_A_epoch_{epoch+1}.pth"))
            print(f"Model saved at epoch {epoch+1}")
        
        if val_f1 > best_f1_score:
            best_f1_score = val_f1
            torch.save(checkpoint, checkpoint_path)
            print(f"Best model updated and saved at {checkpoint_path}")
            
        plot_training_progress(train_losses, train_accuracies, os.path.join(logs_folder, "plots"))
        plot_training_progress(val_losses, val_accuracies, os.path.join(logs_folder, "plotsVal"), val_f1_scores)
        
        gc.collect()

Iterating training graphs: 100%|███████████████████████████████████████| 282/282 [00:39<00:00,  7.09batch/s]
Iterating eval graphs: 100%|█████████████████████████████████████████████| 71/71 [00:05<00:00, 13.46batch/s]


NotImplementedError: 

In [None]:
import gc
del train_dataset
del train_loader
del full_dataset
del val_dataset
del val_loader
gc.collect()

In [None]:
test_dataset = GraphDataset(r"C:\Users\fede6\Desktop\DeepHW\data\A\test.json.gz", transform=add_zeros)
test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False)


In [None]:
model.load_state_dict(torch.load(checkpoint_path))
predictions = evaluate(test_loader, model, device, calculate_accuracy=False)
save_predictions(predictions, args.test_path)