# Library Inport

In [1]:
import copy
import json
import numpy as np

import torch
import torch.nn as nn

import networkx as nx
from networkx.readwrite import json_graph

import torch_geometric
from torch_geometric.utils.convert import from_networkx,to_networkx
from torch_geometric.loader import NeighborLoader
from torch_geometric.nn.models import GIN,GCN
from torch_geometric.explain.metric import groundtruth_metrics

import matplotlib.pyplot as plt

# Classi ausiliarie di utilità
Classe che permette di estrarre i grafi salvati su file e rappresentarli in formato Data in manniera opportuna, oltre a permettere di estrarre le maschere per training e test set.

In [2]:
class DatasetCreator():
    def __init__(self,paths,node_attrs,edge_attrs,label):
        '''
        Args:
            paths (list[str]): Paths ai file contenenti i grafi
            node_attrs (list[str]): Attributi dei nodi
            edge_attrs (list[str]): Attributi degli archi
            label (str): Etichette ground truth
        '''
        self.node_attrs=node_attrs
        self.edge_attrs=edge_attrs
        self.label=label
        self.data=self._read_from_json_(paths)
        self._set_masks_()
    
    def _read_from_json_(self,paths):
        '''
        Estrai i grafi contenuti in una lista di file.
        Args:
            paths (list[str]): Paths ai file contenenti i grafi
        Returns:
            GS (list[torch_geometric.Data]): Grafi contenuti nei files
        '''
        GS = []
        for path in paths:
            json_gs= None
            with open(path,'r') as file:
                json_gs = json.load(file)
                file.close()
            for g in json_gs:
                GS.append(json_graph.node_link_graph(g))
        return self._prepair_data_(GS)
    
    def _prepair_data_(self,GS):
        '''
        Trasforma i grafi in formato pytorch_geometric andando a specificare le feature dei nodi,
        degli archi e le etichette ground truth.
        Args:
            GS (list[nx.Graph]): Grafi formato Networkx
        Returns:
            datas (list[torch_geometric.Data]): Grafi in formato Data pytorch_geometric
        '''
        datas = []
        for g in GS:
            tmp = from_networkx(g,group_node_attrs=self.node_attrs)
            tmp.y = tmp[self.label]
            tmp[self.label] = None
            self._normalize_features_(tmp)
            datas.append(tmp)
        return datas
    
    def _normalize_features_(self,G):
        '''
        Normalizza le feature di un grafo in un range [0,1]
        Args:
            G (torch_geometric.Data)
        '''
        x = G.x
        min_vals = x.min(dim=0, keepdim=True).values
        max_vals = x.max(dim=0, keepdim=True).values
        G.x = (x - min_vals) / (max_vals - min_vals)
    
    def _set_masks_alt_(self,train_ratio=0.7): #problema qui sulla selezione delle maschere
        '''
        Assegna le maschere per i nodi associati al training set e al test set
        Args:
            train_ratio (float): frazione dei nodi da incorporare nel training set
        '''
        for g in self.data:
            num_nodes = g.x.shape[0]
            num_train = int(num_nodes * train_ratio)
            idx = [i for i in range(num_nodes)]

            np.random.shuffle(idx)
            train_mask = torch.full_like(g.y, False, dtype=bool)
            train_mask[idx[:num_train]] = True
            test_mask = torch.full_like(g.y, False, dtype=bool)
            test_mask[idx[num_train:]] = True
            g.train_mask = train_mask
            g.test_mask = test_mask

    def _set_masks_(self,train_ratio=0.7):
        """
        Assegna le maschere per i nodi associati al training set e al test set,
        bilanciandole rispetto alle classi.

        Args:
            train_ratio (float): Frazione dei nodi da incorporare nel training set (0 < train_ratio < 1).
        """
        for g in self.data:
            num_nodes = g.x.shape[0]  # Numero totale di nodi

            # Creazione delle maschere inizializzate a False
            train_mask = torch.zeros(num_nodes, dtype=torch.bool)
            test_mask = torch.zeros(num_nodes, dtype=torch.bool)

            # Trova le classi presenti nei dati
            classes = torch.unique(g.y)  # `g.y` contiene le etichette delle classi

            for cls in classes:
                # Indici dei nodi appartenenti alla classe corrente
                class_indices = torch.nonzero(g.y == cls, as_tuple=True)[0]

                # Mescola casualmente gli indici di questa classe
                shuffled_indices = class_indices[torch.randperm(len(class_indices))]

                # Numero di nodi da assegnare al training set per questa classe
                num_train = int(len(class_indices) * train_ratio)

                # Assegna i nodi ai set di training e test
                train_mask[shuffled_indices[:num_train]] = True
                test_mask[shuffled_indices[num_train:]] = True

            # Assegna le maschere al grafo
            g.train_mask = train_mask
            g.test_mask = test_mask
        
    def get_masks(self,g):
        '''
        Restituisci le maschere per i nodi associati al training set e al test set
        Args:
            g (int): index of the graph in data list
        Returns:
            masks (tuple(Tensor,Tensor)): tupla contenente le maschere per esempi del train e test set
        '''
        G = self.data[g]
        return G.train_mask,G.test_mask

    def get_graph_info(self,idx):
        '''
        Restituisce informazioni relative al grafo in formato pytorch_geometric
        Args:
            G (torch_geometric.Data): un grafo formato pythorch_geometric
        '''
        print(f'Number of nodes: {self.data[idx].num_nodes}') #Number of nodes in the graph
        print(f'Number of edges: {self.data[idx].num_edges}') #Number of edges in the graph
        print(f'Average node degree: {self.data[idx].num_edges / self.data[idx].num_nodes:.2f}') # Average number of nodes in the graph
        print(f'Contains isolated nodes: {self.data[idx].has_isolated_nodes()}') #Does the graph contains nodes that are not connected
        print(f'Contains self-loops: {self.data[idx].has_self_loops()}') #Does the graph contains nodes that are linked to themselves
        print(f'Is undirected: {self.data[idx].is_undirected()}') #Is the graph an undirected graph

    def get_data(self):
        '''
        Restituisce i grafi formato pytorch_geometric.
        Returns:
            _ (torch_geometric.Data): grafo
        '''
        return self.data

Classe ausiliaria per il training e l'evaluation del modello.

In [3]:
class Trainer:
    def __init__(self,model,train_set,test_set,criterion,optimizer,metrics):
        '''
        Args:
            model (torch_geometric.nn.model): modello da addestrare
            train_set (torch_geometric.Data): dataset su cui viene effettuato il training 
            test_set (torch_geometric.Data): dataset su cui viene effettuato il testing
            criterion (torch._Loss) : loss
            optimizer (torch.optim): ottimizzatore
            metrics (list[str]): lista delle stringhe delle metriche da valutare nella fase di testing
        '''
        self.model = model
        self.train_set = train_set
        self.test_set = test_set
        self.criterion = criterion
        self.optimizer = optimizer
        self.metrics = metrics
    
    def train(self,num_epocs):
        """
        Main loop per l'addestramento del modello.
        Args:
            num_epocs (int): numero di epoche totali
        """
        self.optimizer.zero_grad()
        for epoc in range(num_epocs):
            print(f'---- EPOCH {epoc} ----')
            losses = []
            for g in self.train_set:
                graph_losses = []
                train_loader = NeighborLoader(g,input_nodes=g.train_mask,num_neighbors=[8],batch_size=16,directed=False)
                for batch in train_loader:
                    loss = self._train_step_(batch)
                    graph_losses.append(loss.detach().numpy())
                mean=np.mean(graph_losses)
                losses.append(mean)
            print(f'Epoc {epoc} mean loss {np.mean(losses)}')
    
    def test(self):
        '''
        Loop per la valutazione del modello sul test set.
        '''
        print("----- Evaluation -----")
        i=0
        losses = []
        metric_values = []
        for g in self.test_set:
            test_loader = NeighborLoader(g,input_nodes=g.test_mask,num_neighbors=[8],batch_size=16,directed=False)
            j = 0
            for batch in test_loader:
                loss,value = self._evaluation_step_(batch,i,j)
                losses.append(loss.detach().numpy())
                metric_values.append(value)
                j+=1
            i+=1
        print(f'loss {np.mean(losses)}')
        print(f'{self.metrics}: {np.mean(metric_values)}')

    def _train_step_(self,batch):
        """
        Step di addestramento su un batch.
        Args:
            batch (torch_geometric.Data): un grafo del training set
        Returns:
            loss (torch.Tensor): loss del modello sul batch
        """
        self.model.train()
        out = self.model(batch.x, batch.edge_index,batch.Weight)
        out = nn.LogSoftmax(dim=1)(out)
        loss = self.criterion(out[batch.train_mask], batch.y[batch.train_mask])
        loss.backward()
        self.optimizer.step()
        return loss
    
    def _evaluation_step_(self,batch,i,j):
        """
        Step della valutazione del modello su un batch.
        Args:
            batch (torch_geometric.Data): un grafo del test set
            i (int): indice del grafo di cui fa parte il batch
            j (int): indice del batch nell'insieme dei batchs
        Returns:
            loss(torch.Tensor): loss del modello sul batch
            metric(Union[float, Tuple[float, ...]]): valori delle metriche per il batch
        """
        self.model.eval()
        out = self.model(batch.x, batch.edge_index,batch.Weight)
        out = nn.LogSoftmax(dim=1)(out)
        predictions = torch.argmax(out, dim=1)
        #Trainer.draw_original_vs_predicted(batch,predictions,i,j)
        loss = self.criterion(out,batch.y)
        metric = groundtruth_metrics(predictions, batch.y, metrics=self.metrics)
        return loss, metric
    
    def draw_original_vs_predicted(G,predictions,i,j):
        """
        Rappresenta graficamente il grafo con le etichette originali e quelle predette.
        Args:
            G (torch_geometric.Data): grafo da rappresentare
            predictions (list(int)): etichette predette
            i (int): indice del grafo di cui fa parte il batch
            j (int): indice del batch nell'insieme dei batchs
        """
        graph1 = to_networkx(G,node_attrs=['y'],to_undirected=True)
        graph2 = to_networkx(G,to_undirected=True)
        nx.set_node_attributes(graph2, {i: int(pred) for i, pred in enumerate(predictions.tolist())}, 'y')
        
        g1_labels=nx.get_node_attributes(graph1,'y')
        g2_labels = nx.get_node_attributes(graph2,'y')

        pos = nx.spring_layout(graph1)
        _, axes = plt.subplots(1, 2, figsize=(10, 5))
        nx.draw(graph1,pos=pos,ax=axes[0],edgecolors = 'black',node_color='lightblue',with_labels=False)
        nx.draw_networkx_labels(graph1, pos,ax=axes[0], labels=g1_labels, font_size=8, font_color='black')
        axes[0].set_title("Original labels")

        nx.draw(graph2,pos=pos,ax=axes[1],edgecolors = 'black',node_color='lightblue',with_labels=False)
        nx.draw_networkx_labels(graph2, pos,ax=axes[1], labels=g2_labels, font_size=8, font_color='black')
        axes[1].set_title("Predicted labels")
        
        plt.suptitle(f'Original vs Predicted \n Graph {i}, batch {j}',fontsize=10)
        plt.tight_layout()
        plt.subplots_adjust(top=0.85)
        plt.show()

# Caricamento dataset

In [4]:
train_set = DatasetCreator(['advanced mixed train.json'],['Score','Likes','Shares','Comments','Visuals','Dislikes'],['Interactions'],'Misinformative')
test_set = DatasetCreator(['advanced mixed test.json'],['Score','Likes','Shares','Comments','Visuals','Dislikes'],['Interactions'],'Misinformative')

in_channels = train_set.get_data()[0].num_features

The default value will be changed to `edges="edges" in NetworkX 3.6.


  nx.node_link_graph(data, edges="links") to preserve current behavior, or
  nx.node_link_graph(data, edges="edges") for forward compatibility.


In [8]:
model = GCN(in_channels=in_channels,hidden_channels=2,num_layers=2,out_channels=2,dropout=0.8,norm="layer",act_first=True,aggr="mean")
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=1e-4)  # Initialize the Adam optimizer.
optimizer.zero_grad() # Clear gradients.
trainer = Trainer(model,train_set.get_data(),test_set.get_data(),criterion,optimizer,['f1_score'])

trainer.train(15)

---- EPOCH 0 ----
Epoc 0 mean loss 0.6081470251083374
---- EPOCH 1 ----
Epoc 1 mean loss 0.5140830874443054
---- EPOCH 2 ----
Epoc 2 mean loss 0.4832112193107605
---- EPOCH 3 ----
Epoc 3 mean loss 0.4948475658893585
---- EPOCH 4 ----
Epoc 4 mean loss 0.5346401333808899
---- EPOCH 5 ----
Epoc 5 mean loss 0.5624955892562866
---- EPOCH 6 ----
Epoc 6 mean loss 0.559241771697998
---- EPOCH 7 ----
Epoc 7 mean loss 0.514923095703125
---- EPOCH 8 ----
Epoc 8 mean loss 0.4885455369949341
---- EPOCH 9 ----
Epoc 9 mean loss 0.49069541692733765
---- EPOCH 10 ----
Epoc 10 mean loss 0.5203326940536499
---- EPOCH 11 ----
Epoc 11 mean loss 0.5223050117492676
---- EPOCH 12 ----
Epoc 12 mean loss 0.48140841722488403
---- EPOCH 13 ----
Epoc 13 mean loss 0.4830014109611511
---- EPOCH 14 ----
Epoc 14 mean loss 0.5094398856163025


In [9]:
trainer.test()

----- Evaluation -----
loss 0.18653768301010132
['f1_score']: 0.8942841838132467
