In [105]:
import numpy as np
import csv
import pandas as pd
pd.set_option('display.max_columns', None)
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import networkx.algorithms.community as community
import torch

# Import packages
import dgl
import torch
import torch.nn.functional as F
import numpy as np
from dgl.dataloading import GraphDataLoader
from dgl.nn import GraphConv
from IPython.display import Latex
from sklearn.model_selection import train_test_split

In [106]:
test_set = pd.read_csv("../data/test_set_final.csv")
train_set = pd.read_csv("../data/train_set_final.csv")

In [110]:
# Concatenate all the 1 to 932 column named number_source into a single column as an array of values
train_set['node_info_source'] = train_set[train_set.columns[18:950]].values.tolist()
train_set.drop(train_set.columns[18:950], axis=1, inplace=True)




In [111]:
train_set['node_info_target'] = train_set[train_set.columns[18:950]].values.tolist()
train_set.drop(train_set.columns[18:950], axis=1, inplace=True)


In [112]:
import dgl
# Create a graph
# Create a graph from the training set
graph = dgl.graph((train_set['source'], train_set['target']), num_nodes=len(train_set))
graph = dgl.add_self_loop(graph)

# Convert 'node_info_source' and 'node_info_target' to PyTorch tensors and stack them
node_info_source_tensor = torch.stack(train_set['node_info_source'].apply(lambda x: torch.tensor(x)).tolist())
node_info_target_tensor = torch.stack(train_set['node_info_target'].apply(lambda x: torch.tensor(x)).tolist())

# Drop the 'source', 'target', 'label', 'node_info_source', and 'node_info_target' columns and convert the rest to a PyTorch tensor
features_tensor = torch.tensor(train_set.drop(['source', 'target', 'label', 'node_info_source', 'node_info_target'], axis=1).values.astype(float))

# Concatenate the tensors along the second dimension
features = torch.cat([features_tensor, node_info_source_tensor, node_info_target_tensor], dim=1)

# Assign the features to the graph
graph.ndata['features'] = features

# Set the edge labels
graph.edata['label'] = torch.tensor(train_set['label'])

# Print the graph information
print(graph)



KeyError: 'node_info_source'

In [None]:
print('Node labels and features')
print(graph.edata)

Node labels and features
{'label': tensor([1, 1, 1,  ..., 1, 0, 0])}


# Split dataset

In [None]:
# Define key graph variables
X = graph.ndata['features']
y = graph.edata['label']
num_classes = 2
num_feat = X.shape[1]
N = graph.number_of_nodes()

print('Number of features: ', num_feat)
print('Number of Nodes: ', N)

Number of features:  1879
Number of Nodes:  10496


In [113]:
def split_dataset(N, train_ratio, seed=4):
    """ Creates train/val/test masks

    Args:
        N (int): dataset size
        train_ratio (float): proportion of the training set
        seed (int, optional): Fixes random. Defaults to 10

    Return: 
        [tensors]: returns boolean tensors for train/val/test set
        True indicates that a node belong to this set, False otherwise
    """

    train_size = int(train_ratio * N)
    val_size = int((N - train_size)/2)
    test_size = N - train_size - val_size

    # split dataset
    subsets = torch.utils.data.random_split(range(N), lengths = [train_size, val_size, test_size], generator=torch.Generator().manual_seed(seed))
    train_inds, val_inds, test_inds = [torch.Tensor(subset.indices) for subset in subsets]

    # create tensors of masks for each subset
    dataset_inds = torch.arange(N)
    train_mask = torch.isin(dataset_inds, train_inds)
    val_mask = torch.isin(dataset_inds, val_inds)
    test_mask = torch.isin(dataset_inds, test_inds)

    return train_mask, val_mask, test_mask

train_mask, val_mask, test_mask = split_dataset(N, train_ratio=0.8)


In [114]:
class GNN_model(torch.nn.Module):
    """
    Define a Graph Convolution Network 
    """
    def __init__(self, num_layers, input_size, hidden_size, output_size, dropout):
        super(GNN_model, self).__init__()

        # Define GNN components
        self.convs = torch.nn.ModuleList() # holds GraphConv layers in a list
        self.convs.append(
            GraphConv(input_size, hidden_size, activation=F.relu)) # You can either define the activation at the layer level or call it inside the forward
        for i in range(num_layers-2):
            self.convs.append(
                GraphConv(hidden_size, hidden_size, activation=F.relu))
        self.convs.append(GraphConv(hidden_size, output_size))

        self.dropout = dropout

    def forward(self, graph, x):
        # Implement the forward function that takes the graph,
        # the features tensor x and returns the output tensor as shown in figure 1
        for conv in self.convs:
            x = conv(graph, x)
    
        output = F.log_softmax(x, dim=1) # Log_softmax is more stable numerically in comparison to softmax
        return output

In [115]:
def train(model, graph, x, labels, num_epochs, optimizer, train_mask, val_mask, test_mask):
    """ Train the GNN model 

    Args:
        model: GNN model defined in pytorch
        graph (dgl.graph): dataset on which the task is performed
        x (tensor): node feature matrix 
        labels (tensor): node labels
        num_epochs (int): number of epochs
        train_mask (tensor): boolean mask for training nodes
        val_mask (tensor): boolean mask for validation set
    """
    
    # Train the model (pytorch specific)
    best_val_acc = 0
    best_test_acc = 0
    nll_loss = torch.nn.NLLLoss()

    model.train()
    for epoch in range(num_epochs):
        # Forward
        pred = model(graph, x)

        # Compute loss
        # Note that you should only compute the losses of the nodes in the training set.
        loss = nll_loss(pred[train_mask], labels[train_mask])

        # Compute accuracy on training/validation/test
        pred = torch.argmax(pred, dim=1)
        train_acc = (pred[train_mask] == labels[train_mask]).float().mean()
        val_acc = (pred[val_mask] == labels[val_mask]).float().mean()
        test_acc = (pred[test_mask] == labels[test_mask]).float().mean()

        # Save the best validation accuracy and the corresponding test accuracy.
        if best_val_acc < val_acc:
            best_val_acc = val_acc
            best_test_acc = test_acc

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            print('Epoch {}: loss {:.3f}, train Acc: {:.3f}, val acc: {:.3f}, test acc: {:.3f}'.format(
                epoch, loss, train_acc, val_acc, test_acc))

In [116]:
# Instanciate model
num_layers=3
hidden_size=16
dropout=0.3
num_epochs=300
lr=0.01
weight_decay=0.005
train_ratio=0.8
seed=4

model = GNN_model(num_layers, num_feat, hidden_size, num_classes, dropout)

In [121]:
# Define an optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=weight_decay)
X = X.float()
y = y.long()
train_mask = train_mask.long()
val_mask = val_mask.long()
test_mask = test_mask.long()

# Train model
train(model, graph, X, y, num_epochs, optimizer, train_mask, val_mask, test_mask)

Epoch 0: loss 0.778, train Acc: 0.200, val acc: 0.900, test acc: 0.900
Epoch 10: loss 0.000, train Acc: 1.000, val acc: 1.000, test acc: 1.000
Epoch 20: loss 0.000, train Acc: 1.000, val acc: 1.000, test acc: 1.000
Epoch 30: loss 0.000, train Acc: 1.000, val acc: 1.000, test acc: 1.000
Epoch 40: loss 0.000, train Acc: 1.000, val acc: 1.000, test acc: 1.000
Epoch 50: loss 0.000, train Acc: 1.000, val acc: 1.000, test acc: 1.000
Epoch 60: loss 0.000, train Acc: 1.000, val acc: 1.000, test acc: 1.000
Epoch 70: loss 0.000, train Acc: 1.000, val acc: 1.000, test acc: 1.000
Epoch 80: loss 0.000, train Acc: 1.000, val acc: 1.000, test acc: 1.000
Epoch 90: loss 0.000, train Acc: 1.000, val acc: 1.000, test acc: 1.000
Epoch 100: loss 0.000, train Acc: 1.000, val acc: 1.000, test acc: 1.000
Epoch 110: loss 0.000, train Acc: 1.000, val acc: 1.000, test acc: 1.000
Epoch 120: loss 0.000, train Acc: 1.000, val acc: 1.000, test acc: 1.000
Epoch 130: loss 0.000, train Acc: 1.000, val acc: 1.000, test 

In [122]:
# test Model sur le test set 
model.eval()
pred = model(graph, X)
pred = torch.argmax(pred, dim=1)
test_acc = (pred[test_mask] == y[test_mask]).float().mean()
print('Test accuracy:', test_acc.item())


Test accuracy: 1.0
