# Import Librairies

In [1]:
import numpy as np
import csv
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import networkx.algorithms.community as community
import torch
import dgl
import torch.nn.functional as F
from dgl.dataloading import GraphDataLoader
from dgl.nn import GraphConv
from IPython.display import Latex
from sklearn.model_selection import train_test_split
import os.path as osp
from sklearn.metrics import roc_auc_score
import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv
from torch_geometric.utils import negative_sampling
from torch_geometric.data import DataLoader, Data
from torch.optim import Adam
import torch.nn as nn
from torch_geometric.transforms import NormalizeFeatures, RandomLinkSplit
import networkx as nx

import networkx as nx
import torch
from torch_geometric.data import Data

# Import Data

In [2]:
test_set = pd.read_csv("../data/test_set_final.csv")
train_set = pd.read_csv("../data/train_set_final.csv")
true_test = pd.read_csv("../data/test.txt", sep=" ", header=None, names=['source', 'target'])

# Feature Engineering

In [3]:
# Concatenate all the 1 to 932 column named number_source into a single column as an array of values
train_set['node_info_source'] = train_set[train_set.columns[20:952]].values.tolist()
train_set.drop(train_set.columns[20:952], axis=1, inplace=True)
test_set['node_info_source'] = test_set[test_set.columns[17:949]].values.tolist()
test_set.drop(test_set.columns[19:951], axis=1, inplace=True)


In [4]:
train_set['node_info_target'] = train_set[train_set.columns[20:952]].values.tolist()
train_set.drop(train_set.columns[20:952], axis=1, inplace=True)
test_set['node_info_target'] = test_set[test_set.columns[17:949]].values.tolist()
test_set.drop(test_set.columns[19:951], axis=1, inplace=True)



# Model

In [5]:
"""
This code snippet creates a graph using NetworkX and converts it to PyG format. It then prepares the data for training a Graph Neural Network (GNN) model.

The steps involved are as follows:
1. Create a NetworkX graph from a pandas DataFrame called 'train_set', using the columns 'source', 'target', and 'label'.
2. Convert the graph to PyG format by creating a tensor 'edge_index' from the graph's edges.
3. Create tensors for binary labels using the 'label' column of 'train_set'.
4. Extract additional node features from 'train_set' and concatenate them with existing features.
5. Create tensors for node information from 'train_set' and concatenate them with the additional features.
6. Concatenate the source and target tensors to create the final feature tensor 'x'.
7. Create a PyG Data object with the feature tensor, edge index, and label tensor.
8. Apply transformations to split the data into train, validation, and test sets.
9. Print the train, validation, and test data.

Note: The code assumes the existence of certain columns in the 'train_set' DataFrame, such as 'degree_source', 'centrality_source', 'community_source', 'degree_target', 'centrality_target', 'community_target', 'node_info_source', and 'node_info_target'.
"""

G = nx.from_pandas_edgelist(train_set, 'source', 'target', 'label')
edge_index = torch.tensor(list(G.edges)).t().contiguous()
y = torch.tensor(train_set['label'].values, dtype=torch.float)
x_source_features = train_set[['degree_source', 'centrality_source', 'community_source']].values
x_target_features = train_set[['degree_target', 'centrality_target', 'community_target']].values
x_source_info = torch.tensor(train_set['node_info_source'].values.tolist(), dtype=torch.float)
x_target_info = torch.tensor(train_set['node_info_target'].values.tolist(), dtype=torch.float)
x_source = torch.cat((x_source_info, torch.tensor(x_source_features, dtype=torch.float)), dim=1)
x_target = torch.cat((x_target_info, torch.tensor(x_target_features, dtype=torch.float)), dim=1)
x = torch.cat([x_source, x_target], dim=1)
data = Data(x=x, edge_index=edge_index, y=y)
transform = RandomLinkSplit(num_val=0.10, num_test=0.05, neg_sampling_ratio=1.0, is_undirected=True, add_negative_train_samples=False)
transformed_data = transform(data)
train_data, val_data, test_data = transformed_data
print(train_data)
print(val_data)
print(test_data)


Data(x=[10496, 1870], edge_index=[2, 8788], y=[10496], edge_label=[4394], edge_label_index=[2, 4394])
Data(x=[10496, 1870], edge_index=[2, 8788], y=[10496], edge_label=[1032], edge_label_index=[2, 1032])
Data(x=[10496, 1870], edge_index=[2, 9820], y=[10496], edge_label=[516], edge_label_index=[2, 516])


In [6]:
def create_pyg_dataset(test_set):
    """
    Create a PyG dataset from a given test set.

    Args:
        test_set (pandas.DataFrame): The test set containing the edge information.

    Returns:
        data_test (torch_geometric.data.Data): The PyG dataset with the new features.
    """

    # Create a NetworkX graph
    G = nx.from_pandas_edgelist(test_set, 'source', 'target')

    # Convert the graph to PyG format
    edge_index = torch.tensor(list(G.edges)).t().contiguous()

    # Add additional features for each node
    x_source_features = test_set[['degree_source', 'centrality_source', 'community_source']].values
    x_target_features = test_set[['degree_target', 'centrality_target', 'community_target']].values

    # Concatenate the existing features with the new features
    x_source_info = torch.tensor(test_set['node_info_source'].values.tolist(), dtype=torch.float)
    x_target_info = torch.tensor(test_set['node_info_target'].values.tolist(), dtype=torch.float)

    # Concatenate the new features with the existing features
    x_source = torch.cat((x_source_info, torch.tensor(x_source_features, dtype=torch.float)), dim=1)
    x_target = torch.cat((x_target_info, torch.tensor(x_target_features, dtype=torch.float)), dim=1)

    x = torch.cat([x_source, x_target], dim=1)

    # Calculate the number of isolated nodes
    num_nodes = max(max(edge_index[0]), max(edge_index[1])) + 1
    num_isolated_nodes = num_nodes - x.size(0)

    # Create a tensor of zeros of the appropriate size
    isolated_nodes = torch.zeros((num_isolated_nodes, x.size(1)))

    # Concatenate the tensor of zeros to x
    x = torch.cat([x, isolated_nodes], dim=0)

    # Create your PyG dataset with the new features
    data_test = Data(x=x, edge_index=edge_index)

    return data_test

data_test = create_pyg_dataset(test_set)


In [7]:
if data_test.edge_index.max() >= data_test.x.size(0):
    print("edge_index contains node indices that are out of bounds!")

In [12]:
data_test.validate()

True

In [13]:
class Net(torch.nn.Module):
    """
    A graph neural network model for link prediction.

    Args:
        in_channels (int): Number of input features.
        hidden_channels (int): Number of hidden channels.
        out_channels (int): Number of output channels.

    Attributes:
        conv1 (GCNConv): First graph convolutional layer.
        conv2 (GCNConv): Second graph convolutional layer.
        conv3 (GCNConv): Third graph convolutional layer.
        dropout (torch.nn.Dropout): Dropout layer.

    Methods:
        encode: Encodes the input features and computes node embeddings.
        decode: Decodes the node embeddings to predict the existence of a specific edge.
        decode_all: Decodes all node embeddings to predict the existence of all possible edges.
        forward: Performs the forward pass of the model.

    """

    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)  
        self.conv3 = GCNConv(hidden_channels, out_channels)
        self.dropout = torch.nn.Dropout(0.5)  

    def encode(self, x, edge_index):
        """
        Encodes the input features and computes node embeddings.

        Args:
            x (torch.Tensor): Input node features.
            edge_index (torch.Tensor): Graph edge indices.

        Returns:
            torch.Tensor: Node embeddings.

        """
        x = self.conv1(x, edge_index).relu()
        x = self.dropout(x)  
        x = self.conv2(x, edge_index).relu()
        x = self.dropout(x)  
        x = self.conv3(x, edge_index)
        return x

    def decode(self, z, edge_label_index):
        """
        Decodes the node embeddings to predict the existence of a specific edge.

        Args:
            z (torch.Tensor): Node embeddings.
            edge_label_index (torch.Tensor): Indices of the specific edge.

        Returns:
            torch.Tensor: Predicted existence of the specific edge.

        """
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)

    def decode_all(self, z):
        """
        Decodes all node embeddings to predict the existence of all possible edges.

        Args:
            z (torch.Tensor): Node embeddings.

        Returns:
            torch.Tensor: Predicted existence of all possible edges.

        """
        prob_adj = z @ z.t()
        return (prob_adj > 0).nonzero(as_tuple=False).t()

    def forward(self, data):
        """
        Performs the forward pass of the model.

        Args:
            data: Input data containing node features and graph information.

        Returns:
            torch.Tensor: Predicted existence of edges.

        """
        if data.edge_label_index is not None:
            z = self.encode(data.x, data.edge_index)
            return self.decode(z, data.edge_label_index)
        else:
            z = self.encode(data.x, data.edge_index)
            return self.decode_all(z)


In [14]:
# Create the model instance
num_features = train_data.x.size(1) 
model = Net(num_features, 128, 64)

# Create the optimizer
optimizer = Adam(params=model.parameters(), lr=0.01)

# Define the loss function
criterion = nn.BCEWithLogitsLoss()

In [15]:
def train():
    """
    Trains the model using the provided training data.

    Returns:
        loss (torch.Tensor): The loss value after training.
    """
    model.train()
    optimizer.zero_grad()
    z = model.encode(train_data.x, train_data.edge_index)

    # We perform a new round of negative sampling for every training epoch:
    neg_edge_index = negative_sampling(
        edge_index=train_data.edge_index, num_nodes=train_data.num_nodes,
        num_neg_samples=train_data.edge_label_index.size(1))

    edge_label_index = torch.cat(
        [train_data.edge_label_index, neg_edge_index],
        dim=-1,
    )
    edge_label = torch.cat([
        train_data.edge_label,
        train_data.edge_label.new_zeros(neg_edge_index.size(1))
    ], dim=0)

    out = model.decode(z, edge_label_index).view(-1)
    loss = criterion(out, edge_label)
    loss.backward()
    optimizer.step()
    return loss


@torch.no_grad()
def test(data):
    """
    Evaluate the performance of the model on the test data.

    Args:
        data (torch_geometric.data.Data): The test data.

    Returns:
        float: The ROC AUC score of the model's predictions.
    """
    model.eval()
    z = model.encode(data.x, data.edge_index)
    out = model.decode(z, data.edge_label_index).view(-1).sigmoid()
    return roc_auc_score(data.edge_label.cpu().numpy(), out.cpu().numpy())



In [16]:
best_val_auc = final_test_auc = 0
for epoch in range(1, 101):
    loss = train()
    val_auc = test(val_data)
    test_auc = test(test_data)
    if val_auc > best_val_auc:
        best_val = val_auc
        final_test_auc = test_auc
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val: {val_auc:.4f}, '
          f'Test: {test_auc:.4f}')

print(f'Final Test: {final_test_auc:.4f}')

Epoch: 001, Loss: 160.7310, Val: 0.6760, Test: 0.7154
Epoch: 002, Loss: 38.8990, Val: 0.7033, Test: 0.7302
Epoch: 003, Loss: 18.6486, Val: 0.7077, Test: 0.7301
Epoch: 004, Loss: 5.5407, Val: 0.7068, Test: 0.7244
Epoch: 005, Loss: 2.3638, Val: 0.6426, Test: 0.6177
Epoch: 006, Loss: 2.0764, Val: 0.5849, Test: 0.4973
Epoch: 007, Loss: 1.9145, Val: 0.5918, Test: 0.5099
Epoch: 008, Loss: 1.1802, Val: 0.4958, Test: 0.4084
Epoch: 009, Loss: 1.0476, Val: 0.4978, Test: 0.4741
Epoch: 010, Loss: 0.9578, Val: 0.5083, Test: 0.5247
Epoch: 011, Loss: 0.8342, Val: 0.4675, Test: 0.4863
Epoch: 012, Loss: 0.8217, Val: 0.4611, Test: 0.4689
Epoch: 013, Loss: 0.8305, Val: 0.3894, Test: 0.4000
Epoch: 014, Loss: 0.7936, Val: 0.3380, Test: 0.3629
Epoch: 015, Loss: 0.7598, Val: 0.3591, Test: 0.3837
Epoch: 016, Loss: 0.7522, Val: 0.4438, Test: 0.4948
Epoch: 017, Loss: 0.7092, Val: 0.5176, Test: 0.5785
Epoch: 018, Loss: 0.7063, Val: 0.5757, Test: 0.6229
Epoch: 019, Loss: 0.7089, Val: 0.6020, Test: 0.6513
Epoch: 0

In [17]:
len(data_test.edge_index[0])

3498

In [18]:
z = model.encode(data_test.x, data_test.edge_index)
final_edge_index = model.decode_all(z)

In [19]:
# Convert the predicted edge indices to a DataFrame
predictions_df = pd.DataFrame(final_edge_index.t().numpy(), columns=['source', 'target'])

# Add a 'Predicted' column with a value of 1 to indicate that these are predicted edges
# because the model only predicts the existence of edges
predictions_df['Predicted'] = 1

predictions_df

Unnamed: 0,source,target,Predicted
0,0,0,1
1,0,1,1
2,0,2,1
3,0,3,1
4,0,4,1
...,...,...,...
54402929,7599,7595,1
54402930,7599,7596,1
54402931,7599,7597,1
54402932,7599,7598,1


In [20]:
true_test

Unnamed: 0,source,target
0,3425,4524
1,1620,2617
2,4832,6317
3,4984,7298
4,385,5481
...,...,...
3493,1548,2957
3494,717,1756
3495,1731,3976
3496,426,1120


In [21]:
merged_df = true_test.merge(predictions_df, on=['source', 'target'], how='left')
merged_df['Predicted'] = merged_df['Predicted'].fillna(0)


In [22]:
merged_df

Unnamed: 0,source,target,Predicted
0,3425,4524,1.0
1,1620,2617,1.0
2,4832,6317,1.0
3,4984,7298,1.0
4,385,5481,1.0
...,...,...,...
3493,1548,2957,1.0
3494,717,1756,1.0
3495,1731,3976,1.0
3496,426,1120,1.0


In [24]:
# Create a DataFrame with index and predicted labels
submission = pd.DataFrame({'ID': merged_df.index, 'Predicted': merged_df.Predicted})

submission.to_csv('../submission/GNN/predictions_gnn.csv', index=False)