In [2]:
import numpy as np
import csv
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from torch_geometric.data import Data
import torch
import torch.nn as nn
import dgl
from dgl.nn import GraphConv
import torch.nn.functional as F

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc

from sklearn.metrics.pairwise import cosine_similarity

from torch_geometric.nn import GCNConv
import itertools

from dgl.nn import SAGEConv
import scipy.sparse as sp
import dgl.function as fn
from sklearn.metrics import roc_auc_score


In [4]:
test_set_final = pd.read_csv("../data/test_set_final.csv")
train_set_final = pd.read_csv("../data/train_set_final.csv")

In [5]:
edges = train_set_final[train_set_final['label'] == 1][['source', 'target']].values.tolist()

graph = nx.Graph()
graph.add_edges_from(edges)

In [6]:
degrees = [graph.degree(n) for n in graph.nodes()]

isolated_nodes = degrees == 0

# Convert the isolated_nodes variable to a tensor
isolated_nodes = torch.tensor(isolated_nodes)

# Get the indices of the isolated nodes
isolated_nodes = isolated_nodes.nonzero().flatten()

# Print the indices of the isolated nodes
print(isolated_nodes)

tensor([], dtype=torch.int64)


In [7]:
u, v = graph.edges()

eids = np.arange(graph.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = graph.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

# Find all negative edges and split them for training and testing
num_nodes = graph.number_of_nodes()

# Create the adjacency matrix with an explicit shape
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())), shape=(num_nodes, num_nodes))

# Now, when subtracting from a dense matrix of ones and the identity matrix, shapes will match
adj_neg = 1 - adj.todense() - np.eye(num_nodes)
neg_u, neg_v = np.where(adj_neg != 0)

# The rest of your code for selecting negative edges can remain the same
neg_eids = np.random.choice(len(neg_u), graph.number_of_edges())

test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]


ValueError: too many values to unpack (expected 2)

In [None]:
train_g = dgl.remove_edges(graph, eids[:test_size])

In [None]:
class GraphSAGE(nn.Module):
    """
    GraphSAGE model for node classification.

    Args:
        in_feats (int): Number of input features.
        h_feats (int): Number of hidden features.

    Attributes:
        conv1 (SAGEConv): First GraphSAGE convolutional layer.
        conv2 (SAGEConv): Second GraphSAGE convolutional layer.
    """

    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        """
        Forward pass of the GraphSAGE model.

        Args:
            g (DGLGraph): Input graph.
            in_feat (torch.Tensor): Input node features.

        Returns:
            torch.Tensor: Output node representations.
        """
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [None]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

In [None]:
class DotPredictor(nn.Module):
    """
    DotPredictor is a PyTorch module that computes a dot product between the source node feature 'h' and 
    destination node feature 'h' for each edge in the input graph 'g'. It returns the computed scores as 
    edge features.
    """
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]

In [None]:
class MLPPredictor(nn.Module):
    def __init__(self, h_feats):
        """
        Initializes the MLP Predictor.

        Parameters
        ----------
        h_feats : int
            The number of input features for each node.

        """
        super().__init__()
        self.W1 = nn.Linear(h_feats * 2, h_feats)
        self.W2 = nn.Linear(h_feats, 1)

    def apply_edges(self, edges):
        """
        Computes a scalar score for each edge of the given graph.

        Parameters
        ----------
        edges :
            Has three members ``src``, ``dst`` and ``data``, each of
            which is a dictionary representing the features of the
            source nodes, the destination nodes, and the edges
            themselves.

        Returns
        -------
        dict
            A dictionary of new edge features.

        """
        h = torch.cat([edges.src['h'], edges.dst['h']], 1)
        return {'score': self.W2(F.relu(self.W1(h))).squeeze(1)}

    def forward(self, g, h):
        """
        Performs forward pass through the MLP Predictor.

        Parameters
        ----------
        g : dgl.DGLGraph
            The input graph.
        h : torch.Tensor
            The input node features.

        Returns
        -------
        torch.Tensor
            The predicted scores for each edge in the graph.

        """
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(self.apply_edges)
            return g.edata['score']

In [None]:
model = GraphSAGE(train_g.ndata['feat'].shape[1], 16)
# You can replace DotPredictor with MLPPredictor.
#pred = MLPPredictor(16)
pred = DotPredictor()

def compute_loss(pos_score, neg_score):
    """
    Compute the binary cross entropy loss given positive and negative scores.

    Args:
        pos_score (torch.Tensor): Tensor containing the positive scores.
        neg_score (torch.Tensor): Tensor containing the negative scores.

    Returns:
        torch.Tensor: The computed binary cross entropy loss.
    """
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    """
    Compute the Area Under the Receiver Operating Characteristic Curve (AUC-ROC) score.

    Args:
        pos_score (torch.Tensor): Tensor containing the positive scores.
        neg_score (torch.Tensor): Tensor containing the negative scores.

    Returns:
        float: The AUC-ROC score.

    """
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

In [None]:
optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.01)

all_logits = []
for e in range(400):
    # forward
    h = model(train_g, train_g.ndata['feat'])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))


with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print('AUC', compute_auc(pos_score, neg_score))

In [None]:
class GCN(nn.Module):
    """
    Graph Convolutional Network (GCN) class.

    This class implements a two-layer GCN model for node classification.

    Parameters:
    - in_feats (int): Number of input features.
    - h_feats (int): Number of hidden features.
    - num_classes (int): Number of output classes.

    Attributes:
    - conv1 (GraphConv): First graph convolutional layer.
    - conv2 (GraphConv): Second graph convolutional layer.
    """

    def __init__(self, in_feats, h_feats, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats, allow_zero_in_degree=True)
        self.conv2 = GraphConv(h_feats, num_classes, allow_zero_in_degree=True)

    def forward(self, g, in_feat):
        """
        Forward pass of the GCN model.

        Parameters:
        - g (DGLGraph): Input graph.
        - in_feat (torch.Tensor): Input features.

        Returns:
        - h (torch.Tensor): Output features.
        """
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [None]:
# Model parameters
in_feats = graph.ndata['feat'].shape[1]
h_feats = 16  # Hidden layer size
num_classes = 2  # Assuming binary classification for link prediction

model = GCN(in_feats, h_feats, num_classes)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


In [None]:
def compute_loss(pos_score, neg_score):
    """
    Compute the binary cross-entropy loss given positive and negative scores.

    Args:
        pos_score (torch.Tensor): Tensor containing the positive scores.
        neg_score (torch.Tensor): Tensor containing the negative scores.

    Returns:
        torch.Tensor: The computed binary cross-entropy loss.
    """
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.size(0)), torch.zeros(neg_score.size(0))])
    return F.binary_cross_entropy_with_logits(scores, labels)

def train(model, g, features, train_pos_edges, train_neg_edges, optimizer):
    """
    Trains the given model using the provided graph, features, positive edges, negative edges, and optimizer.

    Args:
        model: The model to be trained.
        g: The graph used for training.
        features: The features used for training.
        train_pos_edges: The positive edges used for training.
        train_neg_edges: The negative edges used for training.
        optimizer: The optimizer used for training.

    Returns:
        The loss value as a float.
    """
    model.train()
    optimizer.zero_grad()
    embeddings = model(g, features)
    pos_score = (embeddings[train_pos_edges[:, 0]] * embeddings[train_pos_edges[:, 1]]).sum(dim=1)
    neg_score = (embeddings[train_neg_edges[:, 0]] * embeddings[train_neg_edges[:, 1]]).sum(dim=1)
    loss = compute_loss(pos_score, neg_score)
    loss.backward()
    optimizer.step()
    return loss.item()

In [None]:
# Assuming your dataset and graph are ready
features = graph.ndata['feat']
in_feats, h_feats, out_feats = features.shape[1], 16, 16

model = GCN(in_feats, h_feats, out_feats)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(50):
    loss = train(model, graph, features, train_pos_edges, train_neg_edges, optimizer)
    print(f'Epoch {epoch}: Loss {loss}')


In [None]:
predictions_df = pd.DataFrame(predicted_links, columns=['Predicted'])

# Export the DataFrame to a CSV file
predictions_df.to_csv('GCN_predictions.csv', index=False)

In [None]:
from torch_geometric.nn import GATConv

class GAT(torch.nn.Module):
    """
    Graph Attention Network (GAT) model.

    Args:
        num_node_features (int): Number of input node features.
        hidden_channels (int): Number of hidden channels in the GAT layers.
        out_features (int): Number of output features.

    Attributes:
        conv1 (torch_geometric.nn.conv.GATConv): First GAT convolutional layer.
        conv2 (torch_geometric.nn.conv.GATConv): Second GAT convolutional layer.
        conv3 (torch_geometric.nn.conv.GATConv): Third GAT convolutional layer.

    """

    def __init__(self, num_node_features, hidden_channels, out_features):
        super(GAT, self).__init__()
        self.conv1 = GATConv(num_node_features, hidden_channels, heads=4, dropout=0.2)
        # Concatenation will happen on the output features so the dimension will increase
        self.conv2 = GATConv(hidden_channels * 4, hidden_channels, heads=4, concat=True, dropout=0.2)
        # Since concat is set to True, the input features of the next layer are the hidden_channels * num_heads
        self.conv3 = GATConv(hidden_channels * 4, out_features, concat=False, heads=1, dropout=0.2)  # No concatenation in the final layer

    def forward(self, x, edge_index):
        """
        Forward pass of the GAT model.

        Args:
            x (torch.Tensor): Input node features.
            edge_index (torch.Tensor): Graph edge indices.

        Returns:
            torch.Tensor: Output predictions.

        """
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.2, training=self.training)  # Add dropout for regularization
        x = F.elu(self.conv2(x, edge_index))
        x = F.dropout(x, p=0.2, training=self.training)  # Add dropout for regularization
        x = self.conv3(x, edge_index)
        return torch.sigmoid(x.squeeze())

model = GAT(num_node_features=data.num_node_features, hidden_channels=64, out_features=16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

for epoch in range(200):
    loss = train()
    test_loss, auc_score = test()
    print(f'Epoch: {epoch+1:03d}, Loss: {loss:.4f}, Test Loss: {test_loss:.4f}, AUC: {auc_score:.4f}')

In [None]:
model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # Disable gradient computation
    test_pred = model(data.x, test_edge_index).squeeze()

# Convert predictions to probabilities using a sigmoid function (if not already done within the model)
test_pred_prob = torch.sigmoid(test_pred)

# You can set a threshold to classify edges as existing or not, e.g., threshold = 0.5
threshold = 0.5
predicted_links = (test_pred_prob >= threshold).int()

# Print or save your predictions
print(predicted_links)