# MLNS Kaggle challenge: link prediction using GNN

### Import the necessary packages

In [None]:
import copy
import csv
import torch
import pandas as pd
import numpy as np

import torch_geometric.transforms as T
import torch.nn.functional as F

from torch_geometric.data import Data
from sklearn.metrics import roc_auc_score
from torch_geometric.utils import negative_sampling
from torch_geometric.nn import GCNConv

## Training

### Import and pre-process data

In [None]:
node_df = pd.read_csv("node_information.csv", header=None)

In [None]:
print(node_df.shape)
node_df.head(10)

In [None]:
graph_df = pd.read_csv("train.txt", header=None, sep=" ")

In [None]:
node_df_train_padded = copy.deepcopy(node_df)
k=0
j=0
while k<7599:
    if node_df[0].iloc[j] != k:
        index = k-0.5
        temp_list = [k]
        temp_list[1:] = [0 for i in range(932)]
        node_df_train_padded.loc[index] = temp_list
        node_df_train_padded = node_df_train_padded.sort_index().reset_index(drop=True)
    else:
        j+=1
    k+=1

node_df_train_padded.head()

In [None]:
node_features = node_df_train_padded.iloc[:, 1:].to_numpy()

In [None]:
positive_edges = graph_df[graph_df[2] == 1]

In [None]:
edge_index = torch.tensor([list(positive_edges[0]),
                           list(positive_edges[1])], dtype=torch.long)
x = torch.tensor(node_features, dtype=torch.float)

data = Data(x=x, edge_index=edge_index)

In [None]:
split = T.RandomLinkSplit(
    num_val=0.05,
    num_test=0.0,
    is_undirected=True,
    add_negative_train_samples=False,
    neg_sampling_ratio=1.0,
)
train_data, val_data, test_data = split(data)

### Model definition

In [None]:
class Net(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)

    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(
            dim=-1
        )  # product of a pair of nodes on each edge

    def decode_all(self, z):
        prob_adj = z @ z.t()
        return (prob_adj > 0).nonzero(as_tuple=False).t()
    

def train_link_predictor(
    model, train_data, val_data, optimizer, criterion, n_epochs=100
):
    
    for epoch in range(1, n_epochs + 1):

        model.train()
        optimizer.zero_grad()
        z = model.encode(train_data.x, train_data.edge_index)

        # sampling training negatives for every training epoch
        neg_edge_index = negative_sampling(
            edge_index=train_data.edge_index, num_nodes=train_data.num_nodes,
            num_neg_samples=train_data.edge_label_index.size(1), method='sparse')

        edge_label_index = torch.cat(
            [train_data.edge_label_index, neg_edge_index],
            dim=-1,
        )
        edge_label = torch.cat([
            train_data.edge_label,
            train_data.edge_label.new_zeros(neg_edge_index.size(1))
        ], dim=0)

        out = model.decode(z, edge_label_index).view(-1)
        loss = criterion(out, edge_label)
        loss.backward()
        optimizer.step()

        val_auc = eval_link_predictor(model, val_data)

        if epoch % 10 == 0:
            print(f"Epoch: {epoch:03d}, Train Loss: {loss:.3f}, Val AUC: {val_auc:.3f}")

    return model

@torch.no_grad()
def eval_link_predictor(model, data):

    model.eval()
    z = model.encode(data.x, data.edge_index)
    out = model.decode(z, data.edge_label_index).view(-1).sigmoid()

    return roc_auc_score(data.edge_label.cpu().numpy(), out.cpu().numpy())

def test_link_predictor(model, data):
    
    model.eval()
    z = model.encode(data.x, data.edge_index)
    out = model.decode(z, data.edge_label_index).view(-1).sigmoid()
    output = [0 for i in range(len(out))]
    for k in range(len(out)):
        if out[k]>=0.5:
            output[k] = 1
        else:
            pass
    return output

### Model training

In [None]:
model = Net(932, 128, 64)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()
model = train_link_predictor(model, train_data, val_data, optimizer, criterion, n_epochs = 200)

## Testing

### Import and pre-process data

In [None]:
test_df = pd.read_csv("test.txt", header=None, sep=" ")

In [None]:
edge_index_test = torch.tensor([list(test_df[0]),
                           list(test_df[1])], dtype=torch.long)
x = torch.tensor(node_features, dtype=torch.float)

data_test = Data(x=x, edge_index=edge_index_test, edge_label_index = edge_index_test)

### Prediction generation on the test set

In [None]:
y_pred = test_link_predictor(model, data_test)

In [None]:
# Load test samples 

nb_submission = 18
with open("test.txt", "r") as f:
    reader = csv.reader(f)
    test_set = list(reader)
test_set = [element[0].split(" ") for element in test_set]

# Make random predictions

predictions = zip(np.array(range(len(test_set))), y_pred)

# note: Kaggle requires that you add "ID" and "category" column headers

with open(f"data/submission_{nb_submission}.csv","w") as pred:
    csv_out = csv.writer(pred)
    csv_out.writerow(i for i in ["ID", "Predicted"])
    for row in predictions:
         csv_out.writerow(row)
    pred.close()