In [40]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
from torch import Tensor
print(torch.__version__)

2.3.0+cu121


In [41]:
import os
os.environ['TORCH'] = torch.__version__

# Siguiendo el modelo de:
https://github.com/Orbifold/pyg-link-prediction/blob/main/Pokec-Pyg-Neo4j.ipynb

In [42]:
#Get the current working directory
cwd = os.getcwd()
file_path = cwd + '/points_50.csv'

In [43]:
# Importing the dataset
df = pd.read_csv(file_path)

#Round the values of the dataset to 4 decimal places
df = df.round(4)

#Add a column to use as index from 0 to the length of the dataset
df['n_label'] = range(0, len(df))

#delete the column p_label
df = df.drop('p_label', axis=1)

In [44]:
df.head()

Unnamed: 0,x,y,z,N_side,N_layer,t_label,phi,eta,q,pt,d0,z0,n_label
0,-0.112,-9.9682,-6.3331,8,1,T0,-1.6049,-0.6008,-1,48.2712,0.2156,0.022,0
1,-0.4144,-19.8918,-12.6639,8,2,T0,-1.6049,-0.6008,-1,48.2712,0.2156,0.022,1
2,-0.693,-29.8162,-18.9948,8,3,T0,-1.6049,-0.6008,-1,48.2712,0.2156,0.022,2
3,-0.9483,-39.7538,-25.3337,8,4,T0,-1.6049,-0.6008,-1,48.2712,0.2156,0.022,3
4,-1.1794,-49.6794,-31.6646,8,5,T0,-1.6049,-0.6008,-1,48.2712,0.2156,0.022,4


In [45]:
#Create a pytorch geometric data object
from torch_geometric.data import Data

data=Data()

In [46]:
#Add nodes to the data object from n_label column
data.x = torch.tensor(df[['x','y','z','phi','eta','pt','d0','z0']].values, dtype=torch.float)

In [47]:
edge_path = cwd + '/grap_50.csv'

# Importing the dataset
df_edge = pd.read_csv(edge_path)

#Add edges to the data object
data.edge_index = torch.tensor(df_edge[['Source','Target']].values, dtype=torch.long).t().contiguous()

In [48]:
data

Data(x=[500, 8], edge_index=[2, 692])

In [49]:
data.validate(raise_on_error=True)

True

In [50]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [51]:
data.edge_attr = torch.tensor(df_edge['weight'], dtype=torch.float).view(-1, 1)

In [52]:
import torch_geometric.transforms as T
data = T.ToUndirected()(data)

In [53]:
data=data.to(device, non_blocking=True)

In [54]:
data

Data(x=[500, 8], edge_index=[2, 1384], edge_attr=[1384, 1])

In [55]:
def create_node_masks(d):
        print("Creating classification masks")
        amount = len(d.x)
        # actually the index to the nodes
        nums = np.arange(amount)
        np.random.shuffle(nums)

        train_size = int(amount * 0.7)
        test_size = int(amount * 0.85) - train_size
        val_size = amount - train_size - test_size

        train_set = nums[0:train_size]
        test_set = nums[train_size:train_size + test_size]
        val_set = nums[train_size + test_size:]

        assert len(train_set) + len(test_set) + len(val_set) == amount, "The split should be coherent."
        
        train_mask = torch.zeros(amount, dtype = torch.long, device = device)
        for i in train_set:
            train_mask[i] = 1.

        test_mask = torch.zeros(amount, dtype = torch.long, device = device)
        for i in test_set:
            test_mask[i] = 1.

        val_mask = torch.zeros(amount, dtype = torch.long, device = device)
        for i in val_set:
            val_mask[i] = 1.

        d.train_mask = train_mask
        d.test_mask = test_mask
        d.val_mask = val_mask

In [56]:
create_node_masks(data)

Creating classification masks


In [57]:
import torch_geometric.transforms as T

transform = T.Compose([
    T.ToUndirected(merge = True),
    T.ToDevice(device),
    T.RandomLinkSplit(num_val = 0.0005, num_test = 0.0001, is_undirected = True, add_negative_train_samples = False),
])
transform(data)

(Data(x=[500, 8], edge_index=[2, 1384], edge_attr=[1384, 1], train_mask=[500], test_mask=[500], val_mask=[500], edge_label=[692], edge_label_index=[2, 692]),
 Data(x=[500, 8], edge_index=[2, 1384], edge_attr=[1384, 1], train_mask=[500], test_mask=[500], val_mask=[500], edge_label=[0], edge_label_index=[2, 0]),
 Data(x=[500, 8], edge_index=[2, 1384], edge_attr=[1384, 1], train_mask=[500], test_mask=[500], val_mask=[500], edge_label=[0], edge_label_index=[2, 0]))

In [58]:
from torch_geometric.loader import NeighborLoader

In [59]:
# the larger the batch size the faster things will be
batch_size = 32

# define batch loaders for the three sets
train_loader = NeighborLoader(data, num_neighbors = [10] * 2, shuffle = True, input_nodes = data.train_mask, batch_size = batch_size)
val_loader = NeighborLoader(data, num_neighbors = [10] * 2, input_nodes = data.val_mask, batch_size = batch_size)
test_loader = NeighborLoader(data, num_neighbors = [10] * 2, input_nodes = data.test_mask, batch_size = batch_size)


In [60]:
from datetime import datetime
from tqdm import tqdm
from torch_geometric.utils import negative_sampling
from torch_geometric.nn import GCNConv
from sklearn.metrics import roc_auc_score, f1_score
import os

In [61]:
class Net(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def encode(self, x, edge_index):
        # chaining two convolutions with a standard relu activation

        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)

    def decode(self, z, edge_label_index):
        # cosine similarity
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim = -1)

    def decode_all(self, z):
        prob_adj = z @ z.t()
        return (prob_adj > 0).nonzero(as_tuple = False).t()

In [62]:
model = Net(data.num_features, 128, 64).to(device)
optimizer = torch.optim.Adam(params = model.parameters(), lr = 0.01)
# BCELoss creates a criterion that measures the Binary Cross Entropy between the target and the output.
criterion = torch.nn.BCEWithLogitsLoss()

In [63]:
model

Net(
  (conv1): GCNConv(8, 128)
  (conv2): GCNConv(128, 64)
)

In [64]:
def train():
    """
    Single epoch model training in batches.
    :return: total loss for the epoch
    """
    model.train()
    total_examples = total_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        batch = batch.to(device)
        batch_size = batch.batch_size
        z = model.encode(batch.x, batch.edge_index)
        neg_edge_index = negative_sampling(edge_index = batch.edge_index, num_nodes = batch.num_nodes, num_neg_samples = None, method = 'sparse')
        edge_label_index = torch.cat([batch.edge_index, neg_edge_index], dim = -1, )
        edge_label = torch.cat([torch.ones(batch.edge_index.size(1)), torch.zeros(neg_edge_index.size(1))], dim = 0).to(device)
        out = model.decode(z, edge_label_index).view(-1)
        # loss = criterion(out[:batch_size], edge_label[:batch_size])
        loss = criterion(out, edge_label)
        # standard torch mechanics here
        loss.backward()
        optimizer.step()
        total_examples += batch_size
        total_loss += float(loss) * batch_size
    return total_loss / total_examples

In [65]:
@torch.no_grad()
def test(loader):
    """
    Evalutes the model on the test set.
    :param loader: the batch loader
    :return: a score
    """
    model.eval()
    scores = []
    threshold = torch.tensor([0.7]).to(device)
    for batch in tqdm(loader):
        batch.to(device)
        z = model.encode(batch.x, batch.edge_index)
        out = model.decode(z, batch.edge_index).view(-1).sigmoid()
        pred = (out > threshold).float() * 1
        score = f1_score(np.ones(batch.edge_index.size(1)), pred.cpu().numpy())
        scores.append(score)
    return np.average(scores)

In [66]:
def predictions(max = 50, threshold = 0.99):
    """
    Creates predictions for the specified run.
    :param run_id: model id
    :param max: the maximum amount of predictions to output
    """
    pred_edges = []

    loader = NeighborLoader(data, num_neighbors = [10] * 2, shuffle = True, input_nodes = None, batch_size = batch_size)
    threshold_tensor = torch.tensor([threshold]).to(device)
    for batch in tqdm(loader):
        batch.to(device)
        z = model.encode(batch.x, batch.edge_index)
        # collecting negative edge tuples ensure that the decode are actual non-existing edges
        neg_edge_index = negative_sampling(edge_index = batch.edge_index, num_nodes = None, num_neg_samples = None, method = 'sparse')
        out = model.decode(z, neg_edge_index).view(-1).sigmoid()
        pred = ((out > threshold_tensor).float() * 1).cpu().numpy()
        found = np.argwhere(pred == 1)
        if found.size > 0:
            edge_tuples = neg_edge_index.t().cpu().numpy()
            select_index = found.reshape(1, found.size)[0]
            edges = edge_tuples[select_index]
            pred_edges += edges.tolist()
            if len(pred_edges) >= max:
                break
    return pd.DataFrame.from_dict([{'source': a, 'target': b} for a,b in pred_edges])

In [67]:
def run():
    """
        Run the training and makes predictions.
    """
    run_id = int(datetime.timestamp(datetime.now()))
    start_time = datetime.now()
    epochs = 10
    #with trange(epochs + 1) as t:
    for epoch in range(epochs):
        try:
            #t.set_description('Epoch %i/%i train' % (epoch, epochs))
            loss = train()
            #t.set_description('Epoch %i/%i test' % (epoch, epochs))
            val_acc = test(test_loader)
            #t.set_postfix(loss = loss, accuracy = val_acc)
            print(f"Epoch: {epoch:03d}, Loss: {loss:.4f}, Acc: {val_acc:.4f}")
        except KeyboardInterrupt:
            break
    #torch.save(model.state_dict(), f"model_{run_id}")
    time_elapsed = datetime.now() - start_time
    print("Creating predictions")
    print(f"\nRun {run_id}:")
    print(f"\tEpochs: {epoch}")
    print(f"\tTime: {time_elapsed}")
    print(f"\tAccuracy: {val_acc * 100:.01f}")

In [68]:
run()

100%|██████████| 16/16 [00:00<00:00, 28.93it/s]
100%|██████████| 16/16 [00:00<00:00, 100.13it/s]


Epoch: 000, Loss: 176.8665, Acc: 1.0000


100%|██████████| 16/16 [00:00<00:00, 70.47it/s]
100%|██████████| 16/16 [00:00<00:00, 178.19it/s]


Epoch: 001, Loss: 11.6302, Acc: 1.0000


100%|██████████| 16/16 [00:00<00:00, 94.39it/s]
100%|██████████| 16/16 [00:00<00:00, 182.35it/s]


Epoch: 002, Loss: 3.0353, Acc: 1.0000


100%|██████████| 16/16 [00:00<00:00, 101.40it/s]
100%|██████████| 16/16 [00:00<00:00, 160.10it/s]


Epoch: 003, Loss: 1.0609, Acc: 0.9931


100%|██████████| 16/16 [00:00<00:00, 91.39it/s]
100%|██████████| 16/16 [00:00<00:00, 177.88it/s]


Epoch: 004, Loss: 0.8388, Acc: 0.9575


100%|██████████| 16/16 [00:00<00:00, 52.92it/s]
100%|██████████| 16/16 [00:00<00:00, 173.91it/s]


Epoch: 005, Loss: 0.6941, Acc: 0.9575


100%|██████████| 16/16 [00:00<00:00, 95.59it/s]
100%|██████████| 16/16 [00:00<00:00, 166.45it/s]


Epoch: 006, Loss: 0.7460, Acc: 0.9575


100%|██████████| 16/16 [00:00<00:00, 100.35it/s]
100%|██████████| 16/16 [00:00<00:00, 147.12it/s]


Epoch: 007, Loss: 0.6984, Acc: 0.9897


100%|██████████| 16/16 [00:00<00:00, 84.81it/s]
100%|██████████| 16/16 [00:00<00:00, 145.16it/s]


Epoch: 008, Loss: 0.7127, Acc: 0.8745


100%|██████████| 16/16 [00:00<00:00, 62.19it/s]
100%|██████████| 16/16 [00:00<00:00, 153.39it/s]

Epoch: 009, Loss: 0.6803, Acc: 0.9764
Creating predictions

Run 1717694849:
	Epochs: 9
	Time: 0:00:03.491002
	Accuracy: 97.6





In [69]:
preds_df = predictions()
print(preds_df.head())

  0%|          | 0/16 [00:00<?, ?it/s]

   source  target
0      10      83
1      67      98
2     156     157
3       8      99
4      28      75





In [72]:
for i in df_edge.itertuples():
    for j in preds_df.itertuples():
        if i.Source == j.source and i.Target == j.target:
            print("Encontrado: ", i.Source, i.Target)

Encontrado:  156 157


In [73]:
#Compare predictions with the real edges
real_edges = df_edge[['Source', 'Target']]
real_edges = real_edges.rename(columns={'Source': 'source', 'Target': 'target'})

#Merge the two dataframes
merged = pd.merge(preds_df, real_edges, on=['source', 'target'], how='inner')

#Calculate the number of correct predictions
correct = len(merged)
print(f"Correct predictions: {correct}")

Correct predictions: 1
