In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, VGAE
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx, to_dense_adj, dense_to_sparse, train_test_split_edges
import networkx as nx
import matplotlib.pyplot as plt

In [3]:
ppi_df = pd.read_csv('9606.protein.links.v12.0.txt', delimiter= ' ')
def find_common_prefix(strings):
    if not strings:
        return ""
    prefix = strings[0]
    for string in strings[1:]:
        while not string.startswith(prefix):
            prefix = prefix[:-1]
            if not prefix:
                return ""
    return prefix


protein1_prefix = find_common_prefix(ppi_df['protein1'].tolist())
protein2_prefix = find_common_prefix(ppi_df['protein2'].tolist())


ppi_df['protein1'] = ppi_df['protein1'].str.replace(protein1_prefix, '', regex=False)
ppi_df['protein2'] = ppi_df['protein2'].str.replace(protein1_prefix, '', regex=False)

In [4]:
df = ppi_df
df['protein1'] = df['protein1'].astype(str)
df['protein2'] = df['protein2'].astype(str)

G = nx.Graph()
for _, row in df.iterrows():
    G.add_edge(row['protein1'], row['protein2'], weight=row['combined_score'])

data = from_networkx(G)
data = train_test_split_edges(data, val_ratio=0.2, test_ratio=0.2)




In [5]:
dense_adj = to_dense_adj(data.train_pos_edge_index, max_num_nodes=data.num_nodes)
higher_order_adj = torch.matmul(dense_adj, dense_adj)  # A^2
higher_order_edge_index, _ = dense_to_sparse(higher_order_adj)
data.train_pos_edge_index = higher_order_edge_index

In [6]:
class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCNEncoder, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv2 = GCNConv(2 * out_channels, out_channels)

    def forward(self, x, edge_index):
        x1 = F.relu(self.conv1(x, edge_index))
        x2 = self.conv2(x1, edge_index)
        return torch.cat([x1, x2], dim=1)

In [7]:
in_channels = data.num_node_features if data.x is not None else 1
out_channels = 32  # Latent dimension
model = VGAE(GCNEncoder(in_channels, out_channels))

# 5. Optimization and Training Setup
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

# Placeholder features if the graph has no node features
if data.x is None:
    data.x = torch.ones((data.num_nodes, 1))

In [8]:
def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(data.x, data.train_pos_edge_index)
    pos_weight = torch.tensor(data.num_nodes ** 2 / data.train_pos_edge_index.size(1)).float()
    loss = model.recon_loss(z, data.train_pos_edge_index, pos_weight=pos_weight)
    loss += (1 / data.num_nodes) * model.kl_loss()
    loss.backward()
    optimizer.step()
    return loss.item()

def test():
    model.eval()
    with torch.no_grad():
        z = model.encode(data.x, data.train_pos_edge_index)
    auc, ap = model.test(z, data.test_pos_edge_index, data.test_neg_edge_index)
    return auc, ap

In [None]:
epochs = 200
train_losses = []
auc_scores = []

for epoch in range(1, epochs + 1):
    train_loss = train()
    auc, ap = test()
    train_losses.append(train_loss)
    auc_scores.append(auc)

    if epoch % 10 == 0:
        print(f"Epoch {epoch:03d}, Loss: {train_loss:.4f}, AUC: {auc:.4f}, AP: {ap:.4f}")


In [None]:
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(range(1, epochs + 1), train_losses, label="Train Loss", marker="o")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss")
plt.grid()
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(range(1, epochs + 1), auc_scores, label="AUC", marker="o", color="orange")
plt.xlabel("Epoch")
plt.ylabel("AUC")
plt.title("AUC Score")
plt.grid()
plt.legend()

plt.tight_layout()
plt.show()