In [1]:
import numpy as np
import pandas as pd 
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, VGAE
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx, train_test_split_edges
import networkx as nx
import torch.nn as nn


In [2]:
#pip install torch torchvision torchaudio
#pip install torch-geometric

In [3]:
ppi_df = pd.read_csv('9606.protein.links.v12.0.txt', delimiter= ' ')

In [4]:
ppi_df

Unnamed: 0,protein1,protein2,combined_score
0,9606.ENSP00000000233,9606.ENSP00000356607,173
1,9606.ENSP00000000233,9606.ENSP00000427567,154
2,9606.ENSP00000000233,9606.ENSP00000253413,151
3,9606.ENSP00000000233,9606.ENSP00000493357,471
4,9606.ENSP00000000233,9606.ENSP00000324127,201
...,...,...,...
13715399,9606.ENSP00000501317,9606.ENSP00000475489,195
13715400,9606.ENSP00000501317,9606.ENSP00000370447,158
13715401,9606.ENSP00000501317,9606.ENSP00000312272,226
13715402,9606.ENSP00000501317,9606.ENSP00000402092,169


In [None]:
def find_common_prefix(strings):
    if not strings:
        return ""
    prefix = strings[0]
    for string in strings[1:]:
        while not string.startswith(prefix):
            prefix = prefix[:-1]
            if not prefix:
                return ""
    return prefix


protein1_prefix = find_common_prefix(ppi_df['protein1'].tolist())
protein2_prefix = find_common_prefix(ppi_df['protein2'].tolist())

print("Common prefix in 'protein1':", protein1_prefix)
print("Common prefix in 'protein2':", protein2_prefix)



Common prefix in 'protein1': 9606.ENSP00000
Common prefix in 'protein2': 9606.ENSP00000


In [6]:
ppi_df['protein1'] = ppi_df['protein1'].str.replace(protein1_prefix, '', regex=False)
ppi_df['protein2'] = ppi_df['protein2'].str.replace(protein1_prefix, '', regex=False)

In [7]:
ppi_df

Unnamed: 0,protein1,protein2,combined_score
0,000233,356607,173
1,000233,427567,154
2,000233,253413,151
3,000233,493357,471
4,000233,324127,201
...,...,...,...
13715399,501317,475489,195
13715400,501317,370447,158
13715401,501317,312272,226
13715402,501317,402092,169


In [8]:
df = ppi_df
df['protein1'] = df['protein1'].astype(str)
df['protein2'] = df['protein2'].astype(str)


In [9]:
G = nx.Graph()
for _, row in df.iterrows():
    G.add_edge(row['protein1'], row['protein2'], weight=row['combined_score'])

In [18]:
data = from_networkx(G)
data = train_test_split_edges(data, val_ratio=0.2, test_ratio=0.2)




In [20]:
class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCNEncoder, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv_mu = GCNConv(2 * out_channels, out_channels)
        self.conv_logstd = GCNConv(2 * out_channels, out_channels)
        self.dropout = nn.Dropout(p=0.3) 

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = self.dropout(x)  
        mu = self.conv_mu(x, edge_index) 
        logstd = self.conv_logstd(x, edge_index)  
        return mu, logstd


In [21]:
in_channels = data.num_node_features if data.x is not None else 1
out_channels = 32
model = VGAE(GCNEncoder(in_channels, out_channels))

optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)


In [None]:

patience = 7
delta = 0.0001
best_loss = float('inf')
counter = 0
early_stop = False
checkpoint_path = "best_model.pth"

num_epochs = 100
for epoch in range(num_epochs):
    # Training step
    model.train()
    optimizer.zero_grad()
    z = model.encode(data.x if data.x is not None else torch.ones((data.num_nodes, 1)), data.train_pos_edge_index)
    train_loss = model.recon_loss(z, data.train_pos_edge_index)
    train_loss += (1 / data.num_nodes) * model.kl_loss()
    train_loss.backward()
    optimizer.step()
    
    # Validation step
    model.eval()
    with torch.no_grad():
        z = model.encode(data.x if data.x is not None else torch.ones((data.num_nodes, 1)), data.train_pos_edge_index)
        val_loss = model.test(z, data.test_pos_edge_index, data.test_neg_edge_index)  # Assuming this returns a loss

    print(f"Epoch {epoch + 1}/{num_epochs} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    # Early stopping logic
    if val_loss < best_loss - delta:
        best_loss = val_loss
        counter = 0
        torch.save(model.state_dict(), checkpoint_path)  # Save the best model
    else:
        counter += 1
        if counter >= patience:
            early_stop = True
            print("Early stopping triggered. Training stopped.")
            break

# Load the best model after early stopping
model.load_state_dict(torch.load(checkpoint_path))
print("Best model restored.")


In [23]:
for epoch in range(1, 25):
    loss = train()
    auc, ap = test()
    if epoch % 5 == 0:
        print(f'Epoch {epoch:03d}, Loss: {loss:.4f}, AUC: {auc:.4f}, AP: {ap:.4f}')

Epoch 005, Loss: 3.8583, AUC: 0.7828, AP: 0.7876
Epoch 010, Loss: 2.9609, AUC: 0.7835, AP: 0.7879
Epoch 015, Loss: 2.3160, AUC: 0.7843, AP: 0.7883
Epoch 020, Loss: 1.8376, AUC: 0.7847, AP: 0.7885
