In [1]:
import os
os.environ['OMP_NUM_THREADS'] = '6'  # Adjust the number of threads as necessary

In [14]:
import networkx as nx
import numpy as np
import csv

def build_multidigraph_from_csv(csv_file):
    G = nx.MultiDiGraph()

    with open(csv_file, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            # Exclude 'no_relation' edges
            if row['relation_type'] != 'no_relation':
                # Add nodes with the 'name' attribute
                G.add_node(row['starter_ID'], name=row['starter_ID'])
                G.add_node(row['receiver_ID'], name=row['receiver_ID'])

                # Add directed edges with additional attributes
                weight = float(row['weight'])
                G.add_edge(
                    row['starter_ID'], 
                    row['receiver_ID'], 
                    weight=weight,
                    interaction_type=row['subtype_name'],
                    relation_type=row['relation_type'],
                    pathway_sources=row['pathway_source'],
                    credibility=row['credibility']
                )
    
    return G

# Paths to the CSV files
train_csv_path = 'relations_train_final.csv'
val_csv_path = 'cleaned_relations_val_final.csv'

# Build the MultiDiGraphs
train_MDG = build_multidigraph_from_csv(train_csv_path)
val_MDG = build_multidigraph_from_csv(val_csv_path)

In [15]:
from torch_geometric.utils import from_networkx
import torch

# Create a node to index mapping
node_to_index = {node: i for i, node in enumerate(train_MDG.nodes())}

# Initialize node features with identity matrix
num_nodes = len(train_MDG.nodes())
node_features = torch.eye(num_nodes)

# Convert to PyTorch Geometric Data
train_data = from_networkx(train_MDG)
train_data.x = node_features

# Convert the validation graph to PyTorch Geometric data
val_data = from_networkx(val_MDG)
val_data.x = torch.eye(len(val_MDG.nodes()))  # Use identity matrix as features


In [17]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv, GCNConv
from torch_geometric.nn.models import GAE

class GATGCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GATGCNEncoder, self).__init__()
        # First layer is GCN
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        # Second layer is GAT
        self.conv2 = GATConv(2 * out_channels, out_channels, heads=1, dropout=0.2)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.2, training=self.training)
        return self.conv2(x, edge_index)

# Example usage
in_channels = num_nodes  # Adjust as per your data
out_channels = 16  # Embedding size

encoder = GATGCNEncoder(in_channels, out_channels)
model = GAE(encoder)



In [18]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import umap.umap_ as umap
import numpy as np

def validate(model, node_to_index, val_nodes, initial_step=10, detailed_step=1, max_clusters=1000):
    model.eval()
    with torch.no_grad():
        z = model.encode(train_data.x, train_data.edge_index)
        z_np = z.cpu().numpy()

    val_embeddings = np.array([z_np[node_to_index[node]] for node in val_nodes])
    reducer = umap.UMAP(n_components=2, random_state=42)
    umap_embeddings = reducer.fit_transform(val_embeddings)

    # Initial broad search
    broad_best_score, broad_best_n_clusters = broad_search(umap_embeddings, initial_step, max_clusters)

    # Detailed search within promising range
    start = max(2, broad_best_n_clusters - initial_step)
    end = min(broad_best_n_clusters + initial_step, max_clusters)
    best_score, best_n_clusters = detailed_search(umap_embeddings, start, end, detailed_step)

    print(f"Best silhouette score: {best_score} for {best_n_clusters} clusters")
    return best_score

def broad_search(embeddings, step, max_clusters):
    best_score = -1
    best_n_clusters = 0
    for n_clusters in range(2, max_clusters + 1, step):
        score = calculate_silhouette_score(embeddings, n_clusters)
        if score > best_score:
            best_score = score
            best_n_clusters = n_clusters
    return best_score, best_n_clusters

def detailed_search(embeddings, start, end, step):
    best_score = -1
    best_n_clusters = 0
    for n_clusters in range(start, end + 1, step):
        score = calculate_silhouette_score(embeddings, n_clusters)
        if score > best_score:
            best_score = score
            best_n_clusters = n_clusters
    return best_score, best_n_clusters

def calculate_silhouette_score(embeddings, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    cluster_labels = kmeans.fit_predict(embeddings)
    return silhouette_score(embeddings, cluster_labels)

In [19]:
from torch.optim import Adam

optimizer = Adam(model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()

def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(train_data.x, train_data.edge_index)
    loss = model.recon_loss(z, train_data.edge_index)
    loss.backward()
    optimizer.step()
    return loss.item()

In [7]:
import torch
from tqdm import tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Early stopping and model saving parameters
patience = 10
best_val_score = -1
epochs_no_improve = 0
early_stop = False
best_model_state = None  # To store the best model state

# Scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='max', patience=5, factor=0.5)

for epoch in range(200):
    loss = train()
    val_nodes = list(val_MDG.nodes())
    val_score = validate(model, node_to_index, val_nodes)

    print(f'Epoch: {epoch + 1}, Loss: {loss:.4f}, Val Score: {val_score:.4f}')

    scheduler.step(val_score)

    # Save model if validation score improved
    if val_score > best_val_score:
        best_val_score = val_score
        epochs_no_improve = 0
        best_model_state = model.state_dict()  # Save the best model state
    else:
        epochs_no_improve += 1

    if epochs_no_improve >= patience:
        print("Early stopping triggered")
        early_stop = True
        break

if early_stop:
    print("Stopped early due to no improvement")

# Save the best model state to a file after the training loop
if best_model_state is not None:
    torch.save(best_model_state, 'best_gae_model.pth')
    print("Best model saved.")
else:
    print("No model improvement was observed.")


Best silhouette score: 0.6488627195358276 for 75 clusters
Epoch: 1, Loss: 1.3863, Val Score: 0.6489
Best silhouette score: 0.6517704129219055 for 72 clusters
Epoch: 2, Loss: 1.3846, Val Score: 0.6518
Best silhouette score: 0.6490039825439453 for 89 clusters
Epoch: 3, Loss: 1.3732, Val Score: 0.6490
Best silhouette score: 0.6540904641151428 for 102 clusters
Epoch: 4, Loss: 1.3451, Val Score: 0.6541
Best silhouette score: 0.662098228931427 for 172 clusters
Epoch: 5, Loss: 1.3003, Val Score: 0.6621
Best silhouette score: 0.6646788120269775 for 86 clusters
Epoch: 6, Loss: 1.2368, Val Score: 0.6647
Best silhouette score: 0.6597326993942261 for 118 clusters
Epoch: 7, Loss: 1.1993, Val Score: 0.6597
Best silhouette score: 0.6748242378234863 for 162 clusters
Epoch: 8, Loss: 1.1869, Val Score: 0.6748
Best silhouette score: 0.6747502088546753 for 60 clusters
Epoch: 9, Loss: 1.1686, Val Score: 0.6748
Best silhouette score: 0.6844045519828796 for 86 clusters
Epoch: 10, Loss: 1.1199, Val Score: 0.6

Epoch: 1, Loss: 1.6981, Val Loss: 1.6455
Epoch: 2, Loss: 1.4270, Val Loss: 1.5079
Epoch: 3, Loss: 1.3376, Val Loss: 1.4548
Epoch: 4, Loss: 1.3223, Val Loss: 1.4393
Epoch: 5, Loss: 1.3106, Val Loss: 1.4387
Epoch: 6, Loss: 1.3036, Val Loss: 1.4335
Epoch: 7, Loss: 1.2900, Val Loss: 1.4346
Epoch: 8, Loss: 1.2818, Val Loss: 1.4299
Epoch: 9, Loss: 1.2578, Val Loss: 1.4169
Epoch: 10, Loss: 1.2504, Val Loss: 1.3958
Epoch: 11, Loss: 1.2331, Val Loss: 1.3833
Epoch: 12, Loss: 1.2171, Val Loss: 1.3745
Epoch: 13, Loss: 1.1938, Val Loss: 1.3747
Epoch: 14, Loss: 1.1852, Val Loss: 1.3738
Epoch: 15, Loss: 1.1765, Val Loss: 1.3730
Epoch: 16, Loss: 1.1562, Val Loss: 1.3818
Epoch: 17, Loss: 1.1449, Val Loss: 1.3811
Epoch: 18, Loss: 1.1403, Val Loss: 1.3890
Epoch: 19, Loss: 1.1235, Val Loss: 1.3969
Epoch: 20, Loss: 1.1225, Val Loss: 1.4065
Epoch: 21, Loss: 1.1155, Val Loss: 1.4069
Epoch: 22, Loss: 1.1004, Val Loss: 1.4180
Epoch: 23, Loss: 1.1046, Val Loss: 1.4144
Epoch: 24, Loss: 1.0984, Val Loss: 1.4170
E

In [24]:
model.load_state_dict(torch.load('gae_sihoulette_score.pth'))
model.eval()

with torch.no_grad():
    train_embeddings = model.encode(train_data.x, train_data.edge_index).cpu().numpy()

from sklearn.cluster import KMeans

n_clusters = 92  # Set the number of clusters, or determine it based on your criteria
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
train_cluster_labels = kmeans.fit_predict(train_embeddings)

updated_data = []
# Read the existing CSV file and add the new cluster assignments
existing_data = {}
with open('gene_cluster_assignments.csv', mode='r') as file:
    reader = csv.reader(file)
    header = next(reader)  # Skip the header

    for row in reader:
        gene = row[0]
        existing_data[gene] = row

# Add new cluster assignments based on gene names
for gene, index in node_to_index.items():
    if gene in existing_data:
        new_cluster = train_cluster_labels[index]
        existing_data[gene].append(new_cluster)

# Save the updated data to a new CSV file
with open('updated_gene_cluster_assignments.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header + ['GAE_Silhouette_Cluster'])  # New header with 'GAE_Silhouette_Cluster'
    for gene, row in existing_data.items():
        writer.writerow(row)

