In [1]:
import os
os.environ['OMP_NUM_THREADS'] = '6'  # Adjust the number of threads as necessary

In [2]:
import networkx as nx
import numpy as np
import csv

def build_multidigraph_from_csv(csv_file):
    G = nx.MultiDiGraph()

    with open(csv_file, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            # Exclude 'no_relation' edges
            if row['relation_type'] != 'no_relation':
                # Add nodes with the 'name' attribute
                G.add_node(row['starter_ID'], name=row['starter_ID'])
                G.add_node(row['receiver_ID'], name=row['receiver_ID'])

                # Add directed edges with additional attributes
                weight = float(row['weight'])
                G.add_edge(
                    row['starter_ID'], 
                    row['receiver_ID'], 
                    weight=weight,
                    interaction_type=row['subtype_name'],
                    relation_type=row['relation_type'],
                    pathway_sources=row['pathway_source'],
                    credibility=row['credibility']
                )
    
    return G

def create_global_node_to_index_mapping(train_graph, val_graph):
    all_nodes = set(train_graph.nodes()).union(set(val_graph.nodes()))
    return {node: i for i, node in enumerate(all_nodes)}

# Paths to the CSV files
train_csv_path = 'relations_train_final.csv'
val_csv_path = 'cleaned_relations_val_final.csv'
# Build the MultiDiGraphs
train_MDG = build_multidigraph_from_csv(train_csv_path)
val_MDG = build_multidigraph_from_csv(val_csv_path)

# Create a global node to index mapping
global_node_to_index = create_global_node_to_index_mapping(train_MDG, val_MDG)

In [3]:
import infomap

def apply_infomap(graph):
    # Initialize Infomap
    im = infomap.Infomap("--directed")

    # Create a mapping of node names to integers
    node_to_int = {node: i for i, node in enumerate(graph.nodes())}
    int_to_node = {i: node for node, i in node_to_int.items()}

    # Add nodes and edges to the Infomap network
    for node in graph.nodes():
        im.add_node(node_to_int[node])
    for u, v in graph.edges():
        im.add_link(node_to_int[u], node_to_int[v])

    # Run the Infomap community detection
    im.run()

    # Extract the communities
    communities = {int_to_node[node.node_id]: node.module_id for node in im.nodes}

    return communities

# Apply Infomap to your directed graph
communities = apply_infomap(train_MDG)
val_communities = apply_infomap(val_MDG)

# Analyze the results
num_communities = len(set(communities.values()))
num_val_communities = len(set(val_communities.values()))
print(f"Number of communities detected in the training MDG: {num_communities}")
print(f"Number of communities detected in the val MDG: {num_val_communities}")

Number of communities detected in the training MDG: 92
Number of communities detected in the val MDG: 69


In [4]:
import torch
from torch_geometric.utils import from_networkx

def apply_mapping_and_get_indices(graph, mapping):
    # Create a tensor of node indices based on the global mapping
    num_nodes = len(mapping)
    node_indices = torch.arange(num_nodes)

    # Remap nodes in the graph according to the global mapping
    remapped_graph = nx.relabel_nodes(graph, mapping)

    return remapped_graph, node_indices

# Apply mapping to training and validation graphs
remapped_train_MDG, train_indices = apply_mapping_and_get_indices(train_MDG, global_node_to_index)
remapped_val_MDG, val_indices = apply_mapping_and_get_indices(val_MDG, global_node_to_index)

# Convert to PyTorch Geometric Data
train_data = from_networkx(remapped_train_MDG)
train_data.x = train_indices

val_data = from_networkx(remapped_val_MDG)
val_data.x = val_indices

train_data.x = train_data.x.long()  # Convert to LongTensor
val_data.x = val_data.x.long()  # Convert to LongTensor

In [5]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv, GCNConv
from torch_geometric.nn.models import GAE

class GATGCNEncoder(torch.nn.Module):
    def __init__(self, max_nodes, embedding_dim, out_channels):
        super(GATGCNEncoder, self).__init__()
        self.node_emb = torch.nn.Embedding(max_nodes, embedding_dim)

        # First layer is GCN
        self.conv1 = GCNConv(embedding_dim, 2 * out_channels)
        
        # Second layer is GAT
        self.conv2 = GATConv(2 * out_channels, out_channels, heads=1, dropout=0.2)

    def forward(self, x, edge_index):
        x = self.node_emb(x)  # x is now node indices
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.2, training=self.training)
        return self.conv2(x, edge_index)

# Usage
max_nodes = 5000  # Set to a number higher than your expected number of nodes
embedding_dim = 16
out_channels = 16

encoder = GATGCNEncoder(max_nodes, embedding_dim, out_channels)
model = GAE(encoder)

In [6]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import umap.umap_ as umap
import numpy as np

def validate(model, val_data):
    model.eval()
    with torch.no_grad():
        # Encode the validation data
        z_val = model.encode(val_data.x, val_data.edge_index)

        # Calculate the reconstruction loss
        val_loss = model.recon_loss(z_val, val_data.edge_index)

    return val_loss.item()

def broad_search(embeddings, step, max_clusters):
    best_score = -1
    best_n_clusters = 0
    for n_clusters in range(2, max_clusters + 1, step):
        score = calculate_silhouette_score(embeddings, n_clusters)
        if score > best_score:
            best_score = score
            best_n_clusters = n_clusters
    return best_score, best_n_clusters

def detailed_search(embeddings, start, end, step):
    best_score = -1
    best_n_clusters = 0
    for n_clusters in range(start, end + 1, step):
        score = calculate_silhouette_score(embeddings, n_clusters)
        if score > best_score:
            best_score = score
            best_n_clusters = n_clusters
    return best_score, best_n_clusters

def calculate_silhouette_score(embeddings, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    cluster_labels = kmeans.fit_predict(embeddings)
    return silhouette_score(embeddings, cluster_labels)

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [7]:
from torch.optim import Adam

optimizer = Adam(model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()

def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(train_data.x, train_data.edge_index)
    loss = model.recon_loss(z, train_data.edge_index)
    loss.backward()
    optimizer.step()
    return loss.item()

In [8]:
import torch
from tqdm import tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Early stopping and model saving parameters
patience = 10
best_val_score = -1
epochs_no_improve = 0
early_stop = False
best_model_state = None  # To store the best model state

# Scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.5)

for epoch in range(200):
    loss = train()
    val_loss = validate(model, val_data)  # Updated to use the new validate function

    print(f'Epoch: {epoch + 1}, Loss: {loss:.4f}, Val Loss: {val_loss:.4f}')

    scheduler.step(val_loss)  # Update based on val_loss

    # Check for improvement based on decreased loss
    if val_loss < best_val_score or best_val_score == -1:
        best_val_score = val_loss
        epochs_no_improve = 0
        best_model_state = model.state_dict()
    else:
        epochs_no_improve += 1

    if epochs_no_improve >= patience:
        print("Early stopping triggered")
        break

if best_model_state is not None:
    torch.save(best_model_state, 'best_gae_model_recon_loss.pth')
    print("Best model saved.")
else:
    print("No model improvement was observed.")



Epoch: 1, Loss: 1.6981, Val Loss: 1.6455
Epoch: 2, Loss: 1.4270, Val Loss: 1.5079
Epoch: 3, Loss: 1.3376, Val Loss: 1.4548
Epoch: 4, Loss: 1.3223, Val Loss: 1.4393
Epoch: 5, Loss: 1.3106, Val Loss: 1.4387
Epoch: 6, Loss: 1.3036, Val Loss: 1.4335
Epoch: 7, Loss: 1.2900, Val Loss: 1.4346
Epoch: 8, Loss: 1.2818, Val Loss: 1.4299
Epoch: 9, Loss: 1.2578, Val Loss: 1.4169
Epoch: 10, Loss: 1.2504, Val Loss: 1.3958
Epoch: 11, Loss: 1.2331, Val Loss: 1.3833
Epoch: 12, Loss: 1.2171, Val Loss: 1.3745
Epoch: 13, Loss: 1.1938, Val Loss: 1.3747
Epoch: 14, Loss: 1.1852, Val Loss: 1.3738
Epoch: 15, Loss: 1.1765, Val Loss: 1.3730
Epoch: 16, Loss: 1.1562, Val Loss: 1.3818
Epoch: 17, Loss: 1.1449, Val Loss: 1.3811
Epoch: 18, Loss: 1.1403, Val Loss: 1.3890
Epoch: 19, Loss: 1.1235, Val Loss: 1.3969
Epoch: 20, Loss: 1.1225, Val Loss: 1.4065
Epoch: 21, Loss: 1.1155, Val Loss: 1.4069
Epoch: 22, Loss: 1.1004, Val Loss: 1.4180
Epoch: 23, Loss: 1.1046, Val Loss: 1.4144
Epoch: 24, Loss: 1.0984, Val Loss: 1.4170
E

In [10]:
model.load_state_dict(torch.load('best_gae_model_recon_loss.pth'))
model.eval()

with torch.no_grad():
    z = model.encode(train_data.x, train_data.edge_index)
    embeddings = z.cpu().numpy()

from sklearn.cluster import KMeans

n_clusters = 92
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings)

index_to_gene = {index: gene for gene, index in global_node_to_index.items()}

gene_names = [index_to_gene[i] for i in range(len(embeddings))]

# Combine gene names with their cluster labels
gene_cluster_pairs = list(zip(gene_names, cluster_labels))

import csv

with open('gene_cluster_assignments.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Gene', 'Cluster'])
    for gene, cluster in gene_cluster_pairs:
        writer.writerow([gene, cluster])



In [5]:
gene_cluster_pairs = [(node, communities[node]) for node in train_MDG.nodes()]

import csv

# Read the existing CSV file
existing_data = {}
with open('gene_cluster_assignments.csv', mode='r') as file:
    reader = csv.reader(file)
    header = next(reader)  # Skip the header
    for row in reader:
        gene = row[0]
        cluster = row[1]
        existing_data[gene] = {'Cluster': cluster, 'Infomap': None}

# Add Infomap cluster assignments
for gene, infomap_cluster in gene_cluster_pairs:
    if gene in existing_data:
        existing_data[gene]['Infomap'] = infomap_cluster

# Save the updated data to a new CSV file
with open('updated_gene_cluster_assignments.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header + ['Infomap'])  # New header with 'Infomap'
    for gene, data in existing_data.items():
        writer.writerow([gene, data['Cluster'], data['Infomap']])
