In [1]:
import pandas as pd
import numpy as np
import networkit as nk

## Load Data

In [2]:
# Load corrected annotation table
annot_df = pd.read_parquet("../data/annotation_table_with_uniprot_corrected.parquet")
print(f"Annotation table shape: {annot_df.shape}")

Annotation table shape: (114537, 750)


In [None]:
# Load filtered ChEMBL affinity data
affinity_df = pd.read_parquet("../data/filtered_chembl_affinity.parquet")
print(f"Affinity data shape: {affinity_df.shape}")
print(f"Unique UniProt IDs in affinity data: {affinity_df['source_uniprot_id'].nunique()}")

In [None]:
# Get UniProt IDs from affinity data
target_uniprots = set(affinity_df['source_uniprot_id'].dropna().unique())
print(f"Target UniProt IDs: {len(target_uniprots)}")

## Build Bipartite Graph (UniProt - PDB mapping)
Using NetworkKit to create a bipartite graph where:
- One set of nodes: UniProt IDs (from affinity data)
- Other set of nodes: PDB chains (from annotation table)
- Edges: Exist between UniProt ID and PDB_chain if they map to each other

In [None]:
# Filter annotation table to only include UniProt IDs from affinity data
# Note: uniprot_ids_str can contain multiple comma-separated UniProt IDs
# We need to check if any of the target UniProt IDs are in the string

def contains_target_uniprot(uniprot_str):
    """Check if any target UniProt ID is in the comma-separated string"""
    if pd.isna(uniprot_str):
        return False
    uniprots = [u.strip() for u in str(uniprot_str).split(',')]
    return any(u in target_uniprots for u in uniprots)

# Filter rows where uniprot_ids_str contains at least one target UniProt
mask = annot_df['uniprot_ids_str'].apply(contains_target_uniprot)
filtered_annot = annot_df[mask].copy()
print(f"Filtered annotation rows: {len(filtered_annot)}")

In [None]:
# Expand the comma-separated UniProt IDs into separate rows
# Then filter to keep only target UniProt IDs

edges_list = []
for _, row in filtered_annot.iterrows():
    pdb_chain = row['pdb_chain_key']
    if pd.isna(row['uniprot_ids_str']):
        continue
    uniprots = [u.strip() for u in str(row['uniprot_ids_str']).split(',')]
    for uniprot in uniprots:
        if uniprot in target_uniprots:
            edges_list.append({'uniprot': uniprot, 'pdb_chain': pdb_chain})

edges_df = pd.DataFrame(edges_list).drop_duplicates()
print(f"Number of unique edges: {len(edges_df)}")
print(f"Unique UniProt IDs in edges: {edges_df['uniprot'].nunique()}")
print(f"Unique PDB chains in edges: {edges_df['pdb_chain'].nunique()}")

In [None]:
# Check coverage: how many target UniProts have PDB structures?
covered_uniprots = set(edges_df['uniprot'].unique())
missing_uniprots = target_uniprots - covered_uniprots

print(f"UniProt IDs with PDB structures: {len(covered_uniprots)} ({len(covered_uniprots)/len(target_uniprots)*100:.2f}%)")
print(f"UniProt IDs without PDB structures: {len(missing_uniprots)} ({len(missing_uniprots)/len(target_uniprots)*100:.2f}%)")

In [None]:
# Create node ID mappings for NetworkKit
unique_uniprots = sorted(edges_df['uniprot'].unique())
unique_pdb_chains = sorted(edges_df['pdb_chain'].unique())

# UniProt IDs: 0 to n_uniprots-1
# PDB chains: n_uniprots to n_uniprots + n_pdb_chains - 1
uniprot_to_id = {uniprot: i for i, uniprot in enumerate(unique_uniprots)}
pdb_chain_to_id = {pdb_chain: i + len(unique_uniprots) for i, pdb_chain in enumerate(unique_pdb_chains)}

# Reverse mappings
id_to_uniprot = {v: k for k, v in uniprot_to_id.items()}
id_to_pdb_chain = {v: k for k, v in pdb_chain_to_id.items()}

print(f"UniProt node IDs: 0 to {len(unique_uniprots)-1}")
print(f"PDB chain node IDs: {len(unique_uniprots)} to {len(unique_uniprots) + len(unique_pdb_chains) - 1}")
print(f"Total nodes: {len(unique_uniprots) + len(unique_pdb_chains)}")

In [None]:
# Build the bipartite graph using NetworkKit
n_nodes = len(unique_uniprots) + len(unique_pdb_chains)
G = nk.Graph(n_nodes, weighted=False, directed=False)

# Add edges
for _, row in edges_df.iterrows():
    uniprot_id = uniprot_to_id[row['uniprot']]
    pdb_chain_id = pdb_chain_to_id[row['pdb_chain']]
    G.addEdge(uniprot_id, pdb_chain_id)

print(f"Graph created successfully!")
print(f"Number of nodes: {G.numberOfNodes()}")
print(f"Number of edges: {G.numberOfEdges()}")

In [None]:
# Verify bipartite structure and compute statistics
node_types = {}
for node_id in range(len(unique_uniprots)):
    node_types[node_id] = 'uniprot'
for node_id in range(len(unique_uniprots), n_nodes):
    node_types[node_id] = 'pdb_chain'

# Verify bipartite
is_bipartite = all(node_types[u] != node_types[v] for u, v in G.iterEdges())

print(f"Graph is bipartite: {is_bipartite}")
print(f"\n--- Graph Statistics ---")
print(f"Number of UniProt nodes: {len(unique_uniprots)}")
print(f"Number of PDB chain nodes: {len(unique_pdb_chains)}")
print(f"Total edges: {G.numberOfEdges()}")
print(f"Average degree: {2 * G.numberOfEdges() / G.numberOfNodes():.2f}")

In [None]:
# Helper functions
def get_pdb_chains_for_uniprot(uniprot_id):
    """Get all PDB chains mapped to a UniProt ID"""
    if uniprot_id not in uniprot_to_id:
        return []
    node_id = uniprot_to_id[uniprot_id]
    neighbors = list(G.iterNeighbors(node_id))
    return [id_to_pdb_chain[n] for n in neighbors]

def get_uniprots_for_pdb_chain(pdb_chain):
    """Get all UniProt IDs mapped to a PDB chain"""
    if pdb_chain not in pdb_chain_to_id:
        return []
    node_id = pdb_chain_to_id[pdb_chain]
    neighbors = list(G.iterNeighbors(node_id))
    return [id_to_uniprot[n] for n in neighbors]

# Example usage
example_uniprot = unique_uniprots[0]
pdb_chains = get_pdb_chains_for_uniprot(example_uniprot)
print(f"UniProt {example_uniprot} maps to {len(pdb_chains)} PDB chains: {pdb_chains[:5]}...")

example_pdb = unique_pdb_chains[0]
uniprots = get_uniprots_for_pdb_chain(example_pdb)
print(f"PDB chain {example_pdb} maps to UniProt IDs: {uniprots}")