In [2]:
import pandas as pd
import os
import numpy as np

In [3]:
uniprot_clusters_louvian = pd.read_csv("../data/uniprot_bipartite_cluster_labels.csv")
uniprot_clusters_louvian.head() 

Unnamed: 0,uniprot,cluster_bipartite_qcov_50,cluster_bipartite_qcov_70,cluster_bipartite_qcov_95,cluster_bipartite_qcov_100
0,O00141,128,101,143,333
1,O00206,117,130,302,319
2,O00238,128,101,143,333
3,O00255,0,0,212,0
4,O00311,128,101,143,333


In [4]:
print(f"Total UniProt IDs: {len(uniprot_clusters_louvian)}")
print(f"Unique clusters (qcov_50): {uniprot_clusters_louvian['cluster_bipartite_qcov_50'].nunique()}")

Total UniProt IDs: 837
Unique clusters (qcov_50): 258


In [5]:
import requests
import time

def fetch_uniprot_info(uniprot_ids):
    base_url = "https://rest.uniprot.org/uniprotkb/search"
    batch_size = 50
    info_map = {}
    
    ids_list = list(set(uniprot_ids))
    total = len(ids_list)
    
    print(f"Fetching info for {total} UniProt IDs...")
    
    for i in range(0, total, batch_size):
        batch = ids_list[i:i+batch_size]
        # Construct query: accession:ID1 OR accession:ID2 ...
        query = " OR ".join([f"accession:{uid}" for uid in batch])
        params = {
            "query": query,
            "fields": "accession,id,protein_name",
            "format": "json",
            "size": batch_size
        }
        
        try:
            response = requests.get(base_url, params=params)
            response.raise_for_status()
            results = response.json()['results']
            
            for res in results:
                acc = res['primaryAccession']
                entry_name = res['uniProtkbId']
                
                # Try to get recommended name, fallback to submitted
                desc = res.get('proteinDescription', {})
                rec_name = desc.get('recommendedName', {}).get('fullName', {}).get('value')
                if not rec_name:
                    sub_names = desc.get('submissionNames', [])
                    if sub_names:
                        rec_name = sub_names[0].get('fullName', {}).get('value')
                
                if not rec_name:
                    rec_name = "Unknown Protein"
                    
                # Format: O43318 · M3K7_HUMAN Protein Mitogen-activated protein kinase kinase kinase 7
                display_str = f"{acc} · {entry_name} Protein {rec_name}"
                info_map[acc] = display_str
                
        except Exception as e:
            print(f"Error fetching batch {i}: {e}")
            time.sleep(1)
            
    return info_map

In [6]:
# 1. Count cluster sizes for 'cluster_bipartite_qcov_50'
cluster_col = 'cluster_bipartite_qcov_50'
cluster_counts = uniprot_clusters_louvian[cluster_col].value_counts()
uniprot_clusters_louvian['cluster_size'] = uniprot_clusters_louvian[cluster_col].map(cluster_counts)

# 2. Sort by cluster size (descending) and then cluster ID
df_sorted = uniprot_clusters_louvian.sort_values(by=['cluster_size', cluster_col], ascending=[False, True])

# 3. Fetch info
all_uniprots = df_sorted['uniprot'].unique()
uniprot_info = fetch_uniprot_info(all_uniprots)

# 4. Add info to dataframe
df_sorted['protein_info'] = df_sorted['uniprot'].map(uniprot_info)

# 5. Display
pd.set_option('display.max_colwidth', None)
# Show top 20 rows to verify
df_sorted[[cluster_col, 'cluster_size', 'uniprot', 'protein_info']].head(20)

Fetching info for 837 UniProt IDs...


Unnamed: 0,cluster_bipartite_qcov_50,cluster_size,uniprot,protein_info
0,128,204,O00141,O00141 · SGK1_HUMAN Protein Serine/threonine-protein kinase Sgk1
2,128,204,O00238,O00238 · BMR1B_HUMAN Protein Bone morphogenetic protein receptor type-1B
4,128,204,O00311,O00311 · CDC7_HUMAN Protein Cell division cycle 7-related protein kinase
9,128,204,O00444,O00444 · PLK4_HUMAN Protein Serine/threonine-protein kinase PLK4
16,128,204,O14733,O14733 · MP2K7_HUMAN Protein Dual specificity mitogen-activated protein kinase kinase 7
18,128,204,O14757,O14757 · CHK1_HUMAN Protein Serine/threonine-protein kinase Chk1
22,128,204,O14920,O14920 · IKKB_HUMAN Protein Inhibitor of nuclear factor kappa-B kinase subunit beta
24,128,204,O14965,O14965 · AURKA_HUMAN Protein Aurora kinase A
25,128,204,O14976,O14976 · GAK_HUMAN Protein Cyclin-G-associated kinase
27,128,204,O15075,O15075 · DCLK1_HUMAN Protein Serine/threonine-protein kinase DCLK1


In [7]:
# Save to CSV for easier inspection
output_file = "../data/uniprot_clusters_annotated.csv"
df_sorted.to_csv(output_file, index=False)
print(f"Annotated clusters saved to {output_file}")

# Show members of the largest cluster
largest_cluster_id = df_sorted.iloc[0][cluster_col]
print(f"\nMembers of the largest cluster (ID: {largest_cluster_id}):")
df_sorted[df_sorted[cluster_col] == largest_cluster_id][[cluster_col, 'uniprot', 'protein_info']]

Annotated clusters saved to ../data/uniprot_clusters_annotated.csv

Members of the largest cluster (ID: 128):


Unnamed: 0,cluster_bipartite_qcov_50,uniprot,protein_info
0,128,O00141,O00141 · SGK1_HUMAN Protein Serine/threonine-protein kinase Sgk1
2,128,O00238,O00238 · BMR1B_HUMAN Protein Bone morphogenetic protein receptor type-1B
4,128,O00311,O00311 · CDC7_HUMAN Protein Cell division cycle 7-related protein kinase
9,128,O00444,O00444 · PLK4_HUMAN Protein Serine/threonine-protein kinase PLK4
16,128,O14733,O14733 · MP2K7_HUMAN Protein Dual specificity mitogen-activated protein kinase kinase 7
...,...,...,...
814,128,Q9UQB9,Q9UQB9 · AURKC_HUMAN Protein Aurora kinase C
821,128,Q9Y2K2,Q9Y2K2 · SIK3_HUMAN Protein Serine/threonine-protein kinase SIK3
825,128,Q9Y463,Q9Y463 · DYR1B_HUMAN Protein Dual specificity tyrosine-phosphorylation-regulated kinase 1B
827,128,Q9Y572,Q9Y572 · RIPK3_HUMAN Protein Receptor-interacting serine/threonine-protein kinase 3


In [9]:
import plotly.express as px
import pandas as pd
import os

# Load data if not in memory
if 'df_sorted' not in locals():
    csv_path = "../data/uniprot_clusters_annotated.csv"
    if os.path.exists(csv_path):
        print(f"Loading data from {csv_path}...")
        df_sorted = pd.read_csv(csv_path)
        cluster_col = 'cluster_bipartite_qcov_50'
    else:
        raise FileNotFoundError("Data not found. Please run the previous cells to generate the data.")

# Create a descriptive label for the clusters for the X-axis
df_sorted['cluster_display'] = (
    "ID: " + df_sorted[cluster_col].astype(str) + 
    " (n=" + df_sorted['cluster_size'].astype(str) + ")"
)

# Create an interactive strip plot
fig = px.strip(
    df_sorted, 
    x='cluster_display', 
    hover_name='uniprot',
    hover_data={
        'cluster_display': False,
        'protein_info': True,
        cluster_col: True
    },
    title=f'UniProt Clusters ({cluster_col})',
    labels={'cluster_display': 'Cluster'},
    height=600
)

# Ensure the X-axis is sorted by the cluster size
# We get the unique labels in the order they appear in the sorted dataframe
unique_labels = df_sorted['cluster_display'].unique()
fig.update_xaxes(type='category', categoryorder='array', categoryarray=unique_labels)

fig.show()

In [10]:
import networkx as nx
import plotly.graph_objects as go

# 1. Filter for top 3 clusters
top_3_clusters = df_sorted[cluster_col].value_counts().nlargest(3).index
df_top3 = df_sorted[df_sorted[cluster_col].isin(top_3_clusters)]

print(f"Visualizing top 3 clusters: {list(top_3_clusters)}")

# 2. Create Graph
G = nx.Graph()

# Add cluster centers
for cid in top_3_clusters:
    cluster_node = f"Cluster {cid}"
    # Store metadata for visualization
    G.add_node(cluster_node, node_type='cluster', size=30, color='red', hover_text=f"Cluster {cid} Center")

# Add protein nodes and edges to cluster center
for _, row in df_top3.iterrows():
    prot_node = row['uniprot']
    cluster_node = f"Cluster {row[cluster_col]}"
    info = row['protein_info']
    
    # Add protein node
    G.add_node(prot_node, node_type='protein', size=10, color='blue', hover_text=info)
    # Add edge to its cluster center
    G.add_edge(prot_node, cluster_node)

# 3. Compute Layout (Spring layout works well for clusters)
pos = nx.spring_layout(G, seed=42, k=0.15, iterations=50)

# 4. Prepare Plotly Traces

# Edge Trace
edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

# Node Trace
node_x = []
node_y = []
node_text = []
node_color = []
node_size = []

for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_text.append(G.nodes[node]['hover_text'])
    
    # Style based on node type
    if G.nodes[node]['node_type'] == 'cluster':
        node_color.append('#EF553B') # Red-ish
        node_size.append(25)
    else:
        node_color.append('#636EFA') # Blue-ish
        node_size.append(10)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=False,
        color=node_color,
        size=node_size,
        line_width=1,
        line_color='white'))

node_trace.text = node_text

# 5. Create Figure
fig_network = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title=f'Network Graph of Top 3 Clusters ({cluster_col})',
                title_x=0.5,
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                height=700
             ))

fig_network.show()

Visualizing top 3 clusters: [128, 63, 170]
