In [12]:
import pandas as pd

# Load your CSV
node_file = "./data/node_table_with_centrality.csv"
nodes = pd.read_csv(node_file)

# Get all column names
column_names = nodes.columns.tolist()

# Print column names
print(column_names)


['@id', 'AverageShortestPathLength', 'BetweennessCentrality', 'ClosenessCentrality', 'ClusteringCoefficient', 'compartment::cytoskeleton', 'compartment::cytosol', 'compartment::endoplasmic reticulum', 'compartment::endosome', 'compartment::extracellular', 'compartment::golgi apparatus', 'compartment::lysosome', 'compartment::mitochondrion', 'compartment::nucleus', 'compartment::peroxisome', 'compartment::plasma membrane', 'Degree', 'display name', 'Eccentricity', 'IsSingleNode', 'name', 'NeighborhoodConnectivity', 'NumberOfDirectedEdges', 'NumberOfUndirectedEdges', 'PartnerOfMultiEdgedNodePairs', 'query term', 'Radiality', 'selected', 'SelfLoops', 'shared name', 'Stress', 'stringdb::canonical name', 'stringdb::database identifier', 'stringdb::description', 'stringdb::enhancedLabel Passthrough', 'stringdb::full name', 'stringdb::imageurl', 'stringdb::namespace', 'stringdb::node color', 'stringdb::node type', 'stringdb::sequence', 'stringdb::species', 'stringdb::STRING style', 'stringdb:

In [13]:
# Keep only rows that have a STRING database identifier
nodes_with_string_id = nodes[nodes['stringdb::database identifier'].notna()]

# Select the necessary network/centrality columns + the string ID column
necessary_cols = [
    'stringdb::database identifier',  # to identify the protein
    'Degree', 
    'BetweennessCentrality', 
    'ClosenessCentrality', 
    'Stress', 
    'Radiality', 
    'NeighborhoodConnectivity', 
    'Eccentricity', 
    'ClusteringCoefficient', 
    'TopologicalCoefficient'
]

nodes_filtered = nodes_with_string_id[necessary_cols]

# Save to new CSV
nodes_filtered.to_csv("./data/node_table_filtered.csv", index=False)

print("Filtered CSV saved with columns:", nodes_filtered.columns.tolist())

Filtered CSV saved with columns: ['stringdb::database identifier', 'Degree', 'BetweennessCentrality', 'ClosenessCentrality', 'Stress', 'Radiality', 'NeighborhoodConnectivity', 'Eccentricity', 'ClusteringCoefficient', 'TopologicalCoefficient']


In [14]:

## Disease - (Type 2 Diabetes)


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# ==========================================================
# 1. LOAD NODE TABLE
# ==========================================================
node_file = "./data/node_table_filtered.csv"
nodes = pd.read_csv(node_file)

# ==========================================================
# 2. SELECT CENTRALITY COLUMNS THAT EXIST IN  TABLE
# ==========================================================

centrality_cols = [
    "Degree",
    "BetweennessCentrality",
    "ClosenessCentrality",
    "AverageShortestPathLength",
    "Stress",
    "Eccentricity",
    "TopologicalCoefficient",
    "NeighborhoodConnectivity"
]


centrality_cols = [c for c in centrality_cols if c in nodes.columns]

print("Using centrality columns:", centrality_cols)

# Fill missing values
nodes[centrality_cols] = nodes[centrality_cols].fillna(0)

# ==========================================================
# 3. NORMALIZE ALL CENTRALITY VALUES (0–1)
# ==========================================================
scaler = MinMaxScaler()
norm = scaler.fit_transform(nodes[centrality_cols])

norm_df = pd.DataFrame(
    norm,
    columns=[c + "_norm" for c in centrality_cols]
)

nodes = pd.concat([nodes, norm_df], axis=1)

# ==========================================================
# 4. COMPOSITE HUB SCORE (MEAN OF NORMALIZED CENTRALITIES)
# ==========================================================
nodes["HubScore"] = norm_df.mean(axis=1)

# Rank proteins
nodes = nodes.sort_values("HubScore", ascending=False)

# Mark top 5% as essential
cutoff = nodes["HubScore"].quantile(0.95)
nodes["EssentialFlag"] = nodes["HubScore"] >= cutoff

# Save full output
nodes.to_csv("./data/Hub_Essential_Proteins.csv", index=False)
print("Saved: Hub_Essential_Proteins.csv")

# ==========================================================
# 5. EXTRACT TOP 30 HUB PROTEINS
# ==========================================================
top30 = nodes.head(30)

top30.to_csv("./data/top30_hub_proteins.csv", index=False)
print("Saved: top30_hub_proteins.csv")

# ==========================================================
# 6. VISUALIZATION 1 — HUB SCORE DISTRIBUTION
# ==========================================================
plt.figure(figsize=(7,4))
plt.hist(nodes["HubScore"], bins=30)
plt.title("Hub Score Distribution")
plt.xlabel("Hub Score")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig("./data/hubscore_distribution.png", dpi=150)
plt.close()

# ==========================================================
# 7. VISUALIZATION 2 — TOP 30 HUB BARPLOT
# ==========================================================
plt.figure(figsize=(10,6))
plt.barh(top30["stringdb::database identifier"] if "stringdb::database identifier" in nodes.columns else top30.index,
         top30["HubScore"])
plt.gca().invert_yaxis()
plt.title("Top 30 Hub Proteins")
plt.xlabel("Hub Score")
plt.tight_layout()
plt.savefig("./data/top30_hubs.png", dpi=150)
plt.close()

print("Generated: ./data/hubscore_distribution.png and ./data/top30_hubs.png")

Using centrality columns: ['Degree', 'BetweennessCentrality', 'ClosenessCentrality', 'Stress', 'Eccentricity', 'TopologicalCoefficient', 'NeighborhoodConnectivity']
Saved: Hub_Essential_Proteins.csv
Saved: top30_hub_proteins.csv
Generated: ./data/hubscore_distribution.png and ./data/top30_hubs.png


In [15]:
import pandas as pd

# Load the edge table
edge_file = "./data/edge_table.csv"
edge_table = pd.read_csv(edge_file)

# Get all column names
column_names = edge_table.columns.tolist()

# Print column names
print(column_names)


['interaction', 'name', 'selected', 'shared interaction', 'shared name', 'stringdb::coexpression', 'stringdb::cooccurrence', 'stringdb::databases', 'stringdb::experiments', 'stringdb::fusion', 'stringdb::interspecies', 'stringdb::neighborhood', 'stringdb::score', 'stringdb::textmining']


In [16]:
import pandas as pd

# Load the edge table
edge_file = "./data/edge_table.csv"
edge_table = pd.read_csv(edge_file)

# Select relevant columns
necessary_cols = [
    'interaction',
    'stringdb::score',
    'stringdb::experiments',
    'stringdb::coexpression',
    'stringdb::cooccurrence',
    'stringdb::textmining'
]

edge_filtered = edge_table[necessary_cols]

# Save filtered edge table
edge_filtered.to_csv("./data/edge_table_filtered.csv", index=False)

print("Filtered edge table saved with columns:", edge_filtered.columns.tolist())


Filtered edge table saved with columns: ['interaction', 'stringdb::score', 'stringdb::experiments', 'stringdb::coexpression', 'stringdb::cooccurrence', 'stringdb::textmining']


In [None]:
# ===============================
# NODE TABLE FILTERED COLUMNS
# ===============================

# 'stringdb::database identifier' : STRING ID of the protein
#    - Necessary to uniquely identify proteins and map results back to known proteins
#    - Keeps only valid proteins with identifiers; removes unknown/unmapped entries

# 'Degree' : Number of direct interactions a protein has in the PPI network
#    - Highly connected proteins are often essential
#    - Measures local connectivity

# 'BetweennessCentrality' : Number of shortest paths that pass through the protein
#    - Proteins acting as bridges in the network are often critical
#    - Indicates potential control points in the network

# 'ClosenessCentrality' : How close a protein is to all others in the network
#    - Proteins with high closeness can quickly interact or influence others
#    - Important for overall network integration

# 'Stress' : Total number of shortest paths passing through a protein (like betweenness)
#    - Highlights proteins under "traffic" in network communication
#    - Useful to capture essential nodes

# 'Radiality' : Measures how close a protein is to all others relative to the network diameter
#    - Higher radiality indicates central position in network
#    - Can signal essentiality

# 'NeighborhoodConnectivity' : Average degree of a protein's neighbors
#    - Shows if a protein interacts with hubs or peripheral nodes
#    - Hubs connected to hubs are often more essential

# 'Eccentricity' : Maximum distance from a protein to any other in the network
#    - Proteins with lower eccentricity are more central
#    - Helps in identifying central/essential nodes

# 'ClusteringCoefficient' : Measures how interconnected a protein’s neighbors are
#    - Proteins in tightly knit modules might be critical for specific functions
#    - Helps understand modular essentiality

# 'TopologicalCoefficient' : Measures how similar a protein's neighbors are compared to others
#    - Indicates redundancy and functional uniqueness
#    - Useful for identifying proteins with unique network roles


# ===============================
# EDGE / H TABLE FILTERED COLUMNS
# ===============================

# 'interaction' : Pair of proteins involved in the interaction
#    - Required to reconstruct the PPI network if needed
#    - Can be used to assign weights or calculate derived network features

# 'stringdb::score' : Overall confidence score of the interaction
#    - High scores indicate more reliable interactions
#    - Can be used to filter weak or noisy edges

# 'stringdb::experiments' : Evidence from experimental data
#    - Useful for weighted network construction
#    - Gives reliability based on experimental verification

# 'stringdb::coexpression' : Evidence from gene co-expression
#    - Proteins with co-expression may indicate functional linkage
#    - Can enhance confidence of predicted essentiality

# 'stringdb::cooccurrence' : Evidence from co-occurrence across species
#    - Suggests functional linkage based on evolutionary conservation
#    - May support essentiality assessment

# 'stringdb::textmining' : Evidence from literature
#    - Captures reported interactions
#    - Can reinforce confidence of biologically important interactions
