In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# ==========================================================
# 1. LOAD NODE TABLE
# ==========================================================
node_file = "node_table.csv"
nodes = pd.read_csv(node_file)

# ==========================================================
# 2. SELECT CENTRALITY COLUMNS THAT EXIST IN  TABLE
# ==========================================================

centrality_cols = [
    "Degree",
    "BetweennessCentrality",
    "ClosenessCentrality",
    "AverageShortestPathLength",
    "Stress",
    "Eccentricity",
    "TopologicalCoefficient",
    "NeighborhoodConnectivity"
]


centrality_cols = [c for c in centrality_cols if c in nodes.columns]

print("Using centrality columns:", centrality_cols)

# Fill missing values
nodes[centrality_cols] = nodes[centrality_cols].fillna(0)

# ==========================================================
# 3. NORMALIZE ALL CENTRALITY VALUES (0–1)
# ==========================================================
scaler = MinMaxScaler()
norm = scaler.fit_transform(nodes[centrality_cols])

norm_df = pd.DataFrame(
    norm,
    columns=[c + "_norm" for c in centrality_cols]
)

nodes = pd.concat([nodes, norm_df], axis=1)

# ==========================================================
# 4. COMPOSITE HUB SCORE (MEAN OF NORMALIZED CENTRALITIES)
# ==========================================================
nodes["HubScore"] = norm_df.mean(axis=1)

# Rank proteins
nodes = nodes.sort_values("HubScore", ascending=False)

# Mark top 5% as essential
cutoff = nodes["HubScore"].quantile(0.95)
nodes["EssentialFlag"] = nodes["HubScore"] >= cutoff

# Save full output
nodes.to_csv("Hub_Essential_Proteins.csv", index=False)
print("Saved: Hub_Essential_Proteins.csv")

# ==========================================================
# 5. EXTRACT TOP 30 HUB PROTEINS
# ==========================================================
top30 = nodes.head(30)

top30.to_csv("top30_hub_proteins.csv", index=False)
print("Saved: top30_hub_proteins.csv")

# ==========================================================
# 6. VISUALIZATION 1 — HUB SCORE DISTRIBUTION
# ==========================================================
plt.figure(figsize=(7,4))
plt.hist(nodes["HubScore"], bins=30)
plt.title("Hub Score Distribution")
plt.xlabel("Hub Score")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig("hubscore_distribution.png", dpi=150)
plt.close()

# ==========================================================
# 7. VISUALIZATION 2 — TOP 30 HUB BARPLOT
# ==========================================================
plt.figure(figsize=(10,6))
plt.barh(top30["name"] if "name" in nodes.columns else top30.index,
         top30["HubScore"])
plt.gca().invert_yaxis()
plt.title("Top 30 Hub Proteins")
plt.xlabel("Hub Score")
plt.tight_layout()
plt.savefig("top30_hubs.png", dpi=150)
plt.close()

print("Generated: hubscore_distribution.png and top30_hubs.png")
