# Adjusted Mutual Information

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import adjusted_mutual_info_score

# Load expression data
genes = pd.read_csv("C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Dataset/aak100_cpmdat.csv")

# Keep only expression columns
expr = genes.drop(["Unnamed: 0", "mgs_level"], axis=1)
gene_names = expr.columns
expr_values = expr.to_numpy()

# --- Discretize each gene into bins (needed for AMI) ---
# Quantile binning: divide each gene's expression into categories
n_bins = 5  # you can tune this
binned_expr = np.zeros_like(expr_values, dtype=int)

for i in range(expr_values.shape[1]):
    binned_expr[:, i] = pd.qcut(expr_values[:, i], q=n_bins, labels=False, duplicates='drop')

edges = []

# --- Compute pairwise Adjusted Mutual Information ---
for i in range(len(gene_names)):
    for j in range(i + 1, len(gene_names)):
        g1 = binned_expr[:, i]
        g2 = binned_expr[:, j]
        ami = adjusted_mutual_info_score(g1, g2)
        edges.append([gene_names[i], gene_names[j], ami])

# --- Create dataframe in MEGENA format ---
edges_df = pd.DataFrame(edges, columns=["from", "to", "weight"])

# Optional: ensure finite & nonnegative
edges_df["weight"] = np.clip(edges_df["weight"], a_min=0, a_max=1)

# Save to CSV
edges_df.to_csv("C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Dataset/ami_edges.csv", index=False)

print("‚úÖ Pairwise Adjusted Mutual Information computed and saved for MEGENA.")

‚úÖ Pairwise Adjusted Mutual Information computed and saved for MEGENA.


In [10]:
import numpy as np
import pandas as pd
from sklearn.metrics import adjusted_mutual_info_score

# ===== 1. Load QN Global Expression Data =====
qn = pd.read_csv("C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Scripts/Data Quantile Normalization/QN_global_L_global.csv")

# If metadata columns exist, list them here to remove
meta_cols = ["mgs_level", "sample_id", "Unnamed: 0", "X"]

# Keep only numeric expression columns (genes)
expr = qn.drop(columns=[c for c in meta_cols if c in qn.columns], errors="ignore")
expr = expr.select_dtypes(include=[np.number])

gene_names = expr.columns
expr_values = expr.to_numpy()

print(f"‚úÖ Loaded QN Global expression matrix: {expr_values.shape[0]} samples √ó {expr_values.shape[1]} genes")

# ===== 2. Discretize Expression for AMI =====
n_bins = 5  # can fine-tune
binned_expr = np.zeros_like(expr_values, dtype=int)

for i in range(expr_values.shape[1]):
    col = expr_values[:, i]

    # If gene is constant across samples ‚Üí assign one bin
    if np.all(col == col[0]):
        binned_expr[:, i] = 0
    else:
        binned_expr[:, i] = pd.qcut(col, q=n_bins, labels=False, duplicates='drop')

# ===== 3. Compute Pairwise AMI =====
edges = []
num_genes = len(gene_names)

for i in range(num_genes):
    for j in range(i + 1, num_genes):
        g1 = binned_expr[:, i]
        g2 = binned_expr[:, j]
        ami = adjusted_mutual_info_score(g1, g2)
        edges.append([gene_names[i], gene_names[j], ami])

# ===== 4. Build MEGENA-Compatible Edge List =====
edges_df = pd.DataFrame(edges, columns=["from", "to", "weight"])

# Ensure valid weights
edges_df["weight"] = edges_df["weight"].fillna(0)
edges_df["weight"] = np.clip(edges_df["weight"], 0, 1)

save_path = "C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Dataset/ami_edges_QN_global_L_global.csv"
edges_df.to_csv(save_path, index=False)

print(f"‚úÖ AMI computed for {num_genes} genes.")
print(f"üß† Total edges: {len(edges_df)}")
print(f"üìÅ Saved to {save_path}")


‚úÖ Loaded QN Global expression matrix: 166 samples √ó 81 genes
‚úÖ AMI computed for 81 genes.
üß† Total edges: 3240
üìÅ Saved to C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Dataset/ami_edges_QN_global_L_global.csv


In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import adjusted_mutual_info_score

# Load expression data
genes = pd.read_csv("C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Dataset/aak100_cpmdat.csv")

genes = genes[genes['mgs_level'] == "MGS1"]

# Keep only expression columns
expr = genes.drop(["Unnamed: 0", "mgs_level"], axis=1)
gene_names = expr.columns
expr_values = expr.to_numpy()

# --- Discretize each gene into bins (needed for AMI) ---
# Quantile binning: divide each gene's expression into categories
n_bins = 5  # you can tune this
binned_expr = np.zeros_like(expr_values, dtype=int)

for i in range(expr_values.shape[1]):
    binned_expr[:, i] = pd.qcut(expr_values[:, i], q=n_bins, labels=False, duplicates='drop')

edges = []

# --- Compute pairwise Adjusted Mutual Information ---
for i in range(len(gene_names)):
    for j in range(i + 1, len(gene_names)):
        g1 = binned_expr[:, i]
        g2 = binned_expr[:, j]
        ami = adjusted_mutual_info_score(g1, g2)
        edges.append([gene_names[i], gene_names[j], ami])

# --- Create dataframe in MEGENA format ---
edges_df = pd.DataFrame(edges, columns=["from", "to", "weight"])

# Optional: ensure finite & nonnegative
edges_df["weight"] = np.clip(edges_df["weight"], a_min=0, a_max=1)

# Save to CSV
edges_df.to_csv("C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Dataset/ami_edges_control.csv", index=False)

print("‚úÖ Pairwise Adjusted Mutual Information computed and saved for MEGENA.")

‚úÖ Pairwise Adjusted Mutual Information computed and saved for MEGENA.


In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import adjusted_mutual_info_score

# Load expression data
genes = pd.read_csv("C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Dataset/aak100_cpmdat.csv")

genes = genes[genes['mgs_level'] == "MGS4"]

# Keep only expression columns
expr = genes.drop(["Unnamed: 0", "mgs_level"], axis=1)
gene_names = expr.columns
expr_values = expr.to_numpy()

# --- Discretize each gene into bins (needed for AMI) ---
# Quantile binning: divide each gene's expression into categories
n_bins = 5  # you can tune this
binned_expr = np.zeros_like(expr_values, dtype=int)

for i in range(expr_values.shape[1]):
    binned_expr[:, i] = pd.qcut(expr_values[:, i], q=n_bins, labels=False, duplicates='drop')

edges = []

# --- Compute pairwise Adjusted Mutual Information ---
for i in range(len(gene_names)):
    for j in range(i + 1, len(gene_names)):
        g1 = binned_expr[:, i]
        g2 = binned_expr[:, j]
        ami = adjusted_mutual_info_score(g1, g2)
        edges.append([gene_names[i], gene_names[j], ami])

# --- Create dataframe in MEGENA format ---
edges_df = pd.DataFrame(edges, columns=["from", "to", "weight"])

# Optional: ensure finite & nonnegative
edges_df["weight"] = np.clip(edges_df["weight"], a_min=0, a_max=1)

# Save to CSV
edges_df.to_csv("C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Dataset/ami_edges_late.csv", index=False)

print("‚úÖ Pairwise Adjusted Mutual Information computed and saved for MEGENA.")

‚úÖ Pairwise Adjusted Mutual Information computed and saved for MEGENA.
