# Adjusted Mutual Information

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import adjusted_mutual_info_score

# Load expression data
genes = pd.read_csv("C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Dataset/aak100_cpmdat.csv")

# Keep only expression columns
expr = genes.drop(["sample_id", "mgs_level"], axis=1)
gene_names = expr.columns
expr_values = expr.to_numpy()

# --- Discretize each gene into bins (needed for AMI) ---
# Quantile binning: divide each gene's expression into categories
n_bins = 5  # you can tune this
binned_expr = np.zeros_like(expr_values, dtype=int)

for i in range(expr_values.shape[1]):
    binned_expr[:, i] = pd.qcut(expr_values[:, i], q=n_bins, labels=False, duplicates='drop')

edges = []

# --- Compute pairwise Adjusted Mutual Information ---
for i in range(len(gene_names)):
    for j in range(i + 1, len(gene_names)):
        g1 = binned_expr[:, i]
        g2 = binned_expr[:, j]
        ami = adjusted_mutual_info_score(g1, g2)
        edges.append([gene_names[i], gene_names[j], ami])

# --- Create dataframe in MEGENA format ---
edges_df = pd.DataFrame(edges, columns=["from", "to", "weight"])

# Optional: ensure finite & nonnegative
edges_df["weight"] = np.clip(edges_df["weight"], a_min=0, a_max=1)

# Save to CSV
edges_df.to_csv("C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Dataset/ami_edges.csv", index=False)

print("‚úÖ Pairwise Adjusted Mutual Information computed and saved for MEGENA.")

KeyboardInterrupt: 

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import adjusted_mutual_info_score

# ===== 1. Load QN Global Expression Data =====
qn = pd.read_csv("C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Scripts/Data Quantile Normalization/QN_global_L_global.csv")

# If metadata columns exist, list them here to remove
meta_cols = ["mgs_level", "sample_id", "Unnamed: 0", "X"]

# Keep only numeric expression columns (genes)
expr = qn.drop(columns=[c for c in meta_cols if c in qn.columns], errors="ignore")
expr = expr.select_dtypes(include=[np.number])

gene_names = expr.columns
expr_values = expr.to_numpy()

print(f"‚úÖ Loaded QN Global expression matrix: {expr_values.shape[0]} samples √ó {expr_values.shape[1]} genes")

# ===== 2. Discretize Expression for AMI =====
n_bins = 5  # can fine-tune
binned_expr = np.zeros_like(expr_values, dtype=int)

for i in range(expr_values.shape[1]):
    col = expr_values[:, i]

    # If gene is constant across samples ‚Üí assign one bin
    if np.all(col == col[0]):
        binned_expr[:, i] = 0
    else:
        binned_expr[:, i] = pd.qcut(col, q=n_bins, labels=False, duplicates='drop')

# ===== 3. Compute Pairwise AMI =====
edges = []
num_genes = len(gene_names)

for i in range(num_genes):
    for j in range(i + 1, num_genes):
        g1 = binned_expr[:, i]
        g2 = binned_expr[:, j]
        ami = adjusted_mutual_info_score(g1, g2)
        edges.append([gene_names[i], gene_names[j], ami])

# ===== 4. Build MEGENA-Compatible Edge List =====
edges_df = pd.DataFrame(edges, columns=["from", "to", "weight"])

# Ensure valid weights
edges_df["weight"] = edges_df["weight"].fillna(0)
edges_df["weight"] = np.clip(edges_df["weight"], 0, 1)

save_path = "C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Dataset/ami_edges_QN_global_L_global.csv"
edges_df.to_csv(save_path, index=False)

print(f"‚úÖ AMI computed for {num_genes} genes.")
print(f"üß† Total edges: {len(edges_df)}")
print(f"üìÅ Saved to {save_path}")


‚úÖ Loaded QN Global expression matrix: 166 samples √ó 81 genes
‚úÖ AMI computed for 81 genes.
üß† Total edges: 3240
üìÅ Saved to C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Dataset/ami_edges_QN_global_L_global.csv


In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import adjusted_mutual_info_score

# Load expression data
genes = pd.read_csv("C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Dataset/aak100_cpmdat.csv")

genes = genes[genes['mgs_level'] == "MGS1"]

# Keep only expression columns
expr = genes.drop(["Unnamed: 0", "mgs_level"], axis=1)
gene_names = expr.columns
expr_values = expr.to_numpy()

# --- Discretize each gene into bins (needed for AMI) ---
# Quantile binning: divide each gene's expression into categories
n_bins = 5  # you can tune this
binned_expr = np.zeros_like(expr_values, dtype=int)

for i in range(expr_values.shape[1]):
    binned_expr[:, i] = pd.qcut(expr_values[:, i], q=n_bins, labels=False, duplicates='drop')

edges = []

# --- Compute pairwise Adjusted Mutual Information ---
for i in range(len(gene_names)):
    for j in range(i + 1, len(gene_names)):
        g1 = binned_expr[:, i]
        g2 = binned_expr[:, j]
        ami = adjusted_mutual_info_score(g1, g2)
        edges.append([gene_names[i], gene_names[j], ami])

# --- Create dataframe in MEGENA format ---
edges_df = pd.DataFrame(edges, columns=["from", "to", "weight"])

# Optional: ensure finite & nonnegative
edges_df["weight"] = np.clip(edges_df["weight"], a_min=0, a_max=1)

# Save to CSV
edges_df.to_csv("C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Dataset/ami_edges_control.csv", index=False)

print("‚úÖ Pairwise Adjusted Mutual Information computed and saved for MEGENA.")

‚úÖ Pairwise Adjusted Mutual Information computed and saved for MEGENA.


In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import adjusted_mutual_info_score

# Load expression data
genes = pd.read_csv("C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Dataset/aak100_cpmdat.csv")

genes = genes[genes['mgs_level'] == "MGS4"]

# Keep only expression columns
expr = genes.drop(["Unnamed: 0", "mgs_level"], axis=1)
gene_names = expr.columns
expr_values = expr.to_numpy()

# --- Discretize each gene into bins (needed for AMI) ---
# Quantile binning: divide each gene's expression into categories
n_bins = 5  # you can tune this
binned_expr = np.zeros_like(expr_values, dtype=int)

for i in range(expr_values.shape[1]):
    binned_expr[:, i] = pd.qcut(expr_values[:, i], q=n_bins, labels=False, duplicates='drop')

edges = []

# --- Compute pairwise Adjusted Mutual Information ---
for i in range(len(gene_names)):
    for j in range(i + 1, len(gene_names)):
        g1 = binned_expr[:, i]
        g2 = binned_expr[:, j]
        ami = adjusted_mutual_info_score(g1, g2)
        edges.append([gene_names[i], gene_names[j], ami])

# --- Create dataframe in MEGENA format ---
edges_df = pd.DataFrame(edges, columns=["from", "to", "weight"])

# Optional: ensure finite & nonnegative
edges_df["weight"] = np.clip(edges_df["weight"], a_min=0, a_max=1)

# Save to CSV
edges_df.to_csv("C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Dataset/ami_edges_late.csv", index=False)

print("‚úÖ Pairwise Adjusted Mutual Information computed and saved for MEGENA.")

‚úÖ Pairwise Adjusted Mutual Information computed and saved for MEGENA.


In [5]:
import numpy as np
import pandas as pd
from sklearn.metrics import adjusted_mutual_info_score
from joblib import Parallel, delayed
import os

# ============================================================
# 1. LOAD DATA
# ============================================================

print("Loading gene expression matrix...")

genes_df = pd.read_csv("C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Dataset/gene_input.csv")

# Remove metadata columns
expr_df = genes_df.drop(columns=["sample_id", "mgs_level"], errors="ignore")

gene_names = expr_df.columns.to_numpy()
expr_matrix = expr_df.to_numpy()

print(f"Loaded expression matrix with {expr_matrix.shape[1]} genes.")

# ============================================================
# 2. GENES OF INTEREST
# ============================================================

core_genes = np.array([
    "C1R", "B2M", "HLA-DRA", "SLC9A9", "CTSZ", "APOBEC3C"
])

# Make sure they exist
core_genes = np.array([g for g in core_genes if g in gene_names])

print("Core genes included every run:", core_genes)

# Sampling pool (exclude core genes)
sampling_pool = np.array([g for g in gene_names if g not in core_genes])

subset_size = 500   # total per experiment
runs = 100          # number of experiments

output_folder = "C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Dataset/AMI/ami_runs/"
os.makedirs(output_folder, exist_ok=True)


# ============================================================
# 3. FUNCTION: COMPUTE AMI FOR ONE SUBSET
# ============================================================

def compute_ami_for_subset(selected_genes, run_id):

    print(f"‚ñ∂ Running AMI for experiment {run_id} with {len(selected_genes)} genes...")

    # Indices of selected genes
    idx = [np.where(gene_names == g)[0][0] for g in selected_genes]
    sub_expr = expr_matrix[:, idx]

    # --- Quantile discretization ---
    binned = np.zeros_like(sub_expr, dtype=int)
    for i in range(sub_expr.shape[1]):
        binned[:, i] = pd.qcut(
            sub_expr[:, i],
            q=5,
            labels=False,
            duplicates='drop'
        )

    n = len(selected_genes)

    # Create all gene pairs
    pairs = [(i, j) for i in range(n) for j in range(i+1, n)]

    # Parallel computation of AMI for all pairs
    results = Parallel(n_jobs=-1, verbose=0)(
        delayed(lambda i, j: (selected_genes[i],
                              selected_genes[j],
                              adjusted_mutual_info_score(binned[:, i], binned[:, j]))
        )(i, j) for i, j in pairs
    )

    edges_df = pd.DataFrame(results, columns=["from", "to", "weight"])

    # Ensure weights 0‚Äì1
    edges_df["weight"] = np.clip(edges_df["weight"], 0, 1)

    # Save to CSV
    outfile = os.path.join(output_folder, f"ami_edges_run_{run_id}.csv")
    edges_df.to_csv(outfile, index=False)

    print(f"‚úî Saved {outfile}")

    return outfile


# ============================================================
# 4. MAIN LOOP ‚Äî RUN 100 AMI EXPERIMENTS
# ============================================================

saved_files = []

for run_id in range(1, runs + 1):

    # Always include core genes
    remaining_needed = subset_size - len(core_genes)

    # Sample additional random genes
    random_genes = np.random.choice(sampling_pool, remaining_needed, replace=False)

    selected_genes = np.concatenate([core_genes, random_genes])

    # Run AMI + save
    outfile = compute_ami_for_subset(selected_genes, run_id)
    saved_files.append(outfile)


# ============================================================
# DONE
# ============================================================

print("\nAll AMI experiments completed!")
print("List of saved files:")
for f in saved_files:
    print("   -", f)


Loading gene expression matrix...
Loaded expression matrix with 18056 genes.
Core genes included every run: ['C1R' 'B2M' 'HLA-DRA' 'SLC9A9' 'CTSZ' 'APOBEC3C']
‚ñ∂ Running AMI for experiment 1 with 500 genes...
‚úî Saved C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Dataset/AMI/ami_runs/ami_edges_run_1.csv
‚ñ∂ Running AMI for experiment 2 with 500 genes...
‚úî Saved C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Dataset/AMI/ami_runs/ami_edges_run_2.csv
‚ñ∂ Running AMI for experiment 3 with 500 genes...
‚úî Saved C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Dataset/AMI/ami_runs/ami_edges_run_3.csv
‚ñ∂ Running AMI for experiment 4 with 500 genes...
‚úî Saved C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Dataset/AMI/ami_runs/ami_edges_run_4.csv
‚ñ∂ Running AMI for experiment 5 with 500 genes...
‚úî Saved C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/Dataset/AMI/ami_runs/ami_edges_run_5.csv
‚ñ∂ Running AMI for experiment 6 with 500 genes...
‚úî Saved C:/Users/Brayan Gutierrez/Desktop/RNAseq-AMD/