<a href="https://colab.research.google.com/github/Enso-bio/Neuroimmune-Synchronization-Across-the-Sleep-Wake-Cycl/blob/main/Inf_Theory_Necrop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Full-automatic necroptosis information analysis
# Works in Google Colab or locally (Python 3.9+)

import os, re, json, gzip, textwrap, math
from dataclasses import dataclass
from typing import Dict, List, Tuple, Optional

import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

BASE_DIR = os.getcwd()
OUT_DIR = os.path.join(BASE_DIR, "necroptosis_information")
DATA_DIR = os.path.join(OUT_DIR, "data_fasta")
FIG_DIR  = os.path.join(OUT_DIR, "figures")
TAB_DIR  = os.path.join(OUT_DIR, "tables")
TEX_DIR  = os.path.join(OUT_DIR, "manuscript")

for d in [OUT_DIR, DATA_DIR, FIG_DIR, TAB_DIR, TEX_DIR]:
    os.makedirs(d, exist_ok=True)

print("Output folder:", OUT_DIR)
# Species (UniProt taxonomy IDs)
SPECIES = {
    "Homo_sapiens": 9606,
    "Mus_musculus": 10090,
    "Danio_rerio": 7955,
}

# Protein panel: necroptosis core + controls
# gene: primary gene symbol used in UniProt query
# role: label for grouping
PROTEINS = [
    {"gene": "MLKL",     "role": "necroptosis"},
    {"gene": "RIPK1",    "role": "necroptosis"},
    {"gene": "RIPK3",    "role": "necroptosis"},
    {"gene": "FADD",     "role": "apoptosis_control"},
    {"gene": "CASP8",    "role": "apoptosis_control"},
    {"gene": "CASP3",    "role": "apoptosis_control"},
    {"gene": "ACTB",     "role": "housekeeping_control"},
    {"gene": "GAPDH",    "role": "housekeeping_control"},
]

UNIPROT_SEARCH_URL = "https://rest.uniprot.org/uniprotkb/search"
UNIPROT_FASTA_URL  = "https://rest.uniprot.org/uniprotkb/{}.fasta"

HEADERS = {"User-Agent": "necroptosis-info-paper/1.0", "Accept": "application/json"}

def uniprot_search_best_accession(gene: str, taxid: int) -> Optional[Dict]:
    """
    Returns best UniProt entry dict with accession for a gene in a species.
    Prefers reviewed: true. If none, falls back to unreviewed.
    """
    def _query(reviewed: bool):
        q = f'(gene_exact:{gene} AND organism_id:{taxid}'
        q += f' AND reviewed:{str(reviewed).lower()})'
        params = {
            "query": q,
            "format": "json",
            "size": 5,
            "fields": "accession,id,protein_name,gene_primary,organism_name,reviewed,length"
        }
        r = requests.get(UNIPROT_SEARCH_URL, params=params, headers=HEADERS, timeout=30)
        r.raise_for_status()
        data = r.json()
        results = data.get("results", [])
        return results

    # try reviewed first
    res = _query(True)
    if not res:
        res = _query(False)
        if not res:
            return None

    # choose: exact primary gene match if available, else longest
    def get_primary_gene(entry):
        genes = entry.get("genes", [])
        if genes and "geneName" in genes[0] and "value" in genes[0]["geneName"]:
            return genes[0]["geneName"]["value"]
        return None

    exact = [e for e in res if (get_primary_gene(e) or "").upper() == gene.upper()]
    candidates = exact if exact else res

    # pick longest sequence
    def get_len(entry):
        return entry.get("sequence", {}).get("length", 0)

    best = sorted(candidates, key=get_len, reverse=True)[0]
    return best

def fetch_fasta(accession: str) -> str:
    r = requests.get(UNIPROT_FASTA_URL.format(accession), timeout=30)
    r.raise_for_status()
    return r.text

def parse_fasta(fasta_text: str) -> Tuple[str, str]:
    lines = [l.strip() for l in fasta_text.splitlines() if l.strip()]
    header = lines[0]
    seq = "".join(lines[1:]).replace(" ", "").upper()
    seq = re.sub(r"[^ACDEFGHIKLMNPQRSTVWY]", "", seq)  # keep standard AA
    return header, seq

records = []

for sp_name, taxid in SPECIES.items():
    for p in PROTEINS:
        gene = p["gene"]
        role = p["role"]

        best = uniprot_search_best_accession(gene, taxid)
        if best is None:
            print(f"[WARN] Not found: {gene} in {sp_name}")
            continue

        acc = best["primaryAccession"]
        reviewed = best.get("entryType", "") == "UniProtKB reviewed (Swiss-Prot)"
        org = best.get("organism", {}).get("scientificName", sp_name.replace("_"," "))
        prot_name = best.get("proteinDescription", {}).get("recommendedName", {}).get("fullName", {}).get("value", "")

        fasta = fetch_fasta(acc)
        header, seq = parse_fasta(fasta)

        fname = f"{gene}__{sp_name}__{acc}.fasta"
        fpath = os.path.join(DATA_DIR, fname)
        with open(fpath, "w", encoding="utf-8") as f:
            f.write(fasta)

        records.append({
            "gene": gene,
            "role": role,
            "species": sp_name,
            "taxid": taxid,
            "accession": acc,
            "organism": org,
            "protein_name": prot_name,
            "length": len(seq),
            "fasta_file": fpath
        })

df_meta = pd.DataFrame(records).sort_values(["role","gene","species"])
df_meta.to_csv(os.path.join(TAB_DIR, "uniprot_metadata.csv"), index=False)
df_meta.head(10)

AA = list("ACDEFGHIKLMNPQRSTVWY")
AA_TO_I = {a:i for i,a in enumerate(AA)}

def shannon_entropy(seq: str) -> float:
    if not seq:
        return np.nan
    counts = np.zeros(len(AA), dtype=float)
    for ch in seq:
        if ch in AA_TO_I:
            counts[AA_TO_I[ch]] += 1
    p = counts / counts.sum() if counts.sum() > 0 else counts
    p = p[p > 0]
    return float(-(p * np.log2(p)).sum()) if len(p) else 0.0

def sliding_entropy(seq: str, w: int = 50, step: int = 5) -> Tuple[np.ndarray, np.ndarray]:
    xs, hs = [], []
    for start in range(0, max(1, len(seq) - w + 1), step):
        window = seq[start:start+w]
        xs.append(start + w/2)
        hs.append(shannon_entropy(window))
    return np.array(xs), np.array(hs)

def gzip_compress_ratio(seq: str) -> float:
    if not seq:
        return np.nan
    raw = seq.encode("utf-8")
    comp = gzip.compress(raw, compresslevel=9)
    return len(comp) / max(1, len(raw))

def split_halves(seq: str) -> Tuple[str, str]:
    mid = len(seq)//2
    return seq[:mid], seq[mid:]

def load_seq_from_fasta(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        header, seq = parse_fasta(f.read())
    return seq
rows = []
W = 50
STEP = 5

for _, r in df_meta.iterrows():
    seq = load_seq_from_fasta(r["fasta_file"])
    n_half, c_half = split_halves(seq)

    ent_full = shannon_entropy(seq)
    ent_n = shannon_entropy(n_half)
    ent_c = shannon_entropy(c_half)

    gz_full = gzip_compress_ratio(seq)
    gz_n = gzip_compress_ratio(n_half)
    gz_c = gzip_compress_ratio(c_half)

    rows.append({
        "role": r["role"],
        "gene": r["gene"],
        "species": r["species"],
        "accession": r["accession"],
        "length": len(seq),
        "entropy_full": ent_full,
        "entropy_Nhalf": ent_n,
        "entropy_Chalf": ent_c,
        "gzip_ratio_full": gz_full,
        "gzip_ratio_Nhalf": gz_n,
        "gzip_ratio_Chalf": gz_c,
        "delta_entropy_NminusC": ent_n - ent_c,
        "delta_gzip_NminusC": gz_n - gz_c
    })

df = pd.DataFrame(rows).sort_values(["role","gene","species"])
df.to_csv(os.path.join(TAB_DIR, "main_metrics.csv"), index=False)
df.head(12)
# entropy_full grouped by role
roles = df["role"].unique().tolist()
data = [df.loc[df["role"]==role, "entropy_full"].dropna().values for role in roles]

plt.figure(figsize=(8,4.5))
plt.boxplot(data, labels=roles, showfliers=False)
plt.ylabel("Shannon entropy (bits)")
plt.title("Global sequence entropy by functional group")
plt.xticks(rotation=20, ha="right")
plt.tight_layout()
out = os.path.join(FIG_DIR, "Fig1_entropy_by_role.png")
plt.savefig(out, dpi=300)
plt.close()
print("Saved:", out)
roles = df["role"].unique().tolist()
data = [df.loc[df["role"]==role, "gzip_ratio_full"].dropna().values for role in roles]

plt.figure(figsize=(8,4.5))
plt.boxplot(data, labels=roles, showfliers=False)
plt.ylabel("gzip compressed size / raw size")
plt.title("Algorithmic compressibility by functional group")
plt.xticks(rotation=20, ha="right")
plt.tight_layout()
out = os.path.join(FIG_DIR, "Fig2_gzip_by_role.png")
plt.savefig(out, dpi=300)
plt.close()
print("Saved:", out)
def plot_sliding_profiles(species="Homo_sapiens", genes=("MLKL","RIPK3","CASP3","ACTB"), w=W, step=STEP):
    plt.figure(figsize=(9,4.8))
    for g in genes:
        sub = df_meta[(df_meta["species"]==species) & (df_meta["gene"]==g)]
        if sub.empty:
            continue
        fpath = sub.iloc[0]["fasta_file"]
        seq = load_seq_from_fasta(fpath)
        x,h = sliding_entropy(seq, w=w, step=step)
        plt.plot(x, h, label=f"{g} ({species})")
    plt.xlabel("Position (aa)")
    plt.ylabel(f"Shannon entropy in window (w={w})")
    plt.title("Sliding-window entropy profiles")
    plt.legend()
    plt.tight_layout()
    out = os.path.join(FIG_DIR, "Fig3_sliding_entropy_profiles.png")
    plt.savefig(out, dpi=300)
    plt.close()
    print("Saved:", out)

plot_sliding_profiles()
sub = df.copy()
plt.figure(figsize=(7.5,5))
plt.scatter(sub["entropy_Nhalf"], sub["entropy_Chalf"])
plt.xlabel("Entropy N-half (bits)")
plt.ylabel("Entropy C-half (bits)")
plt.title("N-terminal vs C-terminal sequence entropy")
# identity line
mn = np.nanmin([sub["entropy_Nhalf"].min(), sub["entropy_Chalf"].min()])
mx = np.nanmax([sub["entropy_Nhalf"].max(), sub["entropy_Chalf"].max()])
plt.plot([mn,mx],[mn,mx], linestyle="--")
plt.tight_layout()
out = os.path.join(FIG_DIR, "Fig4_N_vs_C_entropy.png")
plt.savefig(out, dpi=300)
plt.close()
print("Saved:", out)


Output folder: /content/necroptosis_information
[WARN] Not found: MLKL in Danio_rerio
[WARN] Not found: RIPK1 in Danio_rerio
[WARN] Not found: ACTB in Danio_rerio


  plt.boxplot(data, labels=roles, showfliers=False)


Saved: /content/necroptosis_information/figures/Fig1_entropy_by_role.png


  plt.boxplot(data, labels=roles, showfliers=False)


Saved: /content/necroptosis_information/figures/Fig2_gzip_by_role.png
Saved: /content/necroptosis_information/figures/Fig3_sliding_entropy_profiles.png
Saved: /content/necroptosis_information/figures/Fig4_N_vs_C_entropy.png
