In [1]:
import pandas as pd
from pathlib import Path

frames: dict[str, pd.DataFrame] = {}

DATA_DIR = Path("data2")
FILE_PATTERN = "ranked_interactions_*.csv"

for fp in sorted(DATA_DIR.glob(FILE_PATTERN)):
    label = fp.stem.split("_", 2)[-1]
    df = pd.read_csv(fp,
        usecols=["gene1", "gene2", "strength", "sign","rank","gene1_is_TF","gene2_is_TF"],
        dtype={"gene1": "string", "gene2": "string", "strength": "float32", "sign": "int8", "rank":"int32", "gene1_is_TF": "boolean", "gene2_is_TF": "boolean"},)
    print(f"  {label}: {len(df)} edges")
    df["gene1"] = df["gene1"].str.upper()
    df["gene2"] = df["gene2"].str.upper()
    frames[label] = df

  0: 24995000 edges
  1: 24995000 edges
  2: 24995000 edges
  3: 24995000 edges
  4: 24995000 edges
  5: 24995000 edges


In [2]:
print(f'{len(frames)} inter-timepoints avaiable')
print("Keep in mind that we explore networks underlying the transition between the timepoints.")
print("Interactions displayed are ranked by strength.")

6 inter-timepoints avaiable
Keep in mind that we explore networks underlying the transition between the timepoints.
Interactions displayed are ranked by strength.


In [3]:
# allows to query easily the csv dataframe
def get_links_from_gene(gene: str, topn: int = 100, timepoints: list[int] = None,rankmax=None) -> pd.DataFrame:
    gene = gene.upper()
    dfs = []
    for label, df in frames.items():
        if timepoints and int(label) not in timepoints:
            continue
        df_sub = df[df["gene1"] == gene].copy()
        if not df_sub.empty:
            if rankmax is not None:
                df_sub = df_sub[df_sub["rank"] <= rankmax]
            else:
                df_sub = df_sub.nlargest(topn, "strength")
            df_sub["timepoint"] = f"{label}-{int(label)+1}"
            dfs.append(df_sub)
    if dfs:
        result = pd.concat(dfs)
        result = result.sort_values(by=["strength", "timepoint"], ascending=[False, True])
        return result.reset_index(drop=True)
    else:
        return pd.DataFrame(columns=["gene1", "gene2", "strength", "sign","rank", "gene1_is_TF", "gene2_is_TF", "timepoint"])


In [None]:
frames["1"].head(10) # top interactions overall at timepoint 1-2

Unnamed: 0,gene1,gene2,strength,sign,rank,gene1_is_TF,gene2_is_TF
0,EGR1,POLR2A,2.175567,1,0,True,False
1,EGR1,GM42418,2.132893,-1,1,True,False
2,FOS,GM42418,1.972763,-1,2,True,False
3,ATF3,CAMK1D,1.957211,-1,3,True,False
4,FOS,POLR2A,1.95293,1,4,True,False
5,FOS,JUNB,1.94665,1,5,True,True
6,ID3,CMSS1,1.946378,-1,6,True,False
7,JUNB,GM42418,1.940891,-1,7,True,False
8,JUNB,ATF3,1.921662,1,8,True,True
9,FOS,CMSS1,1.916956,-1,9,True,False


In [None]:
# example for Klf4
links= get_links_from_gene(gene="Klf4",topn=500,timepoints=None) #timepoints=[0,1]
#WARNING: rank is per timepoint so it can be disconnected with the ranked strength

In [354]:
# store as csv for each week separately
correspondance = {
    "0-1": "0-3wk",
    "1-2": "3-5wk",
    "2-3": "5-7wk",
    "3-4": "7-9wk",
    "4-5": "9-12wk",
    "5-6": "12-16wk",
}
for tp, label in correspondance.items():
    links_tp = links[links["timepoint"]==tp]
    # remove isTF columns and tp columbs
    links_tp = links_tp.drop(columns=["gene1_is_TF", "gene2_is_TF", "timepoint"])
    out_fp = Path(f"output/top50interactions_{label}.csv")
    out_fp.parent.mkdir(parents=True, exist_ok=True)
    links_tp.to_csv(out_fp, index=False)
    print(f"Stored {len(links_tp)} interactions for timepoint {tp} in {out_fp}")

Stored 50 interactions for timepoint 0-1 in output/top50interactions_0-3wk.csv
Stored 50 interactions for timepoint 1-2 in output/top50interactions_3-5wk.csv
Stored 50 interactions for timepoint 2-3 in output/top50interactions_5-7wk.csv
Stored 50 interactions for timepoint 3-4 in output/top50interactions_7-9wk.csv
Stored 50 interactions for timepoint 4-5 in output/top50interactions_9-12wk.csv
Stored 50 interactions for timepoint 5-6 in output/top50interactions_12-16wk.csv


In [348]:
links= get_links_from_gene(gene="tgfb1",topn=50,timepoints=None)

In [349]:
# evaluate the sensitivity to topn across timepoints, consider the interactions as a set
# and check the part that is in common for each consecutive timepoints
# we already have links that contains that for each timepoint, so we can use it

timepoint_keys = sorted(correspondance.keys())
sensitivity_results = []
for i in range(len(timepoint_keys) - 1):
    tp1 = timepoint_keys[i]
    tp2 = timepoint_keys[i + 1]
    links_tp1 = set(links[links["timepoint"] == tp1].apply(lambda row: (row["gene1"], row["gene2"]), axis=1))
    links_tp2 = set(links[links["timepoint"] == tp2].apply(lambda row: (row["gene1"], row["gene2"]), axis=1))
    common_links = links_tp1.intersection(links_tp2)
    sensitivity = len(common_links) / max(len(links_tp1), len(links_tp2)) if max(len(links_tp1), len(links_tp2)) > 0 else 0
    sensitivity_results.append({
        "timepoint_pair": f"{tp1} -> {tp2}",
        "links_tp1": len(links_tp1),
        "links_tp2": len(links_tp2),
        "common_links": len(common_links),
        "sensitivity": sensitivity
    })

sensitivity_df = pd.DataFrame(sensitivity_results)
print(sensitivity_df)

  timepoint_pair  links_tp1  links_tp2  common_links  sensitivity
0     0-1 -> 1-2         50         50            11         0.22
1     1-2 -> 2-3         50         50            21         0.42
2     2-3 -> 3-4         50         50             4         0.08
3     3-4 -> 4-5         50         50             2         0.04
4     4-5 -> 5-6         50         50             3         0.06
