In [None]:
import pandas as pd
from pathlib import Path

frames: dict[str, pd.DataFrame] = {}

DATA_DIR = Path("data")
FILE_PATTERN = "ranked_interactions_*.csv"

for fp in sorted(DATA_DIR.glob(FILE_PATTERN)):
    label = fp.stem.split("_", 2)[-1]
    df = pd.read_csv(fp,
        usecols=["gene1", "gene2", "strength", "sign","rank","gene1_is_TF","gene2_is_TF"],
        dtype={"gene1": "string", "gene2": "string", "strength": "float32", "sign": "int8", "rank":"int32", "gene1_is_TF": "boolean", "gene2_is_TF": "boolean"},)
    print(f"  {label}: {len(df)} edges")
    df["gene1"] = df["gene1"].str.upper()
    df["gene2"] = df["gene2"].str.upper()
    frames[label] = df

  0: 24995000 edges
  1: 24995000 edges
  2: 24995000 edges
  3: 24995000 edges
  4: 24995000 edges
  5: 24995000 edges


In [2]:
print(f'{len(frames)} inter-timepoints avaiable')
print("Keep in mind that we explore networks underlying the transition between the timepoints.")
print("Interactions displayed are ranked by strength.")

6 inter-timepoints avaiable
Keep in mind that we explore networks underlying the transition between the timepoints.
Interactions displayed are ranked by strength.


In [3]:
# allows to query easily the csv dataframe
def get_links_from_gene(gene: str, topn: int = 100, timepoints: list[int] = None) -> pd.DataFrame:
    gene = gene.upper()
    dfs = []
    for label, df in frames.items():
        if timepoints and int(label) not in timepoints:
            continue
        df_sub = df[df["gene1"] == gene].copy()
        if not df_sub.empty:
            df_sub = df_sub.nlargest(topn, "strength")
            df_sub["timepoint"] = f"{label}-{int(label)+1}"
            dfs.append(df_sub)
    if dfs:
        result = pd.concat(dfs)
        result = result.sort_values(by=["strength", "timepoint"], ascending=[False, True])
        return result.reset_index(drop=True)
    else:
        return pd.DataFrame(columns=["gene1", "gene2", "strength", "sign","rank", "gene1_is_TF", "gene2_is_TF", "timepoint"])


In [None]:
# example for Klf4
links= get_links_from_gene(gene="Klf4",topn=50,timepoints=None) #timepoints=[0,1]
links.head(10)
#WARNING: rank is per timepoint so it can be disconnected with the ranked strength

Unnamed: 0,gene1,gene2,strength,sign,rank,gene1_is_TF,gene2_is_TF,timepoint
0,KLF4,EGR1,1.838872,1,245,True,True,2-3
1,KLF4,GM42418,1.797313,-1,42,True,False,1-2
2,KLF4,EGR1,1.779017,1,51,True,True,1-2
3,KLF4,FOS,1.771451,1,459,True,True,2-3
4,KLF4,CMSS1,1.763666,-1,60,True,False,1-2
5,KLF4,FOS,1.736025,1,87,True,True,0-1
6,KLF4,GADD45B,1.713643,1,694,True,False,2-3
7,KLF4,FOS,1.699698,1,106,True,True,1-2
8,KLF4,EGR1,1.697398,1,110,True,True,0-1
9,KLF4,JUNB,1.618347,1,1091,True,True,2-3
