# Comparison between GoiStrat and naïve solution

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

Imports

In [2]:
import sys
import pandas as pd
import seaborn as sns
import numpy as np

from IPython.display import display
from typing import Iterable, Dict
from itertools import product
from matplotlib import pyplot as plt
from pathlib import Path
from collections import defaultdict

Setup

In [3]:
src_path: str = "/home/uziel/Development/biopipes/src"
sys.path.insert(0, src_path)

In [4]:
ROOT: Path = Path("/mnt/d/phd_data/")
MSIGDB_CATS: Iterable[str] = ("H", *[f"C{i}" for i in range(1, 9)])
DATASETS_MARKERS: Dict[str, str] = {
    "TCGA-BRCA": "FOXA1",  # https://www.sciencedirect.com/science/article/abs/pii/S0960977616000242
    "TCGA-LUAD": "NKX2-1",  # https://www.nature.com/articles/nature09881
    "TCGA-THCA": "BRAF",  # https://www.frontiersin.org/journals/endocrinology/articles/10.3389/fendo.2024.1372553/full
    "TCGA-UCEC": "MCM10",  # https://onlinelibrary.wiley.com/doi/full/10.1111/jcmm.17772
    "TCGA-LUSC": "SOX2",  # https://www.cell.com/cancer-cell/fulltext/S1535-6108(16)30436-6
    "TCGA-KIRC": "CA9",  # https://www.sciencedirect.com/science/article/abs/pii/S0959804910006982
    "TCGA-HNSC": "TP63",  # https://aacrjournals.org/mcr/article/17/6/1279/270274/Loss-of-TP63-Promotes-the-Metastasis-of-Head-and
    "TCGA-LGG": "IDH1",  # https://www.neurology.org/doi/abs/10.1212/wnl.0b013e3181f96282
    "PCTA_WCDT": "FOLH1",  # https://www.nature.com/articles/nrurol.2016.26
}
PERCENTILES: Iterable[int] = (10, 15, 20, 25, 30)
PERC_SAMPLES: Dict[str, int] = {
    "GOI_level": 50 / 100,
    "GOI_level_10": 20 / 100,
    "GOI_level_15": 30 / 100,
    "GOI_level_20": 40 / 100,
    "GOI_level_25": 50 / 100,
    "GOI_level_30": 60 / 100,
}
SAMPLE_TYPE: str = "prim"
PALETTE_STR: str = "flare"

sns.set_theme(style="whitegrid", palette=PALETTE_STR)
plt.rcParams["font.family"] = "serif"
plt.rcParams["font.serif"] = "Ubuntu Mono"
plt.rcParams["font.monospace"] = "Ubuntu Mono"

In [5]:
print(DATASETS_MARKERS)

{'TCGA-BRCA': 'FOXA1', 'TCGA-LUAD': 'NKX2-1', 'TCGA-THCA': 'BRAF', 'TCGA-UCEC': 'MCM10', 'TCGA-LUSC': 'SOX2', 'TCGA-KIRC': 'CA9', 'TCGA-HNSC': 'TP63', 'TCGA-LGG': 'IDH1', 'PCTA_WCDT': 'FOLH1'}


Global variables

In [6]:
MULTI_DATASET_PATH: Path = ROOT.joinpath("MULTI_DATASET")
MULTI_DATASET_PATH.mkdir(parents=True, exist_ok=True)

## 1. Explore dataset characteristics

### 1.1. Annotation data from GSVA splits

In [7]:
group_counts_gsva = {}
for dataset, marker in DATASETS_MARKERS.items():
    data_root = ROOT.joinpath(f"{dataset}_{marker}")
    annot_df = pd.read_csv(
        data_root.joinpath("data").joinpath(f"samples_annotation_{marker}_gsva.csv"),
        index_col=0,
    )
    group_counts_gsva[dataset + f" ({marker})"] = (
        annot_df[f"{marker}_level"].value_counts().to_dict()
    )
    group_counts_gsva[dataset + f" ({marker})"].update({"total": len(annot_df)})

group_counts_gsva_df = pd.DataFrame(group_counts_gsva).T
group_counts_gsva_df.to_csv(MULTI_DATASET_PATH.joinpath("group_counts_gsva_df.csv"))
display(group_counts_gsva_df)

Unnamed: 0,mid,high,low,total
TCGA-BRCA (FOXA1),553,330,223,1106
TCGA-LUAD (NKX2-1),264,52,212,528
TCGA-THCA (BRAF),252,194,59,505
TCGA-UCEC (MCM10),274,186,89,549
TCGA-LUSC (SOX2),250,54,197,501
TCGA-KIRC (CA9),268,53,216,537
TCGA-HNSC (TP63),260,61,199,520
TCGA-LGG (IDH1),258,207,51,516
PCTA_WCDT (FOLH1),331,90,241,662


### 1.1. Annotation data from top/bottom splits

In [8]:
group_counts_perc = dict()
for (dataset, marker), percentile in product(DATASETS_MARKERS.items(), PERCENTILES):
    data_root = ROOT.joinpath(f"{dataset}_{marker}")
    annot_df = pd.read_csv(
        data_root.joinpath("data").joinpath(f"samples_annotation_{marker}_perc.csv"),
        index_col=0,
    )
    contrast_factor = f"{marker}_level_{percentile}"
    group_counts_perc[(dataset + f" ({marker})", contrast_factor)] = (
        annot_df[contrast_factor].value_counts().to_dict()
    )
    group_counts_perc[(dataset + f" ({marker})", contrast_factor)].update(
        {"total": len(annot_df)}
    )

group_counts_perc_df = pd.DataFrame(group_counts_perc).T
group_counts_perc_df.to_csv(MULTI_DATASET_PATH.joinpath("group_counts_perc_df.csv"))
display(group_counts_perc_df)

Unnamed: 0,Unnamed: 1,mid,low,high,total
TCGA-BRCA (FOXA1),FOXA1_level_10,884,111,111,1106
TCGA-BRCA (FOXA1),FOXA1_level_15,774,166,166,1106
TCGA-BRCA (FOXA1),FOXA1_level_20,664,221,221,1106
TCGA-BRCA (FOXA1),FOXA1_level_25,552,277,277,1106
TCGA-BRCA (FOXA1),FOXA1_level_30,442,332,332,1106
TCGA-LUAD (NKX2-1),NKX2-1_level_10,422,53,53,528
TCGA-LUAD (NKX2-1),NKX2-1_level_15,368,80,80,528
TCGA-LUAD (NKX2-1),NKX2-1_level_20,316,106,106,528
TCGA-LUAD (NKX2-1),NKX2-1_level_25,264,132,132,528
TCGA-LUAD (NKX2-1),NKX2-1_level_30,210,159,159,528


## 2. Gather and compare differential expression results

Differential expression results between low and high groups for each dataset and splitting strategy.

In [9]:
all_degs = dict()
all_degs_scores = dict()
for dataset, marker in DATASETS_MARKERS.items():
    contrast_factors = [
        f"{marker}_level_{percentile}" for percentile in PERCENTILES
    ] + [f"{marker}_level"]
    data_root = ROOT.joinpath(f"{dataset}_{marker}")

    for contrast_factor in contrast_factors:
        deseq_results = pd.read_csv(
            data_root.joinpath("deseq2").joinpath(
                f"sample_type_{SAMPLE_TYPE}_{contrast_factor}_"
                f"{SAMPLE_TYPE}_high+{SAMPLE_TYPE}_low_"
                f"_{SAMPLE_TYPE}_high_vs_{SAMPLE_TYPE}_low_"
                "padj_0_05_all_1_0_deseq_results_unique.csv"
            ),
            index_col=0,
        )
        contrast_factor_str = contrast_factor.replace(marker, "GOI")
        all_degs[(dataset + f" ({marker})", contrast_factor_str)] = deseq_results[
            "log2FoldChange"
        ]
        all_degs_scores[(dataset + f" ({marker})", contrast_factor_str)] = (
            deseq_results["log2FoldChange"].pipe(
                lambda x: np.sqrt(np.mean(np.power(x, 2)))
            )
        )

all_degs_df = pd.DataFrame(all_degs)

In [10]:
all_degs_scores_df = pd.Series(all_degs_scores).unstack(level=0).transpose()
all_degs_scores_df.to_csv(MULTI_DATASET_PATH.joinpath("all_degs_scores_df.csv"))
display(all_degs_scores_df)

Unnamed: 0,GOI_level,GOI_level_10,GOI_level_15,GOI_level_20,GOI_level_25,GOI_level_30
PCTA_WCDT (FOLH1),1.523208,1.724276,1.597775,1.571454,1.508589,1.479444
TCGA-BRCA (FOXA1),2.432425,2.733373,2.650776,2.514013,2.306919,2.197974
TCGA-HNSC (TP63),1.483313,1.598803,1.541048,1.481256,1.474818,1.442893
TCGA-KIRC (CA9),1.896113,2.267806,2.327118,2.354664,2.323584,2.206625
TCGA-LGG (IDH1),1.781794,1.878264,1.786128,1.681172,1.612114,1.537598
TCGA-LUAD (NKX2-1),1.947659,2.276284,2.17866,2.110323,2.083919,2.007937
TCGA-LUSC (SOX2),1.795002,1.999208,1.926709,1.873019,1.822779,1.799866
TCGA-THCA (BRAF),1.492055,1.825604,1.674442,1.642668,1.621172,1.623575
TCGA-UCEC (MCM10),1.880049,2.070832,2.019452,1.95781,1.916015,1.817026


In [11]:
all_degs_scores_df_weighted = all_degs_scores_df.apply(
    lambda col: col * PERC_SAMPLES[col.name], axis=0
)
all_degs_scores_df_weighted.to_csv(
    MULTI_DATASET_PATH.joinpath("all_degs_scores_df_weighted.csv")
)
display(all_degs_scores_df_weighted)

Unnamed: 0,GOI_level,GOI_level_10,GOI_level_15,GOI_level_20,GOI_level_25,GOI_level_30
PCTA_WCDT (FOLH1),0.761604,0.344855,0.479333,0.628582,0.754295,0.887667
TCGA-BRCA (FOXA1),1.216213,0.546675,0.795233,1.005605,1.153459,1.318785
TCGA-HNSC (TP63),0.741656,0.319761,0.462314,0.592502,0.737409,0.865736
TCGA-KIRC (CA9),0.948056,0.453561,0.698136,0.941866,1.161792,1.323975
TCGA-LGG (IDH1),0.890897,0.375653,0.535839,0.672469,0.806057,0.922559
TCGA-LUAD (NKX2-1),0.973829,0.455257,0.653598,0.844129,1.041959,1.204762
TCGA-LUSC (SOX2),0.897501,0.399842,0.578013,0.749208,0.911389,1.07992
TCGA-THCA (BRAF),0.746028,0.365121,0.502333,0.657067,0.810586,0.974145
TCGA-UCEC (MCM10),0.940025,0.414166,0.605836,0.783124,0.958008,1.090216


In [12]:
all_degs_scores_df_weighted_ranked = all_degs_scores_df_weighted.rank(
    axis=1, method="max", ascending=False
).astype(int)
all_degs_scores_df_weighted_ranked.to_csv(
    MULTI_DATASET_PATH.joinpath("all_degs_scores_df_weighted_ranked.csv")
)
display(all_degs_scores_df_weighted_ranked)

Unnamed: 0,GOI_level,GOI_level_10,GOI_level_15,GOI_level_20,GOI_level_25,GOI_level_30
PCTA_WCDT (FOLH1),2,6,5,4,3,1
TCGA-BRCA (FOXA1),2,6,5,4,3,1
TCGA-HNSC (TP63),2,6,5,4,3,1
TCGA-KIRC (CA9),3,6,5,4,2,1
TCGA-LGG (IDH1),2,6,5,4,3,1
TCGA-LUAD (NKX2-1),3,6,5,4,2,1
TCGA-LUSC (SOX2),3,6,5,4,2,1
TCGA-THCA (BRAF),3,6,5,4,2,1
TCGA-UCEC (MCM10),3,6,5,4,2,1


In [13]:
all_degs_scores_df_weighted_ranked_summary = (
    all_degs_scores_df_weighted_ranked.median().sort_values(ascending=True)
)
all_degs_scores_df_weighted_ranked_summary.to_csv(
    MULTI_DATASET_PATH.joinpath("all_degs_scores_df_weighted_ranked_summary.csv")
)
display(all_degs_scores_df_weighted_ranked_summary)

GOI_level_30    1.0
GOI_level_25    2.0
GOI_level       3.0
GOI_level_20    4.0
GOI_level_15    5.0
GOI_level_10    6.0
dtype: float64

## 3. Gather and compare differential enrichment results

Differential enrichment results between low and high groups for each dataset and splitting strategy.

In [14]:
all_degss = dict()
all_degss_scores = dict()
for dataset, marker in DATASETS_MARKERS.items():
    contrast_factors = [
        f"{marker}_level_{percentile}" for percentile in PERCENTILES
    ] + [f"{marker}_level"]
    data_root = ROOT.joinpath(f"{dataset}_{marker}")
    msigdb_cats_meta_dfs = {
        msigdb_cat: pd.read_csv(
            ROOT.joinpath(dataset)
            .joinpath("data")
            .joinpath("gsva")
            .joinpath(f"{msigdb_cat}_meta.csv"),
            index_col=0,
        )
        for msigdb_cat in MSIGDB_CATS
    }

    for contrast_factor, msigdb_cat in product(contrast_factors, MSIGDB_CATS):
        gsva_results = pd.read_csv(
            data_root.joinpath("diff_gsva")
            .joinpath(msigdb_cat)
            .joinpath(
                f"sample_type_{SAMPLE_TYPE}_{contrast_factor}_"
                f"{SAMPLE_TYPE}_high+{SAMPLE_TYPE}_low_"
                f"_{SAMPLE_TYPE}_high_vs_{SAMPLE_TYPE}_low_"
                "top_table_padj_0_05_all_0_0.csv"
            ),
            index_col=0,
        )
        contrast_factor_str = contrast_factor.replace(marker, "GOI")
        all_degss[(dataset + f" ({marker})", contrast_factor_str, msigdb_cat)] = (
            gsva_results["log2FoldChange"]
        )
        all_degss_scores[
            (dataset + f" ({marker})", contrast_factor_str, msigdb_cat)
        ] = gsva_results["log2FoldChange"].pipe(
            lambda x: (x.count() / len(msigdb_cats_meta_dfs[msigdb_cat]))
            * np.sqrt(np.mean(np.power(x, 2)))
        )

all_degss_df = pd.DataFrame(all_degss)

In [15]:
all_degss_scores_df = (
    pd.Series(all_degss_scores).unstack(level=[0, 1]).median().unstack()
)
all_degss_scores_df.to_csv(MULTI_DATASET_PATH.joinpath("all_degss_scores_df.csv"))
display(all_degss_scores_df)

Unnamed: 0,GOI_level,GOI_level_10,GOI_level_15,GOI_level_20,GOI_level_25,GOI_level_30
PCTA_WCDT (FOLH1),0.142125,0.13321,0.137942,0.131065,0.12251,0.108408
TCGA-BRCA (FOXA1),0.154515,0.17045,0.170926,0.162655,0.145538,0.134819
TCGA-HNSC (TP63),0.099136,0.097009,0.091157,0.092526,0.084081,0.081235
TCGA-KIRC (CA9),0.098297,0.110873,0.094056,0.086699,0.082785,0.079951
TCGA-LGG (IDH1),0.125393,0.140012,0.124962,0.111795,0.101627,0.09225
TCGA-LUAD (NKX2-1),0.105013,0.101345,0.089102,0.08888,0.089804,0.086732
TCGA-LUSC (SOX2),0.147486,0.163678,0.149648,0.130866,0.122489,0.118642
TCGA-THCA (BRAF),0.165678,0.173495,0.171389,0.168645,0.159379,0.153225
TCGA-UCEC (MCM10),0.17147,0.194194,0.188377,0.176342,0.164723,0.153989


In [16]:
all_degss_scores_df_weighted = all_degss_scores_df.apply(
    lambda col: col * PERC_SAMPLES[col.name], axis=0
)
all_degss_scores_df_weighted.to_csv(
    MULTI_DATASET_PATH.joinpath("all_degss_scores_df_weighted.csv")
)
display(all_degss_scores_df_weighted)

Unnamed: 0,GOI_level,GOI_level_10,GOI_level_15,GOI_level_20,GOI_level_25,GOI_level_30
PCTA_WCDT (FOLH1),0.071063,0.026642,0.041383,0.052426,0.061255,0.065045
TCGA-BRCA (FOXA1),0.077257,0.03409,0.051278,0.065062,0.072769,0.080892
TCGA-HNSC (TP63),0.049568,0.019402,0.027347,0.03701,0.042041,0.048741
TCGA-KIRC (CA9),0.049149,0.022175,0.028217,0.034679,0.041393,0.047971
TCGA-LGG (IDH1),0.062696,0.028002,0.037489,0.044718,0.050814,0.05535
TCGA-LUAD (NKX2-1),0.052506,0.020269,0.026731,0.035552,0.044902,0.052039
TCGA-LUSC (SOX2),0.073743,0.032736,0.044894,0.052346,0.061244,0.071185
TCGA-THCA (BRAF),0.082839,0.034699,0.051417,0.067458,0.079689,0.091935
TCGA-UCEC (MCM10),0.085735,0.038839,0.056513,0.070537,0.082361,0.092393


In [17]:
all_degss_scores_df_weighted_ranked = all_degss_scores_df_weighted.rank(
    axis=1, method="max", ascending=False
).astype(int)
all_degss_scores_df_weighted_ranked.to_csv(
    MULTI_DATASET_PATH.joinpath("all_degss_scores_df_weighted_ranked.csv")
)
display(all_degss_scores_df_weighted_ranked)

Unnamed: 0,GOI_level,GOI_level_10,GOI_level_15,GOI_level_20,GOI_level_25,GOI_level_30
PCTA_WCDT (FOLH1),1,6,5,4,3,2
TCGA-BRCA (FOXA1),2,6,5,4,3,1
TCGA-HNSC (TP63),1,6,5,4,3,2
TCGA-KIRC (CA9),1,6,5,4,3,2
TCGA-LGG (IDH1),1,6,5,4,3,2
TCGA-LUAD (NKX2-1),1,6,5,4,3,2
TCGA-LUSC (SOX2),1,6,5,4,3,2
TCGA-THCA (BRAF),2,6,5,4,3,1
TCGA-UCEC (MCM10),2,6,5,4,3,1


In [18]:
all_degss_scores_df_weighted_ranked_summary = (
    all_degss_scores_df_weighted_ranked.median().sort_values(ascending=True)
)
all_degss_scores_df_weighted_ranked_summary.to_csv(
    MULTI_DATASET_PATH.joinpath("all_degss_scores_df_weighted_ranked_summary.csv")
)
display(all_degss_scores_df_weighted_ranked_summary)

GOI_level       1.0
GOI_level_30    2.0
GOI_level_25    3.0
GOI_level_20    4.0
GOI_level_15    5.0
GOI_level_10    6.0
dtype: float64

## 4. Visualize results