In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import json
import pandas as pd
import numpy as np
import logging
import warnings
import random

from rich import traceback
from tqdm.rich import tqdm
from rpy2.rinterface_lib.callbacks import logger as rpy2_logger
from typing import Tuple, Dict
from pathlib import Path

In [None]:
src_path: str = "../../src"
sys.path.append(src_path)

_ = traceback.install()
rpy2_logger.setLevel(logging.ERROR)
logging.basicConfig(force=True)
logging.getLogger().setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

In [None]:
from components.functional_analysis.orgdb import OrgDB

org_db = OrgDB("Homo sapiens")

## Integrative analysis of Differential Expression, WGCNA and ML results

---


### 1. Load all gene sets


In [None]:
root = Path("/media/ssd/Perez/storage/TCGA_PRAD_SU2C_RNASeq")

#### 1.1. Differentially expressed genes for prim/norm and met/prim


In [None]:
degs_pri_vs_norm = (
    pd.read_csv(
        root.joinpath("deseq2").joinpath(
            "sample_cluster_no_replicates_met_bb+norm+prim__"
            "prim_vs_norm_padj_0_05_up_1_0_deseq_results_unique.csv"
        ),
        index_col=0,
    )
    .dropna(subset=["ENTREZID", "SYMBOL"])
    .set_index("SYMBOL")
)
print(degs_pri_vs_norm.shape)

degs_met_vs_pri = (
    pd.read_csv(
        root.joinpath("deseq2").joinpath(
            "sample_cluster_no_replicates_met_bb+norm+prim__"
            "met_bb_vs_prim_padj_0_05_up_1_0_deseq_results_unique.csv"
        ),
        index_col=0,
    )
    .dropna(subset=["ENTREZID", "SYMBOL"])
    .set_index("SYMBOL")
)
print(degs_met_vs_pri.shape)

#### 1.2. Machine Learning results for prim/norm and met/prim


In [None]:
ml_pri_vs_nor = (
    pd.read_csv(
        root.joinpath("ml_classifiers")
        .joinpath(
            "sample_cluster_no_replicates_met_bb+norm+prim__"
            "prim_vs_norm_padj_0_05_up_1_0"
        )
        .joinpath("random_forest")
        .joinpath("genes_features")
        .joinpath("bootstrap")
        .joinpath("bootstrap_10000_shap_values_0_001.csv"),
    )
    .dropna(subset=["ENTREZID", "SYMBOL"])
    .set_index("SYMBOL")
)
print(ml_pri_vs_nor.shape)

ml_met_vs_pri = (
    pd.read_csv(
        root.joinpath("ml_classifiers")
        .joinpath(
            "sample_cluster_no_replicates_met_bb+norm+prim__"
            "met_bb_vs_prim_padj_0_05_up_1_0"
        )
        .joinpath("random_forest")
        .joinpath("genes_features")
        .joinpath("bootstrap")
        .joinpath("bootstrap_10000_shap_values_0_001.csv"),
    )
    .dropna(subset=["ENTREZID", "SYMBOL"])
    .set_index("SYMBOL")
)
print(ml_met_vs_pri.shape)

In [None]:
print(ml_pri_vs_nor["GENETYPE"].value_counts())
print(ml_met_vs_pri["GENETYPE"].value_counts())

#### 1.3. WGCNA results for prim/norm and met/prim


In [None]:
prim_norm_wgcna_m3_genes = (
    pd.read_csv(
        root.joinpath("wgcna")
        .joinpath(
            "sample_cluster_no_replicates_met_bb+norm+prim__"
            "prim_vs_norm_padj_0_05_up_1_0"
        )
        .joinpath("standard")
        .joinpath("results")
        .joinpath("bicor_signed_M3_204_genes.csv")
    )
    .dropna(subset=["ENTREZID", "SYMBOL"])
    .set_index("SYMBOL")
)
print(prim_norm_wgcna_m3_genes.shape)

met_prim_wgcna_m4_genes = (
    pd.read_csv(
        root.joinpath("wgcna")
        .joinpath(
            "sample_cluster_no_replicates_met_bb+norm+prim__"
            "met_bb_vs_prim_padj_0_05_up_1_0"
        )
        .joinpath("standard")
        .joinpath("results")
        .joinpath("bicor_signed_M4_336_genes.csv")
    )
    .dropna(subset=["ENTREZID", "SYMBOL"])
    .set_index("SYMBOL")
)
print(met_prim_wgcna_m4_genes.shape)

In [None]:
print(prim_norm_wgcna_m3_genes["GENETYPE"].value_counts())
print(met_prim_wgcna_m4_genes["GENETYPE"].value_counts())

---


### 2. Calculate intersections between comparisons within methods


#### 2.1. Differentially expressed genes for met/prim and prim/norm


In [None]:
intersecting_degs = degs_pri_vs_norm.index.intersection(degs_met_vs_pri.index)
print(intersecting_degs)

#### 2.2. Machine Learning results for met/prim and prim/norm


In [None]:
intersecting_ml = ml_pri_vs_nor.index.intersection(ml_met_vs_pri.index)
print(intersecting_ml)

#### 2.3. WGCNA results for met/prim and prim/norm


In [None]:
intersecting_wgcna = prim_norm_wgcna_m3_genes.index.intersection(
    met_prim_wgcna_m4_genes.index
)
print(intersecting_wgcna)

---


### 3. Calculate intersections between methods


#### 3.1. Between ML and WGCNA


In [None]:
intersecting_ml_wgcna = intersecting_ml.intersection(intersecting_wgcna)
print(intersecting_ml_wgcna)

---


### 4. Calculate intersections between comparisons _AND_ methods


#### 4.1. Between ML and WGCNA


In [None]:
intersecting_pri_norm_ml_wgcna_m2 = prim_norm_wgcna_m3_genes.index.intersection(
    ml_pri_vs_nor.index
)
print(intersecting_pri_norm_ml_wgcna_m2)
with root.joinpath("intersecting_pri_norm_ml_wgcna_m2.json").open("w") as fp:
    json.dump(sorted(intersecting_pri_norm_ml_wgcna_m2), fp, indent=True)

In [None]:
intersecting_met_pri_ml_wgcna_m4 = met_prim_wgcna_m4_genes.index.intersection(
    ml_met_vs_pri.index
)
print(intersecting_met_pri_ml_wgcna_m4)
with root.joinpath("intersecting_met_pri_ml_wgcna_m4.json").open("w") as fp:
    json.dump(sorted(intersecting_met_pri_ml_wgcna_m4), fp, indent=True)

#### 4.2. Between comparisons


In [None]:
intersecting_all_ml_wgcna = intersecting_pri_norm_ml_wgcna_m2.intersection(
    intersecting_met_pri_ml_wgcna_m4
)
prim_norm_only_ml_wgcna = intersecting_pri_norm_ml_wgcna_m2.difference(
    intersecting_all_ml_wgcna
)
met_prim_only_ml_wgcna = intersecting_met_pri_ml_wgcna_m4.difference(
    intersecting_all_ml_wgcna
)

with root.joinpath("intersecting_ml_wgcna_genes.json").open("w") as fp:
    json.dump(
        {
            "DEGs (ML & WGCNA) in both contrasts": intersecting_all_ml_wgcna.tolist(),
            "DEGs (ML & WGCNA) only in prim/norm": prim_norm_only_ml_wgcna.tolist(),
            "DEGs (ML & WGCNA) only in met/prim": met_prim_only_ml_wgcna.tolist(),
        },
        fp,
        indent=True,
    )

In [None]:
print(
    json.dumps(
        {
            "DEGs (ML & WGCNA) in both contrasts": len(
                intersecting_all_ml_wgcna.tolist()
            ),
            "DEGs (ML & WGCNA) only in prim/norm": len(
                prim_norm_only_ml_wgcna.tolist()
            ),
            "DEGs (ML & WGCNA) only in met/prim": len(met_prim_only_ml_wgcna.tolist()),
        },
        indent=True,
    )
)

---
