# Python Sandbox


## Setup

In [1]:
%load_ext autoreload
%autoreload 2

### Imports

In [14]:
import sys
from pathlib import Path
from typing import Iterable

import pandas as pd
import rpy2.robjects as ro
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from itertools import product

In [3]:
src_path: str = "../src"
sys.path.append(src_path)

In [4]:
from components.functional_analysis.orgdb import OrgDB
from r_wrappers.deseq2 import vst_transform
from r_wrappers.utils import pd_df_to_rpy2_df, rpy2_df_to_pd_df

### Global Variables

In [5]:
TCGA_PRAD_SU2C_PCF_GSE221601_ROOT: Path = Path(
    "/media/ssd/Perez/storage/TCGA_PRAD_SU2C_PCF_GSE221601"
)
GOIS: Iterable[str] = ("TPX2", "EZH2", "TROAP", "COX1", "UHRF1")
ORG_DB: OrgDB = OrgDB("Homo sapiens")

## Sandbox

### For all integrative analysis results, find appereances of genes of interest.

### Copy original tar counts files to analysis directory

### Annotate LRT deseq files

### Batch Correction PCA figures

---

In [6]:
ROOT: Path = Path("/media/ssd/Perez/storage/PCTA_WCDT_GSE221601_FILTERED/ml_classifiers")
test_scores = dict()

for test_scores_file in (ROOT.rglob("*test_scores.csv")):
    model_type = test_scores_file.parents[2].stem
    if "prim_vs_norm" in test_scores_file.parents[3].stem:
        test_scores[("prim_vs_norm", model_type)] = (
            pd.read_csv(test_scores_file, index_col=0)
            .agg(["mean", "std"])
        )
    elif "hspc_vs_prim" in test_scores_file.parents[3].stem:
        test_scores[("hspc_vs_norm", model_type)] = (
            pd.read_csv(test_scores_file, index_col=0)
            .agg(["mean", "std"])
        )
    elif "mcrpc_vs_hspc" in test_scores_file.parents[3].stem:
        test_scores[("mcrpc_vs_hspc", model_type)] = pd.read_csv(test_scores_file, index_col=0).agg(
            ["mean", "std"]
        )

In [22]:
CONTRASTS_NAMES_MAP = {
    "prim_vs_norm": "prim/norm",
    "hspc_vs_norm": "hspc/prim",
    "mcrpc_vs_hspc": "mcrpc/hspc"
}
MODEL_NAMES_MAP = {
    "decision_tree": "Decision Tree",
    "random_forest": "Random Forest",
    "light_gbm": "LightGBM",
}
sorted_indices = product(CONTRASTS_NAMES_MAP.values(), MODEL_NAMES_MAP.values())
records = []

# 2. Iterate through dictionary and reshape data
for (contrast, model), df in test_scores.items():
    # Extract mean and std rows
    mean_row = df.loc["mean"]
    std_row = df.loc["std"]

    # Create record with hierarchical structure
    for metric in df.columns:
        records.append(
            {
                "contrast": CONTRASTS_NAMES_MAP[contrast],
                "model": MODEL_NAMES_MAP[model],
                "metric": metric,
                "mean": mean_row[metric],
                "std": std_row[metric],
            }
        )

# 3. Convert to DataFrame and reshape
df = pd.DataFrame(records)
df_pivot = (
    df.pivot_table(
        index=["contrast", "model"], columns="metric", values=["mean", "std"]
    )
    .round(4)
    .loc[sorted_indices]
)

display(df_pivot)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,mean,mean,mean,std,std,std,std
Unnamed: 0_level_1,metric,balanced_accuracy,f1,precision,recall,balanced_accuracy,f1,precision,recall
contrast,model,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
prim/norm,Decision Tree,0.8887,0.9625,0.9477,0.9781,0.0335,0.011,0.0166,0.0156
prim/norm,Random Forest,0.8987,0.9678,0.9512,0.9852,0.0313,0.0094,0.0155,0.0115
prim/norm,LightGBM,0.9204,0.9761,0.9605,0.9923,0.0287,0.0084,0.0143,0.0082
hspc/prim,Decision Tree,0.6929,0.9592,0.9571,0.9616,0.0778,0.0105,0.0112,0.0182
hspc/prim,Random Forest,0.698,0.9675,0.9573,0.9782,0.0741,0.0081,0.0104,0.0127
hspc/prim,LightGBM,0.671,0.9699,0.9533,0.9873,0.0707,0.0066,0.0098,0.0102
mcrpc/hspc,Decision Tree,0.6377,0.7403,0.7698,0.7183,0.091,0.3286,0.3378,0.328
mcrpc/hspc,Random Forest,0.6481,0.924,0.9089,0.9406,0.0741,0.0202,0.0194,0.0369
mcrpc/hspc,LightGBM,0.6621,0.9432,0.9108,0.9783,0.0673,0.0125,0.0168,0.0179


In [29]:
df_pivot.groupby(level=1).mean()

Unnamed: 0_level_0,mean,mean,mean,mean,std,std,std,std
metric,balanced_accuracy,f1,precision,recall,balanced_accuracy,f1,precision,recall
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Decision Tree,0.739767,0.887333,0.891533,0.886,0.067433,0.1167,0.121867,0.1206
LightGBM,0.751167,0.963067,0.941533,0.985967,0.055567,0.009167,0.013633,0.0121
Random Forest,0.748267,0.9531,0.939133,0.968,0.059833,0.012567,0.0151,0.020367
