# Python Sandbox


## Setup

In [5]:
%load_ext autoreload
%autoreload 2

### Imports

In [1]:
import sys
from itertools import product
from pathlib import Path
from typing import Iterable

import pandas as pd

In [2]:
src_path: str = "../src"
sys.path.append(src_path)

In [3]:
from components.functional_analysis.orgdb import OrgDB

### Global Variables

In [None]:
TCGA_PRAD_SU2C_PCF_GSE221601_ROOT: Path = Path(
    "/media/ssd/Perez/storage/TCGA_PRAD_SU2C_PCF_GSE221601"
)
GOIS: Iterable[str] = ("TPX2", "EZH2", "TROAP", "COX1", "UHRF1")
ORG_DB: OrgDB = OrgDB("Homo sapiens")

## Sandbox

### For all integrative analysis results, find appereances of genes of interest.

### Copy original tar counts files to analysis directory

### Annotate LRT deseq files

### Batch Correction PCA figures

---

In [None]:
ROOT: Path = Path(
    "/media/ssd/Perez/storage/PCTA_WCDT_GSE221601_FILTERED/ml_classifiers"
)
test_scores = dict()

for test_scores_file in ROOT.rglob("*test_scores.csv"):
    model_type = test_scores_file.parents[2].stem
    if "prim_vs_norm" in test_scores_file.parents[3].stem:
        test_scores[("prim_vs_norm", model_type)] = pd.read_csv(
            test_scores_file, index_col=0
        ).agg(["mean", "std"])
    elif "hspc_vs_prim" in test_scores_file.parents[3].stem:
        test_scores[("hspc_vs_norm", model_type)] = pd.read_csv(
            test_scores_file, index_col=0
        ).agg(["mean", "std"])
    elif "mcrpc_vs_hspc" in test_scores_file.parents[3].stem:
        test_scores[("mcrpc_vs_hspc", model_type)] = pd.read_csv(
            test_scores_file, index_col=0
        ).agg(["mean", "std"])

In [None]:
CONTRASTS_NAMES_MAP = {
    "prim_vs_norm": "prim/norm",
    "hspc_vs_norm": "hspc/prim",
    "mcrpc_vs_hspc": "mcrpc/hspc",
}
MODEL_NAMES_MAP = {
    "decision_tree": "Decision Tree",
    "random_forest": "Random Forest",
    "light_gbm": "LightGBM",
}
sorted_indices = product(CONTRASTS_NAMES_MAP.values(), MODEL_NAMES_MAP.values())
records = []

# 2. Iterate through dictionary and reshape data
for (contrast, model), df in test_scores.items():
    # Extract mean and std rows
    mean_row = df.loc["mean"]
    std_row = df.loc["std"]

    # Create record with hierarchical structure
    for metric in df.columns:
        records.append(
            {
                "contrast": CONTRASTS_NAMES_MAP[contrast],
                "model": MODEL_NAMES_MAP[model],
                "metric": metric,
                "mean": mean_row[metric],
                "std": std_row[metric],
            }
        )

# 3. Convert to DataFrame and reshape
df = pd.DataFrame(records)
df_pivot = (
    df.pivot_table(
        index=["contrast", "model"], columns="metric", values=["mean", "std"]
    )
    .round(4)
    .loc[sorted_indices]
)

display(df_pivot)

In [None]:
df_pivot.groupby(level=1).mean()