# Python Sandbox


## Setup

In [1]:
%load_ext autoreload
%autoreload 2

### Imports

In [2]:
import sys
from pathlib import Path
from typing import Iterable

import pandas as pd

In [3]:
src_path: str = "../src"
sys.path.append(src_path)

In [4]:
from components.functional_analysis.orgdb import OrgDB
from data.io import clean_star_counts
from r_wrappers.utils import annotate_deseq_result, pd_df_to_rpy2_df

### Global Variables

In [5]:
TCGA_PRAD_SU2C_PCF_GSE221601_ROOT: Path = Path(
    "/media/ssd/Perez/storage/TCGA_PRAD_SU2C_PCF_GSE221601"
)
GOIS: Iterable[str] = ("TPX2", "EZH2", "TROAP", "COX1", "UHRF1")
ORG_DB: OrgDB = OrgDB("Homo sapiens")

## Sandbox

### For all integrative analysis results, find appereances of genes of interest.

### Copy original tar counts files to analysis directory

In [6]:
STAR_ROOT: Path = Path("/rawdata/GSE221601/mapping/star")
ANALYSIS_ROOT: Path = Path("/media/ssd/Perez/storage/GSE221601/data")
STAR_ORIGINAL_COUNTS_PATH: Path = ANALYSIS_ROOT / "star_original_counts"
STAR_ORIGINAL_COUNTS_PATH.mkdir(exist_ok=True, parents=True)
STAR_UNSTRANDED_COUNTS_PATH: Path = ANALYSIS_ROOT / "star_unstranded_counts"
STAR_UNSTRANDED_COUNTS_PATH.mkdir(exist_ok=True, parents=True)
STAR_FIRST_READ_STRAND_COUNTS_PATH: Path = (
    ANALYSIS_ROOT / "star_first_read_strand_counts"
)
STAR_FIRST_READ_STRAND_COUNTS_PATH.mkdir(exist_ok=True, parents=True)
STAR_SECOND_READ_STRAND_COUNTS_PATH: Path = (
    ANALYSIS_ROOT / "star_second_read_strand_counts"
)
STAR_SECOND_READ_STRAND_COUNTS_PATH.mkdir(exist_ok=True, parents=True)

In [7]:
for tab_file in STAR_ROOT.rglob("ReadsPerGene.out.tab"):
    parent_dir_name = tab_file.parent.name
    destination_file = STAR_ORIGINAL_COUNTS_PATH / f"{parent_dir_name}.tab"
    destination_file.write_bytes(tab_file.read_bytes())

In [None]:
clean_star_counts(
    star_path=STAR_ROOT,
    star_counts_path=STAR_UNSTRANDED_COUNTS_PATH,
    subset_col=1,
)
clean_star_counts(
    star_path=STAR_ROOT,
    star_counts_path=STAR_FIRST_READ_STRAND_COUNTS_PATH,
    subset_col=2,
)
clean_star_counts(
    star_path=STAR_ROOT,
    star_counts_path=STAR_SECOND_READ_STRAND_COUNTS_PATH,
    subset_col=3,
)

### Annotate LRT deseq files

In [None]:
lrt_files = list(
    Path("/media/ssd/Perez/storage/PCTA_WCDT_GSE221601_LRT/deseq2").glob(
        "Sig_res_LRT_across_sample_types_overall_effects_hspc+mcrpc+norm+prim_1232samples*deseq_results.csv"
    )
)
for lrt_file in lrt_files:
    print(lrt_file)

In [None]:
for lrt_file in lrt_files:
    print("Processing file:", lrt_file)
    lrt_results = pd.read_csv(lrt_file, index_col=0)

    lrt_results_annot = annotate_deseq_result(pd_df_to_rpy2_df(lrt_results), ORG_DB)

    lrt_results_annot = (
        lrt_results_annot[
            ~lrt_results_annot["ENTREZID"].str.contains("/", na=False)
            | ~lrt_results_annot["SYMBOL"].str.contains("/", na=False)
        ]
        .dropna(subset=["ENTREZID", "SYMBOL"])
        .drop_duplicates(subset=["ENTREZID", "SYMBOL"], keep=False)
    )

    lrt_results_annot.to_csv(lrt_file.with_name(lrt_file.stem + "_unique.csv"))