In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
from pathlib import Path

import pandas as pd
import rpy2.robjects as ro

In [None]:
src_path: str = "../../src"
sys.path.append(src_path)

In [None]:
from components.functional_analysis.orgdb import OrgDB
from r_wrappers.rnaseq_power import rnapower
from r_wrappers.utils import map_gene_id

In [None]:
root = Path("/media/ssd/Perez/storage/TCGA_PRAD_SU2C_RNASeq")
deseq_path = root.joinpath("deseq2")
goi = "FOLH1"
org_db = OrgDB("Homo sapiens")

## 1. RNASeq sample size calculation


### 1.1. TCGA-PRAD + SU2C-PCF data


Load raw counts


In [None]:
dds_df = pd.read_csv(
    deseq_path.joinpath(
        "sample_cluster_no_replicates_Metastatic_BB+Normal+Primary_dds.csv"
    ),
    index_col=0,
)

Keep only genes with SYMBOL IDs


In [None]:
genes_symbol = map_gene_id(
    dds_df.index, org_db, from_type="ENSEMBL", to_type="SYMBOL"
).dropna()
dds_df = dds_df.loc[genes_symbol.index, :]
dds_df.index = genes_symbol.values

Calculate mean gene counts


In [None]:
mean_gene_counts = dds_df.mean(axis=1).sort_values(ascending=False)
print(
    f"{goi} is the {mean_gene_counts.index.get_loc(goi)}th most expressed gene "
    f"with an average row count of {mean_gene_counts[goi]:.2f}."
)

In [None]:
mean_gene_counts_filtered = mean_gene_counts[mean_gene_counts > 10]
gene_80th = mean_gene_counts_filtered.index[int(len(mean_gene_counts_filtered) * 0.8)]
print(
    f"{gene_80th} is the {mean_gene_counts_filtered.index.get_loc(gene_80th)}th most"
    " expressed gene with an average row count of"
    f" {mean_gene_counts_filtered[gene_80th]:.2f}."
)

In [None]:
depth: int = 29539.67  # FOLH1 mean raw counts
cv: float = 0.5
effect: float = ro.FloatVector([1.25, 1.5, 1.75, 2])
alpha: float = 0.05
power: float = ro.FloatVector([0.8, 0.85, 0.9, 0.95])

In [None]:
print(rnapower(depth=depth, cv=cv, effect=effect, alpha=alpha, power=power))