In [1]:
import pandas as pd
import numpy as np
import pathlib as pl

from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from statannotations.Annotator import Annotator

from scipy.stats import mannwhitneyu, fisher_exact, pearsonr, kruskal

In [2]:
import sys
sys.path.append("../../FinalCode/")
import download.download as dwnl
import utils.plotting as plting
import adVMP.adVMP_discovery as discov
import adVMP.adVMP_plots as advmpplt

In [9]:
import mygene

INFO:biothings.client:querying 1-104...
INFO:biothings.client:done.


In [3]:
# For figures
colors = sns.color_palette("muted")
palette = {"Normal": colors[0], "NADJ-A": colors[3], "NADJ-C": colors[2], "Adenoma": colors[3], "SSL": colors[4], "Cancer": colors[5]}
fig_dir = pl.Path("/Users/josephineyates/Documents/CRC_polyp/ArticleFigures/unit")

In [4]:
base_dir = pl.Path("/Users/josephineyates/Documents/CRC_polyp/sesame_processed_EPIC/")
base_dir4 = pl.Path("/Users/josephineyates/Documents/CRC_polyp/sesame_processed_EPIC4/")

data_dir = pl.Path("/Users/josephineyates/Documents/CRC_polyp/FinalData/")

bad_probes = pd.read_csv(data_dir / "auxiliary" / "sketchy_probe_list_epic.csv",index_col=0).values.ravel()
sample_origin_path = pl.Path(data_dir / "clinical" / "sample_origin_wbatch.csv")

clinical_path = pl.Path(data_dir / "clinical" / "cleaned_clinical_reduced_diet.csv")
target_path = pl.Path(data_dir / "clinical" / "targets.csv")

In [5]:
EPIC2_b, EPIC2_clin, EPIC2_samples, EPIC2_phenotypes, EPIC3_b, EPIC3_clin, EPIC3_samples, EPIC3_phenotypes = dwnl.download_EPIC(sample_origin_path=sample_origin_path, 
                     base_dir=base_dir, clinical_path=clinical_path, target_path=target_path,
                  bad_probes=bad_probes, EPIC4=False) 

30it [00:03,  7.59it/s]


In [6]:
EPIC4_b, EPIC4_clin, EPIC4_samples, EPIC4_phenotypes = dwnl.download_EPIC(sample_origin_path=sample_origin_path, 
                     base_dir=base_dir4, clinical_path=clinical_path, target_path=target_path, 
                  bad_probes=bad_probes, EPIC4=True) 

18it [00:03,  5.68it/s]


# Deconvolve

In [7]:
atlas = pd.read_csv("/Users/josephineyates/Documents/CRC_polyp/FinalData/auxiliary/Colon_highlevel_ref.csv",index_col=0)
mg = mygene.MyGeneInfo()
mapping_genes = mg.getgenes(atlas.index.to_numpy(), fields='name,symbol,entrezgene,taxid', as_dataframe=True)
atlas["GeneID"] = mapping_genes["symbol"].ravel()

In [13]:
from typing import Dict
import statsmodels.api as sm
def get_patient_values(beta_values: pd.DataFrame, gene_to_promoter_cpg: Dict[str, np.ndarray]) -> pd.DataFrame:
    
    values = []
    for gn in gene_to_promoter_cpg:
        common_cpgs = beta_values.columns.intersection(gene_to_promoter_cpg[gn])
        if len(common_cpgs)==0:
            print(f"No common cpgs for gene {gn}, skipping")
        avg = beta_values.loc[:,common_cpgs]
        avg = avg.fillna(avg.median())
        avg = avg.mean(axis=1)
        avg.name = gn
        values.append(avg)

    values = pd.concat(values,axis=1).dropna(axis=1)
    return values

def get_patient_deconvolution(atlas: pd.DataFrame, values: pd.DataFrame) -> pd.DataFrame:
    

    red_atlas = atlas.set_index("GeneID")
    red_atlas = red_atlas.loc[values.columns]

    sqrt_weight = red_atlas["weight"].apply(np.sqrt)
    diag_sqrt_weight = np.diag(sqrt_weight)

    ref = red_atlas.iloc[:,:-1]
    
    all_est = []
    for i,pat in tqdm(enumerate(values.index)):
        y = values.iloc[i].ravel()
        y = sqrt_weight*y
        
        X = np.dot(diag_sqrt_weight,ref)
        
        beta_hat = sm.RLM(y, X).fit().params
        beta_hat = beta_hat.clip(0,None)
        est_frac = beta_hat/beta_hat.sum()
        est_frac.name = pat
        all_est.append(est_frac)
    all_est = pd.concat(all_est,axis=1)
    all_est.index = ref.columns
    return all_est.T

In [14]:
#mapping = pd.read_csv("/Users/josephineyates/Documents/CRC_polyp/sesame_processed_EPIC4/sample_sheet_EPIC4.csv")
mapping = pd.read_csv("/Users/josephineyates/Documents/CRC_polyp/sesame_processed_EPIC/SWEPIC_full_sample_sheet.csv")
idx = (mapping["Sentrix_ID"].astype(str) + "_" + mapping["Sentrix_Position"]).ravel()
mapping.index = idx
mapping = mapping["Sample_Name"].astype(str).to_dict()

In [15]:
#base_dir = pl.Path("/Users/josephineyates/Documents/CRC_polyp/sesame_processed_EPIC4/beta_values/")
base_dir = pl.Path("/Users/josephineyates/Documents/CRC_polyp/sesame_processed_EPIC/beta_values/")

In [None]:
all_frac_estimates = []
for f in base_dir.iterdir():
    beta_values = pd.read_pickle(f).T
    beta_values = beta_values.rename(index=mapping)
    values = get_patient_values(beta_values=beta_values, gene_to_promoter_cpg=gene_to_promoter_cpg)
    frac_estimates = get_patient_deconvolution(atlas=atlas, values=values)
    all_frac_estimates.append(frac_estimates)

In [None]:
all_frac_estimates = pd.concat(all_frac_estimates)

In [None]:
all_frac_estimates.to_csv("/Users/josephineyates/Documents/CRC_polyp/FinalData/EpiSCORE_results/epic123_estimates.csv")
#all_frac_estimates.to_csv("/Users/josephineyates/Documents/CRC_polyp/FinalData/EpiSCORE_results/epic4_estimates.csv")