In [None]:
import pandas as pd
import numpy as np
import pathlib as pl

from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from statannotations.Annotator import Annotator

from scipy.stats import mannwhitneyu, fisher_exact, pearsonr, kruskal

In [None]:
import sys
sys.path.append("../../FinalCode/")
import download.download as dwnl
import utils.plotting as plting
import adVMP.adVMP_discovery as discov
import adVMP.adVMP_plots as advmpplt

In [None]:
import mygene

In [None]:
from collections import defaultdict

In [None]:
from tqdm.notebook import tqdm

# Get mapping promoter to gene

In [None]:
data_dir = pl.Path("/add/path/here")

In [None]:
epic_manifest = pd.read_csv(data_dir / "illumina_manifests" / "GPL21145_MethylationEPIC_15073387_v-1-0.csv.gz",skiprows=7,index_col=0)

red_manifest = epic_manifest[['CHR', 'MAPINFO',
       'UCSC_RefGene_Name', 
       'UCSC_RefGene_Group',]]

red_manifest = red_manifest[~red_manifest.UCSC_RefGene_Name.isna()]

In [None]:
# for each CpG, there might be several genes with "functions" (corresponding to regulatory elements) mapped
# here we get for each gene the list of  CpGs that are located up to 1,500 bp up or downstream of the TSS
full_gene_to_promoter = defaultdict(list)
for cg in tqdm(red_manifest.index):
    sub = red_manifest.loc[cg]
    genes = sub.UCSC_RefGene_Name.split(";")
    fcts = sub.UCSC_RefGene_Group.split(";")
    for i,f in enumerate(fcts):
        if "TSS" in f:
            full_gene_to_promoter[genes[i]].append(cg)

# get the atlas and translate the genes into official gene IDs
atlas = pd.read_csv(data_dir / "auxiliary" / "Colon_highlevel_ref.csv",index_col=0)
mg = mygene.MyGeneInfo()
mapping_genes = mg.getgenes(atlas.index.to_numpy(), fields='name,symbol,entrezgene,taxid', as_dataframe=True)
atlas["GeneID"] = mapping_genes["symbol"].ravel()

# keep only the genes that we need for deconvolution
gene_to_promoter_cpg = {}
for gene in atlas.GeneID.ravel():
    gene_to_promoter_cpg[gene] = full_gene_to_promoter[gene]

# Deconvolve

In [None]:
from typing import Dict
import statsmodels.api as sm
def get_patient_values(beta_values: pd.DataFrame, gene_to_promoter_cpg: Dict[str, np.ndarray]) -> pd.DataFrame:
    
    values = []
    # for each gene, get the average beta value of the CpGs in the promoter region, for each patient
    for gn in gene_to_promoter_cpg:
        common_cpgs = beta_values.columns.intersection(gene_to_promoter_cpg[gn])
        if len(common_cpgs)==0:
            print(f"No common cpgs for gene {gn}, skipping")
        avg = beta_values.loc[:,common_cpgs]
        avg = avg.fillna(avg.median())
        avg = avg.mean(axis=1)
        avg.name = gn
        values.append(avg)

    values = pd.concat(values,axis=1).dropna(axis=1)
    return values

def get_patient_deconvolution(atlas: pd.DataFrame, values: pd.DataFrame) -> pd.DataFrame:
    
    # only keep the genes present in the dataset
    red_atlas = atlas.set_index("GeneID")
    red_atlas = red_atlas.loc[values.columns]

    # get a diagonal matrix with the weight computed from the atlas
    sqrt_weight = red_atlas["weight"].apply(np.sqrt)
    diag_sqrt_weight = np.diag(sqrt_weight)

    ref = red_atlas.iloc[:,:-1]
    
    all_est = []
    # use the EpiSCORE algorithms
    for i,pat in tqdm(enumerate(values.index)):
        y = values.iloc[i].ravel()
        y = sqrt_weight*y
        
        X = np.dot(diag_sqrt_weight,ref)
        
        beta_hat = sm.RLM(y, X).fit().params
        beta_hat = beta_hat.clip(0,None)
        est_frac = beta_hat/beta_hat.sum()
        est_frac.name = pat
        all_est.append(est_frac)
    all_est = pd.concat(all_est,axis=1)
    all_est.index = ref.columns
    return all_est.T

In [None]:
mapping_path = pl.Path("add/path/here/")
mapping = pd.read_csv(mapping_path / "SWEPIC_full_sample_sheet.csv")
#mapping = pd.read_csv(mapping_path / "sample_sheet_EPIC4.csv")

idx = (mapping["Sentrix_ID"].astype(str) + "_" + mapping["Sentrix_Position"]).ravel()
mapping.index = idx
mapping = mapping["Sample_Name"].astype(str).to_dict()

In [None]:
base_dir = pl.Path("add/path/here/sesame_processed_EPIC")
#base_dir = pl.Path("/add/path/here/sesame_processed_EPIC4")
base_dir = base_dir / "beta_values"

In [None]:
all_frac_estimates = []
for f in tqdm(base_dir.iterdir()):
    beta_values = pd.read_pickle(f).T
    beta_values = beta_values.rename(index=mapping)
    values = get_patient_values(beta_values=beta_values, gene_to_promoter_cpg=gene_to_promoter_cpg)
    frac_estimates = get_patient_deconvolution(atlas=atlas, values=values)
    all_frac_estimates.append(frac_estimates)

In [None]:
all_frac_estimates = pd.concat(all_frac_estimates)

In [None]:
resdir = pl.Path("/add/path/here/")

all_frac_estimates.to_csv(resdir / "epic123_estimates.csv")
#all_frac_estimates.to_csv(resdir / "epic4_estimates.csv")