In [None]:
# import packages
import pandas as pd
import os
os.environ['R_HOME'] = '/Users/geba9152/miniconda3/envs/rpy2_env/lib/R'
import rpy2.rinterface

# loads ipython extension
%load_ext rpy2.ipython

05/14/2024

There was a bug in the last code I wrote to make complex heatmaps centered on mT ```spring_2024_3prime/```

I am writing this notebook to fix this bug & retry the heatmaps on ground truth genes in HCTs. 

Then I will try to use this code to identify genes that are not transcribed well enough in the CDK7i or Ethan/Eric HS data to run LIET on them (I tried running LIET without filtering genes first - just on ground truth set- and it took CDK7i HCT data >3 days to finish, so I have concluded we need a filtering step)

### 1. Pull out mT values
This function saves a df for all parameters for each metarun

Results are stored here: ```/scratch/Users/geba9152/LIET-summer2024/meta-celltype-comparison/prior-dfs/tight-sT/```

In [None]:
def pull_out_params_meta_samples(refpoint):
    
    # set base path
    base_path = "/scratch/Users/geba9152/LIET-summer2024/meta-celltype-comparison"
    
    # Initialize dictionary to store the results
    results_dict = defaultdict(pd.DataFrame)
        
    # define paths
    lietpath = f"{base_path}/results/{refpoint}/*.liet"
#     logpath = f"{base_path}/results/{refpoint}/*.liet.log"
#     configpath = f"{base_path}/config/{refpoint}/*.liet.config"

    # get list of files for each type
    lietfiles = glob.glob(lietpath)
#     logfiles = glob.glob(logpath)
#     configfiles = glob.glob(configpath)

    # process each group of files 
#     for configfile, lietfile, logfile in zip(configfiles, lietfiles, logfiles):
    for lietfile in lietfiles:
        celltype = os.path.basename(lietfile).split('.')[0]  # extract celltype from filename
       
        # call fitparse function to get model params
        fit_parser = FitParse(lietfile)
        
        data = {
            "Gene": fit_parser.genes,
            "mL": fit_parser.mL,
            "mL_std": fit_parser.mL_std,
            "sL": fit_parser.sL,
            "sL_std": fit_parser.sL_std,
            "tI": fit_parser.tI,
            "tI_std": fit_parser.tI_std,
            "mT": fit_parser.mT,
            "mT_std": fit_parser.mT_std,
            "sT": fit_parser.sT,
            "sT_std": fit_parser.sT_std,
            "w": fit_parser.w,
            "w_std": fit_parser.w_std,
            "mL_a": fit_parser.mL_a,
            "mL_a_std": fit_parser.mL_a_std,
            "sL_a": fit_parser.sL_a,
            "sL_a_std": fit_parser.sL_a_std,
            "tI_a": fit_parser.tI_a,
            "tI_a_std": fit_parser.tI_a_std,
            "w_a": fit_parser.w_a,
            "w_a_std": fit_parser.w_a_std,
        }
        
        data_df = pd.DataFrame(data)
        
        ann = pd.read_csv("/scratch/Users/geba9152/LIET-summer2024/meta-celltype-comparison/annotation/lietanns/chr1-6-3p-UTR.liet.ann", sep = "\t", header = None)
        ann = ann[[3,4]]
        ann.columns = ['Gene','Length']
        
        df = ann.merge(data_df, on = 'Gene')
        df['mT'] = df['mT'] - df['Length']
                
        cellname = os.path.basename(lietfile).replace("meta_", "").replace('.liet',"")
#         print(cellname)
        
#         df.to_csv(f"/scratch/Users/geba9152/LIET-summer2024/meta-celltype-comparison/prior-dfs/tight-sT/{cellname}-priors.txt", sep = "\t", header = True, index = None)
                
refpoint = "tight-sT/"
pull_out_params_meta_samples(refpoint)

### 2. Create bedfiles centered on mT & PAS

In [None]:
celltypes = ['OCILY1', 'G401','HCT116', 'MEL624', 'HeLa', 'THP1', 'foreskin_fibroblast', 'A375', 'SUDHL4', 'lymphoblast', 'Jurkat_T_cell', 'HEp2', 'ESC', 'LC2ad', 'Ramos', 'HEK239T-HEK239', 'BEAS2B', 'NUDUL1', 'MV411', 'K562', 'MCF7', 'Kasumi1', 'S2VP10', 'KBM7', 'CD34_erythroblast', 'HAP1', 'U936', 'CD4_T_cell', 'A549']

genestoremove = ['DUSP2|NM_004418.4 ','HNRNPA0|NM_006805.4', 'ID2|NM_002166.5 ',
                 'LBH|NM_030915.4','NDUFS5|NM_004552.3','RHOB|NM_004040.4',
                 'RHOB|NM_004040.4','RNF187|NM_001010858.3','SPCS3|NM_021928.4','TMSB10|NM_021103.4','OMA1|NM_145243.5','MIR181A1HG|NR_040073.1']

def complexheat_mT_beds(celltypes, genestoremove):
    for celltype in celltypes:

        priors = pd.read_csv(f'/scratch/Users/geba9152/LIET-summer2024/meta-celltype-comparison/prior-dfs/tight-sT/{celltype}-priors.txt', sep = '\t')
        
        ann = pd.read_csv("/scratch/Users/geba9152/LIET-summer2024/meta-celltype-comparison/annotation/lietanns/chr1-6-3p-UTR.liet.ann", sep = "\t", header = None)
        ann.columns = ['chr','start','stop','Gene','length','strand'] 
        ann['.'] = '.'
        
        df = priors.merge(ann, on = 'Gene')
        
        # get rid of bad genes
        df = df[~df['Gene'].isin(genestoremove)]
        
        # make bed file
        df['mT-genomic-coords'] = df.apply(lambda row: row['mT'] + row['stop'] if row['strand'] == "+" else row['start'] - row['mT'], axis=1)
        df['start-bed'] = df['mT-genomic-coords'] - 0
        df['stop-bed'] = df['mT-genomic-coords'] + 0
        
        # round values
        df['start-bed'] = df['start-bed'].round().astype(int)
        df['stop-bed'] = df['stop-bed'].round().astype(int)
        
        bed = df[['chr','start-bed','stop-bed','Gene','.','strand']]
        
        # subset for pos genes
        pos = bed.loc[bed['strand'] == "+"]
#         pos.to_csv(f"/scratch/Users/geba9152/LIET-summer2024/meta-celltype-comparison/prior-dfs/tight-sT-beds/intersectlater/{celltype}-pos-mT.bed", header = None, index = None, sep = "\t")

        # subset for neg genes 
        neg = bed.loc[bed['strand'] == "-"]
#         neg.to_csv(f"/scratch/Users/geba9152/LIET-summer2024/meta-celltype-comparison/prior-dfs/tight-sT-beds/intersectlater/{celltype}-neg-mT.bed", header = None, index = None, sep = "\t")
        
        # save bed 
#         bed.to_csv(f"/scratch/Users/geba9152/LIET-summer2024/meta-celltype-comparison/prior-dfs/tight-sT-beds/{celltype}-20kb-updown-mT.bed", header = None, index = None, sep = "\t")

def complexheat_PAS_beds(celltypes, genestoremove):
    for celltype in celltypes:

        priors = pd.read_csv(f'/scratch/Users/geba9152/LIET-summer2024/meta-celltype-comparison/prior-dfs/tight-sT/{celltype}-priors.txt', sep = '\t')
        
        ann = pd.read_csv("/scratch/Users/geba9152/LIET-summer2024/meta-celltype-comparison/annotation/lietanns/chr1-6-3p-UTR.liet.ann", sep = "\t", header = None)
        ann.columns = ['chr','start','stop','Gene','length','strand'] 
        ann['.'] = '.'
        
        df = priors.merge(ann, on = 'Gene')
        
        # get rid of bad genes
        df = df[~df['Gene'].isin(genestoremove)]
        
        # make bed file - "stop" is PAS
        df['start-bed'] = df['stop'] - 20000
        df['stop-bed'] = df['stop'] + 20000
        
        # round values
        df['start-bed'] = df['start-bed'].round().astype(int)
        df['stop-bed'] = df['stop-bed'].round().astype(int)
        
        bed = df[['chr','start-bed','stop-bed','Gene','.','strand']]
        
        # subset for pos genes
        pos = bed.loc[bed['strand'] == "+"]
#         pos.to_csv(f"/scratch/Users/geba9152/LIET-summer2024/meta-celltype-comparison/prior-dfs/tight-sT-beds/intersectlater/{celltype}-pos-PAS.bed", header = None, index = None, sep = "\t")

        # subset for neg genes 
        neg = bed.loc[bed['strand'] == "-"]
#         neg.to_csv(f"/scratch/Users/geba9152/LIET-summer2024/meta-celltype-comparison/prior-dfs/tight-sT-beds/intersectlater/{celltype}-neg-PAS.bed", header = None, index = None, sep = "\t")
        
        # save bed 
        bed.to_csv(f"/scratch/Users/geba9152/LIET-summer2024/meta-celltype-comparison/prior-dfs/tight-sT-beds/{celltype}-20kb-updown-PAS.bed", header = None, index = None, sep = "\t")
    return pos
        
        # save bed 
complexheat_PAS_beds(celltypes, genestoremove)    