In [1]:
import pandas as pd
import numpy as np
from itertools import chain
from tqdm import tqdm
import os
from os import listdir
import glob
import json
import seaborn as sns
import cooler
import bioframe
import cooltools
from cooltools.lib.numutils import fill_diag
from statsmodels.stats.multitest import multipletests
from os import listdir
from os.path import isfile, join
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
from dotenv import load_dotenv
assert os.environ['CONDA_DEFAULT_ENV'] == "cultures_hic"
load_dotenv()

True

# 1. Get loops positions on Hi-C maps

In [22]:
def get_hg38_arms():
    hg38_chromsizes = bioframe.fetch_chromsizes('hg38')
    hg38_cens = bioframe.fetch_centromeres('hg38')
    hg38_arms = bioframe.make_chromarms(hg38_chromsizes, hg38_cens)
    return hg38_arms[hg38_arms.chrom!='chrM'].reset_index(drop=True)
    
def get_loops(map, loops_name, hg38_arms, path_to_save, maps_directory=path_to_maps, loops_plus_directory=path_to_loops,  binzise = 15_000):
    hg38_arms = get_hg38_arms()
    loops_plus = pd.read_csv(f'{loops_plus_directory}/{loops_name}', sep='\t', header=None)
    loops_plus.columns = ['chrom1', 'start1', 'end1', 'chrom2', 'start2', 'end2', 'num', 'kernel']
    loops_plus = loops_plus.sort_values(['chrom1', 'start1', 'end1', 'chrom2', 'start2', 'end2']).reset_index(drop=True)
    loops_plus['num'] = [i for i in range(loops_plus.shape[0])]
    
    loops_plus.to_csv(f"{loops_plus_directory}/{loops_name.split('bed')[0]}_sorted.bed", sep='\t', index=False)
    name = map.split("/")[-1].split(".")[0]
    if "Heffel" in map:
        name = map.split("/")[-1].split(".")[0]
    print(name)
    clr = cooler.Cooler(f'{map}::/resolutions/{binzise}')
    expected = cooltools.expected_cis(clr, view_df=hg38_arms, nproc=16)
    plus_hc_loops = []        
    stack = cooltools.pileup(clr, 
                             loops_plus, 
                             hg38_arms,                                 
                             expected_df=expected, 
                             flank=200000,
                             nproc=17)
    
    stack[:, :, np.all(stack == 0, axis=(0,1))] = np.nan
    plus_hc_loops.append(stack)     
    np.save(f"{path_to_save}/{name}_0.13fdr_15000res_small_NaN5_loops.npy", plus_hc_loops[0]) 
    assert loops_plus.shape[0] ==  plus_hc_loops[0].shape[0]
    return name, plus_hc_loops[0], loops_plus

def load_loops_back(maps_plus):
    loops = {}
    for map in tqdm(maps_plus):        
        if "CTX" in map:
            name = "Heffel_"+map.split("/")[-1].split(".")[2]
        else:
            name = map.split("/")[-1].split(".")[0]        
        plus_hc_loops = np.load(f"{path_to_save}/{name}_0.13fdr_15000res_small_NaN5_loops.npy")
        loops[name] = plus_hc_loops   
    return loops

In [25]:
binsize = 15_000
path_to_maps = os.getenv('PATH_TO_MAPS')
path_to_loops = "./loops_data/loops_final_files"
path_to_loops_intensities = "./loops_data/loops_intensities/"
number_of_files = 23

pattern = '0.13fdr_15000res_small_NaN5_final.bed'
not_pattern = '.bedped'
files = [f for f in listdir(path_to_loops) if pattern in f and not_pattern not in f]
files.sort()

pattern = 'mcool'
maps = [path_to_maps+f for f in listdir(path_to_maps) if pattern in f]
maps.sort()
assert len(maps) == number_of_files
assert len(files) == number_of_files

In [13]:
authors = list(set([i.split('/')[-1].split(".")[0] for i in maps]))
maps2loops = {i:k for i,k in zip(maps, files)}
with open('./loops_data/maps2loops_mapping.json', 'w') as f:
    json.dump(maps2loops, f)

In [15]:
hg38_arms = get_hg38_arms()
loops = {}
sorted_loops = {}
for map_path, loops_path in maps2loops.items():
    name, loops_prep, loops_plus = get_loops(map_path,loops_path, hg38_arms, path_to_loops_intensities)
    loops[name] = loops_prep
    sorted_loops[name] = loops_plus

## 2. Get intensity in 3x3

In [29]:
def process_loops(loops, retain_all_values = False):
    first_key = next(iter(loops))  
    len_loops = loops[first_key].shape[0]
    center = (loops[first_key].shape[1] - 1) // 2 
    start_index, end_index = center - 1, center + 2
    collect_min = []
    loops_values = {map_key: {} for map_key in loops}  

    for loop in range(len_loops):
        selected_box, min_length, all_values = process_each_loop(loops, loop, start_index, end_index)
        collect_min.extend([min_length] * len(loops))  

        for i, map_key in enumerate(loops):
            
            if selected_box[i][:min_length] == []:
                appended_value = "NaN"
            else:
                appended_value = np.nanmean(selected_box[i][:min_length])
            if retain_all_values:
                appended_value = all_values[i]
            loops_values[map_key].setdefault(loop, []).append(appended_value)

    for map_key in loops:
        loops_values[map_key] = [value[0] for value in loops_values[map_key].values()]
    return loops_values

def process_each_loop(loops, loop, start_index, end_index):
    selected_box, all_values = [], []
    min_length = 9
    for map_key in loops:
        array = loops[map_key][loop][start_index:end_index, start_index:end_index]
        all_values.append(array)
        cleaned_list = array[~np.isnan(array)].tolist()
        cleaned_list_temp = [i for i in cleaned_list if i!=0]
        if cleaned_list_temp:
            cleaned_list = cleaned_list_temp
        cleaned_list.sort(reverse=True)
        selected_box.append(cleaned_list)
        min_length = min(min_length, len(cleaned_list))
    return selected_box, min_length, all_values


def save_loops(processed_loops_all, data_mean, name_data_all, name_data_mean, path):    
    #alll
    data_all = processed_loops_all.copy()
    for key in data_all:
        data_all[key] = [arr.tolist() for arr in data_all[key]]
    with open(f'{path}/{name_data_all}.json', 'w') as f:
        json.dump(data_all, f)
    
    ##mean
    with open(f'{path}/{name_data_mean}.json', 'w') as f:
        json.dump(data_mean, f)

def get_3_to_3_loops(loops_intensities, name, binzise = 15_000, path= path_to_loops_intensities):
    processed_loops_mean = process_loops(loops_intensities)    
    processed_loops_all = process_loops(loops_intensities, retain_all_values=True)
    
    save_loops(processed_loops_all, processed_loops_mean, f'loops_{name}_0.13fdr_15000res_small_NaN5_3x3bin_all', f'loops_{name}_0.13fdr_15000res_small_NaN5_3x3bin_mean', path)
    return processed_loops_mean, processed_loops_all, loops


In [30]:
processed_loops_mean_MAPS = {}
processed_loops_all_MAPS = {}
for name, loops_one in loops.items():
    one_loop = {}
    one_loop[name] =  loops_one
    processed_loops_mean, processed_loops_all, loops = get_3_to_3_loops(one_loop, name)
    processed_loops_mean_MAPS.update(processed_loops_mean)
    processed_loops_all_MAPS.update(processed_loops_all)

# 3. Merge with intensity

In [40]:
pattern = '_mean.json'
not_pattern = 'svdv'
files_loops_intensities = [f for f in listdir(path_to_loops_intensities) if pattern in f and not_pattern not in f]
files_loops_intensities.sort()
assert len(files_loops_intensities) == 23

In [43]:
for name in sorted_loops.keys():    
    df = sorted_loops[name]
    df["intensity_all"] = processed_loops_all_MAPS[name]
    df["intensity_mean"] = processed_loops_mean_MAPS[name]
    df.to_csv(f"{path_to_loops_intensities}/{name}_sampled_dots_final_12000000maxloci_0.13fdr_15000res_small_NaN5_final_sorted_with_intensity.bed", sep='\t', index=False)
    
print('Done!')

Done!
