In [1]:
import os
import anndata
import cooler
import pandas as pd
from joblib import Parallel, delayed

In [2]:
folder_path = '/home/micl/workspace/lmh_data/Lee2019/Human_single_cell_10kb_cool'

parallel = Parallel(n_jobs=48, backend='loky', verbose=1)

def load_coolers(folder_path):
    def load_cooler(folder_path, file_name):
        c = cooler.Cooler(os.path.join(folder_path, file_name))
        contact = c.pixels(join=True)[:]
        contact = contact[contact['start1']!=contact['start2']]
        binsize, chromsizes = c.binsize, c.chromsizes

        _1 = contact.groupby(['chrom1', 'start1'])['count'].sum()
        _2 = contact.groupby(['chrom2', 'start2'])['count'].sum()
        _1.index.names = _2.index.names = ['chrom', 'start']
        _1, _2 = _1[_1!=0], _2[_2!=0]
        info = pd.concat([_1, _2], axis=1).fillna(0).sum(axis=1).sort_index()
        
        _indexs = set([(chrom, int(i * binsize))
                   for chrom in chromsizes.keys()
                   for i in range(int(chromsizes[chrom]/binsize)+1)])
        _indexs -= set(info.index)
        info = pd.concat([info, pd.Series([0]*len(_indexs), index=list(_indexs))]).sort_index()

        return info.to_frame().astype('float16').rename(columns={0:file_name})

    joblist = []
    for root, dirs, files in os.walk(folder_path, topdown=False):
        for file_name in files:
            joblist.append(delayed(load_cooler)(folder_path, file_name))

    infos = parallel(joblist)
    infos = pd.concat(infos, axis=1).fillna(0).sort_index()
    return infos

infos = load_coolers(folder_path)
infos

[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done 104 tasks      | elapsed:   19.3s
[Parallel(n_jobs=48)]: Done 354 tasks      | elapsed:   53.5s
[Parallel(n_jobs=48)]: Done 704 tasks      | elapsed:  1.7min
[Parallel(n_jobs=48)]: Done 1154 tasks      | elapsed:  2.7min
[Parallel(n_jobs=48)]: Done 1704 tasks      | elapsed:  4.0min
[Parallel(n_jobs=48)]: Done 2354 tasks      | elapsed:  5.5min
[Parallel(n_jobs=48)]: Done 3104 tasks      | elapsed:  7.2min
[Parallel(n_jobs=48)]: Done 3954 tasks      | elapsed:  9.2min
[Parallel(n_jobs=48)]: Done 4238 out of 4238 | elapsed:  9.8min finished


Unnamed: 0_level_0,Unnamed: 1_level_0,190315_29yr_6_E11_AD008_ODC_10kb_contacts.cool,181218_21yr_2_B11_AD002_Vip_10kb_contacts.cool,190315_29yr_4_A3_AD006_L5_10kb_contacts.cool,190315_21yr_6_F11_AD012_L23_10kb_contacts.cool,181218_21yr_2_G12_AD010_MP_10kb_contacts.cool,190305_21yr_2_B4_AD002_Vip_10kb_contacts.cool,190315_21yr_4_G1_AD012_Astro_10kb_contacts.cool,181218_21yr_2_G7_AD004_Astro_10kb_contacts.cool,181218_21yr_2_E8_AD002_L4_10kb_contacts.cool,190315_29yr_4_B7_AD010_ODC_10kb_contacts.cool,...,181218_21yr_3_D5_AD010_Vip_10kb_contacts.cool,190305_21yr_2_G6_AD002_L23_10kb_contacts.cool,190315_29yr_4_H11_AD007_L23_10kb_contacts.cool,190315_21yr_6_G12_AD006_OPC_10kb_contacts.cool,190315_29yr_4_G1_AD002_Ndnf_10kb_contacts.cool,190305_21yr_2_H6_AD007_MP_10kb_contacts.cool,190305_21yr_2_B1_AD001_OPC_10kb_contacts.cool,190315_21yr_4_C7_AD010_Vip_10kb_contacts.cool,181218_21yr_2_E2_AD012_L6_10kb_contacts.cool,190305_29yr_2_H6_AD001_Vip_10kb_contacts.cool
chrom,start,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
chr1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr1,10000,1.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,2.0,2.0,2.0,1.0,1.0,0.0,0.0
chr1,20000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr1,30000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
chr1,40000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrY,59330000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chrY,59340000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chrY,59350000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chrY,59360000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
obs = pd.DataFrame(infos.T.index, columns=['cells'])
obs.insert(obs.shape[1] - 1, 'domain', 'scHiC')
obs = obs.set_index('cells')
var = infos.reset_index()[['chrom', 'start']].set_index(infos.index.map('{0[0]}_{0[1]}'.format))

infos.index = infos.index.map('{0[0]}_{0[1]}'.format)
infos = anndata.AnnData(X=infos.T, obs=obs, var=var)

  infos = anndata.AnnData(X=infos.T, obs=obs, var=var)


In [4]:
infos.write("/home/micl/workspace/lmh_data/Lee2019/scHiC.h5ad", compression="gzip")