In [1]:
import os
import anndata
import cooler
import pandas as pd
from joblib import Parallel, delayed

In [2]:
folder_path = '/lmh_data/data/sclab/GSE223917/cool'

parallel = Parallel(n_jobs=48, backend='loky', verbose=1)

def load_coolers(folder_path):
    def load_cooler(folder_path, file_name):
        c = cooler.Cooler(os.path.join(folder_path, file_name))
        contact = c.pixels(join=True)[:]
        contact = contact[contact['start1']!=contact['start2']]
        binsize, chromsizes = c.binsize, c.chromsizes

        _1 = contact.groupby(['chrom1', 'start1'])['count'].sum()
        _2 = contact.groupby(['chrom2', 'start2'])['count'].sum()
        _1.index.names = _2.index.names = ['chrom', 'start']
        _1, _2 = _1[_1!=0], _2[_2!=0]
        info = pd.concat([_1, _2], axis=1).fillna(0).sum(axis=1).sort_index()
        
        _indexs = set([(chrom, int(i * binsize))
                   for chrom in chromsizes.keys()
                   for i in range(int(chromsizes[chrom]/binsize)+1)])
        _indexs -= set(info.index)
        info = pd.concat([info, pd.Series([0]*len(_indexs), index=list(_indexs))]).sort_index()

        return info.to_frame().astype('float16').rename(columns={0:file_name})

    joblist = []
    for root, dirs, files in os.walk(folder_path, topdown=False):
        for file_name in files:
            joblist.append(delayed(load_cooler)(folder_path, file_name))

    infos = parallel(joblist)
    infos = pd.concat(infos, axis=1).fillna(0).sort_index()
    return infos

infos = load_coolers(folder_path)
infos

[Parallel(n_jobs=48)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done 104 tasks      | elapsed:   17.7s
[Parallel(n_jobs=48)]: Done 354 tasks      | elapsed:   51.0s
[Parallel(n_jobs=48)]: Done 704 tasks      | elapsed:  1.6min
[Parallel(n_jobs=48)]: Done 1154 tasks      | elapsed:  2.6min
[Parallel(n_jobs=48)]: Done 1704 tasks      | elapsed:  3.8min
[Parallel(n_jobs=48)]: Done 2354 tasks      | elapsed:  5.2min
[Parallel(n_jobs=48)]: Done 3104 tasks      | elapsed:  6.8min
[Parallel(n_jobs=48)]: Done 3954 tasks      | elapsed:  8.8min
[Parallel(n_jobs=48)]: Done 4904 tasks      | elapsed: 11.0min
[Parallel(n_jobs=48)]: Done 5954 tasks      | elapsed: 13.4min
[Parallel(n_jobs=48)]: Done 7104 tasks      | elapsed: 16.3min
[Parallel(n_jobs=48)]: Done 7895 out of 7895 | elapsed: 18.2min finished


Unnamed: 0_level_0,Unnamed: 1_level_0,GasaE751051.cool,GasaE751001.cool,GasaE751053.cool,GasaE751002.cool,GasaE751052.cool,GasaE751003.cool,GasaE751054.cool,GasaE751004.cool,GasaE751055.cool,GasaE751005.cool,...,ValbB8w1183.cool,ValbB8w1184.cool,ValbB8w1185.cool,ValbB8w1186.cool,ValbB8w1187.cool,ValbB8w1188.cool,ValbB8w1189.cool,ValbB8w1190.cool,ValbB8w1191.cool,ValbB8w1192.cool
chrom,start,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
chr1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr1,10000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr1,20000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr1,30000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr1,40000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrX,170990000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chrX,171000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chrX,171010000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chrX,171020000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
obs = pd.DataFrame(infos.T.index, columns=['cells'])
obs.insert(obs.shape[1] - 1, 'domain', 'scHiC')
obs = obs.set_index('cells')
var = infos.reset_index()[['chrom', 'start']].set_index(infos.index.map('{0[0]}_{0[1]}'.format))

infos.index = infos.index.map('{0[0]}_{0[1]}'.format)
infos = anndata.AnnData(X=infos.T, obs=obs, var=var)

  infos = anndata.AnnData(X=infos.T, obs=obs, var=var)


In [4]:
infos.write("/lmh_data/data/sclab/GSE223917/scHiC.h5ad", compression="gzip")