In [1]:
import os
import random
from multiprocessing import Pool

import anndata
import cooler
import numpy as np
import pandas as pd
from tqdm import tqdm

root_dir = '/lmh_data/data/sclab'
threads = 48
random.seed(0)

map_info = pd.read_csv(os.path.join(root_dir, 'sclab', 'map_result.csv'), sep='\t', index_col=0)
# hic
cools_path = os.path.join(root_dir, 'Lee2019', 'Human_single_cell_10kb_cool')
hic = anndata.read_h5ad(os.path.join(root_dir, 'sclab', 'hic_result.h5ad'))
# rna
rna = anndata.read_h5ad(os.path.join(root_dir, 'sclab', 'rna_result.h5ad'))
rna_marker_gene = pd.read_csv(os.path.join(root_dir, 'sclab', 'rna_marker_gene.csv'), sep='\t', index_col=0)
_tmp = rna.var.loc[rna_marker_gene.index]
_tmp = _tmp[_tmp['chromEnd'] > _tmp['chromStart'] + 50000]
rna_marker_gene = rna_marker_gene.loc[_tmp.index]

In [2]:
hic = hic[hic.obs['cell_type']=='Astro']

def get_cooler(hic_name):
    file_name = '{}_10kb_contacts.cool'.format(hic_name)
    return hic_name, cooler.Cooler(os.path.join(cools_path, file_name))
with Pool(threads) as p:
    _coolers = dict(p.map(get_cooler, hic.obs_names))
    
def catch_matrix(_args):
    hic_name, gene_name_1, gene_name_2 = _args
    _gene_info_1, _gene_info_2 = rna.var.loc[gene_name_1], rna.var.loc[gene_name_2]
    mat = _coolers[hic_name].matrix(balance=False).fetch(
        '{}:{}-{}'.format(_gene_info_1['chrom'], int(_gene_info_1['chromStart']), int(_gene_info_1['chromEnd'])),
        '{}:{}-{}'.format(_gene_info_2['chrom'], int(_gene_info_2['chromStart']), int(_gene_info_2['chromEnd'])),
    )
    return gene_name_2, mat.sum()

In [3]:
_contacts = pd.DataFrame()
for hic_name in tqdm(hic.obs_names):
    with Pool(threads) as p:
        _args = [(hic_name, 'SLC1A2', gene_name) for gene_name in rna.var_names]
        _contact = pd.DataFrame([dict(p.map(catch_matrix, _args))], index=[hic_name])
    _contacts = pd.concat([_contacts, _contact], axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 449/449 [1:00:35<00:00,  8.10s/it]


In [4]:
_contacts

Unnamed: 0,DDX11L1,WASH7P,MIR1302-2,FAM138A,OR4G4P,OR4G11P,OR4F5,CICP27,OR4F29,CICP7,...,RBMY2DP,PPP1R12BP1,CYCSP48,ANKRD36P1,CYCSP49,TPTE2P4,SLC25A15P1,PARP4P1,FAM58CP,CTBP2P1
190315_21yr_4_G1_AD012_Astro,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
181218_21yr_2_G7_AD004_Astro,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190315_21yr_6_F12_AD006_Astro,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190305_21yr_2_A9_AD002_Astro,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190305_29yr_2_G9_AD001_Astro,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190305_29yr_2_E6_AD006_Astro,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190305_21yr_2_B5_AD006_Astro,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190315_29yr_4_H2_AD001_Astro,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190315_29yr_4_A8_AD007_Astro,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
_contacts.loc["列总和"] = _contacts.apply(lambda x:x.sum())
_contacts

Unnamed: 0,DDX11L1,WASH7P,MIR1302-2,FAM138A,OR4G4P,OR4G11P,OR4F5,CICP27,OR4F29,CICP7,...,RBMY2DP,PPP1R12BP1,CYCSP48,ANKRD36P1,CYCSP49,TPTE2P4,SLC25A15P1,PARP4P1,FAM58CP,CTBP2P1
190315_21yr_4_G1_AD012_Astro,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
181218_21yr_2_G7_AD004_Astro,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190315_21yr_6_F12_AD006_Astro,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190305_21yr_2_A9_AD002_Astro,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190305_29yr_2_G9_AD001_Astro,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190305_21yr_2_B5_AD006_Astro,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190315_29yr_4_H2_AD001_Astro,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190315_29yr_4_A8_AD007_Astro,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190305_21yr_2_A7_AD004_Astro,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
_test = _contacts.loc['列总和'].sort_values(ascending=False)
_test = _test[_test>0]
_test

SLC1A2     35118
PAMR1       1275
LDLRAD3      510
PDHX         429
CD44         280
           ...  
DSCR3          1
SGSM2          1
KANSL3         1
DAO            1
DYRK1A         1
Name: 列总和, Length: 5407, dtype: int64

In [25]:
for gene in _test.index:
    print(gene)

SLC1A2
PAMR1
LDLRAD3
PDHX
CD44
TRIM44
ABTB2
APIP
PRR5L
EHF
LRRC4C
ELF5
NAV2
FJX1
TSPAN18
HSD17B12
NAT10
CAT
COMMD9
CAPRIN1
CD82
SOX6
EXT2
SHANK2
HIPK3
MIR1343
PRDM11
KIRREL3
KCNQ1
LINC00486
RAG1
TRAF6
ELP4
AMBRA1
LMO2
TTC17
TP53I11
NELL1
GRIK4
CD59
KIAA1549L
UVRAG
FBXO3
SBF2
MIR3973
DCDC1
PHF21A
FAT3
SPON1
NTM
RCN1
DLG2
RSF1
ASIC2
CNTN5
CSTF3
GRIA4
SERGEF
OSBPL5
PLEKHA7
PKNOX2
NUCB2
CARS
DISC1FP1
TENM4
AMPD3
PIK3C2A
SH3RF3
PVRL1
NAP1L4
C11orf74
GALNT18
GAB2
AMOTL1
C11orf49
STIM1
MPPED2
SYT13
RAD51B
MYO7A
ALK
OPCML
FCHSD2
LUZP2
ALKBH3
TCP11L1
DNHD1
ST5
EXOC4
MAML2
NSMCE2
DGKZ
GUCY1A2
MYT1L
CADM1
TNKS1BP1
CREB5
GPC5
CCDC73
LGR4
PTPRD
RAG2
NRXN2
POLD3
CAPN5
PRKG1
FADS2
PICALM
STK33
NARS2
SLC45A4
GPR20
JAZF1
PTPRT
KRT18P14
ALG9
QSER1
PTPN5
CDH12
DAGLA
CSMD1
DSCAML1
METTL15
CORO1C
ACER3
SWAP70
C11orf30
CRY2
ZNF521
CREB3L1
PARD3
NRXN1
NUP98
EDEM2
MACROD1
TTLL5
SEMA6D
ELP3
AQP4-AS1
LIMD1
ARHGAP32
SYT9
RNF38
CDH23
HBG2
PVT1
SIK3
RNF169
XRRA1
TRAPPC9
TSPAN7
KSR1
ARHGAP42
TRPC6
CAMTA1
DAB1
SORCS

In [28]:
body_genes = ['MALAT1', 'SLC1A2', 'PCDH9', 'ERBB4', 'LSAMP', 'SLC1A3', 'GPM6A', 'NPAS3',
              'CTNND2', 'GPC5', 'SPARCL1', 'CST3', 'DPP10', 'XIST', 'NRXN1', 'NTRK2',
              'PTPRG', 'QKI', 'MAPK10', 'NKAIN3', 'TRPM3', 'NTM', 'ARHGAP24', 'FBXL7',
              'BMPR1B', 'ZBTB20', 'MAGI2', 'FTH1', 'RYR3', 'MEIS2', 'ANK2', 'ADCY2',
              'FAM171B', 'PSD3', 'PTPRZ1', 'DOCK4', 'NFIB', 'ZSWIM6', 'PTK2', 'CADM1',
              'AHCYL2', 'PAMR1', 'PLCB1', 'TMEM108', 'RORA', 'KCNMA1', 'PITPNC1', 'ZFHX4',
              'FTX', 'WIF1', 'ZHX3', 'PPAP2B', 'LINC00299', 'LRRC16A', 'TCF4', 'LIFR',
              'ASTN2', 'AQP4', 'N4BP2L2', 'AUTS2', 'TRIM2', 'RGS7', 'PNISR', 'NAV3', 'MKLN1',
              'HSPA1A', 'NRG3', 'NAALADL2', 'CAMTA1', 'ATG7', 'SLCO1C1', 'DST', 'TJP1',
              'PHACTR3', 'GLUD1', 'PDE4D', 'ARHGEF4', 'ASPH', 'PPP1R3C', 'CALM2', 'PRTFDC1',
              'PHF14', 'MED13L', 'SLC7A11', 'RERE', 'SLC25A18', 'ARHGAP5', 'DYNC2H1', 'NCAM1',
              'NAV2', 'FBXL17', 'UBR5', 'ARHGAP26', 'SORL1', 'CPE', 'ZNF638', 'FGFR2', 'ADD3',
              'RFX3', 'SCD5'
             ]
promoter_genes = ['MALAT1', 'SLC1A2', 'PCDH9', 'GPM6A', 'LSAMP', 'ERBB4', 'NPAS3', 'GPC5', 'NRXN1',
                  'CTNND2', 'SPARCL1', 'NTRK2', 'XIST', 'SLC1A3', 'CST3', 'ZBTB20', 'MAPK10', 'NTM',
                  'ANK2', 'FBXL7', 'ARHGAP24', 'RORA', 'DPP10', 'RYR3', 'TRPM3', 'NKAIN3', 'PTPRG',
                  'PITPNC1', 'FAM171B', 'FTH1', 'LRRC16A', 'MEIS2', 'QKI', 'PLCB1', 'ZSWIM6', 'FTX',
                  'DOCK4', 'BMPR1B', 'AUTS2', 'WIF1', 'PTK2', 'NEBL', 'ADCY2', 'PPAP2B', 'NFIB',
                  'PREX2', 'LRRC4C', 'LINC00299', 'RGS7', 'NRG3', 'KCNMA1', 'NFIA', 'PSD3', 'HSPA1A',
                  'TJP1', 'TMEM108', 'PTPRZ1', 'LIFR', 'AHCYL2', 'N4BP2L2', 'NAV3', 'GLUD1', 'ZHX3',
                  'SLC7A11', 'PHLPP1', 'ZFHX4', 'PAMR1', 'CAMTA1', 'TRIM2', 'PRTFDC1', 'MED13L',
                  'SORBS1', 'RERE', 'RFX3', 'CADM1', 'PHACTR3', 'CALM2', 'PNISR', 'PDE4B', 'DYNC2H1',
                  'LRIG1', 'ASTN2', 'ZNF638', 'JMJD1C', 'TCF4', 'ACSL6', 'SLC4A4', 'GLUL', 'AQP4',
                  'FGFR2', 'PSD2', 'HNRNPA2B1', 'SLCO1C1', 'SCD5', 'NCAM2', 'PHF14', 'MID1', 'DOCK3',
                  'ARHGEF4', 'GNAQ'
                 ]

def get_space_approach_genes(genes):
    return [_name for _name in genes if _name in _test.index]

In [29]:
get_space_approach_genes(body_genes)

['SLC1A2',
 'PCDH9',
 'ERBB4',
 'LSAMP',
 'SLC1A3',
 'GPM6A',
 'NPAS3',
 'CTNND2',
 'GPC5',
 'DPP10',
 'NRXN1',
 'NTRK2',
 'PTPRG',
 'QKI',
 'NKAIN3',
 'TRPM3',
 'NTM',
 'ARHGAP24',
 'FBXL7',
 'BMPR1B',
 'ZBTB20',
 'MAGI2',
 'RYR3',
 'MEIS2',
 'ANK2',
 'ADCY2',
 'PSD3',
 'DOCK4',
 'PTK2',
 'CADM1',
 'PAMR1',
 'PLCB1',
 'TMEM108',
 'RORA',
 'KCNMA1',
 'PITPNC1',
 'LINC00299',
 'LRRC16A',
 'TCF4',
 'LIFR',
 'ASTN2',
 'AQP4',
 'AUTS2',
 'TRIM2',
 'RGS7',
 'PNISR',
 'NAV3',
 'MKLN1',
 'HSPA1A',
 'NRG3',
 'NAALADL2',
 'CAMTA1',
 'ATG7',
 'PHACTR3',
 'PDE4D',
 'ASPH',
 'MED13L',
 'RERE',
 'DYNC2H1',
 'NCAM1',
 'NAV2',
 'FBXL17',
 'ARHGAP26',
 'SORL1',
 'ZNF638',
 'FGFR2',
 'ADD3']

In [30]:
get_space_approach_genes(promoter_genes)

['SLC1A2',
 'PCDH9',
 'GPM6A',
 'LSAMP',
 'ERBB4',
 'NPAS3',
 'GPC5',
 'NRXN1',
 'CTNND2',
 'NTRK2',
 'SLC1A3',
 'ZBTB20',
 'NTM',
 'ANK2',
 'FBXL7',
 'ARHGAP24',
 'RORA',
 'DPP10',
 'RYR3',
 'TRPM3',
 'NKAIN3',
 'PTPRG',
 'PITPNC1',
 'LRRC16A',
 'MEIS2',
 'QKI',
 'PLCB1',
 'DOCK4',
 'BMPR1B',
 'AUTS2',
 'PTK2',
 'NEBL',
 'ADCY2',
 'PREX2',
 'LRRC4C',
 'LINC00299',
 'RGS7',
 'NRG3',
 'KCNMA1',
 'NFIA',
 'PSD3',
 'HSPA1A',
 'TMEM108',
 'LIFR',
 'NAV3',
 'PHLPP1',
 'PAMR1',
 'CAMTA1',
 'TRIM2',
 'MED13L',
 'SORBS1',
 'RERE',
 'CADM1',
 'PHACTR3',
 'PNISR',
 'PDE4B',
 'DYNC2H1',
 'ASTN2',
 'ZNF638',
 'JMJD1C',
 'TCF4',
 'SLC4A4',
 'AQP4',
 'FGFR2',
 'MID1',
 'DOCK3',
 'GNAQ']