In [1]:
import os
import random
from multiprocessing import Pool

import anndata
import cooler
import numpy as np
import pandas as pd
from tqdm import tqdm

root_dir = '/lmh_data/data/sclab'
threads = 48
random.seed(0)

map_info = pd.read_csv(os.path.join(root_dir, 'sclab', 'map_result.csv'), sep='\t', index_col=0)
# hic
cools_path = os.path.join(root_dir, 'Lee2019', 'Human_single_cell_10kb_cool')
hic = anndata.read_h5ad(os.path.join(root_dir, 'sclab', 'hic_result.h5ad'))
# rna
rna = anndata.read_h5ad(os.path.join(root_dir, 'sclab', 'rna_result.h5ad'))
rna_marker_gene = pd.read_csv(os.path.join(root_dir, 'sclab', 'rna_marker_gene.csv'), sep='\t', index_col=0)
_tmp = rna.var.loc[rna_marker_gene.index]
_tmp = _tmp[_tmp['chromEnd'] > _tmp['chromStart'] + 50000]
rna_marker_gene = rna_marker_gene.loc[_tmp.index]

In [2]:
hic = hic[hic.obs['cell_type']=='OPC']

def get_cooler(hic_name):
    file_name = '{}_10kb_contacts.cool'.format(hic_name)
    return hic_name, cooler.Cooler(os.path.join(cools_path, file_name))
with Pool(threads) as p:
    _coolers = dict(p.map(get_cooler, hic.obs_names))
    
def catch_matrix(_args):
    hic_name, gene_name_1, gene_name_2 = _args
    _gene_info_1, _gene_info_2 = rna.var.loc[gene_name_1], rna.var.loc[gene_name_2]
    mat = _coolers[hic_name].matrix(balance=False).fetch(
        '{}:{}-{}'.format(_gene_info_1['chrom'], int(_gene_info_1['chromStart']), int(_gene_info_1['chromEnd'])),
        '{}:{}-{}'.format(_gene_info_2['chrom'], int(_gene_info_2['chromStart']), int(_gene_info_2['chromEnd'])),
    )
    return gene_name_2, mat.sum()

In [3]:
_contacts = pd.DataFrame()
for hic_name in tqdm(hic.obs_names):
    with Pool(threads) as p:
        _args = [(hic_name, 'PDGFRA', gene_name) for gene_name in rna.var_names]
        _contact = pd.DataFrame([dict(p.map(catch_matrix, _args))], index=[hic_name])
    _contacts = pd.concat([_contacts, _contact], axis=0)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 203/203 [28:21<00:00,  8.38s/it]


In [4]:
_contacts

Unnamed: 0,DDX11L1,WASH7P,MIR1302-2,FAM138A,OR4G4P,OR4G11P,OR4F5,CICP27,OR4F29,CICP7,...,RBMY2DP,PPP1R12BP1,CYCSP48,ANKRD36P1,CYCSP49,TPTE2P4,SLC25A15P1,PARP4P1,FAM58CP,CTBP2P1
190315_21yr_6_A10_AD001_OPC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190305_29yr_2_H9_AD008_OPC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190315_21yr_4_G12_AD012_OPC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190315_21yr_4_B11_AD007_OPC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
181218_21yr_2_D3_AD008_OPC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190315_29yr_4_F8_AD012_OPC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190305_29yr_2_B6_AD004_OPC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190315_21yr_4_D3_AD008_OPC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190315_21yr_6_G12_AD006_OPC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
_contacts.loc["列总和"] = _contacts.apply(lambda x:x.sum())
_contacts

Unnamed: 0,DDX11L1,WASH7P,MIR1302-2,FAM138A,OR4G4P,OR4G11P,OR4F5,CICP27,OR4F29,CICP7,...,RBMY2DP,PPP1R12BP1,CYCSP48,ANKRD36P1,CYCSP49,TPTE2P4,SLC25A15P1,PARP4P1,FAM58CP,CTBP2P1
190315_21yr_6_A10_AD001_OPC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190305_29yr_2_H9_AD008_OPC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190315_21yr_4_G12_AD012_OPC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190315_21yr_4_B11_AD007_OPC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
181218_21yr_2_D3_AD008_OPC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190305_29yr_2_B6_AD004_OPC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190315_21yr_4_D3_AD008_OPC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190315_21yr_6_G12_AD006_OPC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190305_21yr_2_B1_AD001_OPC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
_test = _contacts.loc['列总和'].sort_values(ascending=False)
_test = _test[_test>0]
_test

FIP1L1      6723
PDGFRA      6155
SCFD2        325
LNX1         173
KIT           48
            ... 
SERBP1P2       1
MFAP3L         1
PEX14          1
HTR2C          1
MCC            1
Name: 列总和, Length: 1025, dtype: int64

In [7]:
for gene in _test.index:
    print(gene)

FIP1L1
PDGFRA
SCFD2
LNX1
KIT
LNX1-AS1
CHIC2
USP46
KIAA1211
LINC00486
CLOCK
DCUN1D4
ERVMER34-1
LNX1-AS2
FRYL
USP46-AS1
WRN
TMEM165
EPHA5
RPL21P44
EXOC1
RPL38P3
RASGEF1B
THEGL
GLIS1
MARCH1
TEC
KDR
MACROD2
SRD5A3
POP4
GSX2
ADAMTS9-AS2
ZNF704
GABRB1
LHPP
KLHL29
FHIT
PTPRD
UBA6-AS1
KCTD8
CDH10
FUT8
NMU
SLAIN2
SRD5A3-AS1
FCF1P8
SGCB
DANCR
GRID2
ARHGAP24
LINC00504
RASL11B
SORCS2
LDLRAD4-AS1
LDLRAD4
MORF4L2P1
ADAMTS3
TPD52
AGBL4
AASDH
PPAT
CLTCL1
SIL1
FAM155A
SPATA18
GLDCP1
COLEC12
STPG2
MAD2L1
ZNF536
CAMK2A
PACRG
CDH2
ZNF804A
LSAMP
NKAIN2
RAD21L1
SOX5
JMJD1C
ARID5B
SQSTM1
SNTG1
PSMG2
EXOC4
TRPC5
TOX2
AGPS
LINC00348
RALGAPB
FGGY
GNAI3
TENM1
LPP
NYAP1
IGFBP7-AS1
SPINK2
NFKB1
TET2
PAICS
PRKG1
PCDHB11
GREB1L
GOLGA7
PDCL2
CRB1
MYLK3
FAM19A5
ICK
UNC79
NSUN7
ROBO2
ZFYVE28
FAM193A
GRAMD2
SYNE1
RELL1
C4orf50
TRMT44
LINC00682
MLLT4
OR4C4P
ASIC2
ARHGEF10L
WDR1
ANKRD34C
MIR3138
RGS16
FGFRL1
TMEM175
UBE2H
PRIM2
FBXO9
MEGF9
CSN1S1
GALNTL6
ANPEP
CLDN10
CLDN10-AS1
OR9Q1
SCIN
DOCK9
EPHA6
SMYD3
TRHDE
GABRA2
ME

In [8]:
body_genes = ['KCNIP4', 'MALAT1', 'LHFPL3', 'LRRC4C', 'CSMD1', 'PCDH9', 'ERBB4', 'NRXN1', 'DMD',
              'IL1RAPL1', 'XIST', 'LSAMP', 'SOX2-OT', 'GRIA2', 'GRM5', 'TNR', 'PTPRG', 'PDZD2',
              'GRIA4', 'NTM', 'GALNT13', 'NPAS3', 'SHISA9', 'MARCH1', 'NTRK2', 'PHF14', 'ZSWIM6',
              'PTPRZ1', 'GPM6A', 'NFIB', 'PCDH15', 'ANKS1B', 'SOX5', 'NEGR1', 'TMEM108', 'ANK2',
              'SLC8A1', 'SPOCK3', 'PDE4B', 'CALM2', 'FGF14', 'LPPR1', 'MAPK10', 'NAV1', 'IL1RAP',
              'FHIT', 'NLGN4X', 'PDE4D', 'KAZN', 'FAM155A', 'GPC6', 'DOCK4', 'COL9A1', 'PTPRD',
              'ZNF521', 'CPE', 'OLIG1', 'SEZ6L', 'ANK3', 'MAGI2', 'LRFN5', 'DNER', 'GLCCI1', 'PSD3',
              'ASTN2', 'DCAF6', 'SORCS1', 'FGF12', 'XYLT1', 'GRID2', 'RGS7', 'TJP1', 'ALCAM', 'NCAM2',
              'CTTNBP2', 'GRM7', 'FTH1', 'GUCY1A2', 'FAF1', 'PLXDC2', 'SEMA6A', 'FBXL17', 'OPCML',
              'ZNF638', 'MEIS2', 'NOL4', 'TCF12', 'ZFPM2', 'DNM3', 'PPFIA2', 'AGAP1', 'NLGN1', 'LPP',
              'ADARB2', 'ZHX3', 'PHACTR3', 'SIK3', 'CCSER1', 'GSK3B', 'PRTFDC1'
             ]
promoter_genes = ['KCNIP4', 'LHFPL3', 'MALAT1', 'NRXN1', 'CSMD1', 'PCDH9', 'LRRC4C', 'ERBB4',
                  'PCDH15', 'LSAMP', 'XIST', 'TNR', 'DMD', 'IL1RAPL1', 'GRIA2', 'NPAS3', 'NTM',
                  'MARCH1', 'GRM5', 'SOX2-OT', 'GALNT13', 'GRIA4', 'NTRK2', 'NFIB', 'GPM6A',
                  'ANK2', 'PTPRZ1', 'ANKS1B', 'ASTN2', 'FGF14', 'NEGR1', 'SHISA9', 'KAZN',
                  'DOCK4', 'MAGI2', 'ZSWIM6', 'SLC8A1', 'LPPR1', 'SPOCK3', 'SOX5', 'PDZD2',
                  'IL1RAP', 'PTPRD', 'PDE4B', 'GPC6', 'NLGN1', 'ZNF521', 'FAM155A', 'PDE4D',
                  'OPCML', 'NLGN4X', 'CPE', 'CLASP2', 'FHIT', 'SCD5', 'KCNMB2', 'PHF14', 'GRM7',
                  'RGS7', 'DNER', 'SEZ6L', 'HIP1', 'PPP1R12A', 'SLC44A5', 'TMEM132C', 'MAPK10',
                  'GRID2', 'TMEM108', 'TJP1', 'SORCS1', 'RFTN2', 'GSK3B', 'CTTNBP2', 'VCAN', 'NCAM2',
                  'N4BP2L2', 'PSD3', 'ANKRD17', 'PREX2', 'COL9A1', 'SIK3', 'SORBS2', 'JMJD1C',
                  'PLXDC2', 'CCSER1', 'PPFIA2', 'AGAP1', 'DPP10', 'PTPRG', 'RIT2', 'ZNF638', 'NFIA',
                  'XYLT1', 'SNTG1', 'CPEB4', 'LRFN5', 'KLHL13', 'ZHX3', 'CST3', 'RIN2'
                 ]

def get_space_approach_genes(genes):
    return [_name for _name in genes if _name in _test.index]

In [9]:
get_space_approach_genes(body_genes)

['KCNIP4',
 'LHFPL3',
 'LRRC4C',
 'CSMD1',
 'DMD',
 'LSAMP',
 'SOX2-OT',
 'NPAS3',
 'MARCH1',
 'GPM6A',
 'PCDH15',
 'ANKS1B',
 'SOX5',
 'NEGR1',
 'ANK2',
 'SLC8A1',
 'LPPR1',
 'MAPK10',
 'FHIT',
 'PDE4D',
 'FAM155A',
 'GPC6',
 'PTPRD',
 'ASTN2',
 'SORCS1',
 'GRID2',
 'OPCML',
 'LPP',
 'CCSER1',
 'GSK3B']

In [10]:
get_space_approach_genes(promoter_genes)

['KCNIP4',
 'LHFPL3',
 'CSMD1',
 'LRRC4C',
 'PCDH15',
 'LSAMP',
 'DMD',
 'NPAS3',
 'MARCH1',
 'SOX2-OT',
 'GPM6A',
 'ANK2',
 'ANKS1B',
 'ASTN2',
 'NEGR1',
 'SLC8A1',
 'LPPR1',
 'SOX5',
 'PTPRD',
 'GPC6',
 'FAM155A',
 'PDE4D',
 'OPCML',
 'FHIT',
 'HIP1',
 'MAPK10',
 'GRID2',
 'SORCS1',
 'GSK3B',
 'VCAN',
 'JMJD1C',
 'CCSER1',
 'DPP10',
 'SNTG1']