In [1]:
import numpy as np
import h5py as h5
import seaborn as sns
from collections import Counter

In [2]:
from scipy import sparse
import pandas as pd

In [3]:
!ls data/hg38

cistrome.txt
cluster_human
ER.profile
ER.Snakefile.model
ER.yml
hg38_100to1000window.out.npy
hg38_beta_peak5fold.h5
hg38.genome
hg38_promoter_TADann_H3K4me3_enhance_k27me3_Using.xls
hg38.tss
hg38_window1kb.bed
hg38_window1kb_DNase.h5
hg38_window1kb_H3K27ac.h5
hg38_window1kb_tss.bed
hs_tf_new_beta_rp.h5
hs_tf_new_peak_loct.h5
lisa_meta.xls
lisa_meta.xls.all
lisa_meta.xls.all.forR
lisa_v2_data.h5
lisa_v2_hg38.h5
marge2_motif_100bp_99.h5
margeRP_DNase.h5
margeRP_H3K27ac.h5


In [56]:
!head data/hg38/hg38_window1kb.bed

chr1	0	1000	1
chr1	1000	2000	2
chr1	2000	3000	3
chr1	3000	4000	4
chr1	4000	5000	5
chr1	5000	6000	6
chr1	6000	7000	7
chr1	7000	8000	8
chr1	8000	9000	9
chr1	9000	10000	10


In [46]:
!head data/hg38/lisa_meta.xls

id	species	factor	factor_type	cell_line	cell_type	tissue	qc
1	Homo sapiens	BTAF1	tf	HeLa	Epithelium	Cervix	0
2	Homo sapiens	GAPDH	not sure	HeLa	Epithelium	Cervix	0
4	Homo sapiens	EGR1	tf	K562	Erythroblast	Bone Marrow	0
6	Homo sapiens	TCF4	tf	LS174T	Epithelium	Colon	0
8	Homo sapiens	TCF4	tf	LS174T	Epithelium	Colon	0
9	Homo sapiens	TCF4	tf	LS174T	Epithelium	Colon	0
11	Homo sapiens	TCF4	tf	LS174T	Epithelium	Colon	0
12	Homo sapiens	TCF4	tf	LS174T	Epithelium	Colon	0
17	Homo sapiens	TERF1	predicted chromatin regulator	BJ	Fibroblast	Skin	0


In [50]:
!cat data/hg38/lisa_meta.xls | grep "Homo sapiens" | wc -l

11198


In [53]:
cistrome_txt = pd.read_csv('data/hg38/cistrome.txt', sep = '\t')

In [54]:
cistrome_txt.columns

Index(['id', 'edition', 'source', 'sourcefile', 'status', 'numseqs', 'pmid',
       'dbd', 'family', 'description', 'species', 'cellline', 'entrez',
       'symbol', 'synonym', 'refseq', 'cluster', 'comment2', 'comment2.1',
       'comment3', 'comment4', 'comment5', 'datasetid', 'zscore', 'seqfactors',
       'seqdbds', 'seqdatasetid', 'nmotifs', 'pssm'],
      dtype='object')

In [77]:
cistrome_txt

Unnamed: 0,id,edition,source,sourcefile,status,numseqs,pmid,dbd,family,description,...,comment3,comment4,comment5,datasetid,zscore,seqfactors,seqdbds,seqdatasetid,nmotifs,pssm
0,M00041,,Transfac,,4,,8190638,Leucine zipper Family,,activating transcription factor 2|jun proto-on...,...,,,,,,,,,,"[[[0.010, 0.023, 0.010, 0.957], [0.047, 0.010,..."
1,M00062,,Transfac,,4,,7687740,Interferon Regulatory Factor,,interferon regulatory factor 1,...,,,,,,,,,,"[[[0.010, 0.381, 0.571, 0.038], [0.970, 0.010,..."
2,M00072,,Transfac,,4,,2233727,CP2 Transcription Factor Domain Family,,transcription factor CP2,...,,,,,,,,,,"[[[0.010, 0.167, 0.813, 0.010], [0.010, 0.813,..."
3,M00082,,Transfac,,4,,8321231,Runt Domain Family,,runt-related transcription factor 1,...,,,,,,,,,,"[[[0.737, 0.010, 0.053, 0.200], [0.053, 0.010,..."
4,M00085,,Transfac,,4,,7958847,BetaBetaAlpha-zinc finger Family,,zinc finger and BTB domain containing 6,...,,,,,,,,,,"[[[0.086, 0.343, 0.257, 0.314], [0.057, 0.143,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1056,UP00404,,UniPROBE,,9,,19443739,Ets Domain Family,,E74-like factor 2 (ets domain transcription fa...,...,,,,,,,,,,"[[[0.382, 0.199, 0.201, 0.218], [0.092, 0.332,..."
1057,UP00412,,UniPROBE,,9,,19443739,Ets Domain Family,,ets variant gene 5,...,,,,,,,,,,"[[[0.164, 0.479, 0.235, 0.122], [0.298, 0.128,..."
1058,UP00413,,UniPROBE,,9,,19443739,Ets Domain Family,,E74-like factor 4 (ets domain transcription fa...,...,,,,,,,,,,"[[[0.350, 0.139, 0.209, 0.302], [0.080, 0.359,..."
1059,UP00418,,UniPROBE,,9,,19443739,Ets Domain Family,,ets variant gene 6 (TEL oncogene),...,,,,,,,,,,"[[[0.355, 0.069, 0.299, 0.277], [0.210, 0.408,..."


# Read in DNase Data

In [4]:
dnnase_data = h5.File('data/hg38/hg38_window1kb_DNase.h5', 'r')

In [5]:
test_dnaase_data = dnnase_data["OrderCount"][:, :10]

In [6]:
test_dnaase_data.shape

(3209513, 10)

# Read in ChIP data

In [None]:
chip = h5.File('data/hg38/hs_tf_new_peak_loct.h5', 'r')

In [96]:
%%timeit
chip_ds = []
for dataset in list(chip.keys())[:8500]:
    chip_ds.append(chip[dataset][...].astype(np.int32))

datasets = np.concatenate([np.full((chip.shape[0], ), i) for i, chip in enumerate(chip_ds)])

peaks = np.concatenate(chip_ds)

6.52 s ± 121 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
num_peaks = peaks.shape
num_bins = test_dnaase_data.shape[0]

chip_sparse = sparse.coo_matrix(
    (
        np.ones(num_peaks), 
        (peaks // 10, datasets)
    ), 
    shape = (num_bins, len(chip_ds)))

In [24]:
test_dnaase_data.shape, chip_sparse.shape

((3209513, 10), (3209513, 8472))

In [28]:
test_dnaase_data[:, 0][:, np.newaxis].shape

(3209513, 1)

In [32]:
%%timeit
chip_sparse.multiply(test_dnaase_data[:, 0][:, np.newaxis]) #Okay that works and is super fast

13.9 s ± 1.94 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [30]:
chip_overlaps = _

In [31]:
chip_overlaps.shape

(3209513, 8472)

In [33]:
converter = np.load('data/hg38/hg38_100to1000window.out.npy')

In [34]:
converter.shape

(32093082,)

In [35]:
converter

array([      0,       0,       0, ..., 3209512, 3209512, 3209512])

In [38]:
peaks.max()

30882611

In [74]:
converter[peaks].max()

3088271

In [71]:
for chip_key in list(chip.keys()):
    try:
        int(chip_key)
    except Exception:
        print(chip_key)

IDs


In [73]:
chip['IDs'][...]

array([b'1', b'2', b'4', ..., b'77535', b'77536', b'77537'], dtype='|S25')

In [76]:
cistrome_txt.index.max()

1060

In [57]:
lisa_meta = pd.read_csv('data/hg38/lisa_meta.xls', sep = '\t', encoding='iso-8859-1')

In [88]:
lisa_meta.groupby(['species','factor']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,factor_type,cell_line,cell_type,tissue,qc
species,factor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Homo sapiens,552-SKD,5,5,5,5,5,5
Homo sapiens,598-SKD,9,9,9,9,9,9
Homo sapiens,5HMC,8,8,8,8,8,8
Homo sapiens,5MC,4,4,4,4,4,4
Homo sapiens,7SK,5,5,5,5,5,5
...,...,...,...,...,...,...,...
Mus musculus,ZMIZ1,4,4,4,4,2,4
Mus musculus,ZNF143,3,3,2,3,3,3
Mus musculus,ZNF322A,1,1,1,1,1,1
Mus musculus,ZNF335,6,6,0,4,2,6


In [89]:
lisa_meta[lisa_meta.species == 'Homo sapiens'].factor.nunique()

1319

In [92]:
tad_info = pd.read_csv('data/hg38/hg38_promoter_TADann_H3K4me3_enhance_k27me3_Using.xls', sep = '\t')

In [94]:
tad_info.columns

Index(['chrom', 'pstart', 'pend', 'geneID', 'geneName', 'strand', 'TADchrom',
       'TADstart', 'TADend', 'TADid', 'median_zk4me3_enhance',
       'median_TADk27ac', 'k4me3_cluster', 'tad_cluster',
       'k4me3_order_cluster', 'tad_order_cluster', 'median_zk27me3_enhance',
       'k27me3_cluster', 'k27me3_order_cluster', 'quadrant'],
      dtype='object')

In [101]:
tad_info

Unnamed: 0,chrom,pstart,pend,geneID,geneName,strand,TADchrom,TADstart,TADend,TADid,median_zk4me3_enhance,median_TADk27ac,k4me3_cluster,tad_cluster,k4me3_order_cluster,tad_order_cluster,median_zk27me3_enhance,k27me3_cluster,k27me3_order_cluster,quadrant
0,chr1,68091,70091,NM_001005484.1,OR4F5,+,chr1,0,834757,interTAD0,-1.065412,-0.993636,0,2,1,1,-1.004632,4,1,Third
1,chr1,180049,182049,XM_011543808.2,LOC102725121,+,chr1,0,834757,interTAD0,-1.023866,-0.993636,0,2,1,1,-0.957343,4,1,Third
2,chr1,924741,926741,NM_152486.2,SAMD11,+,chr1,834757,1314757,TAD1,-0.844750,0.747895,7,5,3,8,1.310685,2,10,second
3,chr1,959103,961103,XM_006710600.3,KLHL17,+,chr1,834757,1314757,TAD1,0.777922,0.747895,1,5,8,8,0.520080,0,6,first
4,chr1,964820,966820,XM_011542248.2,PLEKHN1,+,chr1,834757,1314757,TAD1,-0.773444,0.747895,7,5,3,8,0.295434,0,6,second
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19861,chrX,155263589,155265589,NM_171998.3,RAB39B,-,chrX,154878531,155717145,TAD3051,-0.433127,-0.880177,4,2,4,1,-0.811130,3,3,Third
19862,chrX,155333681,155335681,NM_001289.5,CLIC2,-,chrX,154878531,155717145,TAD3051,-0.996221,-0.880177,0,2,1,1,-0.871263,4,1,Third
19863,chrX,155457600,155459600,NM_001007524.1,F8A3,-,chrX,154878531,155717145,TAD3051,-1.071640,-0.880177,0,2,1,1,-1.105547,4,1,Third
19864,chrX,155458935,155460935,NM_080720.1,H2AFB3,-,chrX,154878531,155717145,TAD3051,-1.071640,-0.880177,0,2,1,1,-1.105547,4,1,Third


In [103]:
tad_group = tad_info.k4me3_order_cluster.astype(str) + ',' + tad_info.tad_order_cluster.astype(str)

In [104]:
tad_group

0        1,1
1        1,1
2        3,8
3        8,8
4        3,8
        ... 
19861    4,1
19862    1,1
19863    1,1
19864    1,1
19865    1,1
Length: 19866, dtype: object

In [107]:
gene_symbol = tad_info.geneName

In [109]:
gene_id = tad_info.geneID

In [110]:
gene_id.shape, gene_symbol.shape, tad_group.shape

((19866,), (19866,), (19866,))

In [118]:
num_genes = gene_id.shape

In [320]:
lisa_v2_hg38 = h5.File('./data/hg38/lisa_v2_hg38.h5', 'a')

In [321]:
lisa_v2_hg38.create_group('gene_info')

<HDF5 group "/gene_info" (0 members)>

In [323]:
lisa_v2_hg38.keys()

gene_symbol.values

gene_symbols = lisa_v2_hg38.create_dataset('/gene_info/gene_symbols', data = gene_symbol.values.astype('S'))

lisa_v2_hg38.create_dataset('/gene_info/gene_refseqIDs', data = gene_id.values.astype('S'))

lisa_v2_hg38.create_dataset('/gene_info/gene_tad_info', data = tad_group.values.astype('S'))

lisa_v2_hg38['gene_info']['gene_tad_info']

lisa_v2_hg38.close()

# Testing Model Training
<hr>

In [None]:
#LOADING IN DATA TO MY H5
rp_h5 = h5.File('data/hg38/margeRP_DNase.h5', 'r')

rp_h5.keys()

rp_h5['RP'][10,:]

lisa_v2_hg38 = h5.File('data/hg38/lisa_v2_hg38.h5', 'w')

dnase_group = lisa_v2_hg38.create_group('DNase_data')

old_refseq = rp_h5['RefSeq'][...].astype(str)

old_refseq = [
    stuff.split(':')[4] for stuff in old_refseq
]

old_refseq

lisa_v2_hg38.create_dataset('DNase_data/gene_ids', data = rp_h5['RefSeq'][...].astype('S'))

lisa_v2_hg38.create_dataset('DNase_data/sample_ids', data = rp_h5['IDs'][...].astype('S'))

lisa_v2_hg38.create_dataset('DNase_data/regulatory_potential', data = rp_h5['RP'][...])

lisa_v2_hg38.keys()

lisa_v2_hg38.close()

In [266]:
lisa_v2_hg38 = h5.File('data/hg38/lisa_v2_hg38.h5', 'r')

In [325]:
lisa_v2_hg38.keys()

<KeysViewHDF5 ['DNase_data', 'gene_info']>

In [327]:
lisa_v2_hg38['DNase_data'].keys()

<KeysViewHDF5 ['gene_ids', 'regulatory_potential', 'sample_ids']>

In [341]:
available_genes = [x.split(':')[-2] for x in lisa_v2_hg38['DNase_data']['gene_ids'][...].astype(str)]

In [337]:
extract_genes = np.isin(available_genes, selected_genes)

# Setting up RP data pt 2

In [4]:
data = h5.File('data/hg38/lisa_v2_hg38.h5', 'r')

In [8]:
data['DNase_data'].keys()

<KeysViewHDF5 ['gene_ids', 'regulatory_potential', 'sample_ids']>

In [17]:
gene_ids = data['DNase_data']['gene_ids'][...]

In [20]:
symbols = [line.decode('utf-8').split(':')[-1] for line in gene_ids]

In [23]:
len(symbols), len(set(symbols))

(52876, 26631)

In [24]:
symbol_dict = {}
for i, symbol in enumerate(symbols):
    if not symbol in symbol_dict:
        symbol_dict[symbol] = i

In [28]:
first_occurence = np.zeros(len(gene_ids), dtype = np.bool)
first_occurence[list(symbol_dict.values())] = True

In [30]:
first_occurence.sum()

26631

In [36]:
len(set(selected_genes).intersection(set(np.array(symbols)[first_occurence])))

2981

In [35]:
np.array(symbols)[first_occurence]

array(['DDX11L1', 'MIR6859-3', 'MIR6859-4', ..., 'CSPG4P1Y', 'GOLGA2P2Y',
       'GOLGA2P3Y'], dtype='<U22')

In [39]:
subset_rp_data = data['DNase_data']['regulatory_potential'][first_occurence, :][...]

In [40]:
subset_rp_data.shape

(26631, 1110)

In [41]:
subest_gene_ids = np.array(symbols)[first_occurence]

In [43]:
subest_gene_ids.shape

(26631,)

In [44]:
data.close()

In [99]:
with h5.File('data/hg38/lisa_v2_hg38.h5', 'r+') as data:
    del data['DNase_data']['regulatory_potential']
    #del data['DNase_data']['gene_ids']
    del data['DNase_data']['sample_ids']
    data.create_dataset('DNase_data/regulatory_potential', data = subset_rp_data)
    #data.create_dataset('DNase_data/gene_ids', data = np.array(subest_gene_ids).astype('S'))
    data.create_dataset('DNase_data/sample_ids', data = np.array(dnaase_ids).astype(np.int32))

In [102]:
data = h5.File('data/hg38/lisa_v2_hg38.h5', 'r+')

In [58]:
dnaase_ids = data['DNase_data']['sample_ids'][...].astype(int)

In [79]:
qc_map = lisa_meta[lisa_meta.id.isin(dnaase_ids)].set_index('id').qc.reindex(dnaase_ids)

In [85]:
subset_qc = qc_map.values.astype(np.bool)

In [87]:
subset_rp_data.shape

(26631, 1110)

In [88]:
subset_gene_ids = subest_gene_ids

In [91]:
dnaase_ids = dnaase_ids[subset_qc]

In [93]:
dnaase_ids.shape

(934,)

In [95]:
subset_rp_data = subset_rp_data[:, subset_qc]

# Testing LR functions

In [122]:
import models
import build_chromatin_model
import background_genes_selection
import importlib

In [150]:
#load gene lists
with h5.File('data/hg38/lisa_v2_hg38.h5', 'r') as store:
    gene_symbols = store['gene_info']['gene_symbols'][...].astype(str)
    gene_ids = store['gene_info']['gene_refseqIDs'][...].astype(str)
    gene_tag_groups = store['gene_info']['gene_tad_info'][...].astype(str)

#open user-supplied gene-list
with open('data/gene_list.txt', 'r') as f:
    user_gene_list = f.readlines()

In [151]:
gene_list = [gene.strip().upper() for gene in user_gene_list]

importlib.reload(background_genes_selection)

selected_genes, labels = background_genes_selection\
.select_genes_for_chromatin_model(gene_list, gene_symbols, gene_ids, gene_tag_groups, 
                                  num_selected = 3000, user_background_genes = None, method = 'TAD')

len(selected_genes)

3091

In [152]:
with h5.File('data/hg38/lisa_v2_hg38.h5', 'r') as data:
    
    gene_symbols = data['DNase_data']['gene_ids'][...].astype(str)
    intersected_genes = set(selected_genes).intersection(set(gene_symbols))
    intersected_ids = np.isin(gene_symbols, list(intersected_genes))
    
    rp_matrix = data['DNase_data']['regulatory_potential'][intersected_ids, :][...]
    dataset_ids = data['DNase_data']['sample_ids'][...].astype(str)

In [261]:
importlib.reload(build_chromatin_model)
importlib.reload(models)

from models import LR_BinarySearch_SampleSelectionModel
from models import LR_ChromatinModel
from sklearn.model_selection import ParameterGrid

sample_selection_model = LR_BinarySearch_SampleSelectionModel()

chromatin_model = LR_ChromatinModel({'C' : list(10.0**np.arange(-4,4,1))})

selected_datasets, selected_dataset_ids, selection_model, chromatin_model, normalization_fn\
    = build_chromatin_model.build_chromatin_model(rp_matrix, dataset_ids, 
                                            labels, sample_selection_model, chromatin_model, n_jobs = -1)



In [258]:
import json

In [265]:
print(json.dumps(selection_model.get_info(), indent = 4))

{
    "search_params": {
        "epsilon": 1e-07,
        "max_iters": 50,
        "num_datasets_selected": 10,
        "penalty": "l1",
        "penalty_range": [
            -1,
            10
        ],
        "tol": 0.01
    },
    "search_model_params": {
        "C": 0.03482458570612164,
        "class_weight": null,
        "dual": false,
        "fit_intercept": true,
        "intercept_scaling": 1,
        "l1_ratio": null,
        "max_iter": 100,
        "multi_class": "auto",
        "n_jobs": null,
        "penalty": "l1",
        "random_state": null,
        "solver": "liblinear",
        "tol": 0.01,
        "verbose": 0,
        "warm_start": false
    },
    "dataset_coefs": [
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.02227953496