In [1]:
import numpy as np
import h5py as h5
import seaborn as sns
from collections import Counter
from scipy import sparse
import pandas as pd

In [3]:
!ls data/hg38

cistrome.txt
cluster_human
ER.profile
ER.Snakefile.model
ER.yml
hg38_100to1000window.out.npy
hg38_beta_peak5fold.h5
hg38.genome
hg38_promoter_TADann_H3K4me3_enhance_k27me3_Using.xls
hg38.tss
hg38_window1kb.bed
hg38_window1kb_DNase.h5
hg38_window1kb_H3K27ac.h5
hg38_window1kb_tss.bed
hs_tf_new_beta_rp.h5
hs_tf_new_peak_loct.h5
lisa_meta.xls
lisa_meta.xls.all
lisa_meta.xls.all.forR
lisa_v2_data.h5
lisa_v2_hg38.h5
marge2_motif_100bp_99.h5
margeRP_DNase.h5
margeRP_H3K27ac.h5


In [56]:
!head data/hg38/hg38_window1kb.bed

chr1	0	1000	1
chr1	1000	2000	2
chr1	2000	3000	3
chr1	3000	4000	4
chr1	4000	5000	5
chr1	5000	6000	6
chr1	6000	7000	7
chr1	7000	8000	8
chr1	8000	9000	9
chr1	9000	10000	10


In [46]:
!head data/hg38/lisa_meta.xls

id	species	factor	factor_type	cell_line	cell_type	tissue	qc
1	Homo sapiens	BTAF1	tf	HeLa	Epithelium	Cervix	0
2	Homo sapiens	GAPDH	not sure	HeLa	Epithelium	Cervix	0
4	Homo sapiens	EGR1	tf	K562	Erythroblast	Bone Marrow	0
6	Homo sapiens	TCF4	tf	LS174T	Epithelium	Colon	0
8	Homo sapiens	TCF4	tf	LS174T	Epithelium	Colon	0
9	Homo sapiens	TCF4	tf	LS174T	Epithelium	Colon	0
11	Homo sapiens	TCF4	tf	LS174T	Epithelium	Colon	0
12	Homo sapiens	TCF4	tf	LS174T	Epithelium	Colon	0
17	Homo sapiens	TERF1	predicted chromatin regulator	BJ	Fibroblast	Skin	0


In [50]:
!cat data/hg38/lisa_meta.xls | grep "Homo sapiens" | wc -l

11198


In [227]:
cistrome_txt = pd.read_csv('data/hg38/cistrome.txt', sep = '\t')

In [228]:
cistrome_txt

Unnamed: 0,id,edition,source,sourcefile,status,numseqs,pmid,dbd,family,description,...,comment3,comment4,comment5,datasetid,zscore,seqfactors,seqdbds,seqdatasetid,nmotifs,pssm
0,M00041,,Transfac,,4,,8190638,Leucine zipper Family,,activating transcription factor 2|jun proto-on...,...,,,,,,,,,,"[[[0.010, 0.023, 0.010, 0.957], [0.047, 0.010,..."
1,M00062,,Transfac,,4,,7687740,Interferon Regulatory Factor,,interferon regulatory factor 1,...,,,,,,,,,,"[[[0.010, 0.381, 0.571, 0.038], [0.970, 0.010,..."
2,M00072,,Transfac,,4,,2233727,CP2 Transcription Factor Domain Family,,transcription factor CP2,...,,,,,,,,,,"[[[0.010, 0.167, 0.813, 0.010], [0.010, 0.813,..."
3,M00082,,Transfac,,4,,8321231,Runt Domain Family,,runt-related transcription factor 1,...,,,,,,,,,,"[[[0.737, 0.010, 0.053, 0.200], [0.053, 0.010,..."
4,M00085,,Transfac,,4,,7958847,BetaBetaAlpha-zinc finger Family,,zinc finger and BTB domain containing 6,...,,,,,,,,,,"[[[0.086, 0.343, 0.257, 0.314], [0.057, 0.143,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1056,UP00404,,UniPROBE,,9,,19443739,Ets Domain Family,,E74-like factor 2 (ets domain transcription fa...,...,,,,,,,,,,"[[[0.382, 0.199, 0.201, 0.218], [0.092, 0.332,..."
1057,UP00412,,UniPROBE,,9,,19443739,Ets Domain Family,,ets variant gene 5,...,,,,,,,,,,"[[[0.164, 0.479, 0.235, 0.122], [0.298, 0.128,..."
1058,UP00413,,UniPROBE,,9,,19443739,Ets Domain Family,,E74-like factor 4 (ets domain transcription fa...,...,,,,,,,,,,"[[[0.350, 0.139, 0.209, 0.302], [0.080, 0.359,..."
1059,UP00418,,UniPROBE,,9,,19443739,Ets Domain Family,,ets variant gene 6 (TEL oncogene),...,,,,,,,,,,"[[[0.355, 0.069, 0.299, 0.277], [0.210, 0.408,..."


In [54]:
cistrome_txt.columns

Index(['id', 'edition', 'source', 'sourcefile', 'status', 'numseqs', 'pmid',
       'dbd', 'family', 'description', 'species', 'cellline', 'entrez',
       'symbol', 'synonym', 'refseq', 'cluster', 'comment2', 'comment2.1',
       'comment3', 'comment4', 'comment5', 'datasetid', 'zscore', 'seqfactors',
       'seqdbds', 'seqdatasetid', 'nmotifs', 'pssm'],
      dtype='object')

# Read in DNase Data

In [4]:
dnnase_data = h5.File('data/hg38/hg38_window1kb_DNase.h5', 'r')

In [5]:
test_dnaase_data = dnnase_data["OrderCount"][:, :10]

In [6]:
test_dnaase_data.shape

(3209513, 10)

# Read in ChIP data

In [None]:
chip = h5.File('data/hg38/hs_tf_new_peak_loct.h5', 'r')

In [96]:
%%timeit
chip_ds = []
for dataset in list(chip.keys())[:8500]:
    chip_ds.append(chip[dataset][...].astype(np.int32))

datasets = np.concatenate([np.full((chip.shape[0], ), i) for i, chip in enumerate(chip_ds)])

peaks = np.concatenate(chip_ds)

6.52 s ± 121 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
num_peaks = peaks.shape
num_bins = test_dnaase_data.shape[0]

chip_sparse = sparse.coo_matrix(
    (
        np.ones(num_peaks), 
        (peaks // 10, datasets)
    ), 
    shape = (num_bins, len(chip_ds)))

In [24]:
test_dnaase_data.shape, chip_sparse.shape

((3209513, 10), (3209513, 8472))

In [28]:
test_dnaase_data[:, 0][:, np.newaxis].shape

(3209513, 1)

In [32]:
%%timeit
chip_sparse.multiply(test_dnaase_data[:, 0][:, np.newaxis]) #Okay that works and is super fast

13.9 s ± 1.94 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [30]:
chip_overlaps = _

In [31]:
chip_overlaps.shape

(3209513, 8472)

In [33]:
converter = np.load('data/hg38/hg38_100to1000window.out.npy')

In [34]:
converter.shape

(32093082,)

In [35]:
converter

array([      0,       0,       0, ..., 3209512, 3209512, 3209512])

In [38]:
peaks.max()

30882611

In [74]:
converter[peaks].max()

3088271

In [71]:
for chip_key in list(chip.keys()):
    try:
        int(chip_key)
    except Exception:
        print(chip_key)

IDs


In [73]:
chip['IDs'][...]

array([b'1', b'2', b'4', ..., b'77535', b'77536', b'77537'], dtype='|S25')

In [76]:
cistrome_txt.index.max()

1060

In [2]:
lisa_meta = pd.read_csv('data/hg38/lisa_meta.xls', sep = '\t', encoding='iso-8859-1')

In [89]:
lisa_meta[lisa_meta.species == 'Homo sapiens'].factor.nunique()

1319

In [92]:
tad_info = pd.read_csv('data/hg38/hg38_promoter_TADann_H3K4me3_enhance_k27me3_Using.xls', sep = '\t')

In [94]:
tad_info.columns

Index(['chrom', 'pstart', 'pend', 'geneID', 'geneName', 'strand', 'TADchrom',
       'TADstart', 'TADend', 'TADid', 'median_zk4me3_enhance',
       'median_TADk27ac', 'k4me3_cluster', 'tad_cluster',
       'k4me3_order_cluster', 'tad_order_cluster', 'median_zk27me3_enhance',
       'k27me3_cluster', 'k27me3_order_cluster', 'quadrant'],
      dtype='object')

In [110]:
gene_id.shape, gene_symbol.shape, tad_group.shape

((19866,), (19866,), (19866,))

In [118]:
num_genes = gene_id.shape

In [320]:
lisa_v2_hg38 = h5.File('./data/hg38/lisa_v2_hg38.h5', 'a')

In [321]:
lisa_v2_hg38.create_group('gene_info')

<HDF5 group "/gene_info" (0 members)>

In [323]:
lisa_v2_hg38.keys()

gene_symbol.values

gene_symbols = lisa_v2_hg38.create_dataset('/gene_info/gene_symbols', data = gene_symbol.values.astype('S'))

lisa_v2_hg38.create_dataset('/gene_info/gene_refseqIDs', data = gene_id.values.astype('S'))

lisa_v2_hg38.create_dataset('/gene_info/gene_tad_info', data = tad_group.values.astype('S'))

lisa_v2_hg38['gene_info']['gene_tad_info']

lisa_v2_hg38.close()

# Testing Model Training
<hr>

In [None]:
#LOADING IN DATA TO MY H5
rp_h5 = h5.File('data/hg38/margeRP_DNase.h5', 'r')

rp_h5.keys()

rp_h5['RP'][10,:]

lisa_v2_hg38 = h5.File('data/hg38/lisa_v2_hg38.h5', 'w')

dnase_group = lisa_v2_hg38.create_group('DNase_data')

old_refseq = rp_h5['RefSeq'][...].astype(str)

old_refseq = [
    stuff.split(':')[4] for stuff in old_refseq
]

old_refseq

lisa_v2_hg38.create_dataset('DNase_data/gene_ids', data = rp_h5['RefSeq'][...].astype('S'))

lisa_v2_hg38.create_dataset('DNase_data/sample_ids', data = rp_h5['IDs'][...].astype('S'))

lisa_v2_hg38.create_dataset('DNase_data/regulatory_potential', data = rp_h5['RP'][...])

lisa_v2_hg38.keys()

lisa_v2_hg38.close()

In [266]:
lisa_v2_hg38 = h5.File('data/hg38/lisa_v2_hg38.h5', 'r')

In [325]:
lisa_v2_hg38.keys()

<KeysViewHDF5 ['DNase_data', 'gene_info']>

In [327]:
lisa_v2_hg38['DNase_data'].keys()

<KeysViewHDF5 ['gene_ids', 'regulatory_potential', 'sample_ids']>

In [341]:
available_genes = [x.split(':')[-2] for x in lisa_v2_hg38['DNase_data']['gene_ids'][...].astype(str)]

In [337]:
extract_genes = np.isin(available_genes, selected_genes)

# Setting up RP data pt 2

In [4]:
data = h5.File('data/hg38/lisa_v2_hg38.h5', 'r')

In [8]:
data['DNase_data'].keys()

<KeysViewHDF5 ['gene_ids', 'regulatory_potential', 'sample_ids']>

In [17]:
gene_ids = data['DNase_data']['gene_ids'][...]

In [20]:
symbols = [line.decode('utf-8').split(':')[-1] for line in gene_ids]

len(symbols), len(set(symbols))

symbol_dict = {}
for i, symbol in enumerate(symbols):
    if not symbol in symbol_dict:
        symbol_dict[symbol] = i

first_occurence = np.zeros(len(gene_ids), dtype = np.bool)
first_occurence[list(symbol_dict.values())] = True

first_occurence.sum()

In [36]:
len(set(selected_genes).intersection(set(np.array(symbols)[first_occurence])))

2981

In [35]:
np.array(symbols)[first_occurence]

array(['DDX11L1', 'MIR6859-3', 'MIR6859-4', ..., 'CSPG4P1Y', 'GOLGA2P2Y',
       'GOLGA2P3Y'], dtype='<U22')

In [39]:
subset_rp_data = data['DNase_data']['regulatory_potential'][first_occurence, :][...]

In [40]:
subset_rp_data.shape

(26631, 1110)

In [41]:
subest_gene_ids = np.array(symbols)[first_occurence]

In [43]:
subest_gene_ids.shape

(26631,)

In [44]:
data.close()

In [99]:
with h5.File('data/hg38/lisa_v2_hg38.h5', 'r+') as data:
    del data['DNase_data']['regulatory_potential']
    #del data['DNase_data']['gene_ids']
    del data['DNase_data']['sample_ids']
    data.create_dataset('DNase_data/regulatory_potential', data = subset_rp_data)
    #data.create_dataset('DNase_data/gene_ids', data = np.array(subest_gene_ids).astype('S'))
    data.create_dataset('DNase_data/sample_ids', data = np.array(dnaase_ids).astype(np.int32))

In [102]:
data = h5.File('data/hg38/lisa_v2_hg38.h5', 'r+')

In [58]:
dnaase_ids = data['DNase_data']['sample_ids'][...].astype(int)

In [79]:
qc_map = lisa_meta[lisa_meta.id.isin(dnaase_ids)].set_index('id').qc.reindex(dnaase_ids)

In [85]:
subset_qc = qc_map.values.astype(np.bool)

In [87]:
subset_rp_data.shape

(26631, 1110)

In [88]:
subset_gene_ids = subest_gene_ids

In [91]:
dnaase_ids = dnaase_ids[subset_qc]

In [93]:
dnaase_ids.shape

(934,)

In [95]:
subset_rp_data = subset_rp_data[:, subset_qc]

# Testing LR functions

In [258]:
import json

In [265]:
print(json.dumps(selection_model.get_info(), indent = 4))

{
    "search_params": {
        "epsilon": 1e-07,
        "max_iters": 50,
        "num_datasets_selected": 10,
        "penalty": "l1",
        "penalty_range": [
            -1,
            10
        ],
        "tol": 0.01
    },
    "search_model_params": {
        "C": 0.03482458570612164,
        "class_weight": null,
        "dual": false,
        "fit_intercept": true,
        "intercept_scaling": 1,
        "l1_ratio": null,
        "max_iter": 100,
        "multi_class": "auto",
        "n_jobs": null,
        "penalty": "l1",
        "random_state": null,
        "solver": "liblinear",
        "tol": 0.01,
        "verbose": 0,
        "warm_start": false
    },
    "dataset_coefs": [
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.02227953496

# ChIP-seq data integration

In [4]:
chip_data = h5.File('./data/hg38/hs_tf_new_peak_loct.h5')

  """Entry point for launching an IPython kernel.


In [5]:
chip_data.keys()

<KeysViewHDF5 ['1', '1006', '1009', '1010', '1011', '1012', '1013', '1078', '1079', '1080', '1081', '1082', '1083', '1084', '1085', '1086', '1087', '11', '1140', '1141', '1143', '1144', '1168', '1172', '1173', '1174', '1175', '1176', '1177', '1178', '1194', '1195', '1196', '1197', '1198', '1203', '1208', '1212', '1213', '1214', '1215', '1218', '1219', '1220', '1221', '1222', '1223', '1224', '1225', '1226', '1227', '1228', '1229', '1230', '1298', '1299', '1300', '1301', '1302', '1303', '1304', '1305', '1306', '1307', '1308', '1309', '1312', '1313', '1314', '1329', '1330', '1331', '1332', '1333', '1334', '1335', '1336', '1337', '1338', '1339', '1340', '1344', '1345', '1346', '1347', '1348', '1349', '1350', '1351', '1352', '1354', '1356', '1357', '1358', '1359', '1360', '1361', '1362', '1367', '1368', '1374', '1375', '1380', '1381', '1382', '1383', '1447', '1448', '1449', '1452', '1463', '1464', '1470', '1471', '1492', '1493', '1507', '1508', '1628', '1629', '17', '1740', '1741', '1742', 

In [7]:
chip_data["IDs"][...]

array([b'1', b'2', b'4', ..., b'77535', b'77536', b'77537'], dtype='|S25')

In [12]:
example_chip = chip_data['1'][...]

In [15]:
window_converter = np.load('data/hg38/hg38_100to1000window.out.npy')

In [17]:
example_chip.map(window_converter)

AttributeError: 'numpy.ndarray' object has no attribute 'map'

In [19]:
example_chip_1000kb = window_converter[example_chip]

In [21]:
lisa_meta = lisa_meta.set_index('id')

In [22]:
lisa_meta

Unnamed: 0_level_0,species,factor,factor_type,cell_line,cell_type,tissue,qc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Homo sapiens,BTAF1,tf,HeLa,Epithelium,Cervix,0
2,Homo sapiens,GAPDH,not sure,HeLa,Epithelium,Cervix,0
4,Homo sapiens,EGR1,tf,K562,Erythroblast,Bone Marrow,0
6,Homo sapiens,TCF4,tf,LS174T,Epithelium,Colon,0
8,Homo sapiens,TCF4,tf,LS174T,Epithelium,Colon,0
...,...,...,...,...,...,...,...
57269,Mus musculus,YY1,tf,C2C12,Myoblast,Muscle,1
57271,Mus musculus,YY1,tf,C2C12,Myoblast,Muscle,1
57973,Mus musculus,TET3,cr,J1,Embryonic Stem Cell,Embryo,1
58789,Mus musculus,RNF2,predicted transcription factor,,Embryonic Stem Cell,Embryo,1


In [28]:
lisa_meta.loc[1]

species        Homo sapiens
factor                BTAF1
factor_type              tf
cell_line              HeLa
cell_type        Epithelium
tissue               Cervix
qc                        0
Name: 1, dtype: object

In [29]:
from collections import defaultdict

In [34]:
lisa_meta.index = lisa_meta.index.astype(np.str)

In [61]:
tf_chip_dict = defaultdict(set)

num_samples = len(list(chip_data.keys()))
for i, sample in enumerate(chip_data.keys()):
    print('\rSample {}/{}'.format(str(i + 1), str(num_samples)), end = '')
    if not sample == 'IDs':
        metadata = lisa_meta.loc[sample]
        if metadata.qc == 1:
            chip_sample = chip_data[sample][...]
            chip_sample_1000kb = window_converter[chip_sample]
            tf_chip_dict[metadata.factor] |= set(list(chip_sample_1000kb))

Sample 8472/8472

In [46]:
lisa_data = h5.File('./data/hg38/lisa_v2_hg38.h5', 'r+')

In [48]:
chip_group = lisa_data.create_group('ChIP_data')

In [55]:
chip_group.keys()

<KeysViewHDF5 ['AEBP2', 'AFF1', 'AFF4', 'AHR', 'AR', 'ARNT', 'ATF3', 'ATF7IP', 'BCL6', 'BCOR', 'BMI1', 'BRD2', 'BRD3', 'BRD4', 'BRD7', 'BRDU', 'BRF1', 'C17orf49', 'CAMTA2', 'CBFB', 'CBX1', 'CBX5', 'CCDC101', 'CDK8', 'CDK9', 'CDX2', 'CEBPA', 'CEBPB', 'CHD1', 'CHRM2', 'CLOCK', 'CREB1', 'CREB3L4', 'CREBBP', 'CREM', 'CTBP1', 'CTCF', 'DCP1A', 'DDX5', 'DLX1', 'DNMT3A', 'E2F1', 'E2F3', 'E2F4', 'E2F7', 'EBNA1BP2', 'EBNA3', 'EGR1', 'EGR2', 'EHF', 'ELF1', 'ELF2', 'ELF5', 'ELK1', 'ELK4', 'ELL2', 'EOMES', 'ERG', 'ESR1', 'ETS1', 'ETS2', 'ETV5', 'ETV7', 'EZH2', 'FAIRE', 'FLI1', 'FOS', 'FOXA1', 'FOXA2', 'FOXD2', 'FOXH1', 'FOXM1', 'FOXP1', 'GATA1', 'GATA2', 'GATA3', 'GATA4', 'GATA6', 'GATAD1', 'GLI2', 'GLIS1', 'GLYR1', 'GMEB2', 'GREB1', 'GRHL2', 'GRHL3', 'GTF2B', 'GTF3C1', 'HBP1', 'HDAC1', 'HES1', 'HINFP', 'HIRA', 'HNF4A', 'HOTAIR', 'HOXA13', 'HOXB13', 'HSF1', 'IRF2', 'IRF5', 'JUN', 'JUND', 'KDM1A', 'KLF10', 'KLF3', 'KLF4', 'KMT2A', 'LHX2', 'LMNA', 'LMNB1', 'LMO2', 'LYL1', 'MAX', 'MECP2', 'MED1', 'MED

In [67]:
for factor, peaks in tf_chip_dict.items():
    try:
        if factor in chip_group.keys():
            del chip_group[factor]
        chip_group.create_dataset(factor, data = np.array(list(peaks)).astype('i'))
    except OSError:
        print(factor)
        pass

SMAD2/3


In [70]:
lisa_data.close()

# Model Testing

In [30]:
import models
import build_chromatin_model
import background_genes_selection
from models import LR_BinarySearch_SampleSelectionModel
from models import LR_ChromatinModel
from sklearn.model_selection import ParameterGrid
import insilico_deletion_model as ISD

#load gene lists
with h5.File('data/hg38/lisa_v2_hg38.h5', 'r') as store:
    gene_symbols = store['gene_info']['gene_symbols'][...].astype(str)
    gene_ids = store['gene_info']['gene_refseqIDs'][...].astype(str)
    gene_tag_groups = store['gene_info']['gene_tad_info'][...].astype(str)

#open user-supplied gene-list
with open('data/gene_list.txt', 'r') as f:
    user_gene_list = f.readlines()

gene_list = [gene.strip().upper() for gene in user_gene_list]

#select background genes from TAD distributions
selected_genes, labels = background_genes_selection\
.select_genes_for_chromatin_model(gene_list, gene_symbols, gene_ids, gene_tag_groups, 
                                  num_selected = 3000, user_background_genes = None, method = 'TAD')

#Load gene x dataset RP matrix
with h5.File('data/hg38/lisa_v2_hg38.h5', 'r') as data:
    
    gene_symbols = data['DNase_data']['gene_ids'][...].astype(str)
    intersected_genes = set(selected_genes).intersection(set(gene_symbols))
    intersected_ids = np.isin(gene_symbols, list(intersected_genes))
    
    rp_matrix = data['DNase_data']['regulatory_potential'][intersected_ids, :][...]
    dataset_ids = data['DNase_data']['sample_ids'][...].astype(str)

labels = labels[np.isin(selected_genes, list(intersected_genes))]

#instantiate models
sample_selection_model = LR_BinarySearch_SampleSelectionModel()

chromatin_model = LR_ChromatinModel({'C' : list(10.0**np.arange(-4,4,1))})

#train models
selected_datasets, selected_dataset_ids, selection_model, chromatin_model, normalization_fn\
    = build_chromatin_model.build_chromatin_model(rp_matrix, dataset_ids, 
                                            labels, sample_selection_model, chromatin_model, n_jobs = -1)

#read in chipseq data and build sparse matrix
with h5.File('data/hg38/lisa_v2_hg38.h5', 'r') as data:

    tf_ids = {tf : i for i, tf in enumerate(data['ChIP_data'].keys())}

    peaks = np.concatenate([
        data['ChIP_data'][factor][...].astype(np.int32)
        for factor in tf_ids.keys()
    ])

    data['ChIP_data']['SMAD4'].shape

    tf_index = np.concatenate([
        np.full(data['ChIP_data'][factor].shape, factor_id)
        for factor, factor_id in tf_ids.items()    ])

#read in DNase samples, filter by important to ChIP model
with h5.File('data/hg38/lisa_v2_hg38.h5', 'r') as data:
    
    num_peaks = peaks.shape
    num_bins = data['DNase_data']['reads_data'][selected_dataset_ids[0]].shape[0]

    chip_sparse = sparse.coo_matrix(
        (
            np.ones(num_peaks), 
            (peaks, tf_index)
        ), 
        shape = (num_bins, len(tf_ids))
    )

    chip_filter = np.array(chip_sparse.sum(axis = 1) >= 1).reshape(-1)

    chip_sparse = sparse.csr_matrix(chip_sparse)[np.array(chip_filter).reshape(-1), :]

    dnnase_reads = np.concatenate([
        data['DNase_data']['reads_data'][dataset_id][chip_filter][...].reshape((-1,1))
        for dataset_id in selected_dataset_ids
    ], axis = 1)

#Import RP matrix
rp_shape = (dnnase_reads.shape[0], labels.shape[0])
num_datapoints = 1e5

random_gene = np.random.choice(rp_shape[1], int(num_datapoints))
random_bin = np.random.choice(rp_shape[0], int(num_datapoints))

rp_matrix = sparse.coo_matrix(
    (
        np.ones(int(num_datapoints)),
        (random_bin, random_gene)
    ),
    shape = rp_shape
)

ISD.get_insilico_deletion_significance(datasets = dnnase_reads, ChIP_per_bin=chip_sparse, 
                rp_matrix= rp_matrix, chromatin_model = chromatin_model, labels = labels)

#



# Reformat DNase Reads

In [123]:
data = h5.File('data/hg38/lisa_v2_hg38.h5', 'r+')

In [126]:
data['gene_info'].keys()

<KeysViewHDF5 ['gene_refseqIDs', 'gene_symbols', 'gene_tad_info']>

In [3]:
dnase_folder = data['DNase_data']

dnase_reads_folder = dnase_folder.create_group('reads_data')

old_data = h5.File('data/hg38/hg38_window1kb_DNase.h5', 'r')

for i, dataset_id in enumerate(old_data['IDs'][...].astype(str)):
        print('\rSample {}'.format(str(i+1)), end = '')
        dnase_reads_folder.create_dataset(
            dataset_id,
            data = old_data['OrderCount'][:, i][...]                                             
        )

list(data['DNase_data']['reads_data'].keys())[0]

run_1798 = data['DNase_data']['reads_data']['1798'][...]

np.where(old_data['IDs'][...].astype(str) == '1798')

old_data['IDs'][...][28]

old_1798 = old_data['OrderCount'][:1000, 28]

run_1798_test = run_1798[:1000]

np.isclose(run_1798_test, old_1798, 1e-7).all()

old_data.close()

data.close()

# Config testing

In [4]:
import configparser

In [5]:
config = configparser.ConfigParser()

In [24]:
config.read('./h5_config.ini')

['./h5_config.ini']

In [26]:
config.sections()

['gene_info',
 'accessibility_assay',
 'factor_binding',
 'regulatory_potential_bin_map',
 'paths',
 'h5_paths']

In [23]:
config['factor_binding']['tf_binding_data'].format(technology = 'chipseq', factor_symbol = 'SMAD4')

'chipseq/SMAD4'

In [3]:
!ls data/hg38/

cistrome.txt
cluster_human
ER.profile
ER.Snakefile.model
ER.yml
hg38_100to1000window.out.npy
hg38_beta_peak5fold.h5
hg38.genome
hg38_promoter_TADann_H3K4me3_enhance_k27me3_Using.xls
hg38.tss
hg38_window1kb.bed
hg38_window1kb_DNase.h5
hg38_window1kb_H3K27ac.h5
hg38_window1kb_tss.bed
hs_tf_new_beta_rp.h5
hs_tf_new_peak_loct.h5
lisa_meta.xls
lisa_meta.xls.all
lisa_meta.xls.all.forR
lisa_v2_data.h5
lisa_v2_hg38.h5
marge2_motif_100bp_99.h5
margeRP_DNase.h5
margeRP_H3K27ac.h5


In [34]:
import configparser

In [None]:
def convert_lisa_data_formatting(new_dataset_name, create_new = True, *
    metadata_file,
    tad_data,
    window_conversion_file,
    dnase_reads,
    acetylation_reads,
    chip_peaks,
    motif_peaks,
    dnase_rp,
    acetylation_rp,
    config_file,
): 
    

Steps to reformatting a lisa_v1 to lisa_v2 species-specific data repo:
1. ~~Get gene lists from tad dataframe, h3k27ac data, and Dnase data~~
2. ~~Intersect gene lists to make master list, subset the TAD data and define gene_info folder~~

For each accessibility assay:
3. ~~Get QC from metadata, and use qc to subset samples in Dnase and acetylation data. Also subset by used genes~~
4. ~~Reformat binned reads array~~

For each TF-binding assay:
5. ~~Aggregate by factor, save as factor-indexed arrays of binned hits~~

For all

6. Calculate RP map

In [290]:
class LoadingBar:
    
    def __init__(self, label, increments, length = 25):
        self.increments = increments
        self.length = length
        self.label = label
        self.progress = 0
        
    def __str__(self):
        self.increment()
        completed_steps = int(self.progress / self.increments * self.length)
        if completed_steps >= self.length:
            return '\r{}: [{}]'.format(self.label, "="*completed_steps) + '\n' if self.is_finished() else ''
        else:
            return '\r{}: [{}>{}]'.format(self.label, "="*completed_steps, " "*(self.length - completed_steps - 1))
    
    def increment(self):
        if not self.is_finished():
            self.progress += 1
        
    def is_finished(self):
        return self.progress >= self.increments

In [293]:
new_dataset_name = 'lisa_data_hg38.h5'
conversion_params = dict(
    create_new = True,
    tad_data = 'data/hg38/hg38_promoter_TADann_H3K4me3_enhance_k27me3_Using.xls',
    metadata_file = 'data/hg38/lisa_meta.xls',
    window_conversion_file = 'data/hg38/hg38_100to1000window.out.npy',
    dnase_reads = 'data/hg38/hg38_window1kb_DNase.h5',
    acetylation_reads = 'data/hg38/hg38_window1kb_H3K27ac.h5',
    chip_peaks = 'data/hg38/hs_tf_new_peak_loct.h5',
    motif_peaks = 'data/hg38/marge2_motif_100bp_99.h5',
    dnase_rp = 'data/hg38/margeRP_DNase.h5',
    acetylation_rp = 'data/hg38/margeRP_H3K27ac.h5',
    config_file = './h5_config.ini',
    motif_metadata = 'data/hg38/cistrome.txt',
)

def subset_accessibility_rp_data(path_to_h5, gene_set, qc_status):
    
    with h5.File(path_to_h5, 'r') as acc_data:
        
        symbols = [line.decode('utf-8').split(':')[-1] for line in acc_data['RefSeq'][...]]

        symbol_dict = {}
        for i, symbol in enumerate(symbols):
            if not symbol in symbol_dict:
                symbol_dict[symbol] = i

        gene_subset = np.zeros(len(symbols), dtype = np.bool)
        gene_subset[list(symbol_dict.values())] = True
        
        dataset_ids = acc_data['IDs'][...].astype(str)
        
        qc_subset = qc_status[dataset_ids].values.astype(np.bool)
        
        rp_data = acc_data['RP'][:, qc_subset][gene_subset, :]
        
    return rp_data, np.array(symbols)[gene_subset], dataset_ids[qc_subset]

def reformat_accessibility_assay_RP(h5_object, technology, config, path_to_h5, gene_set, qc_status):

    rp_data, genes, sid = subset_accessibility_rp_data(path_to_h5, gene_set, qc_status)
    
    h5_object.create_dataset(
        config['accessibility_assay']['reg_potential_gene_symbols'].format(technology = technology),
        data = np.array(genes).astype('S')
    )
    
    h5_object.create_dataset(
        config['accessibility_assay']['reg_potential_dataset_ids'].format(technology = technology),
        data = np.array(sid).astype('S')
    )
    
    h5_object.create_dataset(
        config['accessibility_assay']['reg_potential_matrix'].format(technology = technology),
        data = rp_data
    )

    
def reformat_TF_hits(h5_object, config, technology, path_to_h5, window_converter, metadata):
    
    with h5.File(path_to_h5, 'r') as tf_hits:

        num_samples = len(list(tf_hits.keys()))
        
        loading_bar = LoadingBar('\tCollecting binding data', num_samples, 20)
        
        for sample in tf_hits.keys():
            print(loading_bar, end = '')
            
            if not sample == 'IDs':
                
                try:
                    sample_metadata = metadata.loc[str(sample)]
                
                    if sample_metadata.qc == 1:

                        peaks = tf_hits[sample][...]
                        peaks = window_converter[peaks]
                        
                        h5_object.create_dataset(
                            config['factor_binding']['tf_binding_data']\
                                .format(technology = technology, dataset_id = sample),
                            data = np.array(list(peaks)).astype(np.int64)
                        )
                        
                except OSError:
                    print('\n\tError saving data for sample {}, factor: {}'\
                          .format(str(sample), sample_metadata.factor))
                except KeyError:
                    print('\n\tError: No metadata for sample {}'.format(str(sample)))
                    


def index_binned_reads(h5_object, technology, config, path_to_h5, qc_status):
    
    print('\tReading binned reads data ...')

    with h5.File(path_to_h5, 'r') as binned_reads:

        dataset_ids = binned_reads['IDs'][...].astype(str)

        dataset_subset = qc_status[dataset_ids].astype(np.bool).values

        reads_matrix = binned_reads['OrderCount'][:, dataset_subset]

    loading_bar = LoadingBar('\tWriting ID-indexed data', dataset_subset.sum(), 20)

    for i, dataset_id in enumerate(dataset_ids[dataset_subset]):
        print(loading_bar, end = '')
        h5_object.create_dataset(
            config['accessibility_assay']['binned_reads'].format(technology = technology, dataset_id = dataset_id),
            data = reads_matrix[:, i]
        )

In [291]:
from collections import defaultdict

In [None]:
config = configparser.ConfigParser()

config.read(conversion_params['config_file'])

print('Reading metadata ...')

tad_data = pd.read_csv(conversion_params['tad_data'], encoding = "ISO-8859-1", sep = '\t')

with h5.File(conversion_params['dnase_rp'], 'r') as dnase_rp_data:
    dnase_gene_symbols = [line.split(":")[-1] for line in dnase_rp_data['RefSeq'][...].astype(str)]
    
with h5.File(conversion_params['acetylation_rp'], 'r') as acetylation_rp_data:
    acetyalation_gene_symbols = [line.split(":")[-1] for line in acetylation_rp_data['RefSeq'][...].astype(str)]
    
combined_sybmols = set(tad_data.geneName) & set(dnase_gene_symbols) & set(acetyalation_gene_symbols)    

tad_data = tad_data[tad_data.geneName.isin(combined_sybmols)]

gene_symbols = tad_data.geneName.values
gene_ids = tad_data.geneID.str.split('.').str.get(0).values
tad_domain = (tad_data.k4me3_order_cluster.astype(str) + ',' + tad_data.tad_order_cluster.astype(str)).values

metadata = pd.read_csv(conversion_params['metadata_file'], sep = '\t', encoding = "ISO-8859-1").set_index('id')
metadata.index = metadata.index.astype(np.str)

window_converter = np.load(conversion_params['window_conversion_file'])

motif_metadata = pd.read_csv(conversion_params['motif_metadata'], sep = '\t').set_index('id')
motif_metadata['qc'] = 1
motif_metadata['factor'] = motif_metadata.dbd

with h5.File(new_dataset_name, 'w' if conversion_params['create_new'] else 'a') as data:
    
    print('Intersecting gene lists. Compiling gene TAD information ...')
    data.create_dataset(config['gene_info']['gene_symbols'], data = gene_symbols.astype('S'))
    data.create_dataset(config['gene_info']['gene_refseqids'], data = gene_ids.astype('S'))
    data.create_dataset(config['gene_info']['tad_domains'], data = tad_domain.astype('S'))
    
    print('DNase:')
    print('\tReformatting DNase regulatory potential matrix ...')
    reformat_accessibility_assay_RP(data, 'DNase', config, conversion_params['dnase_rp'], combined_sybmols, 
                                   metadata.qc)
    
    index_binned_reads(data, 'DNase', config, conversion_params['dnase_reads'], metadata.qc)
    
    print('H3K27ac:')
    print('\tReformatting h3k27ac regulatory potential matrix ...')
    reformat_accessibility_assay_RP(data, 'h3k27ac', config, conversion_params['acetylation_rp'],
                                   combined_sybmols, metadata.qc)
    
    index_binned_reads(data, 'H3K27ac', config, conversion_params['acetylation_reads'], metadata.qc)
    
    print('Motifs:')    
    reformat_TF_hits(data, config, 'motif_hits_1000kb', conversion_params['motif_peaks'], window_converter, motif_metadata)
    
    print('Chip-seq:')
    reformat_TF_hits(data, config, 'ChIP-seq_1000kb', conversion_params['chip_peaks'], window_converter, metadata)
    
    # do binned data next

Intersecting gene lists. Compiling gene TAD information ...
DNase:
	Reformatting DNase regulatory potential matrix ...
