In [1]:
import os, csv
import scipy, numpy as np, pandas as pd, time
from scipy import sparse
import pyBigWig, prep_metadata_labels

# Human chromosome names
chr_IDs = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX']
_data_dir = '../../examples/data/encode-tfbs_v1.0/'

# Prep metadata df and metadata/label array
- Metadata df contains 6400bp (window_size/2) prediction windows across the genome. Each gets a 128-bit prediction from the model.
- We store the ones that aren't fully unbound, and write these to bigwigs representing genome-wide labels.

In [4]:
prep_metadata_labels.write_label_bigwigs()

83.30138063430786
H1-hESC 100.73247504234314
HCT116 106.4023334980011
HeLa-S3 111.88021206855774
HepG2 117.56940197944641
K562 126.93423342704773
A549 138.21517205238342
GM12878 148.77391648292542
150.62964010238647
213.72714066505432


- Then read from the bigwigs to generate metadata for the bound sites.

In [7]:
stride = 6400
itime = time.time()
mdf_posamb = pd.read_csv(
    _sorted_dir, 
    sep='\t', header=None, index_col=None, names=['chr', 'start', 'stop', 'y', 'celltype']
)
celltype_mdta = []
celltype_labels = []

for ct in _all_celltypes:
    ct_labels_bw_path = _data_dir + "labels/MAX/MAX_{}.bigwig".format(ct)
    df = mdf_posamb[mdf_posamb['celltype'] == ct]
    df['window_start'] = stride*(df['start'] // stride)
    uniq_windows = np.unique(["{}:{}".format(x[0], x[1]) for x in zip(df['chr'], df['window_start'])])
    df_construction = []
    mdta_labels = []
    
    bw = pyBigWig.open(ct_labels_bw_path)
    num_reps = 0
    for u in uniq_windows:
        u_chr = u.split(':')[0]
        u_start = int(u.split(':')[1])
        u_end = u_start + stride
        x = np.nan_to_num(bw.values(u_chr, u_start, u_end, numpy=True))
        df_construction.append((u_chr, u_start, u_end))
        mdta_labels.append(x[np.arange(0, len(x), 50)])
        num_reps = num_reps + 1
    celltype_mdta_df = pd.DataFrame(df_construction, columns=['chr', 'start', 'stop'])
    celltype_mdta_df.insert(len(celltype_mdta_df.columns), 'celltype', ct)
    celltype_mdta.append(celltype_mdta_df)
    celltype_labels.append(np.stack(mdta_labels))
    print(ct, time.time() - itime)
    bw.close()
    # break
print(time.time() - itime)
# _metadata_df

pd.concat(celltype_mdta).to_csv(
    _data_dir + 'labels/MAX/metadata_df.bed', 
    sep='\t', header=False, index=False
)
np.save(_data_dir + 'labels/MAX/metadata_y.npy', np.vstack(celltype_labels))
print(time.time() - itime)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['window_start'] = stride*(df['start'] // stride)


A549 63.97912120819092
GM12878 103.89278292655945
H1-hESC 182.84059262275696
HCT116 243.95744681358337
HeLa-S3 303.7187397480011
HepG2 375.8099205493927
K562 456.08897161483765
456.0923991203308
462.8749210834503


## Add the all-unbound sites

In [None]:
stride = 6400
itime = time.time()
mdf_posamb = pd.read_csv(
    _data_dir + 'labels/MAX/MAX_posamb.sorted.bed', 
    sep='\t', header=None, index_col=None, names=['chr', 'start', 'stop', 'y', 'celltype']
)
celltype_mdta = []
celltype_labels = []
_train_celltypes = ['H1-hESC', 'HCT116', 'HeLa-S3', 'HepG2', 'K562']
_val_celltype = ['A549']
_test_celltype = ['GM12878']
_all_celltypes = _train_celltypes + _val_celltype + _test_celltype

for ct in _all_celltypes:
    ct_labels_bw_path = _data_dir + "labels/MAX/MAX_{}.bigwig".format(ct)
    df_construction = []
    mdta_labels = []
    bw = pyBigWig.open(ct_labels_bw_path)
    for chrID in bw.chroms():
        chromsize = bw.chroms()[chrID]
        # Iterate over windows
        for startc in np.arange(0, chromsize, stride):
            u_end = startc + stride
            if u_end > chromsize:
                break
            x = np.nan_to_num(bw.values(chrID, startc, u_end, numpy=True))
            df_construction.append((chrID, startc, u_end))
            mdta_labels.append(x[np.arange(0, len(x), 50)])
        print(ct, chrID, time.time() - itime)
    celltype_mdta_df = pd.DataFrame(df_construction, columns=['chr', 'start', 'stop'])
    celltype_mdta_df.insert(len(celltype_mdta_df.columns), 'celltype', ct)
    celltype_mdta.append(celltype_mdta_df)
    celltype_labels.append(np.stack(mdta_labels))
    print(ct, time.time() - itime)
    bw.close()
    # break
print(time.time() - itime)

H1-hESC chr1 31.691891193389893
H1-hESC chr10 43.88507628440857
H1-hESC chr11 54.64318251609802
H1-hESC chr12 63.76666021347046
H1-hESC chr13 72.60888147354126
H1-hESC chr14 78.53658175468445
H1-hESC chr15 84.56542801856995
H1-hESC chr16 92.28407764434814
H1-hESC chr17 99.54330348968506
H1-hESC chr18 106.55353927612305
H1-hESC chr19 111.41691207885742
H1-hESC chr2 135.3123984336853
H1-hESC chr20 141.9123089313507
H1-hESC chr21 146.14480471611023
H1-hESC chr22 150.8621871471405
H1-hESC chr3 169.92432117462158
H1-hESC chr4 186.69121527671814
H1-hESC chr5 201.6394476890564
H1-hESC chr6 215.72684383392334
H1-hESC chr7 227.8461310863495
H1-hESC chr8 240.26825499534607
H1-hESC chr9 250.02118062973022
H1-hESC chrX 264.7451572418213
H1-hESC 267.0940718650818
HCT116 chr1 291.3232545852661
HCT116 chr10 304.0528976917267
HCT116 chr11 316.63377356529236
HCT116 chr12 329.559387922287
HCT116 chr13 341.52057003974915
HCT116 chr14 350.64817333221436
HCT116 chr15 359.71765422821045
HCT116 chr16 368.285

In [None]:
all_metadata_df = pd.concat(celltype_mdta)
print(time.time() - itime)
all_metadata_df.to_csv(
    _data_dir + 'labels/MAX/all_metadata_df.bed', 
    sep='\t', header=False, index=False
)
print(time.time() - itime)
np.save(_data_dir + 'labels/MAX/all_metadata_y.npy', np.vstack(celltype_labels))
print(time.time() - itime)

In [16]:
np.save(_data_dir + 'labels/MAX/all_metadata_y.npy', np.vstack(celltype_labels))
print(time.time() - itime)

1609.5078670978546


In [19]:
np.vstack(celltype_labels).shape

(169827, 128)

In [20]:
celltype_mdta_df

Unnamed: 0,chr,start,stop,celltype
0,chrX,0,6400,GM12878
1,chrX,6400,12800,GM12878
2,chrX,12800,19200,GM12878
3,chrX,19200,25600,GM12878
4,chrX,25600,32000,GM12878
...,...,...,...,...
24256,chrX,155238400,155244800,GM12878
24257,chrX,155244800,155251200,GM12878
24258,chrX,155251200,155257600,GM12878
24259,chrX,155257600,155264000,GM12878
