In [12]:
import pandas as pd
import numpy as np
import json

In [13]:
def read_bed(file):
    return pd.read_csv(file, sep='\t', header=None)

In [14]:
EXP_NAME = 'curax_14h_UNI_mm9'
OUT_PATH = 'experiment_data'

In [15]:
data = read_bed(f'{EXP_NAME}.bed')
data.columns = ['chr', 'start', 'end']
data['center'] = (data.end + data.start) // 2

In [16]:
global_params = json.load(open('./global_data_params.json', 'r'))
global_params

{'region_length': 1500, 'bins_count': 100}

## Create and save final regions (discretized) and global data parameters

#### Positive class

In [17]:
region_length = global_params['region_length']
bins_count = global_params['bins_count']
bps_per_bin = region_length // bins_count

In [18]:
regions = pd.DataFrame()
regions['chr'] = data.chr
regions['start'] = data.center - region_length // 2
regions['end'] = data.center + region_length // 2

In [19]:
regions_discretized = regions.copy()
regions_discretized.start = regions_discretized.start // bps_per_bin * bps_per_bin
regions_discretized.end = regions_discretized.end // bps_per_bin * bps_per_bin

In [20]:
regions.to_csv(f'{OUT_PATH}/{EXP_NAME}.regions.bed', sep='\t', header=None, index=None)
regions_discretized.to_csv(f'{OUT_PATH}/{EXP_NAME}.regions.discrete.bed', sep='\t', header=None, index=None)

In [21]:
# sort files
!bedtools sort -i {OUT_PATH}/{EXP_NAME}.regions.bed > {OUT_PATH}/{EXP_NAME}.regions.sorted.bed
!bedtools sort -i {OUT_PATH}/{EXP_NAME}.regions.discrete.bed > {OUT_PATH}/{EXP_NAME}.regions.discrete.sorted.bed

#### Negative

In [22]:
random_regions = pd.DataFrame([], columns=['chr', 'start', 'end'])

min_pad = region_length // 2

for chrm in data.chr.unique():
    all_for_chr = data[data.chr == chrm]
    free_regions = np.array([
        np.array(all_for_chr.end[:-1]) + 1,
        np.array(all_for_chr.start[1:]) - 1
    ]).T

    # check
    for i in range(free_regions.shape[0]):
        assert free_regions[i][0] < free_regions[i][1]
        if i > 0:
            assert free_regions[i - 1][1] < free_regions[i][0]
    
    #generate regions
    for region in free_regions:
        length = region[1] - region[0]
        num_of_regions = (length) // (region_length + min_pad)
        if num_of_regions == 0:
            continue
        pad = (length - num_of_regions * region_length) / num_of_regions
        middles = np.array(
            region[0] + (length / num_of_regions) / 2 + np.array(range(num_of_regions)) * (length / num_of_regions),
            dtype=np.int64
        )
        
        random_regions = random_regions.append(pd.DataFrame(
            [
                [chrm, middle - region_length // 2, middle + region_length // 2]
                for middle in middles
            ],
            columns=random_regions.columns
        ))
print(f'positive regions: {data.shape[0]}')
print(f'negative regions: {random_regions.shape[0]}')

positive regions: 1961
negative regions: 1144982


In [23]:
random_regions_discretized = random_regions.copy()
random_regions_discretized.start = random_regions_discretized.start // bps_per_bin * bps_per_bin
random_regions_discretized.end = random_regions_discretized.end // bps_per_bin * bps_per_bin

In [26]:
random_regions.to_csv(f'{OUT_PATH}/{EXP_NAME}-random.regions.bed', sep='\t', header=None, index=None)
random_regions_discretized.to_csv(f'{OUT_PATH}/{EXP_NAME}-random.regions.discrete.bed', sep='\t', header=None, index=None)

In [27]:
# sort files
!bedtools sort -i {OUT_PATH}/{EXP_NAME}-random.regions.bed > {OUT_PATH}/{EXP_NAME}-random.regions.sorted.bed
!bedtools sort -i {OUT_PATH}/{EXP_NAME}-random.regions.discrete.bed > {OUT_PATH}/{EXP_NAME}-random.regions.discrete.sorted.bed

In [35]:
regions[regions.chr == 'chrX'].shape, random_regions[random_regions.chr == 'chrX'].shape,

((192, 3), (71817, 3))