In [1]:
#from utils.utils import seq2img2D
import numpy as np
from os.path import exists
from tqdm import tqdm
from Bio import SeqIO
import time
import random
from hilbertcurve.hilbertcurve import HilbertCurve
import itertools

def seq2img2D(in_fa, hc_p=1 , hc_n=2,
            random_choice=False,
            rand_n=None):
    """
    Function to convert dna sequence to an n x n image where each pixel value is
    determined by what 4mer is in that position.

    :param in_fa: Multifasta file to be converted
    :param hc_p: Order for the hilbert curve. Default = 1
    :param hc_n: Number of dimensions for hilbert curve. Default=2
    :param random_choice: Bool. If true, will take a random selection of
    records from the dictionary equal to size of rand_n.
    :param rand_n: Number of records to use if random_choice is True.
    :return: ndarray of shape N, H, W. Where N is the
    number of records in the multifasta file, H height 2**hc_p. W width 2**hc_p.
    """
    # Generate the Hilbert Curve
    print(f'Generating hilbert curve of order {hc_p} with {hc_n} dimensions.\n')
    HC = HilbertCurve(n=hc_n, p=hc_p)
    points = HC.points_from_distances(distances=list(range(int(2**hc_p)**2)))

    # Generate the mapping dictionary for values. Here, we are using 4mers so
    # all possible 4mers we can have is 256. 4 nucleotides to the power of
    # 4mer = 256 possible kmers.
    print(f'Generating mapping dictionary for all possible 4mers\n')
    def mapping_dict():
        nuc = [''.join(n) for n in
               itertools.product(['A', 'C', 'G', 'T'], repeat=4)]
        nuc_dict = {}
        for k, i in enumerate(nuc):
            # print(k, i)
            nuc_dict[f'{i}'] = k + 1.
        return nuc_dict


    print(f'Reading in {in_fa} to a dictionary\nRemoving records with N\'s\n')
    multi_fa = SeqIO.to_dict(SeqIO.parse(in_fa, 'fasta'))
    clean_multi_fa = {}
    for k, v in tqdm(multi_fa.items()):
        if 'N' not in str(v.seq).upper() and k not in clean_multi_fa:
            clean_multi_fa[f'{k}'] = v
        elif f'{k}' in clean_multi_fa:
            print(f'Key: {k} already in dictionary, adding ".2" to the key.')
            clean_multi_fa[f'{k}.2'] = v
        elif f'{k}.2' in clean_multi_fa:
            print(f'Key: {k} already in dictionary, adding ".3" to the key.')
            clean_multi_fa[f'{k}.3'] = v
    if random_choice is True:
        random.seed(12)
        rand_clean_idx = random.sample(list(clean_multi_fa), k=rand_n)
        clean_fa = {key: clean_multi_fa[key] for key in rand_clean_idx}
        print(f'Number of clean records taken randomly: {rand_n}')
    else:
        clean_fa = clean_multi_fa
        print(f'Number of records before N removal: {len(multi_fa)}\nNumber of '
          f'records after N removal: {len(clean_fa)}\n')
    start_time = time.time()
    nuc_dict = mapping_dict()
    print(f'Generating array dataset of shape '
          f'{len(clean_fa)}, {2**hc_p}, {2**hc_p}\n')
    img_mat = np.zeros((len(clean_fa), 2**hc_p, 2**hc_p))
    nuc_channel = list(nuc_dict.keys())
    print(f'Beginning sequencing to image conversion for {in_fa}.')
    for k, record in enumerate(tqdm(clean_fa.items())):
        for i in range(int(len(str(record[1].seq).upper())-4 + 1)):
            nc = nuc_channel.index(f'{str(record[1].seq).upper()[i:i+4]}')
            img_mat[k][points[i][1]][points[i][0]] = nc
    end_time = time.time()
    total_time = end_time - start_time
    print(f'Time taken to create image array: {total_time/60} mins')
    return img_mat / 256

In [9]:
## POSITIVE EXAMPLES ##
split = ['train', 'val', 'test']
bps = [2000] #,1000]
species = ['hg19', 'mm9']
for sp in species:
    for s in split:
        for bp in bps:
            #if exists(f'../data/Fantom5/enhancer/{sp}/pos_npy/{sp}.Fantom5.enhancers.noXY'
            #          f'.seq2img.{bp}bp.{s}.npy'):
            #    continue
            #print(f'Converting seq2img for {sp} {s} at {bp}bps.')
            if bp <= (2**4)**2:
                p = 4
            elif (2**4)**2 < bp <= (2**5)**2:
                p = 5
            elif (2**5)**2 < bp <= (2**6)**2:
                p = 6
            print(f'For seq length = {bp}, I\'m using a Hilbert Curve at order: '
                  f'{p}.')
            mat = seq2img2D(f'../data/Fantom5/enhancer/{sp}/pos_fa/{sp}.Fantom5.enhancers.noXY'
                         f'.{bp}bp.{s}.fa',
                          hc_p=p,
                          hc_n=2)
            print(f'Saving {s} at {bp}bps as seq2img numpy array.\n')
            np.save(f'../data/Fantom5/enhancer/{sp}/pos_npy/{sp}.Fantom5.enhancers.noXY'
                    f'.seq2img'
                    f'.{bp}bp'
                    f'.{s}.npy', mat)

For seq length = 2000, I'm using a Hilbert Curve at order: 6.
Generating hilbert curve of order 6 with 2 dimensions.

Generating mapping dictionary for all possible 4mers

Reading in ../data/Fantom5/enhancer/hg19/pos_fa/hg19.Fantom5.enhancers.noXY.2000bp.train.fa to a dictionary
Removing records with N's



100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 47421/47421 [00:00<00:00, 438831.29it/s]
  0%|                                                                                                                                                                                                                                                        | 12/47401 [00:00<06:56, 113.78it/s]

Number of records before N removal: 47421
Number of records after N removal: 47401

Generating array dataset of shape 47401, 64, 64

Beginning sequencing to image conversion for ../data/Fantom5/enhancer/hg19/pos_fa/hg19.Fantom5.enhancers.noXY.2000bp.train.fa.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 47401/47401 [06:54<00:00, 114.44it/s]


Time taken to create image array: 6.903181167443593 mins
Saving train at 2000bps as seq2img numpy array.



100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13549/13549 [00:00<00:00, 574061.31it/s]

For seq length = 2000, I'm using a Hilbert Curve at order: 6.
Generating hilbert curve of order 6 with 2 dimensions.

Generating mapping dictionary for all possible 4mers

Reading in ../data/Fantom5/enhancer/hg19/pos_fa/hg19.Fantom5.enhancers.noXY.2000bp.val.fa to a dictionary
Removing records with N's

Number of records before N removal: 13549
Number of records after N removal: 13548

Generating array dataset of shape 13548, 64, 64




  0%|▎                                                                                                                                                                                                                                                       | 14/13548 [00:00<01:43, 130.96it/s]

Beginning sequencing to image conversion for ../data/Fantom5/enhancer/hg19/pos_fa/hg19.Fantom5.enhancers.noXY.2000bp.val.fa.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13548/13548 [01:54<00:00, 118.26it/s]


Time taken to create image array: 1.910838536421458 mins
Saving val at 2000bps as seq2img numpy array.



100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6775/6775 [00:00<00:00, 501994.62it/s]
  0%|                                                                                                                                                                                                                                                                   | 0/6775 [00:00<?, ?it/s]

For seq length = 2000, I'm using a Hilbert Curve at order: 6.
Generating hilbert curve of order 6 with 2 dimensions.

Generating mapping dictionary for all possible 4mers

Reading in ../data/Fantom5/enhancer/hg19/pos_fa/hg19.Fantom5.enhancers.noXY.2000bp.test.fa to a dictionary
Removing records with N's

Number of records before N removal: 6775
Number of records after N removal: 6775

Generating array dataset of shape 6775, 64, 64

Beginning sequencing to image conversion for ../data/Fantom5/enhancer/hg19/pos_fa/hg19.Fantom5.enhancers.noXY.2000bp.test.fa.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6775/6775 [00:53<00:00, 125.49it/s]


Time taken to create image array: 0.900128964583079 mins
Saving test at 2000bps as seq2img numpy array.

For seq length = 2000, I'm using a Hilbert Curve at order: 6.
Generating hilbert curve of order 6 with 2 dimensions.

Generating mapping dictionary for all possible 4mers

Reading in ../data/Fantom5/enhancer/mm9/pos_fa/mm9.Fantom5.enhancers.noXY.2000bp.train.fa to a dictionary
Removing records with N's



100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30614/30614 [00:00<00:00, 510930.20it/s]
  0%|                                                                                                                                                                                                                                                        | 13/30596 [00:00<04:04, 125.21it/s]

Number of records before N removal: 30614
Number of records after N removal: 30596

Generating array dataset of shape 30596, 64, 64

Beginning sequencing to image conversion for ../data/Fantom5/enhancer/mm9/pos_fa/mm9.Fantom5.enhancers.noXY.2000bp.train.fa.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30596/30596 [04:06<00:00, 124.13it/s]


Time taken to create image array: 4.1079403003056845 mins
Saving train at 2000bps as seq2img numpy array.



100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8747/8747 [00:00<00:00, 548878.34it/s]
  0%|                                                                                                                                                                                                                                                                   | 0/8739 [00:00<?, ?it/s]

For seq length = 2000, I'm using a Hilbert Curve at order: 6.
Generating hilbert curve of order 6 with 2 dimensions.

Generating mapping dictionary for all possible 4mers

Reading in ../data/Fantom5/enhancer/mm9/pos_fa/mm9.Fantom5.enhancers.noXY.2000bp.val.fa to a dictionary
Removing records with N's

Number of records before N removal: 8747
Number of records after N removal: 8739

Generating array dataset of shape 8739, 64, 64

Beginning sequencing to image conversion for ../data/Fantom5/enhancer/mm9/pos_fa/mm9.Fantom5.enhancers.noXY.2000bp.val.fa.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8739/8739 [01:04<00:00, 135.70it/s]


Time taken to create image array: 1.0736771861712138 mins
Saving val at 2000bps as seq2img numpy array.



100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4374/4374 [00:00<00:00, 640881.91it/s]
  0%|                                                                                                                                                                                                                                                                   | 0/4369 [00:00<?, ?it/s]

For seq length = 2000, I'm using a Hilbert Curve at order: 6.
Generating hilbert curve of order 6 with 2 dimensions.

Generating mapping dictionary for all possible 4mers

Reading in ../data/Fantom5/enhancer/mm9/pos_fa/mm9.Fantom5.enhancers.noXY.2000bp.test.fa to a dictionary
Removing records with N's

Number of records before N removal: 4374
Number of records after N removal: 4369

Generating array dataset of shape 4369, 64, 64

Beginning sequencing to image conversion for ../data/Fantom5/enhancer/mm9/pos_fa/mm9.Fantom5.enhancers.noXY.2000bp.test.fa.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4369/4369 [00:31<00:00, 136.86it/s]


Time taken to create image array: 0.532866366704305 mins
Saving test at 2000bps as seq2img numpy array.



In [10]:
 ## NEGATIVE EXAMPLES ##
split = ['train', 'val', 'test']
species = ['hg19','mm9']#, 'mm9']
bps = [2000] #, 1000]
for sp in species:
    for s in split:
        for bp in bps:
            print(f'Converting {sp} {s} at {bp}bps into an image.')
            pos_X = np.load(f'../data/Fantom5/enhancer/{sp}/pos_npy/{sp}.Fantom5.enhancers.noXY'
                    f'.seq2img'
                    f'.{bp}bp'
                    f'.{s}.npy', mmap_mode='r')
            print(f'Number of positive samples:\t{pos_X.shape[0]}')
            if bp <= (2**4)**2:
                p = 4
            elif (2**4)**2 < bp <= (2**5)**2:
                p = 5
            elif (2**5)**2 < bp <= (2**6)**2:
                p = 6
            elif (2**6)**2 < bp <= (2**7)**2:
                p = 7
            mat = seq2img2D(f'../data/Fantom5/enhancer/{sp}/neg_fa/{sp}.Fantom5.non_enhancers'
                          f'.noXY.GCbalanced.{bp}bp.{s}.fa',
                          hc_p=p,
                          hc_n=2,
                          random_choice=True,
                          rand_n = pos_X.shape[0])
            print(f'Saving {s} at {bp} as seq2img numpy array.\n')
            np.save(f'../data/Fantom5/enhancer/{sp}/neg_npy/{sp}.Fantom5.non_enhancers.noXY'
                    f'.GCbalanced'
                    f'.seq2img'
                    f'.{bp}bp'
                    f'.{s}.npy', mat)


Converting hg19 train at 2000bps into an image.
Number of positive samples:	47401
Generating hilbert curve of order 6 with 2 dimensions.

Generating mapping dictionary for all possible 4mers

Reading in ../data/Fantom5/enhancer/hg19/neg_fa/hg19.Fantom5.non_enhancers.noXY.GCbalanced.2000bp.train.fa to a dictionary
Removing records with N's



100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 619671/619671 [00:01<00:00, 505102.26it/s]
  0%|                                                                                                                                                                                                                                                        | 12/47401 [00:00<06:57, 113.38it/s]

Number of clean records taken randomly: 47401
Generating array dataset of shape 47401, 64, 64

Beginning sequencing to image conversion for ../data/Fantom5/enhancer/hg19/neg_fa/hg19.Fantom5.non_enhancers.noXY.GCbalanced.2000bp.train.fa.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 47401/47401 [05:48<00:00, 135.89it/s]


Time taken to create image array: 5.813872329394022 mins
Saving train at 2000 as seq2img numpy array.

Converting hg19 val at 2000bps into an image.
Number of positive samples:	13548
Generating hilbert curve of order 6 with 2 dimensions.

Generating mapping dictionary for all possible 4mers

Reading in ../data/Fantom5/enhancer/hg19/neg_fa/hg19.Fantom5.non_enhancers.noXY.GCbalanced.2000bp.val.fa to a dictionary
Removing records with N's



100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 177420/177420 [00:00<00:00, 521691.60it/s]
  0%|▏                                                                                                                                                                                                                                                       | 13/13548 [00:00<01:47, 125.64it/s]

Number of clean records taken randomly: 13548
Generating array dataset of shape 13548, 64, 64

Beginning sequencing to image conversion for ../data/Fantom5/enhancer/hg19/neg_fa/hg19.Fantom5.non_enhancers.noXY.GCbalanced.2000bp.val.fa.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13548/13548 [01:39<00:00, 135.94it/s]


Time taken to create image array: 1.661734700202942 mins
Saving val at 2000 as seq2img numpy array.

Converting hg19 test at 2000bps into an image.
Number of positive samples:	6775
Generating hilbert curve of order 6 with 2 dimensions.

Generating mapping dictionary for all possible 4mers

Reading in ../data/Fantom5/enhancer/hg19/neg_fa/hg19.Fantom5.non_enhancers.noXY.GCbalanced.2000bp.test.fa to a dictionary
Removing records with N's



100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88558/88558 [00:00<00:00, 516361.79it/s]
  0%|▌                                                                                                                                                                                                                                                        | 15/6775 [00:00<00:47, 142.77it/s]

Number of clean records taken randomly: 6775
Generating array dataset of shape 6775, 64, 64

Beginning sequencing to image conversion for ../data/Fantom5/enhancer/hg19/neg_fa/hg19.Fantom5.non_enhancers.noXY.GCbalanced.2000bp.test.fa.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6775/6775 [00:49<00:00, 136.57it/s]


Time taken to create image array: 0.8271453301111857 mins
Saving test at 2000 as seq2img numpy array.

Converting mm9 train at 2000bps into an image.
Number of positive samples:	30596
Generating hilbert curve of order 6 with 2 dimensions.

Generating mapping dictionary for all possible 4mers

Reading in ../data/Fantom5/enhancer/mm9/neg_fa/mm9.Fantom5.non_enhancers.noXY.GCbalanced.2000bp.train.fa to a dictionary
Removing records with N's



100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 655792/655792 [00:01<00:00, 506193.09it/s]
  0%|                                                                                                                                                                                                                                                        | 12/30596 [00:00<04:31, 112.75it/s]

Number of clean records taken randomly: 30596
Generating array dataset of shape 30596, 64, 64

Beginning sequencing to image conversion for ../data/Fantom5/enhancer/mm9/neg_fa/mm9.Fantom5.non_enhancers.noXY.GCbalanced.2000bp.train.fa.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30596/30596 [04:06<00:00, 124.23it/s]


Time taken to create image array: 4.104665950934092 mins
Saving train at 2000 as seq2img numpy array.

Converting mm9 val at 2000bps into an image.
Number of positive samples:	8739
Generating hilbert curve of order 6 with 2 dimensions.

Generating mapping dictionary for all possible 4mers

Reading in ../data/Fantom5/enhancer/mm9/neg_fa/mm9.Fantom5.non_enhancers.noXY.GCbalanced.2000bp.val.fa to a dictionary
Removing records with N's



100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 187392/187392 [00:00<00:00, 385870.01it/s]
  0%|▎                                                                                                                                                                                                                                                          | 9/8739 [00:00<01:45, 82.96it/s]

Number of clean records taken randomly: 8739
Generating array dataset of shape 8739, 64, 64

Beginning sequencing to image conversion for ../data/Fantom5/enhancer/mm9/neg_fa/mm9.Fantom5.non_enhancers.noXY.GCbalanced.2000bp.val.fa.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8739/8739 [01:20<00:00, 108.48it/s]


Time taken to create image array: 1.343127969900767 mins
Saving val at 2000 as seq2img numpy array.

Converting mm9 test at 2000bps into an image.
Number of positive samples:	4369
Generating hilbert curve of order 6 with 2 dimensions.

Generating mapping dictionary for all possible 4mers

Reading in ../data/Fantom5/enhancer/mm9/neg_fa/mm9.Fantom5.non_enhancers.noXY.GCbalanced.2000bp.test.fa to a dictionary
Removing records with N's



100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 93295/93295 [00:00<00:00, 495806.18it/s]
  0%|▋                                                                                                                                                                                                                                                        | 13/4369 [00:00<00:34, 124.51it/s]

Number of clean records taken randomly: 4369
Generating array dataset of shape 4369, 64, 64

Beginning sequencing to image conversion for ../data/Fantom5/enhancer/mm9/neg_fa/mm9.Fantom5.non_enhancers.noXY.GCbalanced.2000bp.test.fa.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4369/4369 [00:34<00:00, 126.78it/s]


Time taken to create image array: 0.5746066451072693 mins
Saving test at 2000 as seq2img numpy array.



In [11]:
split = ['train', 'val', 'test']
bps = [2000] #, 1000]
species = ['hg19','mm9']
for sp in species:
    for s in split:
        for bp in bps:

            pos_X = np.load(f'../data/Fantom5/enhancer/{sp}/pos_npy/{sp}.Fantom5.enhancers'
                        f'.noXY.seq2img.{bp}bp.{s}.npy', mmap_mode='r')
            neg_X = np.load(f'../data/Fantom5/enhancer/{sp}/neg_npy/{sp}.Fantom5.non_enhancers'
                        f'.noXY.GCbalanced.seq2img.{bp}bp.{s}.npy')
            if bp <= 2000 and s == 'train':
                assert pos_X.shape[0] == neg_X.shape[0], 'pos and neg not equal'
            if bp <= 2000 and s == 'val':
                assert pos_X.shape[0] == neg_X.shape[0], 'pos and neg not equal'
                #assert neg_X.shape[0] == 25000, 'neg shape not equal to 25,000'
            if bp <= 2000 and s == 'test':
                assert pos_X.shape[0] == neg_X.shape[0], 'pos and neg not equal'
                #assert neg_X.shape[0] == 10000, 'neg shape not equal to 10,000'

            pos_y = np.ones(pos_X.shape[0])
            neg_y = np.zeros(neg_X.shape[0])

            print(f'Merging pos and neg sets')
            dat = np.vstack((pos_X, neg_X))
            lab = np.hstack((pos_y, neg_y))

            print(f'Saving {s} at {bp}bp')
            np.save(f'../data/Fantom5/enhancer/{sp}/datasets/{sp}.balanced.Fantom5.seq2img'
                    f'.{bp}bp'
                    f'.{s}_X.npy', dat)
            np.save(f'../data/Fantom5/enhancer/{sp}/datasets/{sp}.balanced.Fantom5.seq2img'
                    f'.{bp}bp'
                    f'.{s}_y.npy', lab)


Merging pos and neg sets
Saving train at 2000bp
Merging pos and neg sets
Saving val at 2000bp
Merging pos and neg sets
Saving test at 2000bp
Merging pos and neg sets
Saving train at 2000bp
Merging pos and neg sets
Saving val at 2000bp
Merging pos and neg sets
Saving test at 2000bp


In [15]:
strength = ['strong', 'weak', 'non']
split = ['train', 'test']
for st in strength:
    for sp in split:
        print(f'Converting {st} enhancers from {sp} to an image.')
        if sp == 'test':
            mat = seq2img2D(f'.'
                        f'./data/Enhancer_strength/{sp}/originals'
                        f'/test_{st}_enhancers.txt',
                      hc_p=4,
                      hc_n=2)
        else:
            mat = seq2img2D(f'.'
                        f'./data/Enhancer_strength/{sp}/originals'
                        f'/data_{st}_enhancers.txt',
                      hc_p=4,
                      hc_n=2)
        print(f'Saving {st} enhancers from {sp} as seq2img numpy array.\n')
        np.save(f'../data/Enhancer_strength/{sp}/seq2img/data.{st}.enhancers'
                f'.seq2img.npy',
                mat)

100%|██████████| 742/742 [00:00<00:00, 355514.46it/s]
 11%|█         | 83/742 [00:00<00:00, 824.39it/s]

Converting strong enhancers from train to an image.
Generating hilbert curve of order 4 with 2 dimensions.

Generating mapping dictionary for all possible 4mers

Reading in ../data/Enhancer_strength/train/originals/data_strong_enhancers.txt to a dictionary
Removing records with N's

Number of records before N removal: 742
Number of records after N removal: 742

Generating array dataset of shape 742, 16, 16

Beginning sequencing to image conversion for ../data/Enhancer_strength/train/originals/data_strong_enhancers.txt.


100%|██████████| 742/742 [00:00<00:00, 988.03it/s] 
100%|██████████| 100/100 [00:00<00:00, 116153.53it/s]
100%|██████████| 100/100 [00:00<00:00, 811.95it/s]
100%|██████████| 742/742 [00:00<00:00, 349643.14it/s]
  0%|          | 0/742 [00:00<?, ?it/s]

Time taken to create image array: 0.012561949094136556 mins
Saving strong enhancers from train as seq2img numpy array.

Converting strong enhancers from test to an image.
Generating hilbert curve of order 4 with 2 dimensions.

Generating mapping dictionary for all possible 4mers

Reading in ../data/Enhancer_strength/test/originals/test_strong_enhancers.txt to a dictionary
Removing records with N's

Number of records before N removal: 100
Number of records after N removal: 100

Generating array dataset of shape 100, 16, 16

Beginning sequencing to image conversion for ../data/Enhancer_strength/test/originals/test_strong_enhancers.txt.
Time taken to create image array: 0.0021361311276753745 mins
Saving strong enhancers from test as seq2img numpy array.

Converting weak enhancers from train to an image.
Generating hilbert curve of order 4 with 2 dimensions.

Generating mapping dictionary for all possible 4mers

Reading in ../data/Enhancer_strength/train/originals/data_weak_enhancers.txt t

100%|██████████| 742/742 [00:00<00:00, 1144.48it/s]
100%|██████████| 100/100 [00:00<00:00, 861253.39it/s]
100%|██████████| 100/100 [00:00<00:00, 928.83it/s]
100%|██████████| 1484/1484 [00:00<00:00, 503262.22it/s]
  0%|          | 0/1484 [00:00<?, ?it/s]

Time taken to create image array: 0.010837348302205403 mins
Saving weak enhancers from train as seq2img numpy array.

Converting weak enhancers from test to an image.
Generating hilbert curve of order 4 with 2 dimensions.

Generating mapping dictionary for all possible 4mers

Reading in ../data/Enhancer_strength/test/originals/test_weak_enhancers.txt to a dictionary
Removing records with N's

Number of records before N removal: 100
Number of records after N removal: 100

Generating array dataset of shape 100, 16, 16

Beginning sequencing to image conversion for ../data/Enhancer_strength/test/originals/test_weak_enhancers.txt.
Time taken to create image array: 0.0018214702606201172 mins
Saving weak enhancers from test as seq2img numpy array.

Converting non enhancers from train to an image.
Generating hilbert curve of order 4 with 2 dimensions.

Generating mapping dictionary for all possible 4mers

Reading in ../data/Enhancer_strength/train/originals/data_non_enhancers.txt to a dictiona

100%|██████████| 1484/1484 [00:01<00:00, 1147.32it/s]
100%|██████████| 200/200 [00:00<00:00, 1030541.52it/s]
100%|██████████| 200/200 [00:00<00:00, 1353.75it/s]

Time taken to create image array: 0.02159657080968221 mins
Saving non enhancers from train as seq2img numpy array.

Converting non enhancers from test to an image.
Generating hilbert curve of order 4 with 2 dimensions.

Generating mapping dictionary for all possible 4mers

Reading in ../data/Enhancer_strength/test/originals/test_non_enhancers.txt to a dictionary
Removing records with N's

Number of records before N removal: 200
Number of records after N removal: 200

Generating array dataset of shape 200, 16, 16

Beginning sequencing to image conversion for ../data/Enhancer_strength/test/originals/test_non_enhancers.txt.
Time taken to create image array: 0.0024828314781188965 mins
Saving non enhancers from test as seq2img numpy array.






In [14]:
fa = open('../data/Enhancer_strength/test/originals/test_non_enhancers'
     '.txt', 'w')
names = []
for record in SeqIO.parse('../data/Enhancer_strength/test/originals'
                    '/original_test_non_enhancers.txt', 'fasta'):
    if record.name not in names:
        names.append(record.name)
        fa.write(f'>{record.name}\n{record.seq}\n')
    elif record.name in names:
        fa.write(f'>{record.name}.{names.count(record.name) + 1}\n{record.seq}\n')
fa.close()

In [1]:
import numpy as np
from tqdm import tqdm
split = ['train', 'val', 'test']
window = [200]#, 1000]
for s in split:
    for w in tqdm(window):
        print(f'Combining human and mouse {s} datasets at {w}bp.')
        human_X = np.load(f'../data/VISTA/hg19/datasets/hg19.balanced.VISTA'
                          f'.seq2img.{w}bp.{s}_X.npy', mmap_mode='r')
        mouse_X = np.load(f'../data/VISTA/mm9/datasets/mm9.balanced.VISTA'
                          f'.seq2img.{w}bp.{s}_X.npy', mmap_mode='r')
        human_y = np.load(f'../data/VISTA/hg19/datasets/hg19.balanced.VISTA'
                          f'.seq2img.{w}bp.{s}_y.npy', mmap_mode='r')
        mouse_y = np.load(f'../data/VISTA/mm9/datasets/mm9.balanced.VISTA'
                          f'.seq2img.{w}bp.{s}_y.npy', mmap_mode='r')

        comb_X = np.vstack((human_X, mouse_X))
        comb_y = np.hstack((human_y, mouse_y))

        np.save(f'../data/VISTA/hg19.mm9.balanced.VISTA.seq2img.{w}bp.{s}_X'
                f'.npy', comb_X)
        np.save(f'../data/VISTA/hg19.mm9.balanced.VISTA.seq2img.{w}bp.{s}_y'
                f'.npy', comb_y)


  0%|          | 0/1 [00:00<?, ?it/s]

Combining human and mouse train datasets at 200bp.


100%|██████████| 1/1 [00:54<00:00, 54.37s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Combining human and mouse val datasets at 200bp.


100%|██████████| 1/1 [00:22<00:00, 22.94s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Combining human and mouse test datasets at 200bp.


100%|██████████| 1/1 [00:09<00:00,  9.00s/it]


In [3]:
import numpy as np
from tqdm import tqdm

split = ['train', 'val', 'test']

for s in tqdm(split):
    hg19_dat = np.load(f'../data/VISTA/hg19/datasets/hg19.balanced.VISTA'
                       f'.onehot.1000bp.{s}_X.npy', mmap_mode='r')
    hg19_lab = np.load(f'../data/VISTA/hg19/datasets/hg19.balanced.VISTA'
                       f'.onehot.1000bp.{s}_y.npy', mmap_mode='r')

    mm9_dat = np.load(f'../data/VISTA/mm9/datasets/mm9.balanced.VISTA'
                       f'.onehot.1000bp.{s}_X.npy', mmap_mode='r')
    mm9_lab = np.load(f'../data/VISTA/mm9/datasets/mm9.balanced.VISTA'
                       f'.onehot.1000bp.{s}_y.npy', mmap_mode='r')

    both_dat = np.vstack((hg19_dat, mm9_dat))
    both_lab = np.hstack((hg19_lab, mm9_lab))

    np.save(f'../data/VISTA/hg19.mm9.balanced.VISTA.onehot.1000bp.{s}_X'
            f'.npy', both_dat)
    np.save(f'../data/VISTA/hg19.mm9.balanced.VISTA.onehot.1000bp.{s}_y'
            f'.npy', both_lab)

100%|██████████| 3/3 [16:34<00:00, 331.59s/it]


In [2]:
import numpy as np
#import matplotlib.pyplot as plt
from tqdm import tqdm
species = ['hg19', 'mm9']
split = ['train', 'val', 'test']

for sp in species:
    for s in tqdm(split):
        print(f'Cropping {sp} for the {s} split.')
        arr = np.load(f'../data/Fantom5/enhancer/{sp}/datasets/{sp}.balanced'
                      f'.Fantom5.seq2img.2000bp.{s}_X.npy',
                      mmap_mode='r')
        arr_crop = arr[:, :, :32]

        np.save(f'../data/Fantom5/enhancer/{sp}/datasets/{sp}.balanced.Fantom5'
                f'.seq2img'
                f'.cropped'
        f'.2000bp.{s}_X.npy', arr_crop)
print('Finished cropping')
#print(arr[0])

  0%|          | 0/3 [00:00<?, ?it/s]

Cropping hg19 for the train split.


 33%|███▎      | 1/3 [00:31<01:02, 31.38s/it]

Cropping hg19 for the val split.


 67%|██████▋   | 2/3 [00:39<00:17, 17.96s/it]

Cropping hg19 for the test split.


100%|██████████| 3/3 [00:44<00:00, 14.74s/it]
  0%|          | 0/3 [00:00<?, ?it/s]

Cropping mm9 for the train split.


 33%|███▎      | 1/3 [00:19<00:39, 19.78s/it]

Cropping mm9 for the val split.


 67%|██████▋   | 2/3 [00:25<00:11, 11.58s/it]

Cropping mm9 for the test split.


100%|██████████| 3/3 [00:28<00:00,  9.49s/it]

Finished cropping





In [3]:
import numpy as np
from tqdm import tqdm

split = ['train', 'val', 'test']

for s in tqdm(split):
    hg19_X = np.load(f'../data/Fantom5/enhancer/hg19/datasets/hg19.balanced'
                     f'.Fantom5.seq2img.cropped.2000bp.{s}_X.npy',
                     mmap_mode='r')
    hg19_y = np.load(f'../data/Fantom5/enhancer/hg19/datasets/hg19.balanced'
                     f'.Fantom5.seq2img.2000bp.{s}_y.npy',
                     mmap_mode='r')
    mm9_X = np.load(f'../data/Fantom5/enhancer/mm9/datasets/mm9.balanced'
                    f'.Fantom5.seq2img.cropped.2000bp.{s}_X.npy',
                    mmap_mode='r')
    mm9_y = np.load(f'../data/Fantom5/enhancer/mm9/datasets/mm9.balanced'
                    f'.Fantom5.seq2img.2000bp.{s}_y.npy',
                    mmap_mode='r')

    both_X = np.vstack((hg19_X, mm9_X))
    both_y = np.hstack((hg19_y, mm9_y))

    np.save(f'../data/Fantom5/enhancer/hg19.mm9.balanced.Fantom5.seq2img.2000bp'
            f'.{s}_X.npy', both_X)
    np.save(f'../data/Fantom5/enhancer/hg19.mm9.balanced.Fantom5.seq2img.2000bp'
            f'.{s}_y.npy', both_y)

100%|██████████| 3/3 [00:10<00:00,  3.58s/it]
