In [5]:
#from utils.utils import seq2onehot
from Bio import SeqIO
import numpy as np
from tqdm import tqdm
import pandas as pd
import re
import time
import random

In [7]:
def seq2onehot(in_fa, random_choice=False, rand_n=None):
    """
    Function to generate one-hot encoded matrices from fasta sequences
    :param in_fa: Path to multifasta file.
    :param random_choice: If true will take a random sample from the fasta record. Default False
    :param rand_n: If random_choice = True, will take rand_n number of random samples.
    :return: Matrix with  shape N, L, 4.
    N = number of records in the fasta file.
    L = length of the sequence (rows) and
    4 is A, C, G, T.
    """
    print(f'Reading {in_fa} into dictionary  and removing N\'s')
    multi_fa = SeqIO.to_dict(SeqIO.parse(in_fa, 'fasta'))
    clean_multi_fa = {}
    for k, v in tqdm(multi_fa.items()):
        if 'N' not in str(v.seq).upper():
            clean_multi_fa[f'{k}'] = v
    if random_choice is True:
        random.seed(12)
        rand_clean_idx = random.sample(list(clean_multi_fa), k=rand_n)
        clean_fa = {key: clean_multi_fa[key] for key in rand_clean_idx}
        print(f'Number of clean records taken randomly: {rand_n}')
    else:
        clean_fa = clean_multi_fa
        print(f'Number of records before N removal: {len(multi_fa)}\nNumber of records after N removal {len(clean_fa)}')
    seq_len = len(v.seq)
    start_time = time.time()
    one_hot_mat = np.zeros((len(clean_fa), seq_len, 4))

    print(f'Beginning one-hot encoding of {in_fa}')
    for k, record in enumerate(tqdm(clean_fa.items())):
        for i in range(len(str(record[1].seq).upper())):
            assert str(record[1].seq).upper()[i] != 'N', 'Something went ' \
                                                         'wrong prior. You ' \
                                                         'need to make sure ' \
                                                         'there are no Ns in ' \
                                                         'the seq'
            if str(record[1].seq).upper()[i] == 'A':
                one_hot_mat[k][i][0] = 1.
            elif str(record[1].seq).upper()[i] == 'C':
                one_hot_mat[k][i][1] = 1.
            elif str(record[1].seq).upper()[i] == 'G':
                one_hot_mat[k][i][2] = 1.
            elif str(record[1].seq).upper()[i] == 'T':
                one_hot_mat[k][i][3] = 1.
    end_time = time.time()
    total_time = end_time - start_time
    print(f'Time taken to create one-hot matrix: {total_time/60} mins')
    return one_hot_mat


In [9]:
## Deduplicate fasta multifasta file. Seems to be some bug in the shuffle code that means certain records get put in the multifasta file > 1 times. ##
split = ['train', 'val', 'test']
ids = {}
for s in split:
    print(f'Deduplicating {s}')
    fa = open(f'../data/gene_prediction/ensembl/pos_fa/Bos_taurus.ARS-UCD1.2.105.UTRs.exons.DEDUP.noXY.200bp.{s}.fa', 'w')
    for record in tqdm(SeqIO.parse(f'../data/gene_prediction/ensembl/pos_fa/Bos_taurus.ARS-UCD1.2.105.UTRs.exons.noXY.200bp.{s}.fa', 'fasta')):
        # Dictionaries are so so so much faster than lists!!!
        if record.name in ids.keys():
            continue
        else:
            ids[f'{record.name}'] = None
            SeqIO.write(record, fa, 'fasta')


5812it [00:00, 58117.56it/s]

Deduplicating train


534083it [00:08, 63217.67it/s]
6071it [00:00, 60703.98it/s]

Deduplicating val


152596it [00:02, 69904.37it/s]
5992it [00:00, 59877.51it/s]

Deduplicating test


76299it [00:01, 59894.22it/s]


In [4]:
windows = [200]
split = ['train', 'val', 'test']
n_egs = {}
for s in split:
    for w in windows:
        print(f'Working on {s} for Human.')
        fa_path = f'../data/gene_prediction/neg_fa/Homo_sapiens.GRCh38.105.non_genes.noXY.GCbalanced.{w}bp.{s}.fa'
        neg = SeqIO.to_dict(SeqIO.parse(fa_path,
                                    'fasta'))
        count = 0
        for k, v in tqdm(neg.items()):
            if 'N' not in str(v.seq).upper():
                count += 1
        n_egs[f'{s}_{w}'] = count

Working on train for Human.


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5111044/5111044 [00:05<00:00, 932468.56it/s]


Working on train for Human.


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1202748/1202748 [00:01<00:00, 767119.29it/s]


Working on train for Human.


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1234082/1234082 [00:02<00:00, 522550.13it/s]


Working on val for Human.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1459567/1459567 [00:01<00:00, 1209528.51it/s]


Working on val for Human.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 343389/343389 [00:00<00:00, 775697.90it/s]


Working on val for Human.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 352017/352017 [00:00<00:00, 550244.98it/s]


Working on test for Human.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 729721/729721 [00:00<00:00, 1154409.81it/s]


Working on test for Human.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 171743/171743 [00:00<00:00, 654164.46it/s]


Working on test for Human.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 176273/176273 [00:00<00:00, 472093.04it/s]


In [5]:
print(n_egs)

{'train_200': 5111044, 'train_1000': 1202748, 'train_2000': 1234082, 'val_200': 1459567, 'val_1000': 343389, 'val_2000': 352017, 'test_200': 729721, 'test_1000': 171743, 'test_2000': 176273}


In [11]:
## POSITIVE EXAMPLES ##
split = ['test']
windows = [200]
for s in split:
    for w in windows:
        #di_key = f'{s}_{w}'
        #samples = n_egs[di_key]
        #print(f'Samples: {samples}')
        print(f'One hot encoding Human {s} at {w}bps.')
        #neg = np.load(f'../data/Fantom5/enhancer/pos', mmap_mode='r')
        mat = seq2onehot(f'../data/gene_prediction/ensembl/pos_fa/Bos_taurus.ARS-UCD1.2.105.UTRs.exons.DEDUP.noXY.{w}bp.{s}.fa')
        print(f'Saving {s} at {w}bp as one-hot numpy array.\n')
        np.save(f'../data/gene_prediction/ensembl/pos_npy/Bos_taurus.ARS-UCD1.2.105.UTRs.exons.DEDUP.noXY.{w}bp.{s}.npy', mat)

One hot encoding Human test at 200bps.
Reading ../data/gene_prediction/ensembl/pos_fa/Bos_taurus.ARS-UCD1.2.105.UTRs.exons.DEDUP.noXY.200bp.test.fa into dictionary  and removing N's


100%|██████████| 64400/64400 [00:00<00:00, 510750.92it/s]
  1%|          | 328/64399 [00:00<00:39, 1638.91it/s]

Number of records before N removal: 64400
Number of records after N removal 64399
Beginning one-hot encoding of ../data/gene_prediction/ensembl/pos_fa/Bos_taurus.ARS-UCD1.2.105.UTRs.exons.DEDUP.noXY.200bp.test.fa


100%|██████████| 64399/64399 [00:31<00:00, 2039.96it/s]


Time taken to create one-hot matrix: 0.5261663675308228 mins
Saving test at 200bp as one-hot numpy array.



In [12]:
## NEGATIVE EXAMPLES ##
split = ['test']
windows = [200]
for s in split:
    for w in windows:
        print(f'One hot encoding {s} at 200bps.')
        pos_npy = np.load(f'../data/gene_prediction/ensembl/pos_npy/Bos_taurus.ARS-UCD1.2.105.UTRs.exons.DEDUP.noXY.{w}bp.{s}.npy',
                              mmap_mode='r')
        mat = seq2onehot(f'../data/gene_prediction/ensembl/neg_fa/Bos_taurus.ARS-UCD1.2.105.non_UTRs.exons.DEDUP.noXY.GCbalanced.{w}bp.{s}.fa',
                             random_choice=True,
                             rand_n=pos_npy.shape[0])
        print(f'Saving {s} at {w}bp as one-hot numpy array.\n')
        np.save(f'../data/gene_prediction/ensembl/neg_npy/Bos_taurus.ARS-UCD1.2.105.non_UTRs.exons.DEDUP.noXY.GCbalanced.{w}bp.{s}.npy', mat)

One hot encoding test at 200bps.
Reading ../data/gene_prediction/ensembl/neg_fa/Bos_taurus.ARS-UCD1.2.105.non_UTRs.exons.DEDUP.noXY.GCbalanced.200bp.test.fa into dictionary  and removing N's


100%|██████████| 961642/961642 [00:01<00:00, 770311.37it/s]
  0%|          | 182/64399 [00:00<00:35, 1814.32it/s]

Number of clean records taken randomly: 64399
Beginning one-hot encoding of ../data/gene_prediction/ensembl/neg_fa/Bos_taurus.ARS-UCD1.2.105.non_UTRs.exons.DEDUP.noXY.GCbalanced.200bp.test.fa


100%|██████████| 64399/64399 [00:31<00:00, 2073.07it/s]


Time taken to create one-hot matrix: 0.5185190518697103 mins
Saving test at 200bp as one-hot numpy array.



In [13]:
annotation = ['Bos_taurus.ARS-UCD1.2.105']
split = ['test']
for anno in annotation:
    print(f'Working on {anno}.')
    for s in tqdm(split):
        pos_X = np.load(f'../data/gene_prediction/ensembl/pos_npy/{anno}.UTRs.exons.DEDUP.noXY.200bp.{s}.npy', mmap_mode='r')
        neg_X = np.load(f'../data/gene_prediction/ensembl/neg_npy/{anno}.non_UTRs.exons.DEDUP.noXY.GCbalanced.200bp.{s}.npy', mmap_mode='r')

        pos_y = np.ones((pos_X.shape[0]))
        neg_y = np.zeros((neg_X.shape[0]))

        assert pos_X.shape[0] == neg_X.shape[0], 'The dataset is not balanced.'

        dat = np.vstack((pos_X, neg_X))
        lab = np.hstack((pos_y, neg_y))

        print(f'Saving {anno} {s} npy files.')
        np.save(f'../data/gene_prediction/ensembl/datasets/{anno}.UTRs.exons.balanced.onehot.200bp.{s}_X.npy', dat)
        np.save(f'../data/gene_prediction/ensembl/datasets/{anno}.UTRS.exons.balanced.onehot.200bp.{s}_y.npy', lab)

  0%|          | 0/1 [00:00<?, ?it/s]

Working on Bos_taurus.ARS-UCD1.2.105.
Saving Bos_taurus.ARS-UCD1.2.105 test npy files.


100%|██████████| 1/1 [00:03<00:00,  3.01s/it]


In [4]:
split = ['train', 'val', 'test']
for s in tqdm(split):
    print('Stacking human and mouse datasets together.')
    pos_hg38 = np.load(f'../data/gene_prediction/ensembl/pos_npy/Homo_sapiens.GRCh38.105.UTRs.exons.DEDUP.noXY.200bp.{s}.npy', mmap_mode='r')
    pos_hg38_y = np.ones(pos_hg38.shape[0])

    pos_mm10 = np.load(f'../data/gene_prediction/ensembl/pos_npy/Mus_musculus.GRCm38.102.UTRs.exons.DEDUP.noXY.200bp.{s}.npy', mmap_mode='r')
    pos_mm10_y = np.ones(pos_mm10.shape[0])

    neg_hg38 = np.load(f'../data/gene_prediction/ensembl/neg_npy/Homo_sapiens.GRCh38.105.non_UTRs.exons.DEDUP.noXY.GCbalanced.200bp.{s}.npy', mmap_mode='r')
    neg_hg38_y = np.zeros(neg_hg38.shape[0])

    neg_mm10 = np.load(f'../data/gene_prediction/ensembl/neg_npy/Mus_musculus.GRCm38.102.non_UTRs.exons.DEDUP.noXY.GCbalanced.200bp.{s}.npy', mmap_mode='r')
    neg_mm10_y = np.zeros(neg_mm10.shape[0])

    assert pos_hg38.shape[0] == neg_hg38.shape[0]
    assert pos_mm10.shape[0] == neg_mm10.shape[0]

    pos_X = np.vstack((pos_hg38, pos_mm10))
    pos_y = np.hstack((pos_hg38_y, pos_mm10_y))

    neg_X = np.vstack((neg_hg38, neg_mm10))
    neg_y = np.hstack((neg_hg38_y, neg_mm10_y))

    dat_X = np.vstack((pos_X, neg_X))
    dat_y = np.hstack((pos_y, neg_y))

    print(f'Saving {s} array to file.')
    np.save(f'../data/gene_prediction/ensembl/datasets/Homo.Mus.UTRs.exons.balanced.onehot.200bp.{s}_X.npy', dat_X)
    np.save(f'../data/gene_prediction/ensembl/datasets/Homo.Mus.UTRs.exons.balanced.onehot.200bp.{s}_y.npy', dat_y)

  0%|          | 0/3 [00:00<?, ?it/s]

Stacking human and mouse datasets together.
Saving train array to file.


 33%|███▎      | 1/3 [02:10<04:21, 130.84s/it]

Stacking human and mouse datasets together.
Saving val array to file.


 67%|██████▋   | 2/3 [02:52<01:18, 78.63s/it] 

Stacking human and mouse datasets together.
Saving test array to file.


100%|██████████| 3/3 [03:05<00:00, 61.98s/it]


In [None]:
### MOUSE ###

In [7]:
## Create DATASETS ##
import numpy as np
split = ['train', 'val', 'test']
windows = [200]#, 1000]
species = ['hg19']
for sp in species:
    for s in split:
        for w in windows:
            print(f'Creating {s} dataset for size {w}bps using {sp} data.')
            pos_X = np.load(f'../data/SilencerDB/{sp}/pos_npy/{sp}.SilencerDB'
                            f'.silencers.noXY.{w}bp.{s}.npy',
                        mmap_mode='r')
            neg_X = np.load(f'../data/SilencerDB/{sp}/neg_npy/{sp}.SilencerDB'
                            f'.non_silencers.noXY.GCbalanced.{w}bp.{s}.npy',
                        mmap_mode='r')
            print(f'Generating labels for dataset at {w}bps')
            pos_y = np.ones(pos_X.shape[0])
            neg_y = np.zeros(neg_X.shape[0])

            print(f'Stacking examples and labels together')
            dat = np.vstack((pos_X, neg_X))
            lab = np.hstack((pos_y, neg_y))

            print(f'Saving dataset at {w}bps.')
            np.save(f'../data/SilencerDB/{sp}/datasets/{sp}.balanced'
                    f'.SilencerDB.onehot.{w}bp.{s}_X.npy',
                dat)
            np.save(f'../data/SilencerDB/{sp}/datasets/{sp}.balanced'
                    f'.SilencerDB.onehot.{w}bp.{s}_y.npy', lab)

Creating train dataset for size 200bps using hg19 data.
Generating labels for dataset at 200bps
Stacking examples and labels together
Saving dataset at 200bps.
Creating val dataset for size 200bps using hg19 data.
Generating labels for dataset at 200bps
Stacking examples and labels together
Saving dataset at 200bps.
Creating test dataset for size 200bps using hg19 data.
Generating labels for dataset at 200bps
Stacking examples and labels together
Saving dataset at 200bps.


In [18]:
import numpy as np
from tqdm import tqdm
split = ['train', 'val', 'test']
window = [200]
for s in split:
    for w in tqdm(window):
        print(f'Combining human and mouse {s} datasets at {w}bp.')
        human_X = np.load(f'../data/SilencerDB/hg19/datasets/hg19.balanced'
                          f'.SilencerDB.onehot.{w}bp.{s}_X.npy',
                          mmap_mode='r')
        mouse_X = np.load(f'../data/SilencerDB/mm9/datasets/mm9'
                          f'.SilencerDB'
                          f'.onehot.{w}bp.{s}_X.npy', mmap_mode='r')
        human_y = np.load(f'../data/SilencerDB/hg19/datasets/hg19'
                          f'.balanced.SilencerDB'
                          f'.onehot.{w}bp.{s}_y.npy', mmap_mode='r')
        mouse_y = np.load(f'../data/SilencerDB/mm9/datasets/mm9'
                          f'.SilencerDB'
                          f'.onehot.{w}bp.{s}_y.npy', mmap_mode='r')

        comb_X = np.vstack((human_X, mouse_X))
        comb_y = np.hstack((human_y, mouse_y))

        np.save(f'../data/SilencerDB/hg19.mm9.balanced.SilencerDB.onehot'
                f'.{w}bp.{s}_X'
                f'.npy', comb_X)
        np.save(f'../data/SilencerDB/hg19.mm9.balanced.SilencerDB.onehot'
                f'.{w}bp.{s}_y'
                f'.npy', comb_y)

  0%|          | 0/1 [00:00<?, ?it/s]

Combining human and mouse train datasets at 200bp.


100%|██████████| 1/1 [10:54<00:00, 654.04s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Combining human and mouse val datasets at 200bp.


100%|██████████| 1/1 [02:47<00:00, 167.13s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Combining human and mouse test datasets at 200bp.


100%|██████████| 1/1 [01:31<00:00, 91.53s/it]
