In [3]:
from tqdm import tqdm
from Bio import SeqIO
import re
import pandas as pd
import numpy as np
import fasttext
import random
import time

def seqToWordVec(in_fa, in_vec, model_path, word_size=10, random_choice=False,
                 rand_n=None):
    """
    Function to convert DNA sequence to a word vector representation.
    :param in_fa: Input multifasta file.
    :param in_vec: vector file generated by FastText training.
    :param word_size: Word size used to generate the corpus. Defaults to 10
    :param model_path: Path to the trained FastText model binary.
    :param random_choice: Default False. If True, will take a random sample
    == rand_n.
    :param rand_n: Number of random samples to take.
    :return: Matrix of word vectors.
    """
    print(f'Loading model:\t{model_path}\n')
    model = fasttext.load_model(model_path)
    print(f'Loading vectors:\t{in_vec}')
    word_vec = pd.read_csv(in_vec,
                           header=None,
                           index_col=0,
                           sep=' ',
                           skiprows=1)
    word_vec = word_vec.iloc[:, :30]

    word_dict = {}
    for i in tqdm(word_vec.itertuples()):
        word_dict[f'{i[0]}'] = np.reshape(np.array(list(i[1:])),
                                          newshape=(30,))

    print(f'Reading {in_fa} into dictionary  and removing N\'s')
    multi_fa = SeqIO.to_dict(SeqIO.parse(in_fa, 'fasta'))
    clean_multi_fa = {}
    for k, v in tqdm(multi_fa.items()):
        if 'N' not in str(v.seq).upper():
            clean_multi_fa[f'{k}'] = v
    if random_choice is True:
        random.seed(12)
        rand_clean_idx = random.sample(list(clean_multi_fa), k=rand_n)
        clean_fa = {key: clean_multi_fa[key] for key in rand_clean_idx}
        print(f'Number of clean records taken randomly: {rand_n}')
    else:
        clean_fa = clean_multi_fa
        print(f'Number of records before N removal: {len(multi_fa)}\nNumber of records after N removal {len(clean_fa)}')


    start_time = time.time()


    print(f'Beginning word vector represenations of {in_fa}')
    mat = []
    for k, record in enumerate(tqdm(clean_fa.items())):
        seq_arr = []
        for i in range(0, len(str(record[1].seq).upper()), word_size):
            assert str(record[1].seq).upper()[i] != 'N', 'Something went ' \
                                                         'wrong prior. You ' \
                                                         'need to make sure ' \
                                                         'there are no Ns in ' \
                                                         'the seq'

            if str(record[1].seq.upper()[i:i+word_size]) not in \
                        word_dict.keys():
                print(f'{str(record[1].seq).upper()[i:i+word_size]} not in '
                          f'dictionary. Creating vector '
                  f'from subwords.')
                seq_arr.append(model.get_word_vector(str(record[
                                                               1].seq.upper()[
                                                         i:i+word_size])))
            else:
                seq_arr.append(word_dict[str(record[1].seq).upper()[
                                       i:i+word_size]])
        seq_arr = np.array(seq_arr)
        mat.append(seq_arr.T)

    mat = np.array(mat)

    return mat

In [2]:
# Splits fasta record on N and/or n and adds it to a dictionary value for
# that chromosome. Splits fasta on N/n.
chrs = {}
for record in tqdm(SeqIO.parse
                       ('/Users/callummacphillamy/PhD/Reference_Genomes/hg38'
                        '/bt2_index/hg38.fa', 'fasta')):
    inter_seq = re.split('N|n', str(record.seq).upper())
    chrs[f'{record.name}'] = [n for n in inter_seq if n != '']



with open('./hg38_corpus.txt', 'w') as corpus:
    for ch, seq in tqdm(chrs.items()):
        for i in seq:
            for s in range(len(i)):
                corpus.write(f'{i[s:int(s+10)]} ')


455it [00:59,  7.61it/s]


In [2]:
word_vec = pd.read_csv('/Users/callummacphillamy/PhD/Reference_Genomes/hg19'
                       '/hg19.skipgram.vec',
                       header=None,
                       index_col=0,
                       sep=' ',
                       skiprows=1)
word_vec = word_vec.iloc[:, :100]


In [5]:
word_dict = {}
for i in tqdm(word_vec.itertuples()):
    word_dict[f'{i[0]}'] = np.reshape(np.array(list(i[1:])), newshape=(100,))


1048645it [00:16, 62623.50it/s]


In [6]:
enh = 'GCCGCATCCATCTCTGATGGGTGAATTAGGATCCAAATTTTTAACTAATTGAGTACATTAAATAGAAAGTCCCTAAGAAAGAAGGGAGGAAAAGACGAATACAAGACACTTTTCAAAGCTTCTTTAATAAAAAGTGTATTTGGGATTTGACCATGATGGCTATCCTTGCTTGCCAAAACCTGACGCCCTGGCAGGTAACT'
mat = []
for i in range(0,len(enh),10):
    print(enh[i:i+10])
    print(word_dict[enh[i:i+10]])
    mat.append(word_dict[enh[i:i+10]])
x = np.array(mat).T

GCCGCATCCA
[ -0.23905   -1.8646     0.5715    -0.49265   -1.9836     9.7453
   0.31082    1.5741     1.2408    -1.2396     4.7121     4.3289
  -1.3392     0.62365    0.65655   13.181    -32.464      1.2373
 -12.854     -1.176     -1.1257    -0.24657    4.0201     4.5592
   1.1543    -0.45209   -1.0573    -0.065521  -2.9976    -6.3403
   0.54074   -4.1842     0.36288    1.9754    -0.72361   -0.049955
   0.73677    1.8241    -0.50174   -2.8962    -2.5284     1.4649
  -1.5286    -1.1881    -8.5696    -4.028      1.1655    -0.93628
  -3.4753    -0.82331    1.2747    -1.921     -1.4668    -1.4764
  -0.70042    0.69487   -0.10039   -0.42633   -1.9878    -0.9687
   1.0405     2.4644     0.40661    2.5131    -0.8824     4.5574
  -4.6334     2.8202    -1.4212    -1.4906    -8.8073    -2.3719
  -0.45941    4.1874     0.86934   -2.3771     1.444     -4.7546
   3.1737    -4.2287     2.7318     3.9114    -1.309     -4.5742
   3.2415     0.54641   -2.7689    -0.11814   -3.4759     2.0328
  -1.5878  

In [1]:
def fastaToWordVec(seq, word_size):
    """
    Converts a fasta sequence to a matrix of vectors based on the a FastText
    or similar processed corpus.
    :param seq: Fasta sequence to be converted
    :param word_size: Size of the word used to generate the corpus.
    :return: A matrix of vectors that represent the fasta sequence.
    """
    mat = []
    for i in range(0, len(seq), word_size):
        if str(seq[i:i+word_size]) not in word_dict.keys():
            print(f'{seq[i:i+word_size]} not in dictionary. Creating vector '
                  f'from subwords.')
            mat.append(model.get_word_vector(str(seq[i:i+word_size])))
        else:
            mat.append(word_dict[seq[i:i+word_size]])
    mat = np.array(mat).T
    return mat

In [4]:
### POSITIVE SET ###

#from utils.utils import seqToWordVec

split = ['train','val','test']# 'val', 'test']
window = [200, 1000, 2000]
species = ['hg19', 'mm9']



for w in window:
    for s in split:
        for sp in species:
            print(f'Converting {sp} {s} at window size {w} to word-vectors.')
            path = f'../data/VISTA/{sp}/pos_fa/{sp}.VISTA.enhancers.noXY.{w}bp.{s}.fa'
            mat = seqToWordVec(path,
                           '/Users/callummacphillamy/PhD/Reference_Genomes'
                           '/hg19/hg19.skipgram.30.vec',
                           '/Users/callummacphillamy/PhD/Reference_Genomes'
                           '/hg19/hg19.skipgram.30.bin')
            print(f'Saving {sp} {s} at window size {w} to file')
            np.save(f'../data/VISTA/{sp}/pos_npy/{sp}.VISTA.enhancers.noXY.FTvec.{w}bp.{s}.npy', mat)

Converting hg19 train at window size 200 to word-vectors.
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:07, 132985.70it/s]


Reading ../data/VISTA/hg19/pos_fa/hg19.VISTA.enhancers.noXY.200bp.train.fa into dictionary  and removing N's


100%|███████████████████████████████████████████████████████████████████████████████████████| 1294731/1294731 [00:01<00:00, 904415.86it/s]
  0%|                                                                                           | 1089/1294731 [00:00<01:58, 10877.92it/s]

Number of records before N removal: 1294731
Number of records after N removal 1294731
Beginning word vector represenations of ../data/VISTA/hg19/pos_fa/hg19.VISTA.enhancers.noXY.200bp.train.fa


100%|████████████████████████████████████████████████████████████████████████████████████████| 1294731/1294731 [01:50<00:00, 11750.15it/s]


Saving hg19 train at window size 200 to file
Converting mm9 train at window size 200 to word-vectors.
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:07, 133188.45it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.200bp.train.fa into dictionary  and removing N's


100%|███████████████████████████████████████████████████████████████████████████████████████| 1060821/1060821 [00:01<00:00, 952108.08it/s]
  0%|                                                                                           | 1181/1060821 [00:00<01:29, 11803.29it/s]

Number of records before N removal: 1060821
Number of records after N removal 1060821
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.200bp.train.fa


 12%|██████████▊                                                                              | 128911/1060821 [00:10<01:19, 11704.33it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.
CGTTACGCGT not in dictionary. Creating vector from subwords.


 15%|█████████████▊                                                                           | 164200/1060821 [00:13<01:16, 11683.94it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 18%|███████████████▋                                                                         | 186445/1060821 [00:15<01:15, 11654.80it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 22%|███████████████████▍                                                                     | 231363/1060821 [00:19<01:10, 11683.63it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 23%|████████████████████▋                                                                    | 246542/1060821 [00:20<01:10, 11556.56it/s]

CGCGAATCGA not in dictionary. Creating vector from subwords.


 27%|████████████████████████▏                                                                | 287628/1060821 [00:24<01:05, 11752.99it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 30%|██████████████████████████▌                                                              | 317011/1060821 [00:27<01:03, 11725.28it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 33%|█████████████████████████████                                                            | 346366/1060821 [00:29<01:01, 11658.18it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 37%|████████████████████████████████▉                                                        | 393041/1060821 [00:33<00:58, 11471.62it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 40%|███████████████████████████████████▏                                                     | 419900/1060821 [00:35<00:54, 11718.71it/s]

CGCGAATCGA not in dictionary. Creating vector from subwords.


 41%|████████████████████████████████████▌                                                    | 435146/1060821 [00:37<00:53, 11677.24it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 46%|█████████████████████████████████████████▏                                               | 490188/1060821 [00:41<00:48, 11730.89it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 49%|███████████████████████████████████████████▋                                             | 520105/1060821 [00:44<00:51, 10601.14it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 73%|████████████████████████████████████████████████████████████████▌                        | 769812/1060821 [01:06<00:24, 11701.29it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 85%|███████████████████████████████████████████████████████████████████████████▋             | 901463/1060821 [01:17<00:13, 11479.80it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 90%|████████████████████████████████████████████████████████████████████████████████▏        | 955416/1060821 [01:21<00:08, 11729.46it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 95%|███████████████████████████████████████████████████████████████████████████████████▎    | 1004682/1060821 [01:26<00:04, 11829.31it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


100%|████████████████████████████████████████████████████████████████████████████████████████| 1060821/1060821 [01:30<00:00, 11681.38it/s]


Saving mm9 train at window size 200 to file
Converting hg19 val at window size 200 to word-vectors.
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:07, 131390.79it/s]


Reading ../data/VISTA/hg19/pos_fa/hg19.VISTA.enhancers.noXY.200bp.val.fa into dictionary  and removing N's


100%|█████████████████████████████████████████████████████████████████████████████████████████| 369923/369923 [00:00<00:00, 901927.95it/s]
  0%|▎                                                                                           | 1147/369923 [00:00<00:32, 11467.96it/s]

Number of records before N removal: 369923
Number of records after N removal 369923
Beginning word vector represenations of ../data/VISTA/hg19/pos_fa/hg19.VISTA.enhancers.noXY.200bp.val.fa


100%|██████████████████████████████████████████████████████████████████████████████████████████| 369923/369923 [00:31<00:00, 11648.07it/s]


Saving hg19 val at window size 200 to file
Converting mm9 val at window size 200 to word-vectors.
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:08, 129874.40it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.200bp.val.fa into dictionary  and removing N's


100%|████████████████████████████████████████████████████████████████████████████████████████| 303092/303092 [00:00<00:00, 1057495.69it/s]
  0%|▎                                                                                           | 1184/303092 [00:00<00:25, 11838.37it/s]

Number of records before N removal: 303092
Number of records after N removal 303092
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.200bp.val.fa


 15%|█████████████▏                                                                             | 43997/303092 [00:03<00:22, 11647.07it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 15%|█████████████▉                                                                             | 46325/303092 [00:03<00:22, 11579.41it/s]

CGCGAATCGA not in dictionary. Creating vector from subwords.


 59%|█████████████████████████████████████████████████████▎                                    | 179456/303092 [00:15<00:10, 11883.09it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 75%|███████████████████████████████████████████████████████████████████▋                      | 227992/303092 [00:19<00:06, 11327.06it/s]

CGCGAATCGA not in dictionary. Creating vector from subwords.


100%|██████████████████████████████████████████████████████████████████████████████████████████| 303092/303092 [00:26<00:00, 11535.83it/s]


Saving mm9 val at window size 200 to file
Converting hg19 test at window size 200 to word-vectors.
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:08, 121889.75it/s]


Reading ../data/VISTA/hg19/pos_fa/hg19.VISTA.enhancers.noXY.200bp.test.fa into dictionary  and removing N's


100%|█████████████████████████████████████████████████████████████████████████████████████████| 184962/184962 [00:00<00:00, 868407.75it/s]
  1%|▌                                                                                           | 1132/184962 [00:00<00:16, 11311.19it/s]

Number of records before N removal: 184962
Number of records after N removal 184962
Beginning word vector represenations of ../data/VISTA/hg19/pos_fa/hg19.VISTA.enhancers.noXY.200bp.test.fa


100%|██████████████████████████████████████████████████████████████████████████████████████████| 184962/184962 [00:16<00:00, 11441.01it/s]


Saving hg19 test at window size 200 to file
Converting mm9 test at window size 200 to word-vectors.
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:07, 131300.24it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.200bp.test.fa into dictionary  and removing N's


100%|████████████████████████████████████████████████████████████████████████████████████████| 151546/151546 [00:00<00:00, 1020800.47it/s]
  1%|▋                                                                                           | 1191/151546 [00:00<00:12, 11901.70it/s]

Number of records before N removal: 151546
Number of records after N removal 151546
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.200bp.test.fa


 80%|███████████████████████████████████████████████████████████████████████▉                  | 121222/151546 [00:10<00:02, 11729.35it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 92%|██████████████████████████████████████████████████████████████████████████████████▍       | 138912/151546 [00:11<00:01, 11730.75it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


100%|██████████████████████████████████████████████████████████████████████████████████████████| 151546/151546 [00:12<00:00, 11828.82it/s]


Saving mm9 test at window size 200 to file
Converting hg19 train at window size 1000 to word-vectors.
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:08, 130994.09it/s]


Reading ../data/VISTA/hg19/pos_fa/hg19.VISTA.enhancers.noXY.1000bp.train.fa into dictionary  and removing N's


100%|█████████████████████████████████████████████████████████████████████████████████████████| 763623/763623 [00:01<00:00, 545497.17it/s]
  0%|                                                                                              | 179/763623 [00:00<07:08, 1781.66it/s]

Number of records before N removal: 763623
Number of records after N removal 763623
Beginning word vector represenations of ../data/VISTA/hg19/pos_fa/hg19.VISTA.enhancers.noXY.1000bp.train.fa


100%|███████████████████████████████████████████████████████████████████████████████████████████| 763623/763623 [07:32<00:00, 1687.13it/s]


Saving hg19 train at window size 1000 to file
Converting mm9 train at window size 1000 to word-vectors.
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:08, 116938.79it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.1000bp.train.fa into dictionary  and removing N's


100%|█████████████████████████████████████████████████████████████████████████████████████████| 706672/706672 [00:01<00:00, 530063.37it/s]
  0%|                                                                                              | 165/706672 [00:00<07:09, 1645.17it/s]

Number of records before N removal: 706672
Number of records after N removal 706672
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.1000bp.train.fa


  0%|▍                                                                                            | 3131/706672 [00:01<06:43, 1741.99it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


  1%|▉                                                                                            | 7416/706672 [00:04<06:33, 1775.74it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


  2%|█▍                                                                                          | 10602/706672 [00:06<06:35, 1759.79it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


  4%|███▉                                                                                        | 29996/706672 [00:17<06:23, 1763.17it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 13%|████████████                                                                                | 92887/706672 [00:53<06:04, 1684.69it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 21%|███████████████████▎                                                                       | 149513/706672 [01:31<05:44, 1617.99it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 22%|████████████████████▍                                                                      | 158559/706672 [01:37<06:04, 1503.71it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 23%|████████████████████▋                                                                      | 160635/706672 [01:38<06:29, 1402.60it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 23%|█████████████████████▎                                                                     | 165420/706672 [01:42<07:50, 1149.56it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 25%|██████████████████████▊                                                                    | 177392/706672 [01:51<06:49, 1293.45it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 27%|████████████████████████▌                                                                  | 190557/706672 [02:01<06:04, 1417.54it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 27%|████████████████████████▋                                                                  | 191965/706672 [02:02<06:06, 1405.81it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 29%|██████████████████████████▎                                                                | 204358/706672 [02:11<05:48, 1443.26it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 29%|██████████████████████████▋                                                                | 207295/706672 [02:13<05:48, 1433.85it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 30%|███████████████████████████▍                                                               | 212642/706672 [02:16<05:48, 1417.68it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 33%|██████████████████████████████▏                                                            | 234858/706672 [02:32<05:30, 1425.61it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 35%|███████████████████████████████▊                                                           | 247137/706672 [02:40<05:17, 1446.18it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 40%|████████████████████████████████████▏                                                      | 281171/706672 [03:04<04:55, 1440.09it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 43%|███████████████████████████████████████▏                                                   | 304245/706672 [03:20<04:57, 1353.61it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 47%|██████████████████████████████████████████▎                                                | 328786/706672 [03:36<04:18, 1461.69it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 47%|██████████████████████████████████████████▊                                                | 332085/706672 [03:39<04:26, 1406.44it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 48%|███████████████████████████████████████████▌                                               | 337869/706672 [03:43<04:18, 1424.85it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 49%|████████████████████████████████████████████▌                                              | 346005/706672 [03:49<04:36, 1306.05it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 50%|█████████████████████████████████████████████▌                                             | 354093/706672 [03:54<04:10, 1409.65it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 53%|████████████████████████████████████████████████▌                                          | 377073/706672 [04:10<03:50, 1432.97it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 54%|█████████████████████████████████████████████████▌                                         | 384418/706672 [04:15<03:44, 1433.58it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 57%|████████████████████████████████████████████████████▎                                      | 406179/706672 [04:31<03:20, 1496.85it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 59%|█████████████████████████████████████████████████████▍                                     | 414945/706672 [04:37<03:15, 1491.65it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 66%|███████████████████████████████████████████████████████████▊                               | 464545/706672 [05:09<02:20, 1724.57it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 66%|████████████████████████████████████████████████████████████                               | 466286/706672 [05:10<02:19, 1725.69it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 68%|██████████████████████████████████████████████████████████████                             | 482061/706672 [05:20<02:30, 1489.94it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 70%|████████████████████████████████████████████████████████████████                           | 497004/706672 [05:30<02:17, 1523.79it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 71%|████████████████████████████████████████████████████████████████▎                          | 499806/706672 [05:32<02:29, 1385.13it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 72%|█████████████████████████████████████████████████████████████████▍                         | 508594/706672 [05:38<02:27, 1345.60it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 75%|████████████████████████████████████████████████████████████████████▎                      | 530618/706672 [05:53<02:09, 1359.62it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 76%|█████████████████████████████████████████████████████████████████████▏                     | 537610/706672 [05:58<01:37, 1739.39it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 77%|██████████████████████████████████████████████████████████████████████▌                    | 547542/706672 [06:04<02:03, 1285.80it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 81%|█████████████████████████████████████████████████████████████████████████▋                 | 572637/706672 [06:21<01:28, 1511.03it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 81%|█████████████████████████████████████████████████████████████████████████▉                 | 573923/706672 [06:22<01:21, 1632.47it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 83%|███████████████████████████████████████████████████████████████████████████▉               | 589932/706672 [06:32<01:09, 1686.83it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 87%|███████████████████████████████████████████████████████████████████████████████▏           | 615216/706672 [06:47<01:03, 1442.56it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 93%|████████████████████████████████████████████████████████████████████████████████████▌      | 657060/706672 [07:13<00:27, 1782.76it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 95%|██████████████████████████████████████████████████████████████████████████████████████▋    | 673070/706672 [07:22<00:22, 1487.00it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 96%|███████████████████████████████████████████████████████████████████████████████████████▌   | 679582/706672 [07:27<00:15, 1693.70it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 97%|████████████████████████████████████████████████████████████████████████████████████████▋  | 688681/706672 [07:35<00:14, 1282.79it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 98%|█████████████████████████████████████████████████████████████████████████████████████████▎ | 693396/706672 [07:38<00:09, 1411.01it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


100%|███████████████████████████████████████████████████████████████████████████████████████████| 706672/706672 [07:47<00:00, 1511.98it/s]


Saving mm9 train at window size 1000 to file
Converting hg19 val at window size 1000 to word-vectors.
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:07, 147153.84it/s]


Reading ../data/VISTA/hg19/pos_fa/hg19.VISTA.enhancers.noXY.1000bp.val.fa into dictionary  and removing N's


100%|█████████████████████████████████████████████████████████████████████████████████████████| 218178/218178 [00:00<00:00, 653916.77it/s]
  0%|                                                                                              | 183/218178 [00:00<01:59, 1828.41it/s]

Number of records before N removal: 218178
Number of records after N removal 218178
Beginning word vector represenations of ../data/VISTA/hg19/pos_fa/hg19.VISTA.enhancers.noXY.1000bp.val.fa


100%|███████████████████████████████████████████████████████████████████████████████████████████| 218178/218178 [01:54<00:00, 1906.99it/s]


Saving hg19 val at window size 1000 to file
Converting mm9 val at window size 1000 to word-vectors.
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:06, 150084.94it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.1000bp.val.fa into dictionary  and removing N's


100%|█████████████████████████████████████████████████████████████████████████████████████████| 201906/201906 [00:00<00:00, 713900.46it/s]
  0%|                                                                                              | 190/201906 [00:00<01:46, 1896.17it/s]

Number of records before N removal: 201906
Number of records after N removal 201906
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.1000bp.val.fa


  3%|██▉                                                                                          | 6314/201906 [00:03<01:41, 1929.47it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 19%|█████████████████▊                                                                          | 39114/201906 [00:20<01:18, 2073.17it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 26%|███████████████████████▋                                                                    | 51945/201906 [00:26<01:17, 1929.70it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 32%|█████████████████████████████▋                                                              | 65119/201906 [00:33<01:10, 1941.18it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 40%|████████████████████████████████████▋                                                       | 80405/201906 [00:41<01:02, 1941.71it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 41%|██████████████████████████████████████                                                      | 83511/201906 [00:43<01:01, 1934.78it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 54%|█████████████████████████████████████████████████▌                                         | 109917/201906 [00:56<00:47, 1927.83it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 69%|██████████████████████████████████████████████████████████████▊                            | 139479/201906 [01:11<00:31, 1972.72it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 80%|█████████████████████████████████████████████████████████████████████████                  | 162138/201906 [01:23<00:20, 1931.32it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 99%|█████████████████████████████████████████████████████████████████████████████████████████▊ | 199409/201906 [01:42<00:01, 1948.20it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


100%|███████████████████████████████████████████████████████████████████████████████████████████| 201906/201906 [01:44<00:00, 1938.59it/s]


Saving mm9 val at window size 1000 to file
Converting hg19 test at window size 1000 to word-vectors.
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:06, 150976.30it/s]


Reading ../data/VISTA/hg19/pos_fa/hg19.VISTA.enhancers.noXY.1000bp.test.fa into dictionary  and removing N's


100%|█████████████████████████████████████████████████████████████████████████████████████████| 109090/109090 [00:00<00:00, 667237.27it/s]
  0%|▏                                                                                             | 191/109090 [00:00<00:57, 1903.55it/s]

Number of records before N removal: 109090
Number of records after N removal 109090
Beginning word vector represenations of ../data/VISTA/hg19/pos_fa/hg19.VISTA.enhancers.noXY.1000bp.test.fa


100%|███████████████████████████████████████████████████████████████████████████████████████████| 109090/109090 [00:56<00:00, 1936.52it/s]


Saving hg19 test at window size 1000 to file
Converting mm9 test at window size 1000 to word-vectors.
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:07, 148145.08it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.1000bp.test.fa into dictionary  and removing N's


100%|█████████████████████████████████████████████████████████████████████████████████████████| 100954/100954 [00:00<00:00, 650698.70it/s]
  0%|▏                                                                                             | 194/100954 [00:00<00:52, 1931.46it/s]

Number of records before N removal: 100954
Number of records after N removal 100954
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.1000bp.test.fa


 13%|████████████▎                                                                               | 13484/100954 [00:06<00:42, 2052.56it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 25%|██████████████████████▉                                                                     | 25121/100954 [00:12<00:36, 2062.31it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 26%|████████████████████████▏                                                                   | 26512/100954 [00:12<00:38, 1935.96it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 29%|██████████████████████████▍                                                                 | 29042/100954 [00:14<00:37, 1919.88it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 42%|██████████████████████████████████████▎                                                     | 42107/100954 [00:20<00:30, 1950.69it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 48%|████████████████████████████████████████████▌                                               | 48871/100954 [00:24<00:27, 1897.53it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


100%|███████████████████████████████████████████████████████████████████████████████████████████| 100954/100954 [00:53<00:00, 1897.54it/s]


Saving mm9 test at window size 1000 to file
Converting hg19 train at window size 2000 to word-vectors.
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:07, 134492.95it/s]


Reading ../data/VISTA/hg19/pos_fa/hg19.VISTA.enhancers.noXY.2000bp.train.fa into dictionary  and removing N's


100%|█████████████████████████████████████████████████████████████████████████████████████████| 343709/343709 [00:00<00:00, 482666.80it/s]
  0%|                                                                                                | 75/343709 [00:00<07:42, 743.41it/s]

Number of records before N removal: 343709
Number of records after N removal 343709
Beginning word vector represenations of ../data/VISTA/hg19/pos_fa/hg19.VISTA.enhancers.noXY.2000bp.train.fa


100%|████████████████████████████████████████████████████████████████████████████████████████████| 343709/343709 [07:10<00:00, 797.54it/s]


Saving hg19 train at window size 2000 to file
Converting mm9 train at window size 2000 to word-vectors.
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:07, 148876.10it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.2000bp.train.fa into dictionary  and removing N's


100%|█████████████████████████████████████████████████████████████████████████████████████████| 351037/351037 [00:00<00:00, 500926.90it/s]
  0%|                                                                                               | 164/351037 [00:00<07:05, 825.14it/s]

Number of records before N removal: 351037
Number of records after N removal 351037
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.2000bp.train.fa


  9%|████████                                                                                     | 30490/351037 [00:35<06:02, 883.90it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 13%|████████████▍                                                                                | 46924/351037 [00:53<06:01, 841.99it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 16%|██████████████▌                                                                              | 54833/351037 [01:03<05:42, 864.38it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 16%|███████████████                                                                              | 56754/351037 [01:05<05:46, 850.15it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 19%|█████████████████▌                                                                           | 66221/351037 [01:16<05:29, 864.03it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 19%|█████████████████▉                                                                           | 67703/351037 [01:18<05:25, 870.46it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 20%|██████████████████▎                                                                          | 68998/351037 [01:19<05:31, 851.05it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 26%|████████████████████████▎                                                                    | 91815/351037 [01:46<05:03, 855.46it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 28%|██████████████████████████▎                                                                  | 99266/351037 [01:54<04:50, 866.80it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 36%|████████████████████████████████▊                                                           | 125059/351037 [02:24<04:18, 872.79it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 37%|█████████████████████████████████▉                                                          | 129440/351037 [02:29<04:15, 868.99it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 40%|████████████████████████████████████▎                                                       | 138729/351037 [02:40<04:07, 856.34it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 42%|██████████████████████████████████████▎                                                     | 145975/351037 [02:49<04:07, 827.80it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 46%|██████████████████████████████████████████▋                                                 | 162893/351037 [03:08<03:38, 860.14it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 50%|██████████████████████████████████████████████▍                                             | 177052/351037 [03:25<03:20, 869.38it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 52%|███████████████████████████████████████████████▋                                            | 181860/351037 [03:30<03:14, 869.27it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 54%|█████████████████████████████████████████████████▉                                          | 190367/351037 [03:40<03:08, 852.93it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 55%|██████████████████████████████████████████████████▏                                         | 191409/351037 [03:41<03:10, 836.65it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.
CGTTACGCGT not in dictionary. Creating vector from subwords.


 55%|██████████████████████████████████████████████████▍                                         | 192526/351037 [03:43<03:04, 857.75it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 57%|███████████████████████████████████████████████████▉                                        | 198399/351037 [03:49<02:55, 868.78it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 58%|█████████████████████████████████████████████████████▌                                      | 204190/351037 [03:56<02:49, 868.54it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 59%|██████████████████████████████████████████████████████▌                                     | 207960/351037 [04:00<02:44, 867.75it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 62%|████████████████████████████████████████████████████████▋                                   | 216171/351037 [04:10<02:33, 876.15it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 65%|████████████████████████████████████████████████████████████▏                               | 229794/351037 [04:25<02:17, 884.29it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 66%|████████████████████████████████████████████████████████████▍                               | 230509/351037 [04:26<02:15, 888.79it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 66%|████████████████████████████████████████████████████████████▌                               | 231222/351037 [04:27<02:15, 882.28it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 67%|█████████████████████████████████████████████████████████████▎                              | 233719/351037 [04:30<02:12, 884.64it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 67%|█████████████████████████████████████████████████████████████▌                              | 234965/351037 [04:31<02:11, 885.62it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 67%|█████████████████████████████████████████████████████████████▊                              | 235680/351037 [04:32<02:10, 886.72it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 67%|█████████████████████████████████████████████████████████████▊                              | 235946/351037 [04:32<02:11, 878.53it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 68%|██████████████████████████████████████████████████████████████▋                             | 239334/351037 [04:36<02:06, 882.80it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 70%|████████████████████████████████████████████████████████████████                            | 244591/351037 [04:42<02:00, 885.45it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 70%|████████████████████████████████████████████████████████████████▋                           | 246641/351037 [04:45<01:58, 882.64it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 74%|███████████████████████████████████████████████████████████████████▊                        | 258949/351037 [04:58<01:44, 885.42it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 79%|████████████████████████████████████████████████████████████████████████▎                   | 275903/351037 [05:18<01:24, 888.32it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 80%|█████████████████████████████████████████████████████████████████████████▍                  | 280081/351037 [05:22<01:21, 867.90it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 83%|████████████████████████████████████████████████████████████████████████████▍               | 291639/351037 [05:36<01:06, 886.84it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 84%|█████████████████████████████████████████████████████████████████████████████▍              | 295561/351037 [05:40<01:02, 883.18it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 89%|█████████████████████████████████████████████████████████████████████████████████▍          | 310803/351037 [05:57<00:45, 886.21it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 94%|██████████████████████████████████████████████████████████████████████████████████████▎     | 329358/351037 [06:18<00:24, 888.44it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 98%|██████████████████████████████████████████████████████████████████████████████████████████▎ | 344515/351037 [06:35<00:07, 886.45it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


100%|████████████████████████████████████████████████████████████████████████████████████████████| 351037/351037 [06:43<00:00, 870.26it/s]


Saving mm9 train at window size 2000 to file
Converting hg19 val at window size 2000 to word-vectors.
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:07, 148724.27it/s]


Reading ../data/VISTA/hg19/pos_fa/hg19.VISTA.enhancers.noXY.2000bp.val.fa into dictionary  and removing N's


100%|███████████████████████████████████████████████████████████████████████████████████████████| 98202/98202 [00:00<00:00, 547711.68it/s]
  0%|                                                                                                 | 81/98202 [00:00<02:02, 804.23it/s]

Number of records before N removal: 98202
Number of records after N removal 98202
Beginning word vector represenations of ../data/VISTA/hg19/pos_fa/hg19.VISTA.enhancers.noXY.2000bp.val.fa


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 98202/98202 [01:50<00:00, 887.89it/s]


Saving hg19 val at window size 2000 to file
Converting mm9 val at window size 2000 to word-vectors.
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:07, 149252.50it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.2000bp.val.fa into dictionary  and removing N's


100%|█████████████████████████████████████████████████████████████████████████████████████████| 100296/100296 [00:00<00:00, 501926.24it/s]
  0%|                                                                                                | 87/100296 [00:00<01:56, 860.92it/s]

Number of records before N removal: 100296
Number of records after N removal 100296
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.2000bp.val.fa


  3%|██▉                                                                                           | 3154/100296 [00:03<01:50, 879.26it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


  7%|██████▌                                                                                       | 7059/100296 [00:07<01:44, 888.08it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 10%|█████████▏                                                                                    | 9777/100296 [00:10<01:49, 825.84it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 20%|██████████████████▍                                                                          | 19872/100296 [00:22<01:30, 884.95it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 21%|███████████████████▌                                                                         | 21048/100296 [00:23<01:28, 898.38it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 30%|███████████████████████████▍                                                                 | 29638/100296 [00:33<01:24, 832.11it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 46%|███████████████████████████████████████████                                                  | 46504/100296 [00:52<00:59, 898.19it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 56%|███████████████████████████████████████████████████▊                                         | 55942/100296 [01:02<00:49, 892.79it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 66%|█████████████████████████████████████████████████████████████▍                               | 66285/100296 [01:14<00:38, 887.77it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 70%|█████████████████████████████████████████████████████████████████                            | 70223/100296 [01:18<00:33, 888.02it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 74%|█████████████████████████████████████████████████████████████████████                        | 74426/100296 [01:23<00:29, 888.41it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.
CGTTACGCGT not in dictionary. Creating vector from subwords.


 77%|███████████████████████████████████████████████████████████████████████▉                     | 77638/100296 [01:27<00:25, 885.78it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 79%|█████████████████████████████████████████████████████████████████████████▋                   | 79424/100296 [01:29<00:23, 876.69it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 87%|████████████████████████████████████████████████████████████████████████████████▍            | 86759/100296 [01:37<00:15, 882.24it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.
CGTTACGCGT not in dictionary. Creating vector from subwords.


 94%|███████████████████████████████████████████████████████████████████████████████████████▍     | 94249/100296 [01:45<00:06, 887.37it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


100%|████████████████████████████████████████████████████████████████████████████████████████████| 100296/100296 [01:52<00:00, 889.01it/s]


Saving mm9 val at window size 2000 to file
Converting hg19 test at window size 2000 to word-vectors.
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:06, 150814.86it/s]


Reading ../data/VISTA/hg19/pos_fa/hg19.VISTA.enhancers.noXY.2000bp.test.fa into dictionary  and removing N's


100%|███████████████████████████████████████████████████████████████████████████████████████████| 49102/49102 [00:00<00:00, 573702.07it/s]
  0%|▏                                                                                                | 84/49102 [00:00<00:58, 831.70it/s]

Number of records before N removal: 49102
Number of records after N removal 49102
Beginning word vector represenations of ../data/VISTA/hg19/pos_fa/hg19.VISTA.enhancers.noXY.2000bp.test.fa


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 49102/49102 [00:54<00:00, 896.47it/s]


Saving hg19 test at window size 2000 to file
Converting mm9 test at window size 2000 to word-vectors.
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:06, 150137.11it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.2000bp.test.fa into dictionary  and removing N's


100%|███████████████████████████████████████████████████████████████████████████████████████████| 50149/50149 [00:00<00:00, 505716.54it/s]
  0%|▏                                                                                                | 87/50149 [00:00<00:57, 864.39it/s]

Number of records before N removal: 50149
Number of records after N removal 50149
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.2000bp.test.fa


 34%|███████████████████████████████▌                                                              | 16856/50149 [00:18<00:37, 899.42it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 39%|████████████████████████████████████▉                                                         | 19673/50149 [00:22<00:33, 902.52it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


 53%|█████████████████████████████████████████████████▋                                            | 26515/50149 [00:29<00:26, 906.51it/s]

CGTTACGCGT not in dictionary. Creating vector from subwords.


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 50149/50149 [00:55<00:00, 898.10it/s]


Saving mm9 test at window size 2000 to file


In [7]:
## NEGATIVE HUMAN ##
split = ['train','val','test']# 'val', 'test']
window = [200, 1000, 2000]
species = ['hg19', 'mm9']

for w in window:
    for s in split:
        for sp in species:
            print(f'Converting {sp} {s} at window size {s} to its vector representations')
            pos = np.load(f'../data/VISTA/{sp}/pos_npy/{sp}.VISTA.enhancers.noXY'
                      f'.FTvec.{w}bp.{s}.npy',mmap_mode='r')
            mat = seqToWordVec(f'../data/VISTA/{sp}/neg_fa/{sp}.VISTA'
                           f'.non_enhancers.noXY.GCbalanced.{w}bp.{s}.fa',
                           '/Users/callummacphillamy/PhD/Reference_Genomes'
                           '/hg19/hg19.skipgram.30.vec',
                           '/Users/callummacphillamy/PhD/Reference_Genomes'
                           '/hg19/hg19.skipgram.30.bin',
                           random_choice=True,
                           rand_n=pos.shape[0])
            print(f'Saving {sp} {s} at window size {w} to file')
            np.save(f'../data/VISTA/{sp}/neg_npy/{sp}.VISTA.non_enhancers.noXY'
                f'..GCbalanced.{w}bp'
                f'.FTvec.{s}.npy', mat)

Converting hg19 train at window size train to its vector representations
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec






1048645it [00:07, 145362.12it/s]


Reading ../data/VISTA/hg19/neg_fa/hg19.VISTA.non_enhancers.noXY.GCbalanced.200bp.train.fa into dictionary  and removing N's


100%|███████████████████████████████████████████████████████████████████████████████████████| 4661733/4661733 [00:05<00:00, 887080.52it/s]
  0%|                                                                                             | 893/1294731 [00:00<02:24, 8924.46it/s]

Number of clean records taken randomly: 1294731
Beginning word vector represenations of ../data/VISTA/hg19/neg_fa/hg19.VISTA.non_enhancers.noXY.GCbalanced.200bp.train.fa


100%|████████████████████████████████████████████████████████████████████████████████████████| 1294731/1294731 [01:50<00:00, 11716.40it/s]


Saving hg19 train at window size 200 to file
Converting mm9 train at window size train to its vector representations
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin





Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:07, 135615.05it/s]


Reading ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.200bp.train.fa into dictionary  and removing N's


100%|███████████████████████████████████████████████████████████████████████████████████████| 3753423/3753423 [00:04<00:00, 857242.60it/s]
  0%|                                                                                             | 783/1060821 [00:00<02:15, 7825.85it/s]

Number of clean records taken randomly: 1060821
Beginning word vector represenations of ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.200bp.train.fa


 11%|█████████▍                                                                               | 112098/1060821 [00:08<01:06, 14231.00it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


 31%|███████████████████████████▎                                                             | 325362/1060821 [00:22<00:50, 14459.30it/s]

TACGCGATCG not in dictionary. Creating vector from subwords.


100%|████████████████████████████████████████████████████████████████████████████████████████| 1060821/1060821 [01:14<00:00, 14190.88it/s]


Saving mm9 train at window size 200 to file
Converting hg19 val at window size val to its vector representations
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec






1048645it [00:07, 148020.06it/s]


Reading ../data/VISTA/hg19/neg_fa/hg19.VISTA.non_enhancers.noXY.GCbalanced.200bp.val.fa into dictionary  and removing N's


100%|██████████████████████████████████████████████████████████████████████████████████████| 1334093/1334093 [00:01<00:00, 1015182.33it/s]
  0%|▎                                                                                           | 1132/369923 [00:00<00:32, 11315.59it/s]

Number of clean records taken randomly: 369923
Beginning word vector represenations of ../data/VISTA/hg19/neg_fa/hg19.VISTA.non_enhancers.noXY.GCbalanced.200bp.val.fa


100%|██████████████████████████████████████████████████████████████████████████████████████████| 369923/369923 [00:29<00:00, 12610.18it/s]


Saving hg19 val at window size 200 to file
Converting mm9 val at window size val to its vector representations
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec






1048645it [00:07, 141734.58it/s]


Reading ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.200bp.val.fa into dictionary  and removing N's


100%|██████████████████████████████████████████████████████████████████████████████████████| 1073599/1073599 [00:01<00:00, 1025073.10it/s]
  0%|▎                                                                                           | 1126/303092 [00:00<00:26, 11252.58it/s]

Number of clean records taken randomly: 303092
Beginning word vector represenations of ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.200bp.val.fa


 22%|███████████████████▋                                                                       | 65606/303092 [00:05<00:19, 12438.91it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


 45%|████████████████████████████████████████▊                                                 | 137548/303092 [00:10<00:13, 12447.90it/s]

CGCGAATCGA not in dictionary. Creating vector from subwords.


100%|██████████████████████████████████████████████████████████████████████████████████████████| 303092/303092 [00:24<00:00, 12559.53it/s]


Saving mm9 val at window size 200 to file
Converting hg19 test at window size test to its vector representations
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin





Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:07, 148614.24it/s]


Reading ../data/VISTA/hg19/neg_fa/hg19.VISTA.non_enhancers.noXY.GCbalanced.200bp.test.fa into dictionary  and removing N's


100%|████████████████████████████████████████████████████████████████████████████████████████| 667237/667237 [00:00<00:00, 1039844.87it/s]
  1%|█▏                                                                                          | 2297/184962 [00:00<00:15, 11623.95it/s]

Number of clean records taken randomly: 184962
Beginning word vector represenations of ../data/VISTA/hg19/neg_fa/hg19.VISTA.non_enhancers.noXY.GCbalanced.200bp.test.fa


100%|██████████████████████████████████████████████████████████████████████████████████████████| 184962/184962 [00:14<00:00, 12664.94it/s]


Saving hg19 test at window size 200 to file
Converting mm9 test at window size test to its vector representations
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec






1048645it [00:07, 143615.97it/s]


Reading ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.200bp.test.fa into dictionary  and removing N's


100%|████████████████████████████████████████████████████████████████████████████████████████| 536737/536737 [00:00<00:00, 1036398.15it/s]
  1%|▋                                                                                           | 1131/151546 [00:00<00:13, 11308.29it/s]

Number of clean records taken randomly: 151546
Beginning word vector represenations of ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.200bp.test.fa


 42%|█████████████████████████████████████▉                                                     | 63150/151546 [00:04<00:07, 12531.32it/s]

CGATCGTCGG not in dictionary. Creating vector from subwords.


100%|██████████████████████████████████████████████████████████████████████████████████████████| 151546/151546 [00:11<00:00, 12632.82it/s]


Saving mm9 test at window size 200 to file
Converting hg19 train at window size train to its vector representations
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin





Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:07, 146936.16it/s]


Reading ../data/VISTA/hg19/neg_fa/hg19.VISTA.non_enhancers.noXY.GCbalanced.1000bp.train.fa into dictionary  and removing N's


100%|███████████████████████████████████████████████████████████████████████████████████████| 1164229/1164229 [00:01<00:00, 657918.59it/s]
  0%|                                                                                              | 364/763623 [00:00<06:55, 1838.08it/s]

Number of clean records taken randomly: 763623
Beginning word vector represenations of ../data/VISTA/hg19/neg_fa/hg19.VISTA.non_enhancers.noXY.GCbalanced.1000bp.train.fa


 43%|██████████████████████████████████████▋                                                    | 324943/763623 [03:09<05:13, 1400.72it/s]

CGACGCGATA not in dictionary. Creating vector from subwords.


100%|███████████████████████████████████████████████████████████████████████████████████████████| 763623/763623 [07:02<00:00, 1808.32it/s]


Saving hg19 train at window size 1000 to file
Converting mm9 train at window size train to its vector representations
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin





Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:07, 144161.40it/s]


Reading ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.1000bp.train.fa into dictionary  and removing N's


100%|█████████████████████████████████████████████████████████████████████████████████████████| 769121/769121 [00:01<00:00, 642335.67it/s]
  0%|                                                                                              | 169/706672 [00:00<06:59, 1685.85it/s]

Number of clean records taken randomly: 706672
Beginning word vector represenations of ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.1000bp.train.fa


  0%|▍                                                                                            | 3380/706672 [00:01<06:44, 1738.83it/s]

CGCGAATCGA not in dictionary. Creating vector from subwords.


  2%|██                                                                                          | 15790/706672 [00:08<06:11, 1859.43it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


 22%|███████████████████▋                                                                       | 152881/706672 [01:21<04:55, 1876.51it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


 27%|████████████████████████▍                                                                   | 187673/706672 [01:43<14:47, 584.71it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


 32%|████████████████████████████▊                                                              | 224086/706672 [02:10<04:09, 1934.56it/s]

CGATCGTCGG not in dictionary. Creating vector from subwords.


 38%|██████████████████████████████████▊                                                        | 269876/706672 [02:34<03:49, 1906.53it/s]

TACGCGATCG not in dictionary. Creating vector from subwords.


 70%|███████████████████████████████████████████████████████████████▊                           | 495313/706672 [04:38<02:06, 1672.18it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


 90%|██████████████████████████████████████████████████████████████████████████████████▏        | 638179/706672 [06:03<00:42, 1608.48it/s]

CGCGAATCGA not in dictionary. Creating vector from subwords.


 92%|███████████████████████████████████████████████████████████████████████████████████▉       | 651705/706672 [06:10<00:29, 1878.69it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


 96%|███████████████████████████████████████████████████████████████████████████████████████▏   | 677350/706672 [06:26<00:16, 1817.40it/s]

CGTTCGACGG not in dictionary. Creating vector from subwords.


 98%|█████████████████████████████████████████████████████████████████████████████████████████▎ | 693662/706672 [06:36<00:07, 1645.68it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


 99%|█████████████████████████████████████████████████████████████████████████████████████████▊ | 697323/706672 [06:38<00:05, 1652.41it/s]

CGTACGCGAT not in dictionary. Creating vector from subwords.


100%|███████████████████████████████████████████████████████████████████████████████████████████| 706672/706672 [06:44<00:00, 1747.87it/s]


Saving mm9 train at window size 1000 to file
Converting hg19 val at window size val to its vector representations
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin





Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:08, 125248.55it/s]


Reading ../data/VISTA/hg19/neg_fa/hg19.VISTA.non_enhancers.noXY.GCbalanced.1000bp.val.fa into dictionary  and removing N's


100%|█████████████████████████████████████████████████████████████████████████████████████████| 332259/332259 [00:00<00:00, 704694.43it/s]
  0%|                                                                                              | 209/218178 [00:00<01:44, 2089.87it/s]

Number of clean records taken randomly: 218178
Beginning word vector represenations of ../data/VISTA/hg19/neg_fa/hg19.VISTA.non_enhancers.noXY.GCbalanced.1000bp.val.fa


100%|███████████████████████████████████████████████████████████████████████████████████████████| 218178/218178 [02:12<00:00, 1647.00it/s]


Saving hg19 val at window size 1000 to file
Converting mm9 val at window size val to its vector representations
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin





Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:11, 88896.39it/s]


Reading ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.1000bp.val.fa into dictionary  and removing N's


100%|█████████████████████████████████████████████████████████████████████████████████████████| 219523/219523 [00:00<00:00, 463852.92it/s]
  0%|                                                                                              | 127/201906 [00:00<02:40, 1256.22it/s]

Number of clean records taken randomly: 201906
Beginning word vector represenations of ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.1000bp.val.fa


  6%|█████▎                                                                                      | 11674/201906 [00:08<02:20, 1353.63it/s]

CGTTCGACGG not in dictionary. Creating vector from subwords.


 25%|███████████████████████                                                                     | 50736/201906 [00:38<01:49, 1379.77it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


 28%|█████████████████████████▍                                                                  | 55848/201906 [00:42<01:45, 1384.36it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


 29%|██████████████████████████▍                                                                 | 58054/201906 [00:43<01:42, 1402.83it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


 62%|████████████████████████████████████████████████████████▏                                  | 124632/201906 [01:34<00:59, 1294.00it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


 90%|██████████████████████████████████████████████████████████████████████████████████▌         | 181096/201906 [02:07<00:21, 965.30it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


100%|███████████████████████████████████████████████████████████████████████████████████████████| 201906/201906 [02:19<00:00, 1442.26it/s]


Saving mm9 val at window size 1000 to file
Converting hg19 test at window size test to its vector representations
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin





Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:21, 49568.53it/s]


Reading ../data/VISTA/hg19/neg_fa/hg19.VISTA.non_enhancers.noXY.GCbalanced.1000bp.test.fa into dictionary  and removing N's


100%|█████████████████████████████████████████████████████████████████████████████████████████| 166064/166064 [00:00<00:00, 680878.36it/s]
  0%|▏                                                                                             | 176/109090 [00:00<01:02, 1754.91it/s]

Number of clean records taken randomly: 109090
Beginning word vector represenations of ../data/VISTA/hg19/neg_fa/hg19.VISTA.non_enhancers.noXY.GCbalanced.1000bp.test.fa


100%|███████████████████████████████████████████████████████████████████████████████████████████| 109090/109090 [00:55<00:00, 1948.82it/s]


Saving hg19 test at window size 1000 to file
Converting mm9 test at window size test to its vector representations
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin





Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:08, 129676.67it/s]


Reading ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.1000bp.test.fa into dictionary  and removing N's


100%|█████████████████████████████████████████████████████████████████████████████████████████| 109994/109994 [00:00<00:00, 664508.44it/s]
  0%|▏                                                                                             | 179/100954 [00:00<00:56, 1786.98it/s]

Number of clean records taken randomly: 100954
Beginning word vector represenations of ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.1000bp.test.fa


 84%|████████████████████████████████████████████████████████████████████████████▉               | 84475/100954 [00:44<00:09, 1817.51it/s]

TACGCGATCG not in dictionary. Creating vector from subwords.


 87%|████████████████████████████████████████████████████████████████████████████████▎           | 88193/100954 [00:46<00:06, 1837.70it/s]

CGATCGTCGG not in dictionary. Creating vector from subwords.


100%|███████████████████████████████████████████████████████████████████████████████████████████| 100954/100954 [00:54<00:00, 1857.03it/s]


Saving mm9 test at window size 1000 to file
Converting hg19 train at window size train to its vector representations
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin





Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:07, 134582.97it/s]


Reading ../data/VISTA/hg19/neg_fa/hg19.VISTA.non_enhancers.noXY.GCbalanced.2000bp.train.fa into dictionary  and removing N's


100%|█████████████████████████████████████████████████████████████████████████████████████████| 614015/614015 [00:01<00:00, 498883.25it/s]
  0%|                                                                                                | 69/343709 [00:00<08:20, 687.07it/s]

Number of clean records taken randomly: 343709
Beginning word vector represenations of ../data/VISTA/hg19/neg_fa/hg19.VISTA.non_enhancers.noXY.GCbalanced.2000bp.train.fa


100%|████████████████████████████████████████████████████████████████████████████████████████████| 343709/343709 [08:03<00:00, 710.35it/s]


Saving hg19 train at window size 2000 to file
Converting mm9 train at window size train to its vector representations
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin





Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:07, 135689.22it/s]


Reading ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.2000bp.train.fa into dictionary  and removing N's


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 364425/364425 [00:00<00:00, 499671.70it/s]
  0%|                                                                                                                                                                                                                                                       | 71/351037 [00:00<08:17, 705.04it/s]

Number of clean records taken randomly: 351037
Beginning word vector represenations of ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.2000bp.train.fa


 15%|████████████████████████████████████▋                                                                                                                                                                                                               | 52806/351037 [01:12<06:35, 754.60it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


 19%|███████████████████████████████████████████████▌                                                                                                                                                                                                    | 68451/351037 [01:33<06:28, 727.17it/s]

TACGCGATCG not in dictionary. Creating vector from subwords.


 43%|███████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                           | 149793/351037 [03:24<04:19, 775.99it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


 52%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                   | 183983/351037 [04:10<03:33, 782.80it/s]

CGTACGCGAT not in dictionary. Creating vector from subwords.


 61%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                               | 212625/351037 [04:48<02:54, 791.79it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                        | 223505/351037 [05:02<02:40, 794.15it/s]

CGTTCGACGG not in dictionary. Creating vector from subwords.


 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                    | 252717/351037 [05:39<02:04, 789.21it/s]

CGATCGTCGG not in dictionary. Creating vector from subwords.


 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                  | 255620/351037 [05:43<02:00, 790.69it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                        | 269721/351037 [06:02<01:59, 682.99it/s]

CGTTCGACGG not in dictionary. Creating vector from subwords.


 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                       | 270372/351037 [06:03<01:51, 726.61it/s]

CGATCGTCGG not in dictionary. Creating vector from subwords.


 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                       | 271750/351037 [06:05<01:50, 718.05it/s]

CGCGAATCGA not in dictionary. Creating vector from subwords.


 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                     | 273845/351037 [06:08<01:44, 740.95it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                      | 319075/351037 [07:13<00:48, 656.59it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 351037/351037 [07:59<00:00, 731.77it/s]


Saving mm9 train at window size 2000 to file
Converting hg19 val at window size val to its vector representations
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin





Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:08, 129785.27it/s]


Reading ../data/VISTA/hg19/neg_fa/hg19.VISTA.non_enhancers.noXY.GCbalanced.2000bp.val.fa into dictionary  and removing N's


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 175561/175561 [00:00<00:00, 478448.84it/s]
  0%|▏                                                                                                                                                                                                                                                       | 71/98202 [00:00<02:18, 709.23it/s]

Number of clean records taken randomly: 98202
Beginning word vector represenations of ../data/VISTA/hg19/neg_fa/hg19.VISTA.non_enhancers.noXY.GCbalanced.2000bp.val.fa


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 98202/98202 [02:11<00:00, 749.36it/s]


Saving hg19 val at window size 2000 to file
Converting mm9 val at window size val to its vector representations
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin





Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:08, 123433.91it/s]


Reading ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.2000bp.val.fa into dictionary  and removing N's


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 104560/104560 [00:00<00:00, 495000.88it/s]
  0%|▏                                                                                                                                                                                                                                                      | 73/100296 [00:00<02:17, 726.46it/s]

Number of clean records taken randomly: 100296
Beginning word vector represenations of ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.2000bp.val.fa


 10%|████████████████████████▋                                                                                                                                                                                                                           | 10170/100296 [00:14<02:21, 638.22it/s]

TACGCGATCG not in dictionary. Creating vector from subwords.


 23%|███████████████████████████████████████████████████████▉                                                                                                                                                                                            | 22969/100296 [00:30<02:00, 641.36it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


 44%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                        | 44351/100296 [00:59<01:16, 731.93it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                   | 53076/100296 [01:11<01:00, 785.68it/s]

CGTACGCGAT not in dictionary. Creating vector from subwords.


 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                               | 67464/100296 [01:29<00:42, 766.87it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                     | 91489/100296 [01:59<00:10, 806.64it/s]

CGCGAATCGA not in dictionary. Creating vector from subwords.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100296/100296 [02:10<00:00, 767.93it/s]


Saving mm9 val at window size 2000 to file
Converting hg19 test at window size test to its vector representations
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin





Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:07, 135122.54it/s]


Reading ../data/VISTA/hg19/neg_fa/hg19.VISTA.non_enhancers.noXY.GCbalanced.2000bp.test.fa into dictionary  and removing N's


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 87872/87872 [00:00<00:00, 526101.35it/s]
  0%|▍                                                                                                                                                                                                                                                       | 79/49102 [00:00<01:02, 784.80it/s]

Number of clean records taken randomly: 49102
Beginning word vector represenations of ../data/VISTA/hg19/neg_fa/hg19.VISTA.non_enhancers.noXY.GCbalanced.2000bp.test.fa


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 49102/49102 [01:03<00:00, 777.22it/s]


Saving hg19 test at window size 2000 to file
Converting mm9 test at window size test to its vector representations
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.bin





Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.30.vec


1048645it [00:07, 131726.66it/s]


Reading ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.2000bp.test.fa into dictionary  and removing N's


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 52435/52435 [00:00<00:00, 497940.41it/s]
  0%|▍                                                                                                                                                                                                                                                       | 76/50149 [00:00<01:06, 756.56it/s]

Number of clean records taken randomly: 50149
Beginning word vector represenations of ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.2000bp.test.fa


 24%|██████████████████████████████████████████████████████████▋                                                                                                                                                                                          | 12003/50149 [00:15<00:50, 755.43it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


 41%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                               | 20740/50149 [00:26<00:38, 771.90it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50149/50149 [01:05<00:00, 769.55it/s]


Saving mm9 test at window size 2000 to file


In [8]:
## MERGE DATASETS ##
split = ['train','val','test']# 'val', 'test']
window = [200, 1000, 2000]
species = ['hg19', 'mm9']

for w in window:
    for s in split:
        for sp in species:
            print(f'Merging {sp} {s} at window size {w} into dataset.')
            pos_X = np.load(f'../data/VISTA/{sp}/pos_npy/{sp}.VISTA.enhancers.noXY'
                            f'.FTvec.{w}bp.{s}.npy', mmap_mode='r')
            neg_X = np.load(f'../data/VISTA/{sp}/neg_npy/{sp}.VISTA.non_enhancers.noXY'
                            f'..GCbalanced.{w}bp'
                            f'.FTvec.{s}.npy', mmap_mode='r')
            pos_y = np.ones(pos_X.shape[0])
            neg_y = np.zeros(neg_X.shape[0])
            
            dat = np.vstack((pos_X, neg_X))
            lab = np.hstack((pos_y, neg_y))
            
            np.save(f'../data/VISTA/{sp}/datasets/{sp}.balanced.VISTA.FTvec.{w}bp.{s}_X.npy', dat)
            np.save(f'../data/VISTA/{sp}/datasets/{sp}.balanced.VISTA.FTvec.{w}bp.{s}_y.npy', lab)

print('Creating combined hg19 and mm9 datasets.')
for w in window:
    for s in split:
        hg19_X = np.load(f'../data/VISTA/hg19/datasets/hg19.balanced.VISTA.FTvec.{w}bp.{s}_X.npy', mmap_mode='r')
        mm9_X = np.load(f'../data/VISTA/mm9/datasets/mm9.balanced.VISTA.FTvec.{w}bp.{s}_X.npy', mmap_mode='r')
        hg19_y = np.load(f'../data/VISTA/hg19/datasets/hg19.balanced.VISTA.FTvec.{w}bp.{s}_y.npy', mmap_mode='r')
        mm9_y = np.load(f'../data/VISTA/mm9/datasets/mm9.balanced.VISTA.FTvec.{w}bp.{s}_y.npy', mmap_mode='r')
        
        both_dat = np.vstack((hg19_X, mm9_X))
        both_lab = np.hstack((hg19_y, mm9_y))
        
        np.save(f'../data/VISTA/hg19.mm9.balanced.VISTA.FTvec.{w}bp.{s}_X.npy', both_dat)
        np.save(f'../data/VISTA/hg19.mm9.balanced.VISTA.FTvec.{w}bp.{s}_y.npy', both_lab)
print('Finished combining hg19 and mm9 datasets.')
            
# hg19.balanced.VISTA.onehot.1000bp.test_y.npy

Merging hg19 train at window size 200 into dataset.
Merging mm9 train at window size 200 into dataset.
Merging hg19 val at window size 200 into dataset.
Merging mm9 val at window size 200 into dataset.
Merging hg19 test at window size 200 into dataset.
Merging mm9 test at window size 200 into dataset.
Merging hg19 train at window size 1000 into dataset.
Merging mm9 train at window size 1000 into dataset.
Merging hg19 val at window size 1000 into dataset.
Merging mm9 val at window size 1000 into dataset.
Merging hg19 test at window size 1000 into dataset.
Merging mm9 test at window size 1000 into dataset.
Merging hg19 train at window size 2000 into dataset.
Merging mm9 train at window size 2000 into dataset.
Merging hg19 val at window size 2000 into dataset.
Merging mm9 val at window size 2000 into dataset.
Merging hg19 test at window size 2000 into dataset.
Merging mm9 test at window size 2000 into dataset.
Creating combined hg19 and mm9 datasets.
Finished combining hg19 and mm9 datase

In [3]:
### POSITIVE SET ###
## MOUSE ##

from utils.utils import seqToWordVec

params = {'train':100000,
          'val':25000,
          'test':10000}

window = [200, 400, 1000, 1500, 2000]#, 4000, 8000]

for w in window:
    for k, v in params.items():
        print(f'Converting {w} {k} to its vector representations')
        mat = seqToWordVec(in_fa=f'../data/VISTA/mm9/pos_fa/mm9.VISTA'
                                 f'.enhancers.noXY.{w}bp.{k}.fa',
                           in_vec='/Users/callummacphillamy/Software/fastText'
                                  '-0.9.2/result/mm9.skipgram.vec',
                           model_path='/Users/callummacphillamy/Software'
                                      '/fastText-0.9.2/result/mm9.skipgram'
                                      '.bin',
                           word_size=10,
                           random_choice=True,
                           rand_n=v)
        print(f'Saving {w} {k} to file')
        np.save(f'../data/VISTA/mm9/pos_npy/mm9.VISTA.enhancers.noXY.{w}bp'
                f'.FTvec.{k}.npy', mat)

Converting 200 train to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:16, 62930.85it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.200bp.train.fa into dictionary  and removing N's


100%|██████████| 1060821/1060821 [00:01<00:00, 991248.39it/s]
  1%|          | 969/100000 [00:00<00:10, 9683.22it/s]

Number of clean records taken randomly: 100000
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.200bp.train.fa


100%|██████████| 100000/100000 [00:08<00:00, 11271.76it/s]


Saving 200 train to file
Converting 200 val to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:17, 58778.24it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.200bp.val.fa into dictionary  and removing N's


100%|██████████| 303092/303092 [00:00<00:00, 949944.21it/s]
  9%|▉         | 2242/25000 [00:00<00:02, 11285.57it/s]

Number of clean records taken randomly: 25000
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.200bp.val.fa


100%|██████████| 25000/25000 [00:02<00:00, 11448.40it/s]


Saving 200 val to file
Converting 200 test to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:19, 54185.80it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.200bp.test.fa into dictionary  and removing N's


100%|██████████| 151546/151546 [00:00<00:00, 1028099.96it/s]
 22%|██▏       | 2205/10000 [00:00<00:00, 11030.15it/s]

Number of clean records taken randomly: 10000
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.200bp.test.fa


100%|██████████| 10000/10000 [00:00<00:00, 11618.69it/s]


Saving 200 test to file
Converting 400 train to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:17, 61679.56it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.400bp.train.fa into dictionary  and removing N's


100%|██████████| 970710/970710 [00:01<00:00, 905839.21it/s] 
  0%|          | 496/100000 [00:00<00:20, 4958.86it/s]

Number of clean records taken randomly: 100000
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.400bp.train.fa


100%|██████████| 100000/100000 [00:18<00:00, 5507.14it/s]


Saving 400 train to file
Converting 400 val to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:16, 62127.10it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.400bp.val.fa into dictionary  and removing N's


100%|██████████| 277346/277346 [00:00<00:00, 1050600.71it/s]
  2%|▏         | 544/25000 [00:00<00:04, 5437.39it/s]

Number of clean records taken randomly: 25000
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.400bp.val.fa


100%|██████████| 25000/25000 [00:04<00:00, 5577.92it/s]


Saving 400 val to file
Converting 400 test to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:17, 60532.58it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.400bp.test.fa into dictionary  and removing N's


100%|██████████| 138673/138673 [00:00<00:00, 970915.63it/s]
  6%|▌         | 576/10000 [00:00<00:01, 5756.49it/s]

Number of clean records taken randomly: 10000
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.400bp.test.fa


100%|██████████| 10000/10000 [00:01<00:00, 5767.53it/s]


Saving 400 test to file
Converting 1000 train to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:16, 63525.76it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.1000bp.train.fa into dictionary  and removing N's


100%|██████████| 706672/706672 [00:01<00:00, 655605.68it/s]
  0%|          | 362/100000 [00:00<00:54, 1829.88it/s]

Number of clean records taken randomly: 100000
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.1000bp.train.fa


100%|██████████| 100000/100000 [00:55<00:00, 1815.34it/s]


Saving 1000 train to file
Converting 1000 val to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:16, 62774.42it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.1000bp.val.fa into dictionary  and removing N's


100%|██████████| 201906/201906 [00:00<00:00, 678249.18it/s]
  1%|          | 170/25000 [00:00<00:14, 1696.93it/s]

Number of clean records taken randomly: 25000
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.1000bp.val.fa


100%|██████████| 25000/25000 [00:12<00:00, 1935.97it/s]


Saving 1000 val to file
Converting 1000 test to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:16, 61867.89it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.1000bp.test.fa into dictionary  and removing N's


100%|██████████| 100954/100954 [00:00<00:00, 706323.52it/s]
  2%|▏         | 183/10000 [00:00<00:05, 1820.84it/s]

Number of clean records taken randomly: 10000
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.1000bp.test.fa


100%|██████████| 10000/10000 [00:04<00:00, 2000.75it/s]


Saving 1000 test to file
Converting 1500 train to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:16, 62687.53it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.1500bp.train.fa into dictionary  and removing N's


100%|██████████| 513490/513490 [00:00<00:00, 575478.26it/s]
  0%|          | 211/100000 [00:00<01:34, 1057.09it/s]

Number of clean records taken randomly: 100000
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.1500bp.train.fa


100%|██████████| 100000/100000 [01:32<00:00, 1078.03it/s]


Saving 1500 train to file
Converting 1500 val to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:17, 60013.43it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.1500bp.val.fa into dictionary  and removing N's


100%|██████████| 146712/146712 [00:00<00:00, 585678.96it/s]
  1%|          | 213/25000 [00:00<00:23, 1067.35it/s]

Number of clean records taken randomly: 25000
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.1500bp.val.fa


100%|██████████| 25000/25000 [00:21<00:00, 1173.09it/s]


Saving 1500 val to file
Converting 1500 test to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:16, 62897.42it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.1500bp.test.fa into dictionary  and removing N's


100%|██████████| 73356/73356 [00:00<00:00, 589252.04it/s]
  1%|          | 110/10000 [00:00<00:09, 1096.47it/s]

Number of clean records taken randomly: 10000
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.1500bp.test.fa


100%|██████████| 10000/10000 [00:08<00:00, 1134.53it/s]


Saving 1500 test to file
Converting 2000 train to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec






1048724it [00:16, 64182.66it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.2000bp.train.fa into dictionary  and removing N's


100%|██████████| 351037/351037 [00:00<00:00, 501696.36it/s]
  0%|          | 69/100000 [00:00<02:26, 681.97it/s]

Number of clean records taken randomly: 100000
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.2000bp.train.fa


100%|██████████| 100000/100000 [02:13<00:00, 751.86it/s]


Saving 2000 train to file
Converting 2000 val to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:16, 62007.22it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.2000bp.val.fa into dictionary  and removing N's


100%|██████████| 100296/100296 [00:00<00:00, 528538.74it/s]
  0%|          | 69/25000 [00:00<00:36, 689.72it/s]

Number of clean records taken randomly: 25000
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.2000bp.val.fa


100%|██████████| 25000/25000 [00:31<00:00, 797.79it/s]


Saving 2000 val to file
Converting 2000 test to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:16, 62745.58it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.2000bp.test.fa into dictionary  and removing N's


100%|██████████| 50149/50149 [00:00<00:00, 479288.68it/s]
  1%|▏         | 146/10000 [00:00<00:13, 732.45it/s]

Number of clean records taken randomly: 10000
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.2000bp.test.fa


100%|██████████| 10000/10000 [00:11<00:00, 844.23it/s]


Saving 2000 test to file
Converting 4000 train to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:17, 61634.08it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.4000bp.train.fa into dictionary  and removing N's


100%|██████████| 39754/39754 [00:00<00:00, 377828.75it/s]
  0%|          | 30/10000 [00:00<00:34, 292.56it/s]

Number of clean records taken randomly: 10000
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.4000bp.train.fa


100%|██████████| 10000/10000 [00:33<00:00, 296.97it/s]


Saving 4000 train to file
Converting 4000 val to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:16, 63013.13it/s]
100%|██████████| 11358/11358 [00:00<00:00, 339033.16it/s]
  0%|          | 0/10000 [00:00<?, ?it/s]

Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.4000bp.val.fa into dictionary  and removing N's
Number of clean records taken randomly: 10000
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.4000bp.val.fa


100%|██████████| 10000/10000 [00:31<00:00, 313.68it/s]


Saving 4000 val to file
Converting 4000 test to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:17, 60187.60it/s]
100%|██████████| 5680/5680 [00:00<00:00, 336093.43it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.4000bp.test.fa into dictionary  and removing N's


ValueError: Sample larger than population or is negative

In [9]:
window = [4000, 8000]
split = ['train', 'val', 'test']
for w in window:
    for s in split:
        print(f'Converting {w} {s} to its vector representations')
        mat = seqToWordVec(in_fa=f'../data/VISTA/mm9/pos_fa/mm9.VISTA'
                                 f'.enhancers.noXY.{w}bp.{s}.fa',
                           in_vec='/Users/callummacphillamy/Software/fastText'
                                  '-0.9.2/result/mm9.skipgram.vec',
                           model_path='/Users/callummacphillamy/Software'
                                      '/fastText-0.9.2/result/mm9.skipgram'
                                      '.bin',
                           word_size=10)
        print(f'Saving {w} {s} to file')
        np.save(f'../data/VISTA/mm9/pos_npy/mm9.VISTA.enhancers.noXY.{w}bp'
                f'.FTvec.{s}.npy', mat)

Converting 4000 train to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:17, 59753.29it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.4000bp.train.fa into dictionary  and removing N's


100%|██████████| 39754/39754 [00:00<00:00, 378840.85it/s]
  0%|          | 31/39754 [00:00<02:08, 308.66it/s]

Number of records before N removal: 39754
Number of records after N removal 39754
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.4000bp.train.fa


100%|██████████| 39754/39754 [02:20<00:00, 283.93it/s]


Saving 4000 train to file
Converting 4000 val to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:17, 60058.65it/s]


Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.4000bp.val.fa into dictionary  and removing N's


100%|██████████| 11358/11358 [00:00<00:00, 401017.76it/s]
  0%|          | 32/11358 [00:00<00:35, 315.47it/s]

Number of records before N removal: 11358
Number of records after N removal 11358
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.4000bp.val.fa


100%|██████████| 11358/11358 [00:39<00:00, 284.93it/s]


Saving 4000 val to file
Converting 4000 test to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:17, 59059.01it/s]
100%|██████████| 5680/5680 [00:00<00:00, 281846.59it/s]
  0%|          | 0/5680 [00:00<?, ?it/s]

Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.4000bp.test.fa into dictionary  and removing N's
Number of records before N removal: 5680
Number of records after N removal 5680
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.4000bp.test.fa


100%|██████████| 5680/5680 [00:19<00:00, 288.20it/s]


Saving 4000 test to file
Converting 8000 train to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:17, 60793.14it/s]
100%|██████████| 457/457 [00:00<00:00, 180898.16it/s]
  2%|▏         | 10/457 [00:00<00:04, 98.30it/s]

Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.8000bp.train.fa into dictionary  and removing N's
Number of records before N removal: 457
Number of records after N removal 457
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.8000bp.train.fa


100%|██████████| 457/457 [00:04<00:00, 95.04it/s] 


Saving 8000 train to file
Converting 8000 val to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:18, 57903.97it/s]
100%|██████████| 130/130 [00:00<00:00, 242553.17it/s]
  7%|▋         | 9/130 [00:00<00:01, 87.68it/s]

Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.8000bp.val.fa into dictionary  and removing N's
Number of records before N removal: 130
Number of records after N removal 130
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.8000bp.val.fa


100%|██████████| 130/130 [00:01<00:00, 90.31it/s]


Saving 8000 val to file
Converting 8000 test to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:17, 59986.56it/s]
100%|██████████| 66/66 [00:00<00:00, 149958.86it/s]
 14%|█▎        | 9/66 [00:00<00:00, 86.88it/s]

Reading ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.8000bp.test.fa into dictionary  and removing N's
Number of records before N removal: 66
Number of records after N removal 66
Beginning word vector represenations of ../data/VISTA/mm9/pos_fa/mm9.VISTA.enhancers.noXY.8000bp.test.fa


100%|██████████| 66/66 [00:00<00:00, 87.60it/s]


Saving 8000 test to file


In [4]:
### NEGATIVE SET ###
model = fasttext.load_model('/Users/callummacphillamy/PhD/Reference_Genomes'
                            '/hg19/hg19.skipgram.bin')
split = ['train', 'val', 'test']
window = [200]#, 400, 1000, 1500, 2000, 4000, 8000]
for s in split:
    for w in window:
        print(f'Converting {w} {s} to its vector representations')
        pos = np.load(f'../data/VISTA/mm9/pos_npy/mm9.VISTA.enhancers.noXY'
                      f'.{w}bp.FTvec.{s}.npy', mmap_mode='r')
        mat = seqToWordVec(f'../data/VISTA/mm9/neg_fa/mm9.VISTA'
                           f'.non_enhancers.noXY.GCbalanced.{w}bp.{s}.fa',
                           '/Users/callummacphillamy/PhD/Reference_Genomes'
                           '/hg19/hg19.skipgram.vec',
                           '/Users/callummacphillamy/PhD/Reference_Genomes'
                           '/hg19/hg19.skipgram.bin',
                           random_choice=True,
                           rand_n=pos.shape[0])
        print(f'Saving {w} {s} to file\n')
        np.save(f'../data/VISTA/mm9/neg_npy/mm9.VISTA.non_enhancers.noXY'
                f'.GCbalanced.{w}bp.FTvec.{s}.npy', mat)



Converting 200 train to its vector representations
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.vec


1048645it [00:19, 55039.58it/s]


Reading ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.200bp.train.fa into dictionary  and removing N's


100%|██████████| 3753423/3753423 [00:04<00:00, 896541.09it/s] 
  0%|          | 2188/1060821 [00:00<01:35, 11096.85it/s]

Number of clean records taken randomly: 1060821
Beginning word vector represenations of ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.200bp.train.fa


 11%|█         | 112105/1060821 [00:09<01:24, 11292.78it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


 31%|███       | 324073/1060821 [00:30<01:06, 11002.22it/s]

TACGCGATCG not in dictionary. Creating vector from subwords.


100%|██████████| 1060821/1060821 [01:39<00:00, 10647.18it/s]


Saving 200 train to file

Converting 200 val to its vector representations
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.bin

Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.vec






1048645it [00:20, 50072.34it/s]


Reading ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.200bp.val.fa into dictionary  and removing N's


100%|██████████| 1073599/1073599 [00:01<00:00, 1019093.13it/s]
  0%|          | 986/303092 [00:00<00:30, 9858.62it/s]

Number of clean records taken randomly: 303092
Beginning word vector represenations of ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.200bp.val.fa


 22%|██▏       | 65532/303092 [00:05<00:20, 11385.83it/s]

ATTCGACGCG not in dictionary. Creating vector from subwords.


 45%|████▌     | 136903/303092 [00:12<00:15, 10991.72it/s]

CGCGAATCGA not in dictionary. Creating vector from subwords.


100%|██████████| 303092/303092 [00:27<00:00, 11086.16it/s]


Saving 200 val to file

Converting 200 test to its vector representations
Loading model:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.bin





Loading vectors:	/Users/callummacphillamy/PhD/Reference_Genomes/hg19/hg19.skipgram.vec


1048645it [00:18, 56690.13it/s]


Reading ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.200bp.test.fa into dictionary  and removing N's


100%|██████████| 536737/536737 [00:00<00:00, 974115.81it/s] 
  1%|▏         | 2200/151546 [00:00<00:13, 10987.16it/s]

Number of clean records taken randomly: 151546
Beginning word vector represenations of ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.200bp.test.fa


 41%|████      | 62102/151546 [00:06<00:13, 6450.73it/s] 

CGATCGTCGG not in dictionary. Creating vector from subwords.


100%|██████████| 151546/151546 [00:15<00:00, 9820.36it/s] 


Saving 200 test to file



In [7]:
### NEGATIVE SET ###
## MOUSE ##
split = ['train', 'val', 'test']
window = [200, 400, 1000, 1500, 2000, 4000, 8000]
for s in split:
    for w in window:
        print(f'Converting {w} {s} to its vector representations')
        pos = np.load(f'../data/VISTA/mm9/pos_npy/mm9.VISTA.enhancers.noXY'
                      f'.{w}bp.FTvec.{s}.npy', mmap_mode='r')
        mat = seqToWordVec(f'../data/VISTA/mm9/neg_fa/mm9.VISTA'
                           f'.non_enhancers.noXY.GCbalanced.{w}bp.{s}.fa',
                           '/Users/callummacphillamy/Software/fastText-0.9'
                           '.2/result/mm9.skipgram.vec',
                           '/Users/callummacphillamy/Software/fastText-0.9'
                           '.2/result/mm9.skipgram.bin',
                           random_choice=True,
                           rand_n=pos.shape[0])
        print(f'Saving {w} {s} to file\n')
        np.save(f'../data/VISTA/mm9/neg_npy/mm9.VISTA.non_enhancers.noXY'
                f'.GCbalanced.{w}bp.FTvec.{s}.npy', mat)

Converting 200 train to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin





Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:16, 62719.52it/s]


Reading ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.200bp.train.fa into dictionary  and removing N's


100%|██████████| 3753423/3753423 [00:03<00:00, 964156.39it/s] 
  2%|▏         | 2358/100000 [00:00<00:08, 11952.96it/s]

Number of clean records taken randomly: 100000
Beginning word vector represenations of ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.200bp.train.fa


100%|██████████| 100000/100000 [00:08<00:00, 11728.07it/s]


Saving 200 train to file

Converting 400 train to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin





Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec


1048724it [00:16, 62824.35it/s]


Reading ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.400bp.train.fa into dictionary  and removing N's


100%|██████████| 1919431/1919431 [00:02<00:00, 896636.54it/s]
  1%|          | 519/100000 [00:00<00:19, 5186.68it/s]

Number of clean records taken randomly: 100000
Beginning word vector represenations of ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.400bp.train.fa


100%|██████████| 100000/100000 [00:17<00:00, 5622.49it/s]


Saving 400 train to file

Converting 1000 train to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec






1048724it [00:16, 62586.21it/s]


Reading ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.1000bp.train.fa into dictionary  and removing N's


100%|██████████| 769121/769121 [00:01<00:00, 646786.51it/s]
  0%|          | 161/100000 [00:00<01:02, 1606.85it/s]

Number of clean records taken randomly: 100000
Beginning word vector represenations of ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.1000bp.train.fa


  2%|▏         | 2495/100000 [00:01<00:50, 1924.02it/s]

ACGCGTCGAT not in dictionary. Creating vector from subwords.


 47%|████▋     | 46972/100000 [00:24<00:29, 1798.16it/s]

TTTCGCGACG not in dictionary. Creating vector from subwords.


100%|██████████| 100000/100000 [00:52<00:00, 1905.95it/s]


Saving 1000 train to file

Converting 1500 train to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec






1048724it [00:16, 62811.81it/s]


Reading ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.1500bp.train.fa into dictionary  and removing N's


100%|██████████| 500622/500622 [00:00<00:00, 573310.10it/s]
  0%|          | 215/100000 [00:00<01:32, 1075.13it/s]

Number of clean records taken randomly: 100000
Beginning word vector represenations of ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.1500bp.train.fa


 91%|█████████ | 91081/100000 [01:21<00:07, 1131.08it/s]

ACGCGTCGAT not in dictionary. Creating vector from subwords.


100%|██████████| 100000/100000 [01:29<00:00, 1111.50it/s]


Saving 1500 train to file

Converting 2000 train to its vector representations
Loading model:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.bin

Loading vectors:	/Users/callummacphillamy/Software/fastText-0.9.2/result/mm9.skipgram.vec






1048724it [00:18, 55558.52it/s]


Reading ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.2000bp.train.fa into dictionary  and removing N's


100%|██████████| 364425/364425 [00:00<00:00, 506396.84it/s]
  0%|          | 150/100000 [00:00<02:11, 759.06it/s]

Number of clean records taken randomly: 100000
Beginning word vector represenations of ../data/VISTA/mm9/neg_fa/mm9.VISTA.non_enhancers.noXY.GCbalanced.2000bp.train.fa


 39%|███▉      | 39203/100000 [00:50<01:24, 717.70it/s]

TTTCGCGACG not in dictionary. Creating vector from subwords.


100%|██████████| 100000/100000 [02:12<00:00, 754.56it/s]


Saving 2000 train to file

Converting 4000 train to its vector representations


FileNotFoundError: [Errno 2] No such file or directory: '../data/VISTA/mm9/pos_npy/mm9.VISTA.enhancers.noXY.4000bp.FTvec.train.npy'

In [5]:
### CREATE DATASET ###
import numpy as np
split = ['train', 'val', 'test']
windows = [200]
for s in split:
    for w in windows:
        print(f'Creating {s} dataset for size {w}bps')
        pos_X = np.load(f'../data/VISTA/mm9/pos_npy/mm9.VISTA.enhancers'
                        f'.noXY.{w}bp.FTvec.{s}.npy',
                        mmap_mode='r')
        neg_X = np.load(f'../data/VISTA/mm9/neg_npy/mm9.VISTA.non_enhancers'
                        f'.noXY.GCbalanced.{w}bp.FTvec.{s}.npy', mmap_mode='r')

        pos_y = np.ones(pos_X.shape[0])
        neg_y = np.zeros(neg_X.shape[0])

        print('Stacking positive and negative arrays together.')
        dat = np.vstack((pos_X, neg_X))
        lab = np.hstack((pos_y, neg_y))
        print(f'Saving {s} data set fpr size {w}bps.')
        np.save(f'../data/VISTA/mm9/datasets/mm9.balanced.VISTA.FTvec.{w}bp'
                f'.{s}_X'
                f'.npy', dat)
        np.save(f'../data/VISTA/mm9/datasets/mm9.balanced.VISTA.FTvec.{w}bp'
                f'.{s}_y'
                f'.npy', lab)

Creating train dataset for size 200bps
Stacking positive and negative arrays together.
Saving train data set fpr size 200bps.
Creating val dataset for size 200bps
Stacking positive and negative arrays together.
Saving val data set fpr size 200bps.
Creating test dataset for size 200bps
Stacking positive and negative arrays together.
Saving test data set fpr size 200bps.


In [10]:
split = ['train', 'val', 'test']
windows = [200, 400, 1000, 1500, 2000, 4000, 8000]
for s in split:
    for w in windows:
        print(f'Creating dataset for size {w}bps')
        pos_X = np.load(f'../data/VISTA/mm9/pos_npy/mm9.VISTA.enhancers'
                        f'.noXY.{w}bp.FTvec.{s}.npy',
                        mmap_mode='r')
        neg_X = np.load(f'../data/VISTA/mm9/neg_npy/mm9.VISTA.non_enhancers'
                        f'.noXY'
                 f'.GCbalanced.{w}bp.FTvec.{s}.npy', mmap_mode='r')

        pos_y = np.ones(pos_X.shape[0])
        neg_y = np.zeros(neg_X.shape[0])

        dat = np.vstack((pos_X, neg_X))
        lab = np.hstack((pos_y, neg_y))
        np.save(f'../data/VISTA/mm9/datasets/mm9.balanced.VISTA.FTvec.{w}bp'
                f'.{s}_X'
                f'.npy', dat)
        np.save(f'../data/VISTA/mm9/datasets/mm9.balanced.VISTA.FTvec.{w}bp'
                f'.{s}_y'
                f'.npy', lab)

Creating dataset for size 200bps
Creating dataset for size 400bps
Creating dataset for size 1000bps
Creating dataset for size 1500bps
Creating dataset for size 2000bps
Creating dataset for size 4000bps


FileNotFoundError: [Errno 2] No such file or directory: '../data/VISTA/mm9/neg_npy/mm9.VISTA.non_enhancers.noXY.GCbalanced.4000bp.FTvec.train.npy'

In [1]:
import numpy as np
from tqdm import tqdm
split = ['train', 'val', 'test']
window = [200]#, 1000]
for s in split:
    for w in tqdm(window):
        print(f'Combining human and mouse {s} datasets at {w}bp.')
        human_X = np.load(f'../data/VISTA/hg19/datasets/hg19.balanced.VISTA'
                          f'.FTvec.{w}bp.{s}_X.npy', mmap_mode='r')
        mouse_X = np.load(f'../data/VISTA/mm9/datasets/mm9.balanced.VISTA'
                          f'.FTvec.{w}bp.{s}_X.npy', mmap_mode='r')
        human_y = np.load(f'../data/VISTA/hg19/datasets/hg19.balanced.VISTA'
                          f'.FTvec.{w}bp.{s}_y.npy', mmap_mode='r')
        mouse_y = np.load(f'../data/VISTA/mm9/datasets/mm9.balanced.VISTA'
                          f'.FTvec.{w}bp.{s}_y.npy', mmap_mode='r')

        comb_X = np.vstack((human_X, mouse_X))
        comb_y = np.hstack((human_y, mouse_y))

        np.save(f'../data/VISTA/hg19.mm9.balanced.VISTA.FTvec.{w}bp.{s}_X'
                f'.npy', comb_X)
        np.save(f'../data/VISTA/hg19.mm9.balanced.VISTA.FTvec.{w}bp.{s}_y'
                f'.npy', comb_y)

  0%|          | 0/1 [00:00<?, ?it/s]

Combining human and mouse train datasets at 200bp.


100%|██████████| 1/1 [15:35<00:00, 935.02s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Combining human and mouse val datasets at 200bp.


100%|██████████| 1/1 [03:26<00:00, 206.58s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Combining human and mouse test datasets at 200bp.


100%|██████████| 1/1 [01:19<00:00, 79.17s/it]
