In [1]:
import os
import glob
import ast
import pickle
import torch
import pandas as pd
import numpy as np
from Bio import SeqIO, SeqRecord

In [2]:
file = os.path.join("/home/ubuntu/data/dmel_seq", "dmel_only_with_length.fa")
output_folder_path = "/home/ubuntu/data/dmel_seq"
# NOTE: the buffer file need not be created beforehands. Just write the path
#       and the file name here. The file would be created by the system.

In [3]:
base_pairs = {'A': [1, 0, 0, 0], 
              'C': [0, 1, 0, 0],
              'G': [0, 0, 1, 0],
              'T': [0, 0, 0, 1],
              'a': [1, 0, 0, 0],
              'c': [0, 1, 0, 0],
              'g': [0, 0, 1, 0],
              't': [0, 0, 0, 1],
              'n': [0, 0, 0, 0],
              'N': [0, 0, 0, 0]}

In [4]:
data = list(SeqIO.parse(file,"fasta"))

In [5]:
data[0]

SeqRecord(seq=Seq('GTGCTGGTGCAATAACTTGTTCTCATATCTGATTGTAACAGAGAATCTAGTTTT...GGA', SingleLetterAlphabet()), id='VT0002|0|dmel|-|642', name='VT0002|0|dmel|-|642', description='VT0002|0|dmel|-|642', dbxrefs=[])

In [6]:
data[0].seq

Seq('GTGCTGGTGCAATAACTTGTTCTCATATCTGATTGTAACAGAGAATCTAGTTTT...GGA', SingleLetterAlphabet())

In [7]:
# number of sequences
len(data)

7890

In [8]:
seq = data[0]
seq.description.split('|') # regionid, expression, species id, strand, length

['VT0002', '0', 'dmel', '-', '642']

### Find duplicats

In [9]:
regionids = np.array([seq.description.split('|')[0] for seq in data])
unique_regions, counts = np.unique(regionids, return_counts=True)

len(unique_regions)

7131

In [10]:
rep_regions_idx = [idx for idx in range(len(unique_regions)) if counts[idx] > 1]

In [11]:
len(rep_regions_idx)

759

In [12]:
[unique_regions[idx] for idx in rep_regions_idx]

['VT10855',
 'VT10857',
 'VT10858',
 'VT10859',
 'VT10860',
 'VT10861',
 'VT10862',
 'VT10865',
 'VT10908',
 'VT10909',
 'VT10915',
 'VT10916',
 'VT14200',
 'VT14204',
 'VT14246',
 'VT14250',
 'VT14255',
 'VT14256',
 'VT14257',
 'VT14260',
 'VT14264',
 'VT14266',
 'VT14267',
 'VT14273',
 'VT14328',
 'VT14329',
 'VT14340',
 'VT14341',
 'VT14345',
 'VT14347',
 'VT14348',
 'VT14349',
 'VT14844',
 'VT14971',
 'VT14973',
 'VT14974',
 'VT14975',
 'VT14976',
 'VT14977',
 'VT14978',
 'VT14979',
 'VT15970',
 'VT15971',
 'VT15972',
 'VT15982',
 'VT15991',
 'VT16054',
 'VT16055',
 'VT16114',
 'VT16116',
 'VT16117',
 'VT16119',
 'VT16122',
 'VT16124',
 'VT16127',
 'VT17872',
 'VT17873',
 'VT17874',
 'VT17876',
 'VT17877',
 'VT18480',
 'VT18481',
 'VT18873',
 'VT18874',
 'VT18875',
 'VT18876',
 'VT18877',
 'VT18883',
 'VT19643',
 'VT19647',
 'VT19648',
 'VT19651',
 'VT19652',
 'VT19653',
 'VT19654',
 'VT19655',
 'VT19656',
 'VT19657',
 'VT19658',
 'VT19708',
 'VT19711',
 'VT19713',
 'VT19715',
 'VT

In [13]:
[i for i in counts if i > 1]

[2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,


In [14]:
VT15991 = []
for idx, seq in enumerate(data):
    regionid, _, _, _, _ =seq.description.split('|')
    if regionid == 'VT15991':
        VT15991.append(data[idx])

VT15991

[SeqRecord(seq=Seq('ACTATATATTTTCCACATTTTTTCCAAACTTTTTTCTTATATTTTTTTTTGCGA...GCC', SingleLetterAlphabet()), id='VT15991|0|dmel|-|2113', name='VT15991|0|dmel|-|2113', description='VT15991|0|dmel|-|2113', dbxrefs=[]),
 SeqRecord(seq=Seq('ACTATATATTTTCCACATTTTTTCCAAACTTTTTTCTTATATTTTTTTTTGCGA...GCC', SingleLetterAlphabet()), id='VT15991|0|dmel|-|2113', name='VT15991|0|dmel|-|2113', description='VT15991|0|dmel|-|2113', dbxrefs=[])]

In [15]:
VT15971 = []
for idx, seq in enumerate(data):
    regionid, _, _, _, _ =seq.description.split('|')
    if regionid == 'VT15971':
        VT15971.append(data[idx])

VT15971

[SeqRecord(seq=Seq('CATTGCTCAGCCTAGCGCGAAAAAGCCAGAAAGTAAGTTCCGATTGCCTAACGA...CGC', SingleLetterAlphabet()), id='VT15971|0|dmel|-|2232', name='VT15971|0|dmel|-|2232', description='VT15971|0|dmel|-|2232', dbxrefs=[]),
 SeqRecord(seq=Seq('CATTGCTCAGCCTAGCGCGAAAAAGCCAGAAAGTAAGTTCCGATTGCCTAACGA...CGC', SingleLetterAlphabet()), id='VT15971|0|dmel|-|2232', name='VT15971|0|dmel|-|2232', description='VT15971|0|dmel|-|2232', dbxrefs=[])]

### Remove duplicates

In [16]:
counts = {regionid:0 for regionid in unique_regions}
unique_data = []
for seq in data:
    regionid, _, _, _, _ =seq.description.split('|')
    if counts[regionid] == 0:
        unique_data.append(seq)
        counts[regionid] += 1

In [17]:
len(unique_data)

7131

In [18]:
unique_regionids = np.array([seq.description.split('|')[0] for seq in unique_data])
unique_regions, counts = np.unique(regionids, return_counts=True)

len(unique_regions)

7131

### Split data and make sure that validation and training have no overlapping region

In [19]:
# min and max len
min_len = min([int(seq.description.split('|')[-1]) for seq in unique_data])
max_len = max([int(seq.description.split('|')[-1]) for seq in unique_data])
min_len, max_len

(116, 3388)

In [20]:
np.random.seed(42)
train_idx = np.random.choice(np.arange(len(unique_data)), int(0.8*len(unique_data)), replace=False)
val_idx = np.array([idx for idx in range(len(unique_data)) if idx not in train_idx])
train_idx.shape, val_idx.shape

((5704,), (1427,))

In [21]:
train_data, val_data = [], []
for idx in train_idx:
    train_data.append(unique_data[idx])
for idx in val_idx:
    val_data.append(unique_data[idx])
len(train_data), len(val_data)

(5704, 1427)

In [22]:
train_x_ex, train_y_ex = [], []
for seq in train_data:
    regionid, expressed, speciesid, strand, _ = seq.description.split('|')
    y = int(expressed)
    x_len = len(seq.seq)
    x = [base_pairs[char] for char in seq.seq]
    x_comp = [base_pairs[char] for char in seq.seq.complement()]
    while x_len < max_len:
        x.append([0, 0, 0, 0])
        x_comp.append([0, 0, 0, 0])
        x_len += 1
    train_x_ex.append(x)
    train_x_ex.append(x_comp)
    train_y_ex.append(y)
    train_y_ex.append(y)

train_x_ex, train_y_ex = np.array(train_x_ex), np.array(train_y_ex)

In [23]:
val_x_ex, val_y_ex = [], []
for seq in val_data:
    regionid, expressed, speciesid, strand, _ = seq.description.split('|')
    y = int(expressed)
    x_len = len(seq.seq)
    x = [base_pairs[char] for char in seq.seq]
    x_comp = [base_pairs[char] for char in seq.seq.complement()]
    while x_len < max_len:
        x.append([0, 0, 0, 0])
        x_comp.append([0, 0, 0, 0])
        x_len += 1
    val_x_ex.append(x)
    val_x_ex.append(x_comp)
    val_y_ex.append(y)
    val_y_ex.append(y)

val_x_ex, val_y_ex = np.array(val_x_ex), np.array(val_y_ex)

In [24]:
train_x_ex.shape, train_y_ex.shape, val_x_ex.shape, val_y_ex.shape

((11408, 3388, 4), (11408,), (2854, 3388, 4), (2854,))

In [25]:
with open(os.path.join(output_folder_path, 'train_x_ex.np'), mode='wb') as output:
    pickle.dump(train_x_ex, output)
with open(os.path.join(output_folder_path, 'train_y_ex.np'), mode='wb') as output:
    pickle.dump(train_y_ex, output)

In [26]:
with open(os.path.join(output_folder_path, 'val_x_ex.np'), mode='wb') as output:
    pickle.dump(val_x_ex, output)
with open(os.path.join(output_folder_path, 'val_y_ex.np'), mode='wb') as output:
    pickle.dump(val_y_ex, output)