In [1]:
import os
import glob
import ast
import pickle
import pandas as pd
import numpy as np
from Bio import SeqIO

In [2]:
file = os.path.join("/home/ubuntu/data/dmel_seq", "dmel_only_with_length.fa")
output_folder_path = "/home/ubuntu/data/dmel_seq"
# NOTE: the buffer file need not be created beforehands. Just write the path
#       and the file name here. The file would be created by the system.

In [3]:
data = list(SeqIO.parse(file,"fasta"))
regionids = np.array([seq.description.split('|')[0] for seq in data])
unique_regions, counts = np.unique(regionids, return_counts=True)

print(len(unique_regions))
counts = {regionid:0 for regionid in unique_regions}
unique_data = []
for seq in data:
    regionid, _, _, _, _ =seq.description.split('|')
    if counts[regionid] == 0:
        unique_data.append(seq)
        counts[regionid] += 1

7131


In [4]:
base_pairs = {'A': 1, 
              'C': 2,
              'G': 3,
              'T': 4,
              'a': 1,
              'c': 2,
              'g': 3,
              't': 4,
              'n': 0,
              'N': 0}

In [5]:
# min and max len
min_len = min([int(seq.description.split('|')[-1]) for seq in unique_data])
max_len = max([int(seq.description.split('|')[-1]) for seq in unique_data])
min_len, max_len

(116, 3388)

In [6]:
np.random.seed(42)
train_idx = np.random.choice(np.arange(len(unique_data)), int(0.8*len(unique_data)), replace=False)
val_idx = np.array([idx for idx in range(len(unique_data)) if idx not in train_idx])
train_idx.shape, val_idx.shape

((5704,), (1427,))

In [7]:
train_data, val_data = [], []
for idx in train_idx:
    train_data.append(unique_data[idx])
for idx in val_idx:
    val_data.append(unique_data[idx])
len(train_data), len(val_data)

(5704, 1427)

In [8]:
train_x_ex, train_y_ex = [], []
for seq in train_data:
    regionid, expressed, speciesid, strand, _ = seq.description.split('|')
    y = int(expressed)
    x_len = len(seq.seq)
    x = [base_pairs[char] for char in seq.seq]
    x_comp = [base_pairs[char] for char in seq.seq.complement()]
    while x_len < max_len:
        x.append(0)
        x_comp.append(0)
        x_len += 1
    train_x_ex.append(x)
    train_x_ex.append(x_comp)
    train_y_ex.append(y)
    train_y_ex.append(y)

train_x_ex, train_y_ex = np.array(train_x_ex), np.array(train_y_ex)

In [9]:
val_x_ex, val_y_ex = [], []
for seq in val_data:
    regionid, expressed, speciesid, strand, _ = seq.description.split('|')
    y = int(expressed)
    x_len = len(seq.seq)
    x = [base_pairs[char] for char in seq.seq]
    x_comp = [base_pairs[char] for char in seq.seq.complement()]
    while x_len < max_len:
        x.append(0)
        x_comp.append(0)
        x_len += 1
    val_x_ex.append(x)
    val_x_ex.append(x_comp)
    val_y_ex.append(y)
    val_y_ex.append(y)

val_x_ex, val_y_ex = np.array(val_x_ex), np.array(val_y_ex)

In [10]:
train_x_ex.shape, train_y_ex.shape, val_x_ex.shape, val_y_ex.shape

((11408, 3388), (11408,), (2854, 3388), (2854,))

In [11]:
with open(os.path.join(output_folder_path, 'train_tk_x_ex.np'), mode='wb') as output:
    pickle.dump(train_x_ex, output)
with open(os.path.join(output_folder_path, 'train_tk_y_ex.np'), mode='wb') as output:
    pickle.dump(train_y_ex, output)

In [12]:
with open(os.path.join(output_folder_path, 'val_tk_x_ex.np'), mode='wb') as output:
    pickle.dump(val_x_ex, output)
with open(os.path.join(output_folder_path, 'val_tk_y_ex.np'), mode='wb') as output:
    pickle.dump(val_y_ex, output)