In [1]:
import os
import glob
import ast
import pickle
import torch
import pandas as pd
import numpy as np
from Bio import SeqIO

In [2]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
if use_cuda:
    print(torch.cuda.get_device_name(0))

Tesla K80


In [3]:
file = os.path.join("/home/ubuntu/data/dmel_seq", "dmel_only_with_length.fa")
output_folder_path = "/home/ubuntu/data/dmel_seq"
# NOTE: the buffer file need not be created beforehands. Just write the path
#       and the file name here. The file would be created by the system.

In [4]:
base_pairs = {'A': [1, 0, 0, 0], 
              'C': [0, 1, 0, 0],
              'G': [0, 0, 1, 0],
              'T': [0, 0, 0, 1],
              'a': [1, 0, 0, 0],
              'c': [0, 1, 0, 0],
              'g': [0, 0, 1, 0],
              't': [0, 0, 0, 1],
              'n': [0, 0, 0, 0],
              'N': [0, 0, 0, 0]}

In [8]:
input_folder_path

'/home/ubuntu/data/dmel_seq/dmel_only_with_length.fa'

In [6]:
data = list(SeqIO.parse(file,"fasta"))

In [20]:
data[0]

SeqRecord(seq=Seq('GTGCTGGTGCAATAACTTGTTCTCATATCTGATTGTAACAGAGAATCTAGTTTT...GGA', SingleLetterAlphabet()), id='VT0002|0|dmel|-|642', name='VT0002|0|dmel|-|642', description='VT0002|0|dmel|-|642', dbxrefs=[])

In [21]:
data[0].seq

Seq('GTGCTGGTGCAATAACTTGTTCTCATATCTGATTGTAACAGAGAATCTAGTTTT...GGA', SingleLetterAlphabet())

In [13]:
# number of sequences
len(data)

7890

In [24]:
seq = data[0]
seq.description.split('|')

['VT0002', '0', 'dmel', '-', '642']

In [7]:
# min and max len
min_len = min([int(seq.description.split('|')[-1]) for seq in data])
max_len = max([int(seq.description.split('|')[-1]) for seq in data])
min_len, max_len

(116, 3388)

In [37]:
for n in seq.seq:
    print('{}: {}'.format(n, base_pairs[n]))

G: [0, 0, 1, 0]
T: [0, 0, 0, 1]
G: [0, 0, 1, 0]
C: [0, 1, 0, 0]
T: [0, 0, 0, 1]
G: [0, 0, 1, 0]
G: [0, 0, 1, 0]
T: [0, 0, 0, 1]
G: [0, 0, 1, 0]
C: [0, 1, 0, 0]
A: [1, 0, 0, 0]
A: [1, 0, 0, 0]
T: [0, 0, 0, 1]
A: [1, 0, 0, 0]
A: [1, 0, 0, 0]
C: [0, 1, 0, 0]
T: [0, 0, 0, 1]
T: [0, 0, 0, 1]
G: [0, 0, 1, 0]
T: [0, 0, 0, 1]
T: [0, 0, 0, 1]
C: [0, 1, 0, 0]
T: [0, 0, 0, 1]
C: [0, 1, 0, 0]
A: [1, 0, 0, 0]
T: [0, 0, 0, 1]
A: [1, 0, 0, 0]
T: [0, 0, 0, 1]
C: [0, 1, 0, 0]
T: [0, 0, 0, 1]
G: [0, 0, 1, 0]
A: [1, 0, 0, 0]
T: [0, 0, 0, 1]
T: [0, 0, 0, 1]
G: [0, 0, 1, 0]
T: [0, 0, 0, 1]
A: [1, 0, 0, 0]
A: [1, 0, 0, 0]
C: [0, 1, 0, 0]
A: [1, 0, 0, 0]
G: [0, 0, 1, 0]
A: [1, 0, 0, 0]
G: [0, 0, 1, 0]
A: [1, 0, 0, 0]
A: [1, 0, 0, 0]
T: [0, 0, 0, 1]
C: [0, 1, 0, 0]
T: [0, 0, 0, 1]
A: [1, 0, 0, 0]
G: [0, 0, 1, 0]
T: [0, 0, 0, 1]
T: [0, 0, 0, 1]
T: [0, 0, 0, 1]
T: [0, 0, 0, 1]
T: [0, 0, 0, 1]
C: [0, 1, 0, 0]
A: [1, 0, 0, 0]
A: [1, 0, 0, 0]
T: [0, 0, 0, 1]
A: [1, 0, 0, 0]
A: [1, 0, 0, 0]
A: [1, 0, 0, 0]
A: [1, 0

In [36]:
[base_pairs[char] for char in seq.seq]

[[0, 0, 1, 0],
 [0, 0, 0, 1],
 [0, 0, 1, 0],
 [0, 1, 0, 0],
 [0, 0, 0, 1],
 [0, 0, 1, 0],
 [0, 0, 1, 0],
 [0, 0, 0, 1],
 [0, 0, 1, 0],
 [0, 1, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 0, 0, 1],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 1, 0, 0],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 1, 0],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 1, 0, 0],
 [0, 0, 0, 1],
 [0, 1, 0, 0],
 [1, 0, 0, 0],
 [0, 0, 0, 1],
 [1, 0, 0, 0],
 [0, 0, 0, 1],
 [0, 1, 0, 0],
 [0, 0, 0, 1],
 [0, 0, 1, 0],
 [1, 0, 0, 0],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 1, 0],
 [0, 0, 0, 1],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 1, 0, 0],
 [1, 0, 0, 0],
 [0, 0, 1, 0],
 [1, 0, 0, 0],
 [0, 0, 1, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 0, 0, 1],
 [0, 1, 0, 0],
 [0, 0, 0, 1],
 [1, 0, 0, 0],
 [0, 0, 1, 0],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 1, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 0, 0, 1],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 1, 0,

In [199]:
data_x, data_y = [], []
for seq in data:
    regionid, expressed, speciesid, strand, _ = seq.description.split('|')
    y = int(expressed)
    x_len = len(seq.seq)
    x = [base_pairs[char] for char in seq.seq]
    x_comp = [base_pairs[char] for char in seq.seq.complement()]
    while x_len < max_len:
        x.append([0, 0, 0, 0])
        x_comp.append([0, 0, 0, 0])
        x_len += 1
    data_x.append(x)
    data_x.append(x_comp)
    data_y.append(y)
    data_y.append(y)

data_x_nlc, data_y = np.array(data_x), np.array(data_y)
data_x_ncl = data_x_nlc.transpose([0, 2, 1])

In [200]:
data_x_nlc.shape, data_x_ncl.shape, data_y.shape

((15780, 3388, 4), (15780, 4, 3388), (15780,))

In [204]:
with open(os.path.join(output_folder_path, 'data_x_nlc.np'), mode='wb') as output:
    pickle.dump(data_x_nlc, output)
with open(os.path.join(output_folder_path, 'data_x_ncl.np'), mode='wb') as output:
    pickle.dump(data_x_ncl, output)
with open(os.path.join(output_folder_path, 'data_y.np'), mode='wb') as output:
    pickle.dump(data_y, output)