Read nucleotide sequences and covert to 3D one hot encoded matrix. First dimension gives the sequences index, second is nucleotide position within that sequence, third gives nucleotide identity.

$X_{ijk}$

In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
import numpy as np

In [2]:
def random_seqs(num_seqs, length=1000):
    # Generate random test data
    return [
        Seq(''.join(list(np.random.choice(['A', 'T', 'G', 'C'], length, replace=True)))) 
        for _ in range(0, num_seqs)
    ]

From Quon

"Assume the text file input is going to be a set of N sequences (one per line, all of the same length, with the characters A,C,G, and T)."

In [3]:
def read_seq_file(filepath):
    with open(filepath) as handle:
        return [s.upper().strip() for s in handle]

In [4]:
def nuc_to_one_hot(nuc):
    # Convert nucleotide to the index in one hot encoded array
    # that should be hot (==1)
    upper_nuc = nuc.upper()
    mapping = {'A': 0, 'T': 1, 'G': 2, 'C': 3}
    return mapping[upper_nuc]

In [5]:
def make_matrix(seqs):
    # input an iterable of sequences and return one hot matrix
    num_seqs, length = len(seqs), len(seqs[0])
    # assume all sequences are the same length
    matrix = np.zeros((num_seqs, length, 4))
    for i, each_seq in enumerate(seqs):
        for j, each_nuc in enumerate(each_seq):
            hot_index = nuc_to_one_hot(each_nuc)
            matrix[i][j][hot_index] = 1
    return matrix

In [6]:
rand_seqs = random_seqs(10, 10)
rand_seqs

[Seq('TCTGGGCCAG'),
 Seq('CACTTAAATG'),
 Seq('CGACTACGAT'),
 Seq('AACAGGGAAG'),
 Seq('CTGTCTGTAT'),
 Seq('AACCAGCATG'),
 Seq('ATGACACGTG'),
 Seq('CATGTTGGGC'),
 Seq('ACTCATTGAC'),
 Seq('CTCGGACGAA')]

In [7]:
matrix = make_matrix(rand_seqs)
matrix

array([[[0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.]],

       [[0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.]],

       [[0., 0., 0., 1.],
        [0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.]],

       [[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [1., 0., 0., 0.],
      

Test when reading from a list of sequences.

In [8]:
# write random sequences to a text file
random_seqs_file = 'random.txt'
with open(random_seqs_file, 'w') as handle:
    for each_seq in rand_seqs:
        handle.write(str(each_seq))
        handle.write('\n')

Read sequences.

In [9]:
rand_seqs_from_file = read_seq_file(random_seqs_file)
rand_seqs_from_file

['TCTGGGCCAG',
 'CACTTAAATG',
 'CGACTACGAT',
 'AACAGGGAAG',
 'CTGTCTGTAT',
 'AACCAGCATG',
 'ATGACACGTG',
 'CATGTTGGGC',
 'ACTCATTGAC',
 'CTCGGACGAA']

Compute matrix.

In [10]:
matrix_from_file = make_matrix(rand_seqs_from_file)

Check to make sure is the same as the matrix computed before reading from file.

In [11]:
assert matrix_from_file.all() == matrix.all()
print('Matrices match')

Matrices match
