Read nucleotide sequences and covert to 3D one hot encoded matrix. First dimension gives the sequences index, second is nucleotide position within that sequence, third gives nucleotide identity.

$X_{ijk}$

In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
import numpy as np

In [2]:
def random_seqs(num_seqs, length=1000):
    # Generate random test data
    return [
        Seq(''.join(list(np.random.choice(['A', 'T', 'G', 'C'], length, replace=True)))) 
        for _ in range(0, num_seqs)
    ]

In [3]:
def nuc_to_one_hot(nuc):
    # Convert nucleotide to the index in one hot encoded array
    # that should be hot (==1)
    upper_nuc = nuc.upper()
    mapping = {'A': 0, 'T': 1, 'G': 2, 'C': 3}
    return mapping[upper_nuc]

In [4]:
def make_matrix(seqs):
    num_seqs, length = len(seqs), len(seqs[0])
    # assume all sequences are the same length
    matrix = np.zeros((num_seqs, length, 4))
    for i, each_seq in enumerate(seqs):
        for j, each_nuc in enumerate(each_seq):
            hot_index = nuc_to_one_hot(each_nuc)
            matrix[i][j][hot_index] = 1
    return matrix

In [5]:
rand_seqs = random_seqs(10, 10)
rand_seqs

[Seq('TCTGCGTATT'),
 Seq('TCACCGTAGT'),
 Seq('TATAATTAAG'),
 Seq('GGGACAGCTA'),
 Seq('GGAGCTTCAT'),
 Seq('TCTACATACG'),
 Seq('TCCCACTTAA'),
 Seq('CAAAAGGTGT'),
 Seq('GTCGGAGCAC'),
 Seq('GTACGTTTAT')]

In [6]:
matrix = make_matrix(rand_seqs)
matrix

array([[[0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]],

       [[0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.]],

       [[0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.]],

       [[0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.],
      