# Part 3: EM implementation

In [1]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [2]:
def read_data():
    with open('sequence.padded.txt') as handle:
        # I dont think this is the actual sequences we should be using
        # put using as placeholder for now.
        return [s.strip() for s in handle.readlines()]

data = read_data()
data

['ATACCCCTGGCTGGGTCATGGTGACCTGGAGGAAGCGT',
 'CATATATGGCCAGGGTCAGTGTGACCTCCATTTCCCAT',
 'AGCAGCTGGCCTGGGTCACAGTGACCTGACCTCAAACC',
 'AGGCTGTGTACAAGGTCAGAGTGACCTCTAGAAGCTCT',
 'TACTCTAGTTCCAGGTCATGGTGACCTGTGAAAAATCT',
 'AGGACTGTTTCAAGGTCACGGTGACCCTCGTGGGCTGT',
 'GCAGGAAGTTTTGGGTCACGGTGACCTCTAGTTGTTGA',
 'CAAGTGCTTCAAAGGTCATGGTGCCCTGGGGCCGAGAG',
 'ACCAACATGGCAGGGTCAAGTTGACCTCCCTGGCCACT',
 'TCTCTCTCTAGTAGGTCATGGTGACCTGTACACATTAT',
 'TCAGACCACAGAGGGTCAAGGTGACCTGAGAGATCAGT',
 'AGGCAATTCACTAGGTCAGGATGCCCTGGGGCAACAGT',
 'TAGTCCTGAAAAGGGTCATGTTGACCTGATTGTCATGT',
 'ATTAACTCTTCTAGGTCAGTGTGACCTAAACTCATCGG',
 'GGACAATTATTGGGGTCACGGTGACCTGCCTGTTTCAG',
 'GGTCCATAATATAGGTCATGTTGACCTGGGACAACTGG',
 'CTCCAGGAGCAGGGGTCAGGGTGACCTCCAGCTCCTCA',
 'GAGCCCATCTCTGGGTCATGTTGCCCTCTTACAGCACA',
 'TGGGTTAAACCTGGGTCATGTTGACCTAGATACATCTC',
 'GTGACATCCCCAGGGTCAAAGTGCCCTGAGTCTGGAGA',
 'GCCTTCTAGGTCAGCATGACCTGGTCCTCAGAGGGGGG',
 'GGCAATGAATCAAGGTCAGGCTAACCTGGCTTACTGCA',
 'CCTACTAGCCCTGGGTCAACGTGCCCTGTAAGAGCATG',
 'GGCGCAGCC

Create matrix $X_{i,j,p,k}$ using one-hot encoding scheme.

In [3]:
seq_length = len(data[0])
motif_length = 8
number_motifs = seq_length - motif_length + 1
X = np.zeros((len(data), number_motifs, motif_length, 4))

def nuc_to_one_hot(nuc):
    # Convert nucleotide to the index in one hot encoded array
    # that should be hot (==1)
    upper_nuc = nuc.upper()
    mapping = {'A': 0, 'T': 1, 'G': 2, 'C': 3}
    return mapping[upper_nuc]

j_p = []
for i in range(seq_length):
    for j in range(number_motifs):
        for p in range(motif_length):
            nuc = data[i][j+p]
            k_hot = nuc_to_one_hot(nuc)
            X[i][j][p][k_hot] = 1.0

assert X.sum() == seq_length * number_motifs * motif_length
X.shape

(357, 31, 8, 4)

Randomly initialize the model parameters.

In [4]:
def init_EM(seq_length, motif_length):
    number_motifs_per_sequence = seq_length - motif_length + 1
    lambda_j = np.random.uniform(0, 1, size=(number_motifs_per_sequence,))
    lambda_j_norm = lambda_j / lambda_j.sum()
    psi_0 = np.random.uniform(0, 1, size=(4, motif_length))
    psi_1 = np.random.uniform(0, 1, size=(4, motif_length))
    psi_0 = (psi_0/psi_0.sum(axis=0)).T
    psi_1 = (psi_1/psi_1.sum(axis=0)).T
    
    return lambda_j_norm, psi_0, psi_1

In [5]:
lambda_j, psi_0, psi_1 = init_EM(seq_length, motif_length)

## E step

![](e.png)

### Log trick implementation questions

How does the log trick actually work and how do we implement that?

$ln(P_{i} = j | X_{i}, \theta) = \frac{[ln(\lambda_{j}) + \sum_{p} \sum_{k} ln(\psi^{0}_{p, k})X_{i,j,p,k}] + [\sum_{j^{'} != j} \sum_{p} \sum_{k} ln(\psi_{p,k}^{0})X_{i,j^{'},p,k}]}{ln(\textrm{denominator})} $

Where does the divide by smallest number come into play? What is the smallest number? The smallest number of what variable?

First calculate the numerator of the equation.

### ELBO calculation

Pretty lost on how to actually calculate ELBO as I am unsure what terms actually correspond to the ELBO term.

In [6]:
def e_numerator(i, j, X, lambda_j, psi_0, psi_1):
    # i = current sequence
    # j = current j index
    # X = Data
    psi_1_term = X[i][j] * psi_1
    # remove zero terms this is in lieu of having exponent X_{i,j,p,k}
    # whcih would cause 0 encoded values (cold values) to have
    # value of 1 and therefore not contribute to the product
    psi_1_term = psi_1_term[psi_1_term != 0]
    # take product of all remaining terms (these are probibities seeing
    # the bases in the given motif in their given positions given they
    # are in the TFBS)
    psi_1_term = np.log(psi_1_term).sum()
    
    # now need to get product of all other motifs (j' != j) but assuming they are
    # not the TFBS (psi^0). 
    psi_0_term = X[i][np.arange(len(X[i]))!=j] * psi_0
    psi_0_term = psi_0_term.flatten()
    psi_0_term = np.log(psi_0_term[psi_0_term != 0]).sum()
    return np.log(lambda_j[j]) + psi_0_term + psi_1_term  # log of products is sum of logs

In [30]:
one_i = np.array([e_numerator(0, j, X, lambda_j, psi_0, psi_1) for j in range(X.shape[1])])
one_i_sum = one_i.sum()
one_i_minus_min = one_i - one_i.min()
one_i_minus_min_e = np.e**one_i_minus_min
(one_i_minus_min_e / one_i_minus_min_e.sum()).sum()

0.9999999999999998

In [8]:
# make vector of all numerators 
#numerators = [e_numerator(i, j X, lambda_j, psi_0, psi_1) for j in X.shape[1] for i in X.shape]

In [9]:
def e_denominator(i, j, X, lambda_j, psi_0, psi_1):
    # Get all lambda values that are not j
    lambda_j_prime = np.delete(lambda_j, j)
    sum_terms = []
    j_prime_indexes = np.delete(list(range(len(lambda_j))), j)
    for j_prime in j_prime_indexes:
        psi_1_term = X[i][j_prime] * psi_0.T
        psi_1_term = psi_1_term[psi_1_term != 0].flatten().prod()
        psi_0_term = X[i][np.arange(len(X[i]))!=j_prime] * psi_0.T
        psi_0_term = psi_1_term[psi_1_term != 0].prod()
        sum_terms.append(
            psi_1_term * psi_0_term * lambda_j[j_prime]
        )
    return np.log(sum(sum_terms))

In [None]:
denominator(0, 0, X, lambda_j, psi_0, psi_1)

In [None]:
def e_step_ij(i, j, X, lambda_j, psi_0, psi_1):
    # Calculate P(C_{i} = j | X_{i}, \theta) for 1 i and j value
    return e_numerator(i, j, X, lambda_j, psi_0, psi_1) - e_denominator(i, j, X, lambda_j, psi_0, psi_1) 

Function to do one iteration of the I step.

In [None]:
def e_step(X, lambda_j, psi_0, psi_1):
    posts = []
    for i in range(X.shape[0]):  # number of sequences
        posts.append(
            [e_step_ij(i, j, X, lambda_j, psi_0, psi_1) for j in range(X.shape[1])]
        )
    return np.array(posts)

e_step_1 = e_step(X, lambda_j, psi_0, psi_1)
e_step_1[1]

Calculate ELBO. What exactly is ELBO here?

#### ELBO lecture notes

- 

In [None]:
def loglikelihood(XXss, lambda_j, psi_0, psi_1, posteriors):
    

In [None]:
sum(np.e ** e_step_1[2])

## M step

Quon says $\boldsymbol{E}[C_{i,j}] = P(C_{i} = j | X_{i}, \theta)$

He also gives what $\boldsymbol{E}[C_{i,j}] = P(C_{i} = j | X_{i}, \theta)$ equals to in the E step (shown in the image below.)

### $\lambda_{j}$

Take sum of all values at each index $i$ over vector $C_{i, j}$ which would store prob each $j$ (each motif) being the transcription factor binding site and divide this value by the number of sequences $N$.
- Would this not always just sum to 1? And therefore $\lambda_{j}$ is basically fixed at 1 over the number of sequences?

In [None]:
# Make practice posterior to troubleshoot M-step
lambda_j, psi_0, psi_1 = init_EM(seq_length, motif_length)
practice_posterior = []
for i in range(X.shape[0]):
    practice_posterior.append(lambda_j)
practice_posterior = np.array(practice_posterior)

In [None]:
def lambda_j_m_step(E_Cij):
    return E_Cij.sum(axis=0) / E_Cij.shape[0]

In [None]:
assert lambda_j_m_step(practice_posterior).sum() == 1.0
assert lambda_j_m_step(practice_posterior).shape == lambda_j.shape

### $\psi^{1}_{p, k}$

- How does taking a sum over $i$ for $C_{i, j}$look compared to taking a sum over $i$ and $j$?

Product of indicator variables for a given motif (For example the matrix at `X[0][0]`) and the expectation at that motif calculated during the E step. Then take a sum overall all those values and divide by the number of sequences.

In [None]:
psi_1

In [None]:
def psi_1_m_step(E_Cij, psi_1):
    new_psi_1 = np.zeros(psi_1.shape)
    for p in range(psi_1.shape[0]):
        for k in range(psi_1.shape[1]):
            products = []
            for i in range(X.shape[0]):
                for j in range(X.shape[1]):
                    products.append(X[i][j][p][k] * E_Cij[i][j])
            new_psi_1[p][k] = sum(products) / E_Cij.shape[0]
    return new_psi_1

psi_1_pract = psi_1_m_step(practice_posterior, psi_1)

In [None]:
X[X != 0].shape

In [None]:
seq_length * X.shape[0]

In [None]:
psi_1_pract.sum(axis=1)

In [None]:
practice_posterior.shape

In [None]:
psi_1.sum(axis=1)

In [None]:
products = []
for i in range(practice_posterior.shape[0]):
    for j in range(practice_posterior.shape[1]):
        products.append(X[i][j][0][0] * practice_posterior[i][j])
sum(products) / X.shape[0]

In [None]:
sum(products) / X.shape[0]

### $\psi^{0}_{p, k}$

Seems pretty much like other $\psi$ but add some subtractions and change the denominator to the number of sequences times the number of possible motifs.

In [None]:
seq_len = 100
def psi_0_m_step(E_Cij, psi_0, seq_len, num_seqs):
    for p in range(psi_1.T.shape[0]):
        for k in range(psi_1.T.shape[1]):
            products = []
            for i in range(X.shape[0]):
                for j in range(X.shape[1]):
                    products.append(1 - X[i][j][p][k] * rand[i][j])
        psi_1[k][p] = sum(products) / ((seq_len - X.shape[1] + 1 - 1) * num_seqs)
    
    return psi_1

psi_0_m_step(e_step_1, psi_0, seq_len, X.shape[0])

In [None]:
X[0][0]

In [None]:
X[0][0] * psi_1.T