# 11-mer
The goal of this notebook is to reproduce the 11-mer model.
While investigating the plotting notebook in the original repository it was found that the 11-mer model actually is the best markov model. In the config file for the best markov model in the results folder it can be seen, that it is a bidirectional markov model of order 5.

In this notebook no splitting of the data wil be performed, ie. the whole dataset will be used for training and testing.

# Imports

In [1]:
%load_ext autoreload
%autoreload 2

import sys, os
sys.path.insert(0, '../..')

import gc
import pysam
import pandas as pd
import re
import torch
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
import numpy as np


import helpers.train_eval as train_eval    #train and evaluation
import helpers.misc as misc                #miscellaneous functions

import encoding_utils.sequence_encoders as sequence_encoders
import encoding_utils.sequence_utils as sequence_utils
from models.spec_dss import DSSResNet, DSSResNetEmb, SpecAdd
from models.baseline.markov_model import *

from Bio import SeqIO

# Data

In [4]:
# load the train data if it exists
file_path = 'train_df.pickle'
if os.path.exists(file_path):
    with open(file_path, 'rb') as f:
        train_df = pickle.load(f)
else:
    # load the fasta file and select the train data
    fasta_file = "../../../test/Homo_sapiens_3prime_UTR.fa"
    sequences = []
    for s in SeqIO.parse(fasta_file, "fasta"):
        sequences.append(str(s.seq).upper())
    # get the train fraction
    # val_fraction = 0.1
    # N_train = int(len(sequences)*(1-val_fraction))
    # train_data = sequences[:N_train]
    # store it as a dataframe
    # train_df = pd.DataFrame({'3-UTR':train_data})
    train_df = pd.DataFrame({'3-UTR':sequences})
    with open(file_path, 'wb') as f:
        pickle.dump(train_df, f)
train_df

Unnamed: 0,3-UTR
0,ATCTTATATAACTGTGAGATTAATCTCAGATAATGACACAAAATAT...
1,GGTTGCCGGGGGTAGGGGTGGGGCCACACAAATCTCCAGGAGCCAC...
2,GGCAGCCCATCTGGGGGGCCTGTAGGGGCTGCCGGGCTGGTGGCCA...
3,CCCACCTACCACCAGAGGCCTGCAGCCTCCCACATGCCTTAAGGGG...
4,TGGCCGCGGTGAGGTGGGTTCTCAGGACCACCCTCGCCAAGCTCCA...
...,...
18129,AGCAAGCATTGAAAATAATAGTTATTGCATACCAATCCTTGTTTGC...
18130,AGCAAGCATTGAAAATAATAGTTATTGCATACCAATCCTTGTTTGC...
18131,GCCTACTTCATCTCAGGACCCGCCCAAGAGTGGCCGCGGCTTTGGG...
18132,TTGTCAGTCTGTCTGCTCAGGACACAAGAACTAAGGGGCAACAAAT...


# Model

In [5]:
# training here refers to calculating the 11mer frequencies
file_path = 'kmer_train.pickle'
if os.path.exists(file_path):
    with open(file_path, 'rb') as f:
        kmer_train = pickle.load(f)
else: 
    # get the frequency counts of all motifs till 11mer
    kmer_train = KmerCount(11,pseudocount=0.1)
    kmer_train.compute_counts(train_df['3-UTR'])
    kmer_train.kmer_counts_dict

    # save dictionary pickle file
    with open('kmer_train.pickle', 'wb') as f:
        pickle.dump(kmer_train, f)

100%|██████████| 18134/18134 [07:09<00:00, 42.18it/s] 


In [6]:
# initialize a bidirectional markov model of order 5
markov_model = MarkovModel(
    kmer_train,
    markov_matrix_path="markov_model.npy",
    order=5,
    bidirectional=True,
    test_df_path='train_df.pickle'
)

In [7]:
# calculate the markov matrix using the 11mer counts
markov_model.model.compile_from_counts()

  self.markov_matrix[order,:,:] = self.markov_matrix[order,:,:]/np.sum(self.markov_matrix[order,:,:],axis=1)[:,np.newaxis]


In [8]:
# generate the result files needed for plotting using the test data
markov_model.test()