In [1]:
import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio import Seq
from Bio.Data import CodonTable
import seq2seq
from seq2seq.models import SimpleSeq2seq
import keras
from keras.models import Sequential

Using Theano backend.


In [2]:
#1. Create a list of all coding proteins in E. coli based on standard coding table

#Source: MetaCYC - SmartTable: E.coli MG1655 protein coding genes -- http://metacyc.org/group?id=biocyc11-15281-3657402612
handle = open("E.coli_MG1655_protein_coding_genes.fa", "r")
geneList = list(SeqIO.parse(handle, "fasta"))
geneListTrunc = []
handle.close()
print(len(geneList))

#Get standard codon table
standard_table = CodonTable.unambiguous_dna_by_name["Standard"]
print(standard_table)

#For now, just remove genes that don't have a seq length divisible by 3 or have no sequence ("NIL"); later need to impute/investigate.
geneList[:] = [gene for gene in geneList if len(gene.seq)%3==0 and gene.seq!="NIL"]


#The standard codon table doesn't include GTG as a start codon, so convert them to ATG (quick-fix; better would be to figure out how to alter codon table)
for gene in geneList:
    if gene.seq[0:3]== 'GTG':
        gene.seq = 'ATG' + gene.seq[3:]

proteinList = [Seq.translate(gene.seq) for gene in geneList]
print(len(proteinList))


4351
Table 1 Standard, SGC0

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA Stop| A
T | TTG L(s)| TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L(s)| CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I   | ACT T   | AAT N   | AGT S   | T
A | ATC I   | ACC T   | AAC N   | AGC S   | C
A | ATA I   | ACA T   | AAA K   | AGA R   | A
A | ATG M(s)| ACG T   | AAG K   | AGG R   | G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V   | GCG A   | GAG E   | GGG G   | G
--+----

In [7]:
#2. Next, train a "translation" memory-inclusive neural network for single amino acid -> nucleotide triplet. This will
#theoretically lead to a stronger codon table by taking into account more features than current rule based codon-optimization
#algorithms. This reduces to a language translation problem. Check out: https://www.tensorflow.org/versions/r0.10/tutorials/seq2seq/index.html

#Using Seq2Seq (Sequence to sequence learning add-on for python deep learning library Keras)
model = SimpleSeq2seq(input_dim=5, hidden_dim=10, output_length=8, output_dim=8)
model.compile(loss='mse', optimizer='rmsprop')




In [None]:
#2. This time trying w/ TensorFlow seq2seq library:



In [3]:
#print(geneList[0].seq[0:3])
#print(geneList[0].seq)
#geneList[0].seq = "XXX" + geneList[0].seq[3:-1]

#for i,gene in enumerate(geneList):
 #   if len(gene.seq)%3!=0: 
        #print(gene.id," ", len(gene.seq))
  #      del geneList[i]
   #     a+=1
    #if length(gene): print(gene.id,"\n",gene.seq)
    #Seq.translate(gene.seq)
    #elif len(gene.seq)%3 == 2:
     #   b+=1
    #elif len(gene.seq)%3 == 1:
     #   c+=1
        

In [6]:
from keras.models import Sequential
import numpy as np
from keras.layers.recurrent import SimpleRNN, GRU, LSTM
from keras.layers.core import TimeDistributedDense, Activation

n_in_out = 1
n_hidden = 100
n_samples = 2297
n_timesteps = 400

model = Sequential()
# `return_sequences` controls whether to copy the input automatically
model.add(GRU( n_hidden, input_dim = n_in_out, return_sequences=True))
model.add(TimeDistributedDense(n_in_out, input_dim = n_hidden))
model.compile(loss='mse', optimizer='rmsprop')

X = np.random.random((n_samples, n_timesteps, n_in))
Y = np.random.random((n_samples, n_timesteps, n_out))

# learning the hidden states from source sentences

Xp = model._predict(X)
print(Xp.shape)
print(Y.shape)

model.fit(X, Y, nb_epoch=10)



NameError: name 'n_out' is not defined