In [None]:
import numpy as np 
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.utils import to_categorical
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical
from keras import backend as K
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from matplotlib import pyplot
import re

In [None]:
def getPadSequences(v,max_features):
    y = v
    for idx,row in y.iterrows():
      a = ''
      b = ''
      for c in row[0]:
          a = a + b + c
          b = ' ' 
      row[0] = a    

    tokenizer = Tokenizer(num_words=max_features, split=' ')
    tokenizer.fit_on_texts(y['Sequence'].values)
    x_ = tokenizer.texts_to_sequences(y['Sequence'].values)
    return pad_sequences(x_)

def letter_to_index(letter):
    _alphabet = 'ABCDEFGHIJKLMNOPQRSTUVXWYZ*'
    return next((i for i, _letter in enumerate(_alphabet) if _letter == letter), 29)

def create_lstm(embed_dim, lstm_out):
    model = Sequential()
    model.add(Embedding(max_features, embed_dim))
    model.add(SpatialDropout1D(0.4))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(2,activation='softmax'))
    model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics=['acc'])
    return model 


Mounted at /content/gdrive


In [None]:
# sequencias ccbh
url = '../../data/seqs/ccbh4851_seq.csv'
data = pd.read_csv(url)
data['Sequence'] = data['Sequence'].apply(lambda x: [int(letter_to_index(e)) for e in x])
ccbh = pad_sequences(data['Sequence'])

# ecoli
url = '../../data/seqs/ecoli-sequencias-d.csv'
data = pd.read_csv(url)

# pao1
url = '../../data/seqs/pao1-sequencias-d.csv'
data = data.append(pd.read_csv(url)) 

data['Sequence'] = data['Sequence'].apply(lambda x: [int(letter_to_index(e)) for e in x])

X = pad_sequences(data['Sequence'])
y = data.loc[ : , 'essencial' ]

In [None]:
print(data[ data['essencial'] == 1].shape)
print(data[ data['essencial'] == 0].shape)
max_features=2000
model = create_lstm(128, 196)
batch_size = 32

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, stratify = y)

# treinamento espera que y_trains's estejam representados como 'one hot encode'
y_train_encoded = to_categorical(y_train)


class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)


# Early stopping
path = "./genome_sequence_best_model.h5" 
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=60)
mc = ModelCheckpoint(path, monitor='loss', mode='min', verbose=1, save_best_only=True)


d_class_weights = dict(enumerate(class_weights))
y_pred = model.fit(X_train, 
                   y_train_encoded,
                   epochs = 50, 
                   batch_size = batch_size,
                   verbose = 2, 
                   shuffle = True,
                   class_weight = d_class_weights,
                   callbacks = [mc])


# summarize history for accuracy
pyplot.plot(y_pred.history['acc'], label='train')
pyplot.plot(y_pred.history['val_acc'], label='test')
pyplot.legend()
pyplot.show()


(1190, 2)
(9164, 2)
Epoch 1/50
 - 1080s - loss: 0.6914 - acc: 0.4991

Epoch 00001: loss improved from inf to 0.69139, saving model to /content/gdrive/My Drive/best_model.h5
Epoch 2/50
 - 1072s - loss: 0.6823 - acc: 0.6058

Epoch 00002: loss improved from 0.69139 to 0.68229, saving model to /content/gdrive/My Drive/best_model.h5
Epoch 3/50
 - 1072s - loss: 0.6825 - acc: 0.5888

Epoch 00003: loss did not improve from 0.68229
Epoch 4/50
 - 1070s - loss: 0.6788 - acc: 0.5824

Epoch 00004: loss improved from 0.68229 to 0.67877, saving model to /content/gdrive/My Drive/best_model.h5
Epoch 5/50
 - 1067s - loss: 0.7132 - acc: 0.5462

Epoch 00005: loss did not improve from 0.67877
Epoch 6/50
 - 1071s - loss: 0.7034 - acc: 0.5269

Epoch 00006: loss did not improve from 0.67877
Epoch 7/50
 - 1073s - loss: 0.6923 - acc: 0.5233

Epoch 00007: loss did not improve from 0.67877
Epoch 8/50
 - 1065s - loss: 0.6925 - acc: 0.5594

Epoch 00008: loss did not improve from 0.67877
Epoch 9/50
 - 1069s - loss: 

In [None]:
# sequencias ccbh
predictions = model.predict(np.array(ccbh), batch_size=batch_size, verbose = 2)
np.savetxt("ccbh_pred.csv", predictions, delimiter=",")

for p in predictions: 
    print(p)