In [1]:
from __future__ import print_function

from hyperopt import Trials, STATUS_OK, tpe, rand
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.advanced_activations import LeakyReLU
from keras.models import Sequential
from keras.utils import np_utils
from sklearn.metrics import accuracy_score
from hyperas import optim
from hyperas.distributions import choice, uniform, conditional
from keras import optimizers
import numpy as np
from keras.datasets import mnist

Using TensorFlow backend.


In [2]:
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from keras.models import Sequential

from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

from keras.layers.recurrent import GRU
from keras.preprocessing.sequence import pad_sequences
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer

[nltk_data] Downloading package punkt to /home/anna/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def data():
    def prepare_data(filename):
        data = pd.read_csv(filename, sep="\t")
        data = data[['text', 'subj']]
        data['subj'] = data['subj'].apply(lambda subj: subj.split('\\'))
        data = data.sample(n=10000)
        mlb = MultiLabelBinarizer()
        encoded_subjects = pd.DataFrame(mlb.fit_transform(data.pop('subj')), columns=mlb.classes_, index=data.index)
        data = data.join(encoded_subjects)
        return data, mlb.classes_

    train, categories = prepare_data('learn.txt')
    test, _ = prepare_data('test.txt')
    
    subjects = pd.read_csv('subjects.txt', sep="\t", header=None, names=['code', 'desc_rus', 'description'])[['code', 'description']]
    
    X_train = train.text
    X_test = test.text
    Y_train = train[categories]
    Y_test = test[categories]
    
    xLengths = [len(word_tokenize(x)) for x in X_train]
    h = sorted(xLengths)  #sorted lengths

    
    maxLength = h[int(len(h) * 0.70)]
    
    max_vocab_size = 200000
    input_tokenizer = Tokenizer(max_vocab_size)
    input_tokenizer.fit_on_texts(X_train)
    input_vocab_size = len(input_tokenizer.word_index) + 1
    
    X_train = np.array(pad_sequences(input_tokenizer.texts_to_sequences(X_train), maxlen=maxLength))
    X_test = np.array(pad_sequences(input_tokenizer.texts_to_sequences(X_test), maxlen=maxLength))
    return X_train, Y_train, X_test, Y_test

In [4]:
def create_model(X_train, Y_train, X_test, Y_test):
    
    embedding_dim = 256
    num_categories = len(categories)
 
    pool_length = 4
    lstm_output_size = 100
    batch_size = 200
    nb_epoch = 10
    
    model = Sequential()
    model.add(Embedding(input_vocab_size, embedding_dim,input_length = maxLength))
    
    model.add(Dropout({{uniform(0, 1)}}))
    model.add(Conv1D({{choice([64, 128])}},
                            {{choice([6, 8])}},
                            padding='valid',
                            activation='relu',
                            strides=1))
    model.add(MaxPooling1D(pool_size=pool_length))
    model.add(LSTM(lstm_output_size))
    model.add(Dense(num_categories))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
              
    print('Train...')
    result = model.fit(X_train, Y_train, 
              batch_size=batch_size, 
              epochs=nb_epoch,
              verbose=2,
              validation_split=0.1)
    
    #get the highest validation accuracy of the training epochs
    validation_acc = np.amax(result.history['val_acc']) 
    print('Best validation acc of epoch:', validation_acc)
    return {'loss': -validation_acc, 'status': STATUS_OK, 'model': model}

In [5]:
best_run, best_model = optim.minimize(model=create_model,data=data,algo=tpe.suggest,max_evals=5,trials=Trials(),notebook_name='rnn_optimizer')
X_train, Y_train, X_test, Y_test = data()
print("Evalutation of best performing model:")
print(best_model.evaluate(X_test, Y_test))
print("Best performing model chosen hyper-parameters:")
print(best_run)

>>> Imports:
#coding=utf-8

from __future__ import print_function

try:
    from hyperopt import Trials, STATUS_OK, tpe, rand
except:
    pass

try:
    from keras.layers.core import Dense, Dropout, Activation
except:
    pass

try:
    from keras.layers.advanced_activations import LeakyReLU
except:
    pass

try:
    from keras.models import Sequential
except:
    pass

try:
    from keras.utils import np_utils
except:
    pass

try:
    from sklearn.metrics import accuracy_score
except:
    pass

try:
    from hyperas import optim
except:
    pass

try:
    from hyperas.distributions import choice, uniform, conditional
except:
    pass

try:
    from keras import optimizers
except:
    pass

try:
    import numpy as np
except:
    pass

try:
    from keras.datasets import mnist
except:
    pass

try:
    import pandas as pd
except:
    pass

try:
    from sklearn.preprocessing import MultiLabelBinarizer
except:
    pass

try:
    from sklearn.pipeline import Pipeline
except:
    pass

Epoch 4/10
 - 40s - loss: 0.1556 - acc: 0.9490 - val_loss: 0.1716 - val_acc: 0.9448
Epoch 5/10
 - 40s - loss: 0.1347 - acc: 0.9585 - val_loss: 0.1641 - val_acc: 0.9461
Epoch 6/10
 - 39s - loss: 0.1159 - acc: 0.9650 - val_loss: 0.1587 - val_acc: 0.9468
Epoch 7/10
 - 40s - loss: 0.1011 - acc: 0.9690 - val_loss: 0.1555 - val_acc: 0.9491
Epoch 8/10
 - 40s - loss: 0.0886 - acc: 0.9727 - val_loss: 0.1567 - val_acc: 0.9490
Epoch 9/10
 - 41s - loss: 0.0767 - acc: 0.9774 - val_loss: 0.1561 - val_acc: 0.9501
Epoch 10/10
 - 40s - loss: 0.0652 - acc: 0.9827 - val_loss: 0.1586 - val_acc: 0.9488
Best validation acc of epoch: 0.9501179695129395
Train...
Train on 9000 samples, validate on 1000 samples
Epoch 1/10
 - 32s - loss: 0.3019 - acc: 0.9153 - val_loss: 0.2040 - val_acc: 0.9289
Epoch 2/10
 - 30s - loss: 0.2004 - acc: 0.9301 - val_loss: 0.2025 - val_acc: 0.9289
Epoch 3/10
 - 29s - loss: 0.1969 - acc: 0.9309 - val_loss: 0.1942 - val_acc: 0.9309
Epoch 4/10
 - 29s - loss: 0.1817 - acc: 0.9417 - val_

ValueError: Error when checking input: expected embedding_5_input to have shape (143,) but got array with shape (144,)

In [6]:
print(best_run)

{'Conv1D': 0, 'Conv1D_1': 1, 'Dropout': 0.8713141896816126}
