In [34]:
!pip install --quiet optuna

In [35]:
import pandas as pd
import tensorflow as tf
from collections import defaultdict
import numpy as np
from sklearn.model_selection import train_test_split
import optuna

In [36]:
data = pd.read_csv('./sentences.csv')
data.head()

Unnamed: 0,sentence_number,word,tag
0,1,como,HOW
1,1,usar,USE
2,1,lucros,PROFIT
3,1,e,CONJ
4,1,reinvestir,INVEST


In [37]:
vocab = set(data['word'])
vocab_list = list(vocab)

word2idx = {}
idx2word = {}

for idx, word in enumerate(vocab_list):
    word2idx[word] = idx
    idx2word[idx] = word

tags = set(data['tag'])
tags_list = list(tags)

tag2idx = {}
idx2tag = {}

for idx, tag in enumerate(tags_list):
    tag2idx[tag] = idx
    idx2tag[idx] = tag

In [38]:
converted_data = data.copy()

converted_data['word'] = converted_data['word'].transform(lambda word: word2idx[word])
converted_data['tag'] = converted_data['tag'].transform(lambda tag: tag2idx[tag])
converted_data.head()

Unnamed: 0,sentence_number,word,tag
0,1,103,3
1,1,42,22
2,1,65,70
3,1,22,34
4,1,113,28


In [39]:
list_data = converted_data.groupby(['sentence_number'])['word', 'tag'].agg(lambda i: list(i))
list_data.head()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0_level_0,word,tag
sentence_number,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[103, 42, 65, 22, 113, 44, 66, 80, 11, 54]","[3, 22, 70, 34, 28, 56, 54, 16, 31, 52]"
2,"[103, 39, 17, 15, 5, 122, 53, 117, 73, 12, 16,...","[3, 65, 12, 33, 24, 44, 40, 67, 6, 48, 54, 12,..."
3,"[121, 30, 104, 83, 114, 111]","[41, 41, 47, 25, 25, 25]"
4,"[30, 50, 19, 78, 108, 88, 87, 122]","[41, 5, 26, 68, 41, 29, 41, 44]"
5,"[92, 16, 108, 37, 2, 112, 79, 102, 51, 68, 89]","[11, 54, 41, 12, 0, 24, 64, 50, 28, 24, 25]"


In [40]:
[idx2word[idx] for idx in list_data['word'][5]]

['quais',
 'são',
 'os',
 'melhores',
 'recursos',
 'para',
 'aprender',
 'sobre',
 'investimento',
 'em',
 'títulos']

In [41]:
max_len = list_data['word'].map(len).max()
max_len

28

In [42]:
pad_tokens = tf.keras.preprocessing.sequence.pad_sequences(list_data['word'])
pad_tags = tf.keras.preprocessing.sequence.pad_sequences(list_data['tag'])
pad_tags = [tf.keras.utils.to_categorical(tag, num_classes=len(tags)) for tag in pad_tags]

train_tokens, test_tokens, train_tags, test_tags = train_test_split(pad_tokens, pad_tags)

In [43]:
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from tensorflow.keras import optimizers

In [44]:
input_dim = len(vocab)
input_length = max_len

def objective(trial):
    embedding_output_dim = trial.suggest_int("embedding_output_dim", 16, 256)

    model = Sequential([
        Embedding(input_dim=input_dim, output_dim=embedding_output_dim, input_length=input_length),
        Bidirectional(LSTM(units=embedding_output_dim, return_sequences=True), merge_mode='concat'),
        LSTM(units=embedding_output_dim, return_sequences=True),
        TimeDistributed(Dense(len(tags), activation='relu'))
    ])

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    num_epochs = trial.suggest_int("num_epochs", 5, 150)

    model.fit(train_tokens, np.array(train_tags), verbose=1, epochs=num_epochs)

    return model.evaluate(test_tokens, np.array(test_tags), return_dict=True)['accuracy']

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=6)

[32m[I 2021-08-15 15:21:07,878][0m A new study created in memory with name: no-name-b1b23e11-6489-4631-bde6-a29e50c5cc6b[0m


Epoch 1/82
Epoch 2/82
Epoch 3/82
Epoch 4/82
Epoch 5/82
Epoch 6/82
Epoch 7/82
Epoch 8/82
Epoch 9/82
Epoch 10/82
Epoch 11/82
Epoch 12/82
Epoch 13/82
Epoch 14/82
Epoch 15/82
Epoch 16/82
Epoch 17/82
Epoch 18/82
Epoch 19/82
Epoch 20/82
Epoch 21/82
Epoch 22/82
Epoch 23/82
Epoch 24/82
Epoch 25/82
Epoch 26/82
Epoch 27/82
Epoch 28/82
Epoch 29/82
Epoch 30/82
Epoch 31/82
Epoch 32/82
Epoch 33/82
Epoch 34/82
Epoch 35/82
Epoch 36/82
Epoch 37/82
Epoch 38/82
Epoch 39/82
Epoch 40/82
Epoch 41/82
Epoch 42/82
Epoch 43/82
Epoch 44/82
Epoch 45/82
Epoch 46/82
Epoch 47/82
Epoch 48/82
Epoch 49/82
Epoch 50/82
Epoch 51/82
Epoch 52/82
Epoch 53/82
Epoch 54/82
Epoch 55/82
Epoch 56/82
Epoch 57/82
Epoch 58/82
Epoch 59/82
Epoch 60/82
Epoch 61/82
Epoch 62/82
Epoch 63/82
Epoch 64/82
Epoch 65/82
Epoch 66/82
Epoch 67/82
Epoch 68/82
Epoch 69/82
Epoch 70/82
Epoch 71/82
Epoch 72/82
Epoch 73/82
Epoch 74/82
Epoch 75/82
Epoch 76/82
Epoch 77/82
Epoch 78/82
Epoch 79/82
Epoch 80/82
Epoch 81/82
Epoch 82/82


[32m[I 2021-08-15 15:21:40,247][0m Trial 0 finished with value: 0.6734693646430969 and parameters: {'embedding_output_dim': 243, 'num_epochs': 82}. Best is trial 0 with value: 0.6734693646430969.[0m


Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


[32m[I 2021-08-15 15:21:48,578][0m Trial 1 finished with value: 0.6326530575752258 and parameters: {'embedding_output_dim': 54, 'num_epochs': 64}. Best is trial 0 with value: 0.6734693646430969.[0m


Epoch 1/111
Epoch 2/111
Epoch 3/111
Epoch 4/111
Epoch 5/111
Epoch 6/111
Epoch 7/111
Epoch 8/111
Epoch 9/111
Epoch 10/111
Epoch 11/111
Epoch 12/111
Epoch 13/111
Epoch 14/111
Epoch 15/111
Epoch 16/111
Epoch 17/111
Epoch 18/111
Epoch 19/111
Epoch 20/111
Epoch 21/111
Epoch 22/111
Epoch 23/111
Epoch 24/111
Epoch 25/111
Epoch 26/111
Epoch 27/111
Epoch 28/111
Epoch 29/111
Epoch 30/111
Epoch 31/111
Epoch 32/111
Epoch 33/111
Epoch 34/111
Epoch 35/111
Epoch 36/111
Epoch 37/111
Epoch 38/111
Epoch 39/111
Epoch 40/111
Epoch 41/111
Epoch 42/111
Epoch 43/111
Epoch 44/111
Epoch 45/111
Epoch 46/111
Epoch 47/111
Epoch 48/111
Epoch 49/111
Epoch 50/111
Epoch 51/111
Epoch 52/111
Epoch 53/111
Epoch 54/111
Epoch 55/111
Epoch 56/111
Epoch 57/111
Epoch 58/111
Epoch 59/111
Epoch 60/111
Epoch 61/111
Epoch 62/111
Epoch 63/111
Epoch 64/111
Epoch 65/111
Epoch 66/111
Epoch 67/111
Epoch 68/111
Epoch 69/111
Epoch 70/111
Epoch 71/111
Epoch 72/111
Epoch 73/111
Epoch 74/111
Epoch 75/111
Epoch 76/111
Epoch 77/111
Epoch 78

[32m[I 2021-08-15 15:21:58,721][0m Trial 2 finished with value: 0.6326530575752258 and parameters: {'embedding_output_dim': 42, 'num_epochs': 111}. Best is trial 0 with value: 0.6734693646430969.[0m


Epoch 1/95
Epoch 2/95
Epoch 3/95
Epoch 4/95
Epoch 5/95
Epoch 6/95
Epoch 7/95
Epoch 8/95
Epoch 9/95
Epoch 10/95
Epoch 11/95
Epoch 12/95
Epoch 13/95
Epoch 14/95
Epoch 15/95
Epoch 16/95
Epoch 17/95
Epoch 18/95
Epoch 19/95
Epoch 20/95
Epoch 21/95
Epoch 22/95
Epoch 23/95
Epoch 24/95
Epoch 25/95
Epoch 26/95
Epoch 27/95
Epoch 28/95
Epoch 29/95
Epoch 30/95
Epoch 31/95
Epoch 32/95
Epoch 33/95
Epoch 34/95
Epoch 35/95
Epoch 36/95
Epoch 37/95
Epoch 38/95
Epoch 39/95
Epoch 40/95
Epoch 41/95
Epoch 42/95
Epoch 43/95
Epoch 44/95
Epoch 45/95
Epoch 46/95
Epoch 47/95
Epoch 48/95
Epoch 49/95
Epoch 50/95
Epoch 51/95
Epoch 52/95
Epoch 53/95
Epoch 54/95
Epoch 55/95
Epoch 56/95
Epoch 57/95
Epoch 58/95
Epoch 59/95
Epoch 60/95
Epoch 61/95
Epoch 62/95
Epoch 63/95
Epoch 64/95
Epoch 65/95
Epoch 66/95
Epoch 67/95
Epoch 68/95
Epoch 69/95
Epoch 70/95
Epoch 71/95
Epoch 72/95
Epoch 73/95
Epoch 74/95
Epoch 75/95
Epoch 76/95
Epoch 77/95
Epoch 78/95
Epoch 79/95
Epoch 80/95
Epoch 81/95
Epoch 82/95
Epoch 83/95
Epoch 84/95
E

[32m[I 2021-08-15 15:22:09,985][0m Trial 3 finished with value: 0.6479591727256775 and parameters: {'embedding_output_dim': 87, 'num_epochs': 95}. Best is trial 0 with value: 0.6734693646430969.[0m


Epoch 1/72
Epoch 2/72
Epoch 3/72
Epoch 4/72
Epoch 5/72
Epoch 6/72
Epoch 7/72
Epoch 8/72
Epoch 9/72
Epoch 10/72
Epoch 11/72
Epoch 12/72
Epoch 13/72
Epoch 14/72
Epoch 15/72
Epoch 16/72
Epoch 17/72
Epoch 18/72
Epoch 19/72
Epoch 20/72
Epoch 21/72
Epoch 22/72
Epoch 23/72
Epoch 24/72
Epoch 25/72
Epoch 26/72
Epoch 27/72
Epoch 28/72
Epoch 29/72
Epoch 30/72
Epoch 31/72
Epoch 32/72
Epoch 33/72
Epoch 34/72
Epoch 35/72
Epoch 36/72
Epoch 37/72
Epoch 38/72
Epoch 39/72
Epoch 40/72
Epoch 41/72
Epoch 42/72
Epoch 43/72
Epoch 44/72
Epoch 45/72
Epoch 46/72
Epoch 47/72
Epoch 48/72
Epoch 49/72
Epoch 50/72
Epoch 51/72
Epoch 52/72
Epoch 53/72
Epoch 54/72
Epoch 55/72
Epoch 56/72
Epoch 57/72
Epoch 58/72
Epoch 59/72
Epoch 60/72
Epoch 61/72
Epoch 62/72
Epoch 63/72
Epoch 64/72
Epoch 65/72
Epoch 66/72
Epoch 67/72
Epoch 68/72
Epoch 69/72
Epoch 70/72
Epoch 71/72
Epoch 72/72


[32m[I 2021-08-15 15:22:21,506][0m Trial 4 finished with value: 0.6530612111091614 and parameters: {'embedding_output_dim': 112, 'num_epochs': 72}. Best is trial 0 with value: 0.6734693646430969.[0m


Epoch 1/82
Epoch 2/82
Epoch 3/82
Epoch 4/82
Epoch 5/82
Epoch 6/82
Epoch 7/82
Epoch 8/82
Epoch 9/82
Epoch 10/82
Epoch 11/82
Epoch 12/82
Epoch 13/82
Epoch 14/82
Epoch 15/82
Epoch 16/82
Epoch 17/82
Epoch 18/82
Epoch 19/82
Epoch 20/82
Epoch 21/82
Epoch 22/82
Epoch 23/82
Epoch 24/82
Epoch 25/82
Epoch 26/82
Epoch 27/82
Epoch 28/82
Epoch 29/82
Epoch 30/82
Epoch 31/82
Epoch 32/82
Epoch 33/82
Epoch 34/82
Epoch 35/82
Epoch 36/82
Epoch 37/82
Epoch 38/82
Epoch 39/82
Epoch 40/82
Epoch 41/82
Epoch 42/82
Epoch 43/82
Epoch 44/82
Epoch 45/82
Epoch 46/82
Epoch 47/82
Epoch 48/82
Epoch 49/82
Epoch 50/82
Epoch 51/82
Epoch 52/82
Epoch 53/82
Epoch 54/82
Epoch 55/82
Epoch 56/82
Epoch 57/82
Epoch 58/82
Epoch 59/82
Epoch 60/82
Epoch 61/82
Epoch 62/82
Epoch 63/82
Epoch 64/82
Epoch 65/82
Epoch 66/82
Epoch 67/82
Epoch 68/82
Epoch 69/82
Epoch 70/82
Epoch 71/82
Epoch 72/82
Epoch 73/82
Epoch 74/82
Epoch 75/82
Epoch 76/82
Epoch 77/82
Epoch 78/82
Epoch 79/82
Epoch 80/82
Epoch 81/82
Epoch 82/82


[32m[I 2021-08-15 15:22:37,668][0m Trial 5 finished with value: 0.6581632494926453 and parameters: {'embedding_output_dim': 139, 'num_epochs': 82}. Best is trial 0 with value: 0.6734693646430969.[0m


In [51]:
input_dim = len(vocab)
input_length = max_len

best_trial_params = study.best_trial.params

embedding_output_dim = best_trial_params['embedding_output_dim']
#num_epochs = best_trial_params['num_epochs']
num_epochs=30

model = Sequential([
    Embedding(input_dim=input_dim, output_dim=embedding_output_dim, input_length=input_length),
    Bidirectional(LSTM(units=embedding_output_dim, return_sequences=True, dropout=0.01), merge_mode='concat'),
    LSTM(units=embedding_output_dim, return_sequences=True, dropout=0.01),
    TimeDistributed(Dense(len(tags), activation='relu'))
])

optimizer = optimizers.Adam(clipvalue=0.5)

model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model.fit(train_tokens, np.array(train_tags), epochs=num_epochs)

model.evaluate(test_tokens, np.array(test_tags))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


[3.474992513656616, 0.6326530575752258]

In [52]:
t = 'como usar lucros e reinvestir sem ser esmagado por impostos'.split(' ')
t2 = [word2idx[word] for word in t if word in vocab]
t3 = model.predict(t2)
[idx2tag[np.argmax(cat)] for cat in t3]



['RESOURCES',
 'RESOURCES',
 'RESOURCES',
 'RESOURCES',
 'RESOURCES',
 'RESOURCES',
 'RESOURCES',
 'RESOURCES',
 'RESOURCES',
 'RESOURCES']