In [None]:
import pandas as pd
import tensorflow as tf
from collections import defaultdict
import numpy as np
from sklearn.model_selection import train_test_split
import optuna

In [None]:
data = pd.read_csv('./sentences.csv')
data.head()

In [None]:
vocab = set(data['word'])
vocab_list = list(vocab)

word2idx = {}
idx2word = {}

for idx, word in enumerate(vocab_list):
    word2idx[word] = idx
    idx2word[idx] = word

tags = set(data['tag'])
tags_list = list(tags)

tag2idx = {}
idx2tag = {}

for idx, tag in enumerate(tags_list):
    tag2idx[tag] = idx
    idx2tag[idx] = tag

In [None]:
converted_data = data.copy()

converted_data['word'] = converted_data['word'].transform(lambda word: word2idx[word])
converted_data['tag'] = converted_data['tag'].transform(lambda tag: tag2idx[tag])
converted_data.head()

In [None]:
list_data = converted_data.groupby(['sentence_number'])['word', 'tag'].agg(lambda i: list(i))
list_data.head()

In [None]:
max_len = list_data['word'].map(len).max()
max_len

In [None]:
pad_tokens = tf.keras.preprocessing.sequence.pad_sequences(list_data['word'])
pad_tags = tf.keras.preprocessing.sequence.pad_sequences(list_data['tag'])
pad_tags = [tf.keras.utils.to_categorical(tag, num_classes=len(tags)) for tag in pad_tags]

train_tokens, test_tokens, train_tags, test_tags = train_test_split(pad_tokens, pad_tags)

In [None]:
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model

In [None]:
input_dim = len(vocab)
input_length = max_len

model = Sequential([
    Embedding(input_dim=input_dim, output_dim=64, input_length=input_length),
    Bidirectional(LSTM(units=64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode='concat'),
    LSTM(units=64, return_sequences=True, dropout=0.5, recurrent_dropout=0.5),
    TimeDistributed(Dense(len(tags), activation="relu"))
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
def objective(trial):
    embedding_output_dim = trial.suggest_int("embedding_output_dim", 16, 256)

    model = Sequential([
        Embedding(input_dim=input_dim, output_dim=embedding_output_dim, input_length=input_length),
        Bidirectional(LSTM(units=embedding_output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode='concat'),
        LSTM(units=embedding_output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5),
        TimeDistributed(Dense(len(tags), activation='relu'))
    ])

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    num_epochs = trial.suggest_int("num_epochs", 5, 150)

    model.fit(train_tokens, np.array(train_tags), verbose=1, epochs=num_epochs)

    return model.evaluate(test_tokens, np.array(test_tags), return_dict=True)['accuracy']

study = optuna.create_study(direction='maximize')
#study.optimize(objective, n_trials=100)

In [None]:
model.fit(train_tokens, np.array(train_tags), verbose=1, epochs=5)

In [None]:
t = 'Como conseguir dinheiro'.split(' ')
t2 = [word2idx[word] for word in t]
t3 = model.predict(t2)
[idx2tag[np.argmax(cat)] for cat in t3]