In [266]:
import numpy as np
from torchtext.vocab import vocab
from sklearn.preprocessing import OrdinalEncoder, LabelBinarizer
from collections import OrderedDict, Counter
from keras.models import Sequential
from keras.layers import Bidirectional, LSTM, Dense, RepeatVector, TimeDistributed, Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
import torchtext
import fasttext

from torchtext.vocab import vocab

train_dir = "data/ES/train"
test_dir = "data/ES/dev.in"

def get_vecs_by_tokens(model, word_lst):
    temp_lst = []
    for word in word_lst:
        # print("word:", word, "result:", model[word])
        temp_lst.append(model[word]) # numpy vector from vocab or out-of-vocab vector:
    return temp_lst

model = fasttext.load_model('data/ES/embeddings-l-model.bin')
# Get pre-trained model vector
# vec = torchtext.vocab.FastText(language='es')





In [277]:
def simple_bidirectional_lstm(batchsize, maxlen, embedding_size):
    model = Sequential()
    # model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=n_timesteps, trainable=False))
    model.add(Bidirectional(LSTM(10, return_sequences=True, input_shape=(batchsize, maxlen, embedding_size))))

    model.add(Dense(6, activation='softmax'))
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

In [276]:
counter = Counter()
sentence_lst, word_lst, label_sentence_lst, label_lst = [], [], [], []

# {'O': 0, 'B-positive': 1, ..., 'I-neutral': 6} 

lb = LabelBinarizer()
lb = lb.fit(range(1,7)) # Default value is [0, 0, 0, 0, 0, 0] = 0, [1, 0, 0, 0, 0, 0] = 1, [0, 0, 0, 0, 0, 1] = 6
true_label_lst = ['O', 'B-positive', 'B-negative', 'B-neutral', 'I-positive', 'I-negative', 'I-neutral']
label_dic = dict((v,k) for k,v in enumerate(true_label_lst))

with open(train_dir, "r", encoding="utf8") as f:
    for line in f:
        # Parse each line
        if len(line.split(" ")) == 2:
            word, label = line.replace("\n","").split(" ")
            counter.update(word)
            word_lst.append(word)   
            label_lst.append(lb.transform([label_dic[label]])[0])
        else:
            sentence_lst.append(word_lst)
            label_sentence_lst.append(label_lst)
            word_lst, label_lst = [], []

print(len([len(i) for i in sentence_lst]))
print(max([len(i) for i in label_sentence_lst]))
for sentence in sentence_lst:
    for idx, word in enumerate(sentence):
        sentence[idx] = model[word]
sentence_lst = pad_sequences(sentence_lst).astype(np.float32)
label_sentence_lst = pad_sequences(label_sentence_lst).astype(np.float32)
maxlen = sentence_lst.shape[1]
embedding_size = sentence_lst.shape[-1]
print(label_sentence_lst.shape)
print(sentence_lst.shape)
print(maxlen,embedding_size)


# ordered_dict = OrderedDict(counter.most_common())
# vocab = vocab(ordered_dict)
# # insert special tokens and set default index to 'unknown'
# vocab.insert_token('<PAD>', 0)
# vocab.insert_token('<UNK>', 1)
# vocab.set_default_index(1)
# word_emb = get_vecs_by_tokens(model, vocab.get_itos())
# word_arr = np.array(word_lst)
# label_arr = np.array(label_lst)
# print(word_arr.shape,label_arr.shape)
# vocab_size, embedding_size = len(word_emb),len(word_emb[0])
# print(vocab_size, embedding_size)
# OE = OrdinalEncoder()

# label_arr = OE.fit_transform(label_arr.reshape(-1, 1))

# print(word_arr.shape,label_arr.shape)
# print(word_arr.shape,label_arr.shape)
# print(word_arr[1],label_arr[1])


# n_timesteps = 8
# word_arr = word_arr.reshape(-1, n_timesteps, 1)
# label_arr = label_arr.reshape(-1, n_timesteps, 1)


# word_arr = word_arr.reshape(-1, n_timesteps, -1)
# label_arr = label_arr.reshape(-1, n_timesteps, -1)

batch_size = 1
bilstm = simple_bidirectional_lstm(batch_size,maxlen,embedding_size)
bilstm.fit(sentence_lst, label_sentence_lst, batch_size=batch_size, shuffle=False)

2065
163
(2065, 163, 6)
(2065, 163, 300)
163 300

KeyboardInterrupt: 

In [271]:
def predict_y(model,label_lst, word2vec_model, test_dir="data/ES/dev.in", output_dir="data/ES/dev.p4.out"):
    ''' Finds our predicted_y with our model 

    :param model: trained model for sequence prediction
    :type model: Keras

    :param label_lst: mapping from Ordinal to our actual labels
    :type label_lst: lst

    :param word2vec_model: pre-trained w2v model
    :type word2vec_model: .bin

    :param test_dir: our test file in either ES or RU
    :type test_dir: str

    :param output_dir: our output file for either ES or RU
    :type test_dir: str

    :Output: "dev.p3.out" by default to the directory given
    :rtype: .out file
    '''

    test_sentence_lst = []
    test_word_lst = []

    with open(test_dir, "r", encoding="utf8") as f:
        for line in f:
            if not line.strip():
                test_sentence_lst.append(test_word_lst)
                test_word_lst = []
            else:
                test_word_lst.append(line.replace("\n",""))
    # print(test_sentence_lst)
    for sentence in test_sentence_lst:
        for idx, word in enumerate(sentence):
            sentence[idx] = word2vec_model[word]

    test_sentence_lst = pad_sequences(test_sentence_lst,maxlen=maxlen).astype(np.float32)
    # print(test_sentence_lst)
    y_pred = model.predict(test_sentence_lst)
    predictions = []
    for sentence in y_pred:
        for word in sentence:
            predictions.append(np.argmax(word))
    i = 0
    with open(output_dir,'w', encoding="utf-8") as f:
        with open(test_dir,'r',encoding="utf-8") as file:
            for line in file:
                if len(line.replace("\n","")) > 0:
                    word = line.replace("\n","")
                else:
                    f.write("\n")                    
                    continue
                predicted_y = label_lst[predictions[i]] # Convert argmax index to predicted name
                i += 1
                f.write(f"{word} {predicted_y}\n") # Write in our original word

In [272]:
predict_y(bilstm, true_label_lst, model, test_dir="data/ES/dev.in", output_dir="data/ES/dev.p4.out")