In [98]:
import numpy as np
from torchtext.vocab import vocab
from sklearn.preprocessing import OrdinalEncoder, LabelBinarizer
from collections import OrderedDict, Counter
from keras.models import Sequential
from keras.layers import Bidirectional, LSTM, Dense, RepeatVector, TimeDistributed, Flatten, Dropout, BatchNormalization 
from keras.layers.embeddings import Embedding
from keras import regularizers, initializers
from keras.preprocessing.sequence import pad_sequences
from sklearn.svm import SVC
import torchtext
import fasttext

from torchtext.vocab import vocab

train_dir = "data/ES/train"
test_dir = "data/ES/dev.in"

def get_vecs_by_tokens(model, word_lst):
    temp_lst = []
    for word in word_lst:
        print("word:", word, "result:", model[word])
        temp_lst.append(model[word]) # numpy vector from vocab or out-of-vocab vector:
    return temp_lst

model = fasttext.load_model('data/ES/embeddings-s-model.bin')
# Get pre-trained model vector
# vec = torchtext.vocab.FastText(language='es')




In [99]:
def simple_bidirectional_lstm(batchsize, maxlen, embedding_size):
    model = Sequential()
    # model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=n_timesteps, trainable=False))
    model.add(Bidirectional(LSTM(20, return_sequences=True, input_shape=(batchsize, maxlen, embedding_size),kernel_regularizer=regularizers.l1(0.01),kernel_initializer=initializers.HeNormal())))
    model.add(Dense(6, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

def sentence_word_to_vectors(sentence_lst):
    vector_sentence_lst, vector_word_lst = [], []
    for sentence in sentence_lst:
        for word in sentence:
            vector_word_lst.append(model[word])
        vector_sentence_lst.append(vector_word_lst)
        vector_word_lst = []
    return vector_sentence_lst
    

In [100]:
counter = Counter()
sentence_lst, word_lst, label_sentence_lst, label_lst = [], [], [], []

# {'O': 0, 'B-positive': 1, ..., 'I-neutral': 6} 

lb = LabelBinarizer()
lb = lb.fit(range(1,7)) # Default value is [0, 0, 0, 0, 0, 0] = 0, [1, 0, 0, 0, 0, 0] = 1, [0, 0, 0, 0, 0, 1] = 6
true_label_lst = ['O', 'B-positive', 'B-negative', 'B-neutral', 'I-positive', 'I-negative', 'I-neutral']
label_dic = dict((v,k) for k,v in enumerate(true_label_lst))

with open(train_dir, "r", encoding="utf8") as f:
    for line in f:
        # Parse each line
        if len(line.split(" ")) == 2:
            word, label = line.replace("\n","").split(" ")
            counter.update(word)
            word_lst.append(word)   
            label_lst.append(lb.transform([label_dic[label]])[0])
        else:
            sentence_lst.append(word_lst)
            label_sentence_lst.append(label_lst)
            word_lst, label_lst = [], []

print(len([len(i) for i in sentence_lst]))
print(max([len(i) for i in label_sentence_lst]))

vector_sentence_lst = sentence_word_to_vectors(sentence_lst)
sentence_lst = pad_sequences(vector_sentence_lst, padding='post',dtype=np.float32)
label_sentence_lst = pad_sequences(label_sentence_lst, padding='post',dtype=np.float32)
maxlen = sentence_lst.shape[1]
embedding_size = sentence_lst.shape[-1]
print(label_sentence_lst.shape)
print(sentence_lst.shape)
print(maxlen,embedding_size)

batch_size = 16
bilstm = simple_bidirectional_lstm(batch_size,maxlen,embedding_size)
bilstm.fit(sentence_lst, label_sentence_lst, batch_size=batch_size, shuffle=True, epochs= 4)

2065
163
(2065, 163, 6)
(2065, 163, 30)
163 30
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x198369c78e0>

In [101]:
def predict_y(model,label_lst, word2vec_model, test_dir="data/ES/dev.in", output_dir="data/ES/dev.p4.out"):
    ''' Finds our predicted_y with our model 

    :param model: trained model for sequence prediction
    :type model: Keras

    :param label_lst: mapping from Ordinal to our actual labels
    :type label_lst: lst

    :param word2vec_model: pre-trained w2v model
    :type word2vec_model: .bin

    :param test_dir: our test file in either ES or RU
    :type test_dir: str

    :param output_dir: our output file for either ES or RU
    :type test_dir: str

    :Output: "dev.p3.out" by default to the directory given
    :rtype: .out file
    '''

    test_sentence_lst = []
    test_word_lst = []

    with open(test_dir, "r", encoding="utf8") as f:
        for line in f:
            if not line.strip():
                test_sentence_lst.append(test_word_lst)
                test_word_lst = []
            else:
                test_word_lst.append(line.replace("\n",""))
    # print(test_sentence_lst)
    test_vector_sentence_lst, test_vector_word_lst = [], []
    for sentence in test_sentence_lst:
        for word in sentence:
            test_vector_word_lst.append(word2vec_model[word])
        test_vector_sentence_lst.append(test_vector_word_lst)
        test_vector_word_lst = []
    test_sentence_lst = pad_sequences(test_vector_sentence_lst, padding='post',maxlen=maxlen,dtype=np.float32)
    # print(test_sentence_lst)
    y_pred = model.predict(test_sentence_lst)
    predictions = []
    for sentence in y_pred:
        for word in sentence:
            # print(word)
            predictions.append(np.argmax(word))
    i = 0
    # print(predictions)
    with open(output_dir,'w', encoding="utf-8") as f:
        with open(test_dir,'r',encoding="utf-8") as file:
            for line in file:
                if len(line.replace("\n","")) > 0:
                    word = line.replace("\n","")
                else:
                    f.write("\n")                    
                    continue
                predicted_y = label_lst[predictions[i]] # Convert argmax index to predicted name
                i += 1
                print(predicted_y, predictions[i], i)
                f.write(f"{word} {predicted_y}\n") # Write in our original word

In [102]:
predict_y(bilstm, true_label_lst, model, test_dir="data/ES/dev.in", output_dir="data/ES/dev.p4.out")

I-negative 5 1
I-negative 4 2
I-positive 4 3
I-positive 4 4
I-positive 4 5
I-positive 4 6
I-positive 4 7
I-positive 4 8
I-positive 4 9
I-positive 4 10
I-positive 4 11
I-positive 4 12
I-positive 4 13
I-positive 4 14
I-positive 4 15
I-positive 4 16
I-positive 4 17
I-positive 4 18
I-positive 4 19
I-positive 4 20
I-positive 4 21
I-positive 4 22
I-positive 4 23
I-positive 4 24
I-positive 4 25
I-positive 4 26
I-positive 4 27
I-positive 4 28
I-positive 4 29
I-positive 4 30
I-positive 4 31
I-positive 4 32
I-positive 4 33
I-positive 4 34
I-positive 4 35
I-positive 4 36
I-positive 4 37
I-positive 4 38
I-positive 4 39
I-positive 4 40
I-positive 4 41
I-positive 4 42
I-positive 4 43
I-positive 4 44
I-positive 4 45
I-positive 4 46
I-positive 4 47
I-positive 4 48
I-positive 4 49
I-positive 4 50
I-positive 4 51
I-positive 4 52
I-positive 4 53
I-positive 4 54
I-positive 4 55
I-positive 4 56
I-positive 4 57
I-positive 4 58
I-positive 4 59
I-positive 4 60
I-positive 4 61
I-positive 4 62
I-positive 4 63
I