In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.python.client import device_lib
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow
import numpy as np

In [54]:
def readDatasets(path):
    f = open(path, 'r', encoding='utf-8')
    texts = f.readlines()
    f.close()
    return texts

text_train = readDatasets('HerMajestySpeechesDataset/train.txt')
text_test = readDatasets('HerMajestySpeechesDataset/test.txt')
text_val = readDatasets('HerMajestySpeechesDataset/dev.txt')

In [56]:
tokenizer_train = Tokenizer(oov_token='<unk>')
tokenizer_train.fit_on_texts(text_train) 
texts2ids_train = tokenizer_train.texts_to_sequences(text_train)

tokenizer_test = Tokenizer(oov_token='<unk>')
tokenizer_test.fit_on_texts(text_train) 
texts2ids_test = tokenizer_test.texts_to_sequences(text_train)

tokenizer_val = Tokenizer(oov_token='<unk>')
tokenizer_val.fit_on_texts(text_train) 
texts2ids_val = tokenizer_val.texts_to_sequences(text_train)

# print ("Texts as IDs:", texts2ids[:10])
ids2texts = tokenizer_train.sequences_to_texts(texts2ids)
# print ("IDs back to texts:", ids2texts[:10])

In [52]:
def get_ngrams(frase, size=2):
    for i in range(size):
        frase = [0] + frase
    ngrams_list = []
    for i in range(len(frase)-size):
        ngrams_list.append((tuple(frase[i:i+size]),frase[i+size]))
    return ngrams_list

all_ngrams = []
for i in texts2ids_train:
    all_ngrams += get_ngrams(i)

In [7]:
def co_table(lista_ocurrencias):
    table = {}
    for i in lista_ocurrencias:
        if i[0] in table:
            if i[1] in table[i[0]]:
                table[i[0]][i[1]] += 1
            else:
                table[i[0]][i[1]] = 1
        else:
            table[i[0]] = {}
            table[i[0]][i[1]] = 1
    return table

table = co_table(all_ngrams)

In [9]:
def generate_toyLM_ngram_a(table, context='aleatorio', n=15):
    if context == 'aleatorio':
        tmp = list(table.keys())
        context = tmp[np.random.randint(len(tmp))]

    cadena = list(context)
    for _ in range(n-len(context)):
        context = tuple(cadena[-len(context):])
        if context not in table.keys():
            return tokenizer_train.sequences_to_texts([cadena])
        else:
            new = max(table[context], key=table[context].get)
            cadena.append(new)
    return tokenizer_train.sequences_to_texts([cadena])

generate_toyLM_ngram_a(table)


In [11]:
def generate_toyLM_ngram_b(table, context='aleatorio', n=15):
    if context == 'aleatorio':
        tmp = list(table.keys())
        context = tmp[np.random.randint(len(tmp))]

    cadena = list(context)
    for i in range(n-len(context)):
        context = tuple(cadena[-len(context):])
        if context not in table.keys():
            return tokenizer_train.sequences_to_texts([cadena])
        else:
            lista_tmp = list()
            for i in table[context]:
                for j in range(table[context][i]):
                    lista_tmp.append(i)
            new = np.random.randint(len(lista_tmp))
            cadena.append(lista_tmp[new])
    return tokenizer_train.sequences_to_texts([cadena])

generate_toyLM_ngram_b(table, (424, 1166))

# toyLM_LSTM

In [57]:
def train_generate(text, max_seq_length=10):
    train_set = dict()
    for frase in text:
        for word_index in range(len(frase)):
            if word_index < max_seq_length:
                train_set[tuple(pad_sequences([frase[:word_index]], maxlen=max_seq_length)[0])] = frase[word_index]
    return train_set
    
train_set = train_generate(texts2ids_train)
test_set = train_generate(texts2ids_test)
val_set = train_generate(texts2ids_val)

In [58]:
x_train = np.array(list(train_set.keys()))
y_train = to_categorical(list(train_set.values()), num_classes=len(tokenizer_train.word_index))

x_test = np.array(list(test_set.keys()))
y_test = to_categorical(list(test_set.values()), num_classes=len(tokenizer_test.word_index))

x_val = np.array(list(val_set.keys()))
y_val = to_categorical(list(val_set.values()), num_classes=len(tokenizer_val.word_index))

In [59]:
model = Sequential([
    Embedding(5614, 20, input_length=10),
    LSTM(64),
    Dense(5614, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=64, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [60]:
import keras.backend as K

def perplexity(y_true, y_pred):
    cross_entropy = K.categorical_crossentropy(y_true, y_pred)
    perplexity = K.pow(2.0, cross_entropy)
    return perplexity

In [68]:
y_pred = model.predict(x_test)
np.mean(perplexity(y_test, y_pred))