In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.python.client import device_lib
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow
import numpy as np

In [2]:
def readDatasets(path):
    f = open(path, 'r', encoding='utf-8')
    texts = f.readlines()
    f.close()
    return texts

text_train = readDatasets('HerMajestySpeechesDataset/train.txt')
text_test = readDatasets('HerMajestySpeechesDataset/test.txt')
text_val = readDatasets('HerMajestySpeechesDataset/dev.txt')



In [3]:
tokenizer_train = Tokenizer(oov_token='<unk>', num_words = 2000)
tokenizer_train.fit_on_texts(text_train) 
texts2ids_train = tokenizer_train.texts_to_sequences(text_train)

texts2ids_test = tokenizer_train.texts_to_sequences(text_test)

texts2ids_val = tokenizer_train.texts_to_sequences(text_val)

In [4]:
np.max(np.concatenate(texts2ids_train))

1999.0

In [5]:
print(len(texts2ids_train[0]), len(texts2ids_test[0]), len(texts2ids_val[0]))

12 8 13


In [6]:
for i in texts2ids_train:
    i.append(2000)

for i in texts2ids_test:
    i.append(2000)

for i in texts2ids_val:
    i.append(2000)

In [7]:
print(len(texts2ids_train[0]), len(texts2ids_test[0]), len(texts2ids_val[0]))


13 9 14


In [8]:
def get_ngrams(frase, size=2):
    frase = np.concatenate((np.zeros(size), frase))

    ngrams_list = []
    for i in range(len(frase)-size):
        ngrams_list.append((tuple(frase[i:i+size]),frase[i+size]))
    return ngrams_list

all_ngrams = []
for i in texts2ids_train:
    all_ngrams += get_ngrams(i)


In [9]:
def co_table(lista_ocurrencias):
    table = {}
    for i in lista_ocurrencias:
        if i[0] in table:
            if i[1] in table[i[0]]:
                table[i[0]][i[1]] += 1
            else:
                table[i[0]][i[1]] = 1
        else:
            table[i[0]] = {}
            table[i[0]][i[1]] = 1
    return table

table = co_table(all_ngrams)

In [10]:
def generate_toyLM_ngram_a(table, context='aleatorio', n=15):
    if context == 'aleatorio':
        tmp = list(table.keys())
        context = tmp[np.random.randint(len(tmp))]

    cadena = list(context)
    for _ in range(n-len(context)):
        context = tuple(cadena[-len(context):])
        if context not in table.keys():
            break
        else:
            new = max(table[context], key=table[context].get)
            if new == 2000:
                break
            cadena.append(new)
    return tokenizer_train.sequences_to_texts([cadena])

generate_toyLM_ngram_a(table)

['glasgow when over seventy nations and territories are gathered here in the <unk> of the']

In [11]:
def perplexity_ngrams(frases):
    tmp = []
    for frase in frases:
        n_grams = get_ngrams(frase)
        perplexity = 1
        for context, following in n_grams:
            if context in table.keys():
                denominador = sum(table[context].values())
                if following in table[context].keys():
                    numerador = table[context][following]
                else:
                    numerador = 0
            else:
                denominador, numerador = 0, 0

            numerador += 1
            denominador += 2000

            perplexity *= 1/(numerador/denominador)
        tmp.append(perplexity**(1/len(n_grams)))

    return np.mean(tmp)

In [12]:
perplexity_ngrams(texts2ids_test)

992.1306822885472

In [13]:
def generate_toyLM_ngram_b(table, context='aleatorio', n=15):
    if context == 'aleatorio':
        tmp = list(table.keys())
        context = tmp[np.random.randint(len(tmp))]

    cadena = list(context)
    for i in range(n-len(context)):
        context = tuple(cadena[-len(context):])
        if context not in table.keys():
            break
        else:
            lista_tmp = list()
            for i in table[context]:
                for j in range(table[context][i]):
                    lista_tmp.append(i)
            new = np.random.randint(len(lista_tmp))
            if new == 2000:
                break
            cadena.append(lista_tmp[new])
    return tokenizer_train.sequences_to_texts([cadena])

generate_toyLM_ngram_b(table)


['in holyrood you have the faith to ask <unk>']

# toyLM_LSTM

In [14]:
def train_generate(text, max_seq_length=10):
    train_set = dict()
    for frase in text:
        for word_index in range(len(frase)):
            if word_index < max_seq_length:
                train_set[tuple(pad_sequences([frase[:word_index]], maxlen=max_seq_length)[0])] = frase[word_index]
    return train_set
    
train_set = train_generate(texts2ids_train)
test_set = train_generate(texts2ids_test)
val_set = train_generate(texts2ids_val)

In [15]:
def train_generate(text, size=2):
    x = []
    y = []
    for phrase in text:
        for context, following in get_ngrams(phrase,size):
            x.append(list(context))
            y.append(following)
    return np.array(x), to_categorical(np.array(y))

In [16]:
x_train, y_train = train_generate(texts2ids_train, 2)
x_test, y_test = train_generate(texts2ids_test, 2)
x_val, y_val = train_generate(texts2ids_val, 2)

print(x_val.shape, y_val.shape)

(12971, 2) (12971, 2001)


In [17]:
x_train = np.array(list(train_set.keys()))
x_train
# y_train = to_categorical(list(train_set.values()), num_classes=len(tokenizer_train.word_index))
#
# x_test = np.array(list(test_set.keys()))
# y_test = to_categorical(list(test_set.values()), num_classes=len(tokenizer_test.word_index))
#
# x_val = np.array(list(val_set.keys()))
# y_val = to_categorical(list(val_set.values()), num_classes=len(tokenizer_val.word_index))

array([[  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   2],
       [  0,   0,   0, ...,   0,   2, 181],
       ...,
       [  0,   0,   0, ...,   5,   2, 123],
       [  0,   0, 132, ...,   2, 123, 168],
       [  0, 132, 131, ..., 123, 168,   4]])

In [18]:
model = Sequential([
    Embedding(2001, 20),
    LSTM(64),
    Dense(2001, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(x_train, y_train, batch_size=64, epochs=10)

ValueError: Data cardinality is ambiguous:
  x sizes: 16043
  y sizes: 55078
Make sure all arrays contain the same number of samples.

In [None]:
def perplexity_lstm(model, frases):
    tmp = []
    for frase in frases:
        n_grams = get_ngrams(frase)
        perplexity = 1
        for context, following in n_grams:
            prob = model.predict(np.array([context]))

            perplexity *= 1/prob[0][int(following)]
        tmp.append(perplexity**(1/len(n_grams)))

    return np.mean(tmp)

In [None]:
perplexity_lstm(model, texts2ids_test)

130.3956923574683

In [23]:
def generate_toyLM_lstm_a(model, context='aleatorio', n=15):
    if context=='aleatorio':
        context = list(np.random.randint(0, 5616,2))

    cadena = context
    for _ in range(n-len(context)):
        context = cadena[-len(context):]
        new = np.argmax(model.predict(np.array([context])))
        if new == 5615:
            break
        cadena.append(new)
    return tokenizer_train.sequences_to_texts([cadena])

generate_toyLM_lstm_a(model)

['line <unk> tenth legislation constant officers answer fashion october fifth fifth wider fifth already far']

In [None]:
def generate_toyLM_lstm_b(model, context='aleatorio', n=15):
    if context=='aleatorio':
        context = list(np.random.randint(0, 2001,2))

    cadena = context
    for _ in range(n-len(context)):
        context = cadena[-len(context):]
        probs = model.predict(np.array([context]))
        new = np.random.choice(range(2001), p=probs[0])
        if new == 5615:
            break
        cadena.append(new)
    return tokenizer_train.sequences_to_texts([cadena])

generate_toyLM_lstm_b(model)


['given memory in the commonwealth and views that in glasgow africa is our symbol of']