Recurrent Neural Network

In [None]:
%pip install keras tensorflow

In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense, Input, TextVectorization, TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

# Load a text corpus
corpus = ['A A A A B A A B A A B B C A A B',
          'A A B B B C A A B A A B B B A B B C A',
          'A A A B A B B C A A'] * 1000

split_point = len(corpus) * 4//5
train_set = corpus[:split_point]
dev_set = corpus[split_point:]

vocab_size = 4
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(corpus)
train_set = tokenizer.texts_to_sequences(train_set)
dev_set = tokenizer.texts_to_sequences(dev_set)

max_len = 20 
train_set = pad_sequences(train_set, maxlen=max_len)
dev_set = pad_sequences(dev_set, maxlen=max_len)

def create_model(max_len, vocab_size, embedding_dim, hidden_dim):
    model = Sequential()
    # create the layer
    model.add(Input(shape=(max_len-1,), dtype='int32'))
    model.add(Embedding(vocab_size, embedding_dim, mask_zero=True))
    model.add(SimpleRNN(hidden_dim, return_sequences=True))
    model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
    model.summary()
    return model

model = create_model(max_len, vocab_size, 100, 128)

In [ ]:
history = model.fit(
    train_set[:,:-1],
    train_set[:,1:,None],
    batch_size=64,
    epochs=4,
    validation_data=(dev_set[:,:-1], dev_set[:,1:,None]),
)

In [ ]:
import matplotlib.pyplot as plt
def plot_history(history):
    plt.style.use('ggplot')
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'dev'], loc='best')
    plt.show()
plot_history(history)

In [ ]:
train_set[0]

In [ ]:
prompt = 'B B'
prompt_idx = tokenizer.texts_to_sequences([prompt])
prompt_idx = pad_sequences(prompt_idx, maxlen=max_len-1)
model.predict(prompt_idx)[0][-1]

In [ ]:
prompt_idx

In [ ]:
tokenizer.index_word

In [ ]:
import random
from numpy.random import choice

def generate(prompt, tokenizer, predictor_model, end_token=None):
    prompt_idx = tokenizer.texts_to_sequences([prompt])
    prompt_idx = pad_sequences(prompt_idx, maxlen=max_len-1)
    probs = predictor_model.predict(prompt_idx)[0][-1]
    word_index = choice(a=len[probs], p=probs)
    if word_index == 0 or (end_token is not None and word_index == tokenizer.word_index[end_token]):
        return prompt
    next_word = tokenizer.index_word[word_index]
    prompt += ' ' + next_word
    return prompt

In [ ]:
generate('B B', tokenizer, model)

Using actual data, generate mixed English and Spanish names
https://github.com/smashew/NameDatabases

In [ ]:
names = [' '.join(list(x.lower().strip())) + ' end' for x in open('names.txt').readlines()]

import random
random.shuffle(names)
split_point = len(names) * 9//10
train_set = names[:split_point]
dev_set = names[split_point:]

vocab_size = 33
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(names)
print(tokenizer.word_index)
train_set = tokenizer.texts_to_sequences(train_set)
dev_set = tokenizer.texts_to_sequences(dev_set)

max_len = 20 # sequence length to pad the output to
train_set = pad_sequences(train_set, maxlen=max_len)
dev_set = pad_sequences(dev_set, maxlen=max_len)

In [ ]:
new_model = create_model(max_len, vocab_size, 40, 128)

In [ ]:
history = new_model.fit(
    train_set[:,:-1],
    train_set[:,1:,None],
    batch_size=64,
    epochs=5,
    validation_data=(dev_set[:,:-1], dev_set[:,1:,None]),
)

In [ ]:
plot_history(history)

In [ ]:
new_name = generate('s t', tokenizer, new_model, end_token='end')
new_name.capitalize().replace(' ','')