In [None]:
import requests
url = 'https://github.com/am1tyadav/superhero/blob/master/superheroes.txt'
response = requests.get(url)
data = response.text
print(data[:100])

In [None]:
import tensorflow as tf

In [121]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~',
    split='\n',
)

In [122]:
tokenizer.fit_on_texts(data)

In [None]:
char_to_index = tokenizer.word_index
index_to_char = dict((v,k) for k,v in char_to_index.items())
print(index_to_char)

In [None]:
names = data.splitlines()
names[:10]

In [None]:
tokenizer.texts_to_sequences(names[0])

In [126]:
def name_to_seq(name):
  return [tokenizer.texts_to_sequences(c)[0][0] for c in name]

In [None]:
name_to_seq(names[0])

In [128]:
def seq_to_name(seq):
  return ''.join([index_to_char[i] for i in seq if i != 0])

In [None]:
seq_to_name(name_to_seq(names[0]))

In [130]:
sequences = []
for name in names:
  seq = name_to_seq(name)
  if len(seq) >= 2:
    sequences += [seq[:i] for i in range(2,len(seq)+1)]

In [None]:
sequences[:10]

In [None]:
max_len = max([len(x) for x in sequences])
print(max_len)

In [None]:
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(
    sequences, padding='pre',
    maxlen=max_len
)
print(padded_sequences[0])

In [None]:
padded_sequences.shape

In [None]:
x,y = padded_sequences[:,:-1], padded_sequences[:,-1]
print(x.shape,y.shape)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y)
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

In [None]:
num_chars = len(char_to_index.keys())+1
print(num_chars)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPool1D, LSTM
from tensorflow.keras.layers import Bidirectional, Dense

num_chars = 29
max_len = 33

model = Sequential([
    Embedding(num_chars, 8, input_length=max_len - 1, input_shape=(max_len - 1,)),
    Conv1D(64, 5, strides=1, activation='tanh', padding='causal'),
    MaxPool1D(2),
    LSTM(32),
    Dense(num_chars, activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)
model.summary()

In [None]:
h = model.fit(
    x_train, y_train,
    validation_data=(x_test, y_test),
    epochs=50, verbose=2,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3)
    ]
)

In [None]:
from matplotlib import pyplot as plt
epochs_ran = len(h.history['loss'])
plt.plot(range(0,epochs_ran), h.history['val_accuracy'], label='Validation')
plt.plot(range(0,epochs_ran), h.history['accuracy'], label='Training')
plt.legend()
plt.show()

In [145]:
def generate_names(seed):
  for i in range(0,40):
    seq = name_to_seq(seed)
    padded = tf.keras.preprocessing.sequence.pad_sequences([seq], padding='pre', maxlen=max_len-1, truncating='pre')
    pred = model.predict(padded)[0]
    pred_char = index_to_char[tf.argmax(pred).numpy()]
    seed += pred_char

    if pred_char == '\t':
      break

  print(seed)

In [None]:
generate_names('b')
generate_names('c')