In [111]:
import pandas as pd
import numpy as np
import spacy

raw = pd.read_csv('cleaned.nicknames.csv')

model = spacy.load('en_core_web_sm')

# add period so training knows when its done
nicknames = [f'{name}.' for name in raw['fake name']]
realnames = [f'{name}' for name in raw['real name']]

raw.head()

Unnamed: 0,fake name,real name,len fake,len real,category,notes,count
0,dumbo,randolph tex alles,1,3,domestic political figures,director of the united states secret service,1
1,wheres hunter,hunter biden,2,2,domestic political figures,american lawyer and lobbyist who is the second...,1
2,1% joe,joe biden,2,2,domestic political figures,47th vice president of the united states; form...,1
3,basement joe,joe biden,3,2,domestic political figures,47th vice president of the united states; form...,1
4,beijing joe,joe biden,3,2,domestic political figures,47th vice president of the united states; form...,1


In [112]:
# pos tagging
def nlp(name):
    name = model(name)
    return {word: word.pos_ for word in name}

test = [nlp(name) for name in raw['fake name']]

test[0:3]

# for name in test:
#     print(name)
#     for token in name:
#         print(token, token.pos_)

[{dumbo: 'PROPN'},
 {where: 'ADV', s: 'PROPN', hunter: 'VERB'},
 {1: 'NUM', %: 'NOUN', joe: 'PROPN'}]

In [113]:
print(nicknames[0:3], realnames[0:3])

['dumbo.', 'wheres hunter.', '1% joe.'] ['randolph tex alles', 'hunter biden', 'joe biden']


In [181]:
# vectorize names
##########################
allnames = nicknames + realnames

# max char length in names
max_chars = max([len(name) for name in allnames])
# max_real_chars = max([len(name) for name in realnames])
# total number of names
n = len(nicknames)
# nicknames to realnames
nick2real = {nicknames[i]:realnames[i] for i in range(n)}
# character to index
char2i = {char:0 for name in allnames for char in name}
char2i = {char:n for n, char in enumerate(dictionary)}
# index to character
i2char = {char2i[char]: char for char in char2i}
char_dimensions = len(i2char)

# set up vectors for output = nicknames
output = np.zeros((n, max_chars, char_dimensions))
# set up vectors for label = real names
label = np.zeros((n, max_chars, char_dimensions))

# vectorize output and labels
for i, name in enumerate(nicknames):
    name = list(name)
    for row, ch in enumerate(name):
        # assign 1 to nickname, character number, vocab index
        output[i, row, char2i[ch]] = 1
        # if row < len(name)-1:
        #     # print(j)
        #     label[i, row, char2i[ch]] = 1
        
    for row, ch in enumerate(nick2real[''.join(name)]):
        # assign 1 to real name, character number, vocab index
        label[i, row, char2i[ch]] = 1

In [182]:
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.callbacks import LambdaCallback

model = Sequential()
model.add(LSTM(128, input_shape=(max_chars, char_dimensions), return_sequences=True))
model.add(Dense(char_dimensions, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

In [359]:
def generate_name(model, limit, input):
    # vectorize the input
    word_vec = np.zeros((1, max_chars, char_dimensions))
    # for row, ch in enumerate(input):
    #     # assign 1 to nickname, character number, vocab index
    #     word_vec[0, row, char2i[ch]] = 1

    def predict(index):
        # pull probabilities for character index
        probabilities = list(model.predict(word_vec)[0,index])
        # normalize probabilities
        probabilities = probabilities / np.sum(probabilities)
        if index == limit-1:
            return '.'
        # guess a letter
        guess = np.random.choice(range(char_dimensions), p=probabilities) # choose a letter
        word_vec[0, index+1, guess] = 1
        return i2char[guess]


    gen_name = ''.join([predict(i) for i in range(limit)])
    print(f'{input}: {gen_name}')

In [332]:
def generate_name_loop(epoch, _):
    if epoch % 999 == 0:
        
        print('Names generated after epoch %d:' % epoch)

        for i, name in enumerate(['alex kahanek', 'joe biden', 'barack obama']):
            generate_name(model, limit = 13, input = name)
        
        print()
      
name_generator = LambdaCallback(on_epoch_end = generate_name_loop)

model.fit(output, label, batch_size=64, epochs=10000, callbacks=[name_generator], verbose=0)

model.save("model.output")

Names generated after epoch 0:
alex kahanek: aee  azinnss.
joe biden: joe bidensee.
barack obama: beraiiie sns.

Names generated after epoch 1000:
alex kahanek: alt  oonnsss.
joe biden: joe bidensee.
barack obama: gerrii e sns.

Names generated after epoch 2000:
alex kahanek: aet  reensss.
joe biden: joe bidensee.
barack obama: jer iiiessns.

Names generated after epoch 3000:
alex kahanek: clt  oensiss.
joe biden: joe biden se.
barack obama: jerniiee sns.

Names generated after epoch 4000:
alex kahanek: al   orrssss.
joe biden: joe bidensbe.
barack obama: bor iiae sns.

Names generated after epoch 5000:
alex kahanek: al   oenesss.
joe biden: joe biden ee.
barack obama: jor iiiessns.

Names generated after epoch 6000:
alex kahanek: pe   rednnss.
joe biden: joe bidennke.
barack obama: jeriiiee snr.

Names generated after epoch 7000:
alex kahanek: be   odknnss.
joe biden: joe bidenzbd.
barack obama: jorriiee snn.

Names generated after epoch 8000:
alex kahanek: ce   oednsss.
joe biden: jo

In [333]:
from tensorflow import keras
model = keras.models.load_model("model.output")

In [414]:
generate_name(model, limit = 13, input = 'alex kahanek')

alex kahanek: hnannyy  coo.
