# Generation of dinosaur names

## 1. Statistical generation

The task:
* Implement a function that collects ngrams from the corpus. The length of ngrams should be one of parameters.
* Implement random sampling for picking the next letter based on the previous context.
* Generate new dino species. :)

In [1]:
import random
from collections import Counter, defaultdict

In [2]:
START = "^"
END = "$"

In [3]:
dino_names = []
with open("dinosaur-names.txt", "r") as f:
    for line in f.readlines():
        dino_names.append(START + line.strip() + END)

In [4]:
dino_names[:10]

['^aachenosaurus$',
 '^aardonyx$',
 '^abdallahsaurus$',
 '^abelisaurus$',
 '^abrictosaurus$',
 '^abrosaurus$',
 '^abydosaurus$',
 '^acanthopholis$',
 '^achelousaurus$',
 '^acheroraptor$']

In [5]:
# Find most frequent ngrams just for fun :)

def collect_ngrams(corpus, n):
    ngrams = Counter()
    for line in corpus:
        line_split = list(line)
        for i in range(len(line_split) - n + 1):
            ngrams[tuple(line_split[i:i+n])] += 1
    return ngrams

In [6]:
ngrams = collect_ngrams(dino_names, 5)
ngrams.most_common(10)

[(('u', 'r', 'u', 's', '$'), 721),
 (('a', 'u', 'r', 'u', 's'), 719),
 (('s', 'a', 'u', 'r', 'u'), 717),
 (('o', 's', 'a', 'u', 'r'), 443),
 (('a', 's', 'a', 'u', 'r'), 105),
 (('n', 'o', 's', 'a', 'u'), 88),
 (('i', 's', 'a', 'u', 'r'), 72),
 (('c', 'e', 'r', 'a', 't'), 72),
 (('e', 'r', 'a', 't', 'o'), 72),
 (('r', 'o', 's', 'a', 'u'), 68)]

In [7]:
# Compute next letter frequencies based on the collected ngrams

def collect_ngram_dist(corpus, n):
    ngram_dist = defaultdict(dict)
    for line in corpus:
        line_split = list(line)
        for i in range(len(line_split) - n + 1):
            key = "".join(line_split[i:i+n-1])
            val = line_split[i+n-1]
            try:
                ngram_dist[key][val] += 1
            except:
                ngram_dist[key][val] = 1
    return ngram_dist

In [8]:
ngram_freqs = collect_ngram_dist(dino_names, 3)
list(ngram_freqs.items())[0]

('^a',
 {'a': 2,
  'b': 5,
  'c': 9,
  'd': 4,
  'e': 6,
  'f': 1,
  'g': 7,
  'h': 1,
  'i': 1,
  'j': 2,
  'l': 25,
  'm': 13,
  'n': 26,
  'o': 2,
  'p': 4,
  'q': 1,
  'r': 23,
  's': 8,
  't': 6,
  'u': 12,
  'v': 7,
  'z': 1})

In [9]:
# Сompute next letter probabilities based on the collected ngrams

def freqs_to_probs(ngram_freqs):
    ngram_probs = defaultdict(dict)
    for k, v in list(ngram_freqs.items()):
        total = sum(v.values())
        ngram_probs[k] = dict()
        for i in v.keys():
            ngram_probs[k][i] = v[i] / total
    return ngram_probs

In [10]:
ngram_probs = freqs_to_probs(ngram_freqs)
list(ngram_probs.items())[0]

('^a',
 {'a': 0.012048192771084338,
  'b': 0.030120481927710843,
  'c': 0.05421686746987952,
  'd': 0.024096385542168676,
  'e': 0.03614457831325301,
  'f': 0.006024096385542169,
  'g': 0.04216867469879518,
  'h': 0.006024096385542169,
  'i': 0.006024096385542169,
  'j': 0.012048192771084338,
  'l': 0.15060240963855423,
  'm': 0.0783132530120482,
  'n': 0.1566265060240964,
  'o': 0.012048192771084338,
  'p': 0.024096385542168676,
  'q': 0.006024096385542169,
  'r': 0.13855421686746988,
  's': 0.04819277108433735,
  't': 0.03614457831325301,
  'u': 0.07228915662650602,
  'v': 0.04216867469879518,
  'z': 0.006024096385542169})

In [11]:
# Pick a random element based on the collected ngrams and their probabilities

def generate_letter(context, ngram_probs):
    try:
        options = ngram_probs[context]
        rand_num = random.random()
        total = 0
        for k in options.keys():
            total += options[k]
            if total > rand_num:
                return k
    except:
        return END

In [12]:
[generate_letter('ab', ngram_probs) for i in range(10)]

['u', 'e', 'l', 'a', 'e', 'u', 'd', 'e', 'r', 'o']

In [13]:
# Generate a new dino species

def generate_dino(start, ngram_probs, n, end_sym):
    next_letter = generate_letter(start, ngram_probs)
    dino_name = start + next_letter
    while next_letter != end_sym:
        next_letter = generate_letter(dino_name[-1*(n-1):], ngram_probs)
        dino_name += next_letter
    return dino_name

In [14]:
N = 4
ngram_freqs = collect_ngram_dist(dino_names, N)
ngram_probs = freqs_to_probs(ngram_freqs)
possible_begs = [k for k in ngram_probs.keys() if k.startswith(START)]

random.shuffle(possible_begs)

for start in possible_begs[:30]:
    print(generate_dino(start, ngram_probs, N, END)[1:-1])

europhosaurus
nyorosaurus
fosaurus
wendromeus
breuillosaurus
velocosaurus
padros
unaashetris
elorosaurus
colong
koreadnouchus
oohkotasaurus
spinosaurus
itemodromeykosaurus
shansutodraciliubanjaffia
walker
glisaurus
clepisabeipiaosaurus
microsaurus
nanusor
dromimus
bellynashosaurus
utabatitanyasus
hippodromeus
jobathus
abrosaurus
don
airasaurus
augus
uintarchasillus


## 2. Neural generation

The task:
* Prepare the training data: input is the given context and output is the character that has to be predicted.
* Map characters to integers and do one-hot encoding.
* Train the model.
* Call the model to predict the next character and implement random sampling.
* Generate new dino species. :)

References: https://machinelearningmastery.com/develop-character-based-neural-language-model-keras/

In [15]:
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.preprocessing.sequence import pad_sequences
from numpy import array

Using TensorFlow backend.


In [16]:
# Prepare the training data

N = 5
X_str, y_str = [], []

for name in dino_names:
    for i in range(N, len(name)):
        X_str.append(name[i-N:i])
        y_str.append(name[i])

for i, j in list(zip(X_str, y_str))[:20]:
    print(i, j)

^aach e
aache n
achen o
cheno s
henos a
enosa u
nosau r
osaur u
sauru s
aurus $
^aard o
aardo n
ardon y
rdony x
donyx $
^abda l
abdal l
bdall a
dalla h
allah s


In [17]:
# Map characters to integers

chars = sorted(list(set([char for name in dino_names + [START, END] for char in name])))
mapping = dict((c, i) for i, c in enumerate(chars))

print(mapping)

{'$': 0, '^': 1, 'a': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6, 'f': 7, 'g': 8, 'h': 9, 'i': 10, 'j': 11, 'k': 12, 'l': 13, 'm': 14, 'n': 15, 'o': 16, 'p': 17, 'q': 18, 'r': 19, 's': 20, 't': 21, 'u': 22, 'v': 23, 'w': 24, 'x': 25, 'y': 26, 'z': 27}


In [18]:
X_int, y_int = [], []

for i in X_str:
    X_int.append([mapping[char] for char in i])
for i in y_str:
    y_int.append([mapping[char] for char in i])

for i, j in list(zip(X_int, y_int))[:20]:
    print(i, j)

[1, 2, 2, 4, 9] [6]
[2, 2, 4, 9, 6] [15]
[2, 4, 9, 6, 15] [16]
[4, 9, 6, 15, 16] [20]
[9, 6, 15, 16, 20] [2]
[6, 15, 16, 20, 2] [22]
[15, 16, 20, 2, 22] [19]
[16, 20, 2, 22, 19] [22]
[20, 2, 22, 19, 22] [20]
[2, 22, 19, 22, 20] [0]
[1, 2, 2, 19, 5] [16]
[2, 2, 19, 5, 16] [15]
[2, 19, 5, 16, 15] [26]
[19, 5, 16, 15, 26] [25]
[5, 16, 15, 26, 25] [0]
[1, 2, 3, 5, 2] [13]
[2, 3, 5, 2, 13] [13]
[3, 5, 2, 13, 13] [2]
[5, 2, 13, 13, 2] [9]
[2, 13, 13, 2, 9] [20]


In [19]:
# Do one-hot encoding

vocab_size = len(mapping)
print("The vocab size is", vocab_size)

X = array([to_categorical(x, num_classes=vocab_size) for x in X_int])
y = to_categorical(y_int, num_classes=vocab_size)

print(X[0])
print(y[0])

The vocab size is 28
[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0.]]
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]


In [20]:
# Build the model

model = Sequential()
# X is our input; adding one LSTM hidden layer with 100 memory cells
model.add(LSTM(100, input_shape=(X.shape[1], X.shape[2])))
# adding a fully connected output layer that outputs one vector
# with a probability distribution across all characters
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 100)               51600     
_________________________________________________________________
dense_1 (Dense)              (None, 28)                2828      
Total params: 54,428
Trainable params: 54,428
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
# Fit the model

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=100, verbose=2)

Instructions for updating:
Use tf.cast instead.
Epoch 1/100
 - 3s - loss: 2.1971 - accuracy: 0.3729
Epoch 2/100
 - 4s - loss: 1.6802 - accuracy: 0.5192
Epoch 3/100
 - 3s - loss: 1.5389 - accuracy: 0.5570
Epoch 4/100
 - 3s - loss: 1.4559 - accuracy: 0.5856
Epoch 5/100
 - 3s - loss: 1.3982 - accuracy: 0.5981
Epoch 6/100
 - 3s - loss: 1.3498 - accuracy: 0.6119
Epoch 7/100
 - 3s - loss: 1.3057 - accuracy: 0.6263
Epoch 8/100
 - 3s - loss: 1.2682 - accuracy: 0.6356
Epoch 9/100
 - 3s - loss: 1.2350 - accuracy: 0.6456
Epoch 10/100
 - 3s - loss: 1.2017 - accuracy: 0.6585
Epoch 11/100
 - 3s - loss: 1.1693 - accuracy: 0.6643
Epoch 12/100
 - 3s - loss: 1.1402 - accuracy: 0.6744
Epoch 13/100
 - 3s - loss: 1.1104 - accuracy: 0.6812
Epoch 14/100
 - 3s - loss: 1.0817 - accuracy: 0.6879
Epoch 15/100
 - 2s - loss: 1.0531 - accuracy: 0.6917
Epoch 16/100
 - 2s - loss: 1.0248 - accuracy: 0.7038
Epoch 17/100
 - 3s - loss: 0.9981 - accuracy: 0.7110
Epoch 18/100
 - 2s - loss: 0.9694 - accuracy: 0.7157
Epoch 1

<keras.callbacks.callbacks.History at 0x13315de10>

In [22]:
# model.save('model.h5')
# dump(mapping, open('mapping.pkl', 'wb'))

In [23]:
mapping_inv = dict((i, c) for c, i in list(mapping.items()))
print(mapping_inv)

{0: '$', 1: '^', 2: 'a', 3: 'b', 4: 'c', 5: 'd', 6: 'e', 7: 'f', 8: 'g', 9: 'h', 10: 'i', 11: 'j', 12: 'k', 13: 'l', 14: 'm', 15: 'n', 16: 'o', 17: 'p', 18: 'q', 19: 'r', 20: 's', 21: 't', 22: 'u', 23: 'v', 24: 'w', 25: 'x', 26: 'y', 27: 'z'}


In [24]:
def generate_letter_nn(start_str, model, mapping, mapping_inv):
    # encode the context
    start_int = [mapping[char] for char in start_str]
    #start_int = pad_sequences([start_int], maxlen=n, truncating='pre')
    start = to_categorical(start_int, num_classes=len(mapping))
    start = start.reshape(1, start.shape[0], start.shape[1])
    # predict the next letter
    options = model.predict(start, verbose=0)[0]
    rand_num = random.random()
    total = 0
    for i in range(len(options)):
        total += options[i]
        if total > rand_num:
            return mapping_inv[i]

In [25]:
generate_letter_nn("^acri", model, mapping, mapping_inv)

's'

In [26]:
[generate_letter_nn('^acri', model, mapping, mapping_inv) for i in range(10)]

['s', 's', 't', 's', 'o', 's', 's', 's', 's', 's']

In [27]:
# Generate a new dino species

def generate_dino_nn(start, model, n, end_sym, mapping, mapping_inv):
    next_letter = generate_letter_nn(start, model, mapping, mapping_inv)
    dino_name = start + next_letter
    while next_letter != end_sym:
        next_letter = generate_letter_nn(dino_name[-1*n:], model, mapping, mapping_inv)
        dino_name += next_letter
    return dino_name

In [28]:
generate_dino_nn("^acri", model, 5, END, mapping, mapping_inv)

'^acristatusaurus$'

In [30]:
possible_begs = [name[:5] for name in dino_names]
random.shuffle(possible_begs)

for start in possible_begs[:30]:
    print(generate_dino_nn(start, model, 5, END, mapping, mapping_inv)[1:-1])

rhinoceratops
indosaurus
ruehleia
magnapaulicheirosaurus
eoceratophoneus
chingkankousaurus
sinotyrannus
velocipes
daptasaurus
saichania
niobrarasaurus
owenodon
poekolophus
pukyongosaurus
deuterosaurus
elachiosaurus
griphosaurus
boreonykus
salimosaurus
actiosaurus
nemegtosaurus
velocimamosaurus
texasetes
paleosaurus
stegos
balauiuscur
acheroraptor
triassolestes
zuoyunlong
ozraptor
