<a href="https://colab.research.google.com/github/CaQtiml/DeepLearning_Practice/blob/main/Dinosaur_Name_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Start

In [None]:
# Check for GPU
!nvidia-smi -L

GPU 0: Tesla K80 (UUID: GPU-28452d29-fc7c-127f-174a-f73c5c0a1dde)


In [None]:
import pandas as pd
import random
import numpy as np
from sklearn.model_selection import train_test_split
import string

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPool1D, LSTM, GRU, TimeDistributed, Bidirectional, Dense

In [None]:
data = open('dinos.txt', 'r').read()
data= data.lower()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('There are %d total characters and %d unique characters in your data.' % (data_size, vocab_size))

There are 19909 total characters and 27 unique characters in your data.


In [None]:
data

'aachenosaurus\naardonyx\nabdallahsaurus\nabelisaurus\nabrictosaurus\nabrosaurus\nabydosaurus\nacanthopholis\nachelousaurus\nacheroraptor\nachillesaurus\nachillobator\nacristavus\nacrocanthosaurus\nacrotholus\nactiosaurus\nadamantisaurus\nadasaurus\nadelolophus\nadeopapposaurus\naegyptosaurus\naeolosaurus\naepisaurus\naepyornithomimus\naerosteon\naetonyxafromimus\nafrovenator\nagathaumas\naggiosaurus\nagilisaurus\nagnosphitys\nagrosaurus\nagujaceratops\nagustinia\nahshislepelta\nairakoraptor\najancingenia\najkaceratops\nalamosaurus\nalaskacephale\nalbalophosaurus\nalbertaceratops\nalbertadromeus\nalbertavenator\nalbertonykus\nalbertosaurus\nalbinykus\nalbisaurus\nalcovasaurus\nalectrosaurus\naletopelta\nalgoasaurus\nalioramus\naliwalia\nallosaurus\nalmas\nalnashetri\nalocodon\naltirhinus\naltispinax\nalvarezsaurus\nalwalkeria\nalxasaurus\namargasaurus\namargastegos\namargatitanis\namazonsaurus\nammosaurus\nampelosaurus\namphicoelias\namphicoelicaudia\namphisaurus\namtocephale\namtosaur

In [None]:
chars = sorted(chars)
print(chars)
print(len(chars))

['\n', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
27


In [None]:
len_name = [len(name) for name in data.split("\n")]
len_name = np.array(len_name)
np.percentile(np.sort(len_name),98)

17.0

In [None]:
names = [name for name in data.split("\n")]
names = np.array(names)

In [None]:
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(names)

In [None]:
tokenizer.get_config()

{'char_level': True,
 'document_count': 1536,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'index_docs': '{"4": 1133, "5": 1225, "8": 712, "2": 1199, "1": 1374, "6": 830, "12": 479, "13": 482, "3": 1105, "15": 309, "21": 80, "17": 258, "10": 544, "18": 162, "7": 756, "9": 686, "11": 479, "20": 108, "16": 292, "14": 320, "25": 35, "23": 53, "19": 131, "24": 40, "22": 56, "26": 23}',
 'index_word': '{"1": "a", "2": "s", "3": "u", "4": "o", "5": "r", "6": "n", "7": "i", "8": "e", "9": "t", "10": "l", "11": "p", "12": "h", "13": "c", "14": "g", "15": "d", "16": "m", "17": "y", "18": "b", "19": "k", "20": "v", "21": "x", "22": "z", "23": "j", "24": "w", "25": "f", "26": "q"}',
 'lower': True,
 'num_words': None,
 'oov_token': None,
 'split': ' ',
 'word_counts': '{"a": 2487, "c": 539, "h": 548, "e": 913, "n": 1081, "o": 1710, "s": 2285, "u": 2123, "r": 1704, "d": 341, "y": 266, "x": 85, "b": 171, "l": 617, "i": 944, "t": 852, "p": 552, "v": 111, "m": 328, "g": 360, "f": 37, "j": 55,

In [None]:
char_to_index = tokenizer.word_index
index_to_char = dict((v,k) for k,v in char_to_index.items())

In [None]:
print(tokenizer.texts_to_sequences(["aachenosaurus"]))
print(tokenizer.sequences_to_texts([[2, 2, 15, 14, 10, 8, 6, 3, 2, 4, 5, 4, 3]]))

[[1, 1, 13, 12, 8, 6, 4, 2, 1, 3, 5, 3, 2]]
['s s d g l e n u s o r o u']


In [None]:
def name_to_seq(name):
    return tokenizer.texts_to_sequences([name])[0]

In [None]:
name_to_seq("asuo")

[1, 2, 3, 4]

In [None]:
def seq_to_name(seq):
    return "".join([index_to_char[i] for i in seq if i!=0])

In [None]:
seq_to_name([1, 1, 13, 12, 8, 6, 4, 2, 1, 3, 5, 3, 2])

'aachenosaurus'

In [None]:
max_id = len(tokenizer.word_index) # number of distinct characters
dataset_size = tokenizer.document_count # total number of characters
print(max_id,dataset_size)

26 1536


In [None]:
names_augmented = []
for name in names:
    name_seq = name_to_seq(name)
    if(len(name_seq)>=2):
        names_augmented += [name_seq[:i] for i in range(2,len(name_seq)+1)]

In [None]:
names_augmented

In [None]:
# seq = tokenizer.texts_to_sequences(names_augmented)
seq = pad_sequences(names_augmented)
# seq = [np.array(ele) for ele in seq]
# seq = np.array(seq, dtype = object)

In [None]:
seq[0:10]

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  1,  1],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  1,  1, 13],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  1,  1, 13, 12],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  1,  1, 13, 12,  8],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  1,  1, 13, 12,  8,  6],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  1,  1, 13, 12,  8,  6,  4],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  1,  1, 13, 12,  8,  6,  4,  2],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  1,  1, 13, 12,  8,  6,  4,  2,  1],
       [ 0,  0,  0,  0,  0,  0, 

In [None]:
X = seq[:,:-1]
y = seq[:,-1]

In [None]:
print(X.shape, y.shape)

(16838, 25) (16838,)


In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(X, y, train_size = 0.85)

In [None]:
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(14312, 25) (14312,)
(2526, 25) (2526,)


In [None]:
print(seq_to_name(x_train[520]))
print(seq_to_name([y_train[520]]))

compsos
u


In [None]:
num_chars = len(char_to_index.keys())+1 # +1 is from zero-padding
max_len = len(x_train[0])
print(num_chars,max_len)

27 25


# Model

In [None]:
model = Sequential([
                    Embedding(num_chars, 8, input_length = max_len),
                    # Conv1D(64,5, strides = 1, activation = "tanh", padding = "causal"),
                    # MaxPool1D(2),
                    GRU(64, return_sequences=True, dropout=0.2),
                    GRU(32, dropout=0.2),
                    Dense(num_chars, activation="softmax")
])

model.compile(
    loss = "sparse_categorical_crossentropy",
    optimizer = "adam",
    metrics = ["accuracy"]
)

model.summary()

# causal means that the output will depend on only the previous data.

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 25, 8)             216       
                                                                 
 gru_12 (GRU)                (None, 25, 64)            14208     
                                                                 
 gru_13 (GRU)                (None, 32)                9408      
                                                                 
 dense_9 (Dense)             (None, 27)                891       
                                                                 
Total params: 24,723
Trainable params: 24,723
Non-trainable params: 0
_________________________________________________________________


In [None]:
h = model.fit(
    x_train ,y_train,
    validation_data = (x_test,y_test),
    epochs = 50,
    verbose = 1,
    callbacks = [tf.keras.callbacks.EarlyStopping(monitor = "accuracy", patience = 3)]
)

In [None]:
# less temperature, more reasonable sentence

def next_char(text, temperature=1):
    seq = name_to_seq(text)
    padded = pad_sequences([seq], maxlen = max_len)
    y_proba = model.predict(padded)[0]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical([rescaled_logits], num_samples=1)
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [None]:
def complete_text(text, n_chars=10, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [None]:
complete_text("ice",9,0.75)

'iceratopsaur'