<a href="https://colab.research.google.com/github/CaQtiml/Kaggle_Practice/blob/main/superhero.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/am1tyadav/superhero

Cloning into 'superhero'...
remote: Enumerating objects: 8, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 8 (delta 0), reused 4 (delta 0), pack-reused 0[K
Unpacking objects: 100% (8/8), done.


In [6]:
with open("superhero/superheroes.txt") as f:
    data = f.read()

In [8]:
data[:100]

'jumpa\t\ndoctor fate\t\nstarlight\t\nisildur\t\nlasher\t\nvarvara\t\nthe target\t\naxel\t\nbattra\t\nchangeling\t\npyrrh'

In [2]:
import tensorflow as tf

In [3]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~',
    split='\n',
)

In [7]:
tokenizer.fit_on_texts(data)

In [9]:
char_to_index = tokenizer.word_index
index_to_char = dict((v,k) for k,v in char_to_index.items())

In [38]:
print(char_to_index)
print(index_to_char)

{'\t': 1, 'a': 2, 'e': 3, 'r': 4, 'o': 5, 'n': 6, 'i': 7, ' ': 8, 't': 9, 's': 10, 'l': 11, 'm': 12, 'h': 13, 'd': 14, 'c': 15, 'u': 16, 'g': 17, 'k': 18, 'b': 19, 'p': 20, 'y': 21, 'w': 22, 'f': 23, 'v': 24, 'j': 25, 'z': 26, 'x': 27, 'q': 28}
{1: '\t', 2: 'a', 3: 'e', 4: 'r', 5: 'o', 6: 'n', 7: 'i', 8: ' ', 9: 't', 10: 's', 11: 'l', 12: 'm', 13: 'h', 14: 'd', 15: 'c', 16: 'u', 17: 'g', 18: 'k', 19: 'b', 20: 'p', 21: 'y', 22: 'w', 23: 'f', 24: 'v', 25: 'j', 26: 'z', 27: 'x', 28: 'q'}


# Names and Sequences

In [11]:
names = data.splitlines()
names[:10]

['jumpa\t',
 'doctor fate\t',
 'starlight\t',
 'isildur\t',
 'lasher\t',
 'varvara\t',
 'the target\t',
 'axel\t',
 'battra\t',
 'changeling\t']

In [12]:
tokenizer.texts_to_sequences(names[0])

[[25], [16], [12], [20], [2], [1]]

In [16]:
def name_to_seq(name):
    return [tokenizer.texts_to_sequences(chr)[0][0] for chr in name]

In [17]:
name_to_seq(names[0])

[25, 16, 12, 20, 2, 1]

In [20]:
def seq_to_name(seq):
    return "".join([index_to_char[num] for num in seq if num!=0])

In [21]:
seq_to_name(name_to_seq(names[0]))

'jumpa\t'

# Creating Examples

In [22]:
sequences = []
for name in names:
    seq = name_to_seq(name)
    if(len(seq)>=2):
        sequences += [seq[:i] for i in range(2,len(seq)+1)]

In [26]:
print(names[0])
print(name_to_seq(names[0]))
print(sequences[:4])

jumpa	
[25, 16, 12, 20, 2, 1]
[[25, 16], [25, 16, 12], [25, 16, 12, 20], [25, 16, 12, 20, 2]]


In [27]:
max_len = max([len(x) for x in sequences])

In [28]:
print(max_len)

33


In [29]:
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(
    sequences,
    padding = "pre",
    maxlen = max_len
)

In [32]:
print(padded_sequences[1])
print(padded_sequences.shape)

[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0 25 16 12]
(88279, 33)


# Training and Validation Set

In [33]:
# Training Set
x = padded_sequences[:,:-1] # save the last character to check with the predicted one.
y = padded_sequences[:,-1]

In [34]:
print(x.shape,y.shape)

(88279, 32) (88279,)


In [35]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y)

In [37]:
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(66209, 32) (66209,)
(22070, 32) (22070,)


In [39]:
num_chars = len(char_to_index.keys())+1 # +1 is from zero-padding
print(num_chars)

29


# Creating the Model

In [40]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPool1D, LSTM
from tensorflow.keras.layers import Bidirectional, Dense

In [41]:
model = Sequential([
                    Embedding(num_chars, 8, input_length = max_len-1),
                    Conv1D(64,5, strides = 1, activation = "tanh", padding = "causal"),
                    MaxPool1D(2),
                    LSTM(32),
                    Dense(num_chars, activation="softmax")
])

model.compile(
    loss = "sparse_categorical_crossentropy",
    optimizer = "adam",
    metrics = ["accuracy"]
)

model.summary()

# causal means that the output will depend on only the previous data.

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 32, 8)             232       
_________________________________________________________________
conv1d (Conv1D)              (None, 32, 64)            2624      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 16, 64)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 32)                12416     
_________________________________________________________________
dense (Dense)                (None, 29)                957       
Total params: 16,229
Trainable params: 16,229
Non-trainable params: 0
_________________________________________________________________


In [None]:
h = model.fit(
    x_train ,y_train,
    validation_data = (x_test,y_test),
    epochs = 50,
    verbose = 2,
    callbacks = [tf.keras.callbacks.EarlyStopping(monitor = "val_accuracy", patience = 3)]
)

# Generate Name

In [70]:
seq = name_to_seq("M")
padded = tf.keras.preprocessing.sequence.pad_sequences([seq], 
                                                       padding = "pre",
                                                       maxlen = max_len-1, 
                                                       truncating = "pre")
pred = model.predict(padded)[0]
print(tf.argmax(pred).numpy())
pred_char = index_to_char[tf.argmax(pred).numpy()]

2


In [77]:
def generate_names(seed):
    for i in range(40):
        seq = name_to_seq(seed)
        padded = tf.keras.preprocessing.sequence.pad_sequences([seq], padding = "pre", 
                                                               maxlen = max_len-1, truncating = "pre") # -1 as we want to predict the last character
        pred = model.predict(padded)[0]
        pred_char = index_to_char[tf.argmax(pred).numpy()]
        seed += pred_char

        if pred_char == "\t":
            break
    print(seed)


In [79]:
generate_names("f")

freeker	
