<a href="https://colab.research.google.com/github/Codehackerone/nlp-pytorch/blob/main/Text%20Generator%20with%20character-level%20LSTM/Pytorch_name_gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import json
import requests

url = 'https://parseapi.back4app.com/classes/Complete_List_Names?limit=100000000000&keys=Name'
headers = {
    'X-Parse-Application-Id': 'zsSkPsDYTc2hmphLjjs9hz2Q3EXmnSxUyXnouj1I',
    'X-Parse-Master-Key': '4LuCXgPPXXO2sU5cXm6WwpwzaKyZpo3Wpj4G4xXK'
}
data = json.loads(requests.get(url, headers=headers).content.decode('utf-8'))
names = [i['Name'] for i in data['results']]

with open('names.txt', 'w') as f:
    f.write('\n'.join(names))

In [3]:
!pip install unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting unidecode
  Downloading Unidecode-1.3.4-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 29.9 MB/s 
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.4


In [7]:
# Imports

import torch
import torch.nn as nn
import string
import random
import sys
import unidecode
from torch.utils.tensorboard import SummaryWriter

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Get characters from string.printable
all_characters = string.printable
n_characters = len(all_characters)

# Read large text file
file = unidecode.unidecode(open('./names.txt').read())


# Model
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embed = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        out = self.embed(x)
        out, (hidden, cell) = self.lstm(out.unsqueeze(1), (hidden, cell))
        out = self.fc(out.reshape(out.shape[0], -1))
        return out, (hidden, cell)

    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        return hidden, cell


class Generator():
    def __init__(self):
        self.chunk_len = 250
        self.num_epochs = 5000
        self.batch_size = 1
        self.print_every = 50
        self.hidden_size = 256
        self.num_layers = 2
        self.lr = 0.003

    # Take a char and map it to a vector
    def char_tensor(self, string):
        tensor = torch.zeros(len(string)).long()
        for c in range(len(string)):
            tensor[c] = all_characters.index(string[c])
        return tensor

    def get_random_batch(self):
        start_idx = random.randint(0, len(file) - self.chunk_len)
        end_idx = start_idx + self.chunk_len + 1
        text_str = file[start_idx:end_idx]
        text_input = torch.zeros(self.batch_size, self.chunk_len)
        text_target = torch.zeros(self.batch_size, self.chunk_len)

        # Obtain a char to input and set the prediction to target
        for i in range(self.batch_size):
            text_input[i, :] = self.char_tensor(text_str[:-1])
            text_target[i, :] = self.char_tensor(text_str[1:])

        return text_input.long(), text_target.long()

    def generate(self, initial_str='A', prediction_len=100, temperature=0.85):
        # temp is how risky rnn could be. If temp is lower, it could predict and and and or or etc
        hidden, cell = self.rnn.init_hidden(batch_size=self.batch_size)
        initial_input = self.char_tensor(initial_str)
        predicted = initial_str

        for p in range(len(initial_str) - 1):
            # random values
            _, (hidden, cell) = self.rnn(initial_input[p].view(1).to(device), hidden, cell)

        last_char = initial_input[-1]
        # Actual Predictions
        for p in range(prediction_len):
            output, (hidden, cell) = self.rnn(last_char.view(1).to(device), hidden, cell)
            output_dist = output.data.view(-1).div(temperature).exp() # eliminate common names
            top_char = torch.multinomial(output_dist, 1)[0] # not taking the highest probability
            predicted_char = all_characters[top_char]
            predicted += predicted_char
            last_char = self.char_tensor(predicted_char)

        return predicted

    def train(self):
        self.rnn = RNN(n_characters, self.hidden_size, self.num_layers, n_characters).to(device)
        optimizer = torch.optim.Adam(self.rnn.parameters(), lr=self.lr)
        criterion = nn.CrossEntropyLoss()
        writer = SummaryWriter(f'runs/names0') # for tensorboard

        print("=> Start training")

        for epoch in range(1, self.num_epochs + 1):
            inp, target = self.get_random_batch()
            hidden, cell = self.rnn.init_hidden(batch_size=self.batch_size)

            self.rnn.zero_grad()
            loss = 0
            inp = inp.to(device)
            target = target.to(device)

            for c in range(self.chunk_len):
                output, (hidden, cell) = self.rnn(inp[:, c], hidden, cell)
                loss += criterion(output, target[:, c])

            loss.backward()
            optimizer.step()
            loss = loss.item()/self.chunk_len

            if epoch % self.print_every == 0:
                print(f'Loss: {loss}')
                print(self.generate())

            writer.add_scalar('Training Loss', loss, global_step=epoch)


gennames = Generator()
gennames.train()

=> Start training
Loss: 2.271778076171875
AFgerd
Karla
Wuverne
Mucin
Dcannne
Evela
Elene
Jornne
Leynnmieten
Mlalin
Ssalin
Haole
Claiy
Wala
Ella
Loss: 2.401146240234375
Aucies
Cildel
Jomeg
irea
Albiy
Joache
Hereclre
Fhelle
Kaorfta
Brex
Tandie
Noici
Jana
Jirrel
Rilcie
De
Loss: 2.134099853515625
Allo
Aan
Jadey
Saney
Jacand
Lilly
Vincen
Adiste
Ollieth
Relerr
Lua
Naca
Jonega
Bery
Geloyt
Toyrer
Shi
Loss: 2.059052734375
Adie
Ceze
Ronne
Damdall
Karo
Andaurey
Wangie
Lene
Amorela
Adrie
Samina
Carner
Dela
Jeason
Oris
Juline
Loss: 1.8622960205078125
Aluse
Emcyl
Matuy
Hoenall
Tince
Laret
Emlynd
Danoly
Frianne
Willes
Orart
Riliette
alla
Syralina
Wicti
Loss: 1.7917508544921874
Azine
Charla
Beverl
Danna
Emirie
Juffub
Olleyne
LaLey
Lidia
Mider
Riginer
Bendabge
Dadie
Sherty
Pelev
Loss: 2.088212890625
Aberne
Clille
Becel
Eu
Berid
Miney
Vade
Saunie
Pelle
Chirlina
Rosey
Kirgara
Erneg
Marian
Daria
Morgia
Loss: 2.1648720703125
Auman
Alanne
Edy
Ellinie
Jowis
Lema
BaKgura
Chod
Benta
Maline
Charlene
Camonie
Ele

KeyboardInterrupt: ignored