In [1]:
import numpy as np
import torch
from torch import nn
from torch import optim

# prepare data

In [2]:
# create an index to character mapping
import string

idx2char = [" "] + list(string.ascii_lowercase)
char2idx = {ch: idx for idx, ch in enumerate(idx2char)}


def get_max_seq_length(filename="data/names/English.txt"):
    max_len = 0
    with open(filename, "r") as fp:
        for line in fp.readlines():
            try:
                line = line.lower().strip()
                max_len = max(len(line), max_len)
            except Exception as err:
                continue
    return max_len


def prepare_rnn_data(filename="data/names/English.txt", padding=True) -> list:
    x_data = []

    max_len = get_max_seq_length(filename)

    with open(filename, "r") as fp:
        for line in fp.readlines():
            try:
                line = line.lower().strip()
                if padding:
                    # pad strings with extra spaces to make them of equal length
                    max_len = max(len(line), max_len)
                    line += " " * max(max_len - len(line), 0)
                    assert len(line) == max_len
                line_idx = [char2idx[ch] for ch in line]
                x_data.append(line_idx)
            except Exception as err:
                # print(line)
                continue
    return x_data


def idx2onehot(idx: int) -> list:
    """
    return one hot encoding for a given index
    :param idx: 
    :return: 
    """
    one_hot = [0] * 27
    one_hot[idx] = 1
    return one_hot


def convert_data_to_onehot(x_data: list) -> list:
    """
    convert the indices to one hot encoding
    :param x_data: 
    :return: 
    """
    x_one_hot = []
    for lst in x_data:
        one_hots = [idx2onehot(idx) for idx in lst]
        x_one_hot.append(one_hots)
    return x_one_hot


rnn_data = prepare_rnn_data(filename="data/names/English.txt")
# leave out the last character as we will be predicting the next character
x_data = [list[:-1] for list in rnn_data]

x_one_hot = convert_data_to_onehot(x_data)

# leave first character as we will be predicting the next character
y_data = [lst[1:] for lst in rnn_data]

assert len(x_one_hot) == len(y_data)
# x_one_hot = torch.LongTensor(x_one_hot)
y_data = torch.LongTensor(y_data)
# print(y_data[0])

inputs = torch.Tensor(x_one_hot)
labels = torch.LongTensor(y_data)


# hyperparameters

In [3]:
# hyperparameters
seq_len = get_max_seq_length(filename="data/names/English.txt")

input_size = 27  # one-hot size
batch_size = 5  # one sentence per batch
num_layers = 1  # one-layer rnn
num_classes = 27  # predicting 5 distinct character
hidden_size = 128  # output from the RNN

# model architecture

In [4]:
class LSTM(nn.Module):
    """
    The RNN model will be a RNN followed by a linear layer,
    i.e. a fully-connected layer
    """

    def __init__(self, seq_len, num_classes, input_size, hidden_size, num_layers):
        super().__init__()
        self.seq_len = seq_len
        self.num_layers = num_layers
        self.input_size = input_size
        self.num_classes = num_classes
        self.hidden_size = hidden_size
        # batch_first = true means forward will take input of shape (batch_size, seq_len, input_size)
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)

        self.linear = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # assuming batch_first = True for RNN cells
        # print(f"x.shape: {x.shape}")
        batch_size = x.size(0)
        seq_len = x.size(1)
        input_size = x.size(2)
        # print(f"batch_size: {batch_size}")
        hidden = self._init_hidden(batch_size)
        # print(f"hidden.shape: {hidden.shape}")
        # print(f"before x.shape: {x.shape}")
        # x = x.view(batch_size, seq_len, input_size)
        lstm_out, (h_n, c_n) = self.lstm(x, hidden)
        assert lstm_out.shape == (batch_size, seq_len, hidden_size)
        assert h_n.shape == (1, batch_size, hidden_size)
        # print(f"rnn_out:{rnn_out}")
        # print(f"rnn_out.shape: {rnn_out.shape}")
        # print(f"rnn_out.shape: {rnn_out.view(-1, hidden_size).shape}")
        # rnn_out.contiguous().view(-1, hidden_size)
        linear_out = self.linear(lstm_out.contiguous().view(-1, hidden_size))
        assert linear_out.shape == (batch_size * seq_len, num_classes)
        return linear_out

    def _init_hidden(self, batch_size):
        """
        Initialize hidden cell states, assuming
        batch_first = True for RNN cells
        """
        return #torch.zeros(1, batch_size, self.hidden_size)

In [6]:
# Set loss, optimizer and the LSTM model
torch.manual_seed(777)
lstm_model = LSTM(seq_len, num_classes, input_size, hidden_size, num_layers=num_layers)
print('network architecture:\n', lstm_model)

# train the model
num_epochs = 500

# This criterion computes the cross entropy loss between input logits and target.
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=0.025)
# print(f"inputs.shape: {inputs.shape}")
# print(f"labels.shape: {labels.shape}")
for epoch in range(1, num_epochs + 1):
    optimizer.zero_grad()
    # for idx in np.arange(0,len(inputs),step=batch_size):
    # print(f"ip.shape: {ip.shape}")
    # ip = inputs[idx:idx+batch_size,:,:]
    # print(f"inputs.shape: {inputs.shape}")
    # for idx, ip in enumerate(inputs):
    # print(f"inputs.shape: {inputs.shape}")
    outputs = lstm_model(inputs)
    # assert outputs.shape == labels.view(-1).shape
    # print(f"outputs.shape: {outputs.shape}")
    # print(f"labels.view(-1).shape: {labels.view(-1).shape}")
    assert outputs.shape[0] == labels.view(-1).shape[0]
    loss = criterion(outputs, labels.view(-1).long())
    loss.backward()
    optimizer.step()

    # check the current predicted string
    # max gives the maximum value and its
    # corresponding index, we will only
    # be needing the index
    _, idx = outputs.max(dim=1)
    idx = idx.detach().numpy()
    result_str = [idx2char[c] for c in idx]
    if epoch % 100 == 0:
        print('epoch: {}, loss: {:1.3f}'.format(epoch, loss.item()))
    # print(f"type(result_str): {type(result_str)}")
    # print(f"len(result_str): {len(result_str)}")
    # print('Predicted string: ', ''.join(result_str))
    # break

network architecture:
 LSTM(
  (lstm): LSTM(27, 128, batch_first=True)
  (linear): Linear(in_features=128, out_features=27, bias=True)
)
epoch: 100, loss: 0.841
epoch: 200, loss: 0.549
epoch: 300, loss: 0.439
epoch: 400, loss: 0.412
epoch: 500, loss: 0.404


In [7]:
# save pytorch model
# saving embeddings
model_path = f"lm_lstm_model_{seq_len}_{num_classes}_{input_size}_{hidden_size}.pth"
torch.save(lstm_model.state_dict(), model_path)

In [8]:
# Load the model from the file
model_path = f"lm_lstm_model_{seq_len}_{num_classes}_{input_size}_{hidden_size}.pth"
loaded_model = LSTM(seq_len, num_classes, input_size, hidden_size,
                   num_layers=num_layers)  # Create an instance of your model
loaded_model.load_state_dict(torch.load(model_path))  # Load the state dictionary
# loaded_model.eval()  # Set the model to evaluation mode
# loaded_model = to_device(loaded_model, device)
# Now 'loaded_model' contains the model loaded from the saved file


<All keys matched successfully>

In [9]:
# This function takes in the model and character as arguments and returns the next character prediction and hidden state
def predict(model, character):
    # One-hot encoding our input to fit into the model
    character = [char2idx[c] for c in character]
    character = [idx2onehot(c) for c in character]
    character = torch.Tensor([character])
    # character.to(device)
    # print(f"character.shape: {character.shape}")
    out = model(character)

    prob = nn.functional.softmax(out[-1], dim=0).data
    # Taking the class with the highest probability score from the output
    char_ind = torch.max(prob, dim=0)[1].item()

    return idx2char[char_ind]

In [10]:
# This function takes the desired output length and input characters as arguments, returning the produced sentence
def sample(model, out_len, start='ab'):
    model.eval()  # eval mode
    start = start.lower()
    max_len = 13
    # start = start+" " * max(max_len - len(start), 0)
    # First off, run through the starting characters
    chars = [ch for ch in start]
    # print(f"chars: {chars}")
    size = out_len - len(chars)
    # print(f"size:{size}")
    # Now pass in the previous characters and get a new one
    for ii in range(size):
        char, *h = predict(model, chars)
        # print(f"char: {char}")
        chars.append(char)

    return ''.join(chars)




In [26]:
sample(loaded_model, 8, 'z')

'zaoui   '