# Character-based LSTM Syriac Text Generation

## Create the Model

### Import and load data

In [6]:
import numpy as np
 
# load ascii text and covert to lowercase
filename = "basic.txt"
raw_text = open(filename, 'r', encoding='utf-8').read()
raw_text = raw_text.lower()
 
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
 
# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  14887
Total Vocab:  25


### Prepare dataset of samples

In [7]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100    # number of characters per sample
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  14787


### Convert samples dataset samples into PyTorch tensors

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
 
# reshape X to be [samples, time steps, features]
X = torch.tensor(dataX, dtype=torch.float32).reshape(n_patterns, seq_length, 1)
X = X / float(n_vocab)
y = torch.tensor(dataY)
print(X.shape, y.shape)

torch.Size([14787, 100, 1]) torch.Size([14787])


### Define the LSTM model

In [9]:
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
 
class CharModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(input_size=1, hidden_size=256, num_layers=1, batch_first=True)
        # self.lstm = nn.LSTM(input_size=1, hidden_size=256, num_layers=2, batch_first=True, dropout=0.2)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(256, n_vocab)
    def forward(self, x):
        x, _ = self.lstm(x)
        # take only the last output
        x = x[:, -1, :]
        # produce output
        x = self.linear(self.dropout(x))
        return x

### Train the model

In [10]:
n_epochs = 5   # number of times the model passes through the whole dataset
batch_size = 128     # number of samples per batch
model = CharModel()
 
optimizer = optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss(reduction="sum")
loader = data.DataLoader(data.TensorDataset(X, y), shuffle=True, batch_size=batch_size)
 
best_model = None
best_loss = np.inf
for epoch in range(n_epochs):
    model.train()
    for X_batch, y_batch in loader:
        y_pred = model(X_batch)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # Validation
    model.eval()
    loss = 0
    with torch.no_grad():
        for X_batch, y_batch in loader:
            y_pred = model(X_batch)
            loss += loss_fn(y_pred, y_batch)
        if loss < best_loss:
            best_loss = loss
            best_model = model.state_dict()
        print("Epoch %d: Cross-entropy: %.4f" % (epoch, loss))
 
torch.save([best_model, char_to_int], "single-char.pth")

Epoch 0: Cross-entropy: 42373.5430
Epoch 1: Cross-entropy: 42201.7539
Epoch 2: Cross-entropy: 41685.7109
Epoch 3: Cross-entropy: 40886.8945
Epoch 4: Cross-entropy: 40563.7656


# Testing the Model

### Import modules once again (run if 'Create the Model' is skipped)

In [18]:
import numpy as np
import torch
import torch.nn as nn

### Reload model and other variables

In [20]:
# reload stored variables
best_model, char_to_int = torch.load("single-char.pth")
n_vocab = len(char_to_int)
int_to_char = dict((i, c) for c, i in char_to_int.items())
 
# reload the model
class CharModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(input_size=1, hidden_size=256, num_layers=1, batch_first=True)
        # self.lstm = nn.LSTM(input_size=1, hidden_size=256, num_layers=2, batch_first=True, dropout=0.2)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(256, n_vocab)
    def forward(self, x):
        x, _ = self.lstm(x)
        # take only the last output
        x = x[:, -1, :]
        # produce output
        x = self.linear(self.dropout(x))
        return x
model = CharModel()
model.load_state_dict(best_model)

<All keys matched successfully>

### Generate and evaluate a prompt

In [26]:
# randomly generate a prompt
filename = "basic.txt"
seq_length = 100
raw_text = open(filename, 'r', encoding='utf-8').read()
raw_text = raw_text.lower()
start = np.random.randint(0, len(raw_text)-seq_length)
prompt = raw_text[start:start+seq_length]
#prompt = 'ܡܢ ܕܫܩܠ'
pattern = [char_to_int[c] for c in prompt]
 
model.eval()
print('Prompt: "%s"' % prompt)
with torch.no_grad():
    for i in range(1000):
        # format input array of int into PyTorch tensor
        x = np.reshape(pattern, (1, len(pattern), 1)) / float(n_vocab)
        x = torch.tensor(x, dtype=torch.float32)
        # generate logits as output from the model
        prediction = model(x)
        # convert logits into one character
        index = int(prediction.argmax())
        result = int_to_char[index]
        print(result, end="")
        if result == ' ':
            print()
        # append the new character into the prompt for the next iteration
        pattern.append(index)
        pattern = pattern[1:]
print()
print("Done.")

Prompt: "ܝܗ ܥܠ ܫܬܐܣܬܐ ܘܚܘܝ. ܐܡܪ ܓܝܪ ܗܟܢܐ. ܕܫܬܐܣܬܐ.ܐܚܪܬܐ ܣܛܪ ܡܢ ܗܕܐ ܕܣܝܡܐ. ܐܢܫ ܠܐ ܡܫܟܚ ܠܡܣܡ. ܕܗܘܝܘ ܝܫܘܥ ܡܫܝܚܐ."
 
ܘܝ 
 
ܘܐ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 
ܘܝ 
 
ܘܘ 
 