# Next Character Generator: LSTM

In [58]:
import numpy as np
import pandas as pd
import torch
import pickle

### Create a unique character dictionary

In [44]:
dictionary = ['\n', '\r', ' ', '!', '"', "'", '(', ')', '‘', '’', '*', '”', '“', ',', '—', '-', '.', ':', ';', '?', '[', ']',
'_', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xbb', '\xbf', '\xef']

### Load data

In [48]:
with open('/content/Alice’s Adventures in Wonderland.txt', 'r') as f:
    text = f.read()

In [49]:
# convert into lower case
text = text.lower()

In [50]:
# summarize the data
n_chars = len(text)
n_vocab = len(dictionary)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  144581
Total Vocab:  62


In [52]:
# Unique characters and mapping
chars = sorted(set(text))  # Collect unique characters
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for idx, char in enumerate(chars)}

In [53]:
len(char_to_idx)

48

### Create an Input Output Dataset

In [54]:
import pickle

# Define sequence length (window size)
SEQ_LENGTH = 40  # Adjust based on your dataset

# Create input-output pairs
input_sequences = []
target_chars = []

for i in range(len(text) - SEQ_LENGTH):
    input_sequences.append([char_to_idx[c] for c in text[i:i+SEQ_LENGTH]])
    target_chars.append(char_to_idx[text[i+SEQ_LENGTH]])

# Convert to numpy arrays
X = np.array(input_sequences)
y = np.array(target_chars)

# Save processed data to avoid re-processing
with open("processed_data.pkl", "wb") as f:
    pickle.dump((X, y, char_to_idx, idx_to_char), f)

print("Data processing complete. Saved as 'processed_data.pkl'")


Data processing complete. Saved as 'processed_data.pkl'


In [55]:
text[:100]

'alice’s adventures in wonderland\n\nby lewis carroll\n\nthe millennium fulcrum edition 3.0\n\ncontents\n\n c'

### Load the dataset

In [72]:
# Load the processed data
with open("processed_data.pkl", "rb") as f:
    X, y, char_to_idx, idx_to_char = pickle.load(f)

In [73]:
# Check the shapes of loaded arrays
print("Input shape (X):", X.shape)
print("Output shape (y):", y.shape)

Input shape (X): (144541, 40)
Output shape (y): (144541,)


In [74]:
# convert these into PyTorch tensors
X = torch.from_numpy(X).type(torch.float)
y = torch.from_numpy(y)

In [75]:
n_patterns = X.shape[0]
n_patterns

144541

LSTM layer is going to be used in the model, thus the input tensor should be of dimension (sample, time steps, features).

In [76]:
X = X.reshape(n_patterns, SEQ_LENGTH, 1)

In [77]:
print(X.shape, y.shape)

torch.Size([144541, 40, 1]) torch.Size([144541])


### Create LSTM Model

In [65]:
import torch.optim as optim
import torch.utils.data as data
import torch.nn as nn

In [67]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [69]:
class LSTMCharModel(nn.Module):

  def __init__(self):
      super(LSTMCharModel, self).__init__()
      self.lstm = nn.LSTM(input_size=1, hidden_size=256, num_layers=1, batch_first=True)
      self.dropout = nn.Dropout(0.2)
      self.linear = nn.Linear(256, n_vocab)

  def forward(self, x):
      x, _ = self.lstm(x)
      # take only the last output
      x = x[:, -1, :]
      # produce output
      x = self.linear(self.dropout(x))
      return x

In [80]:
# Set Parameters
n_epochs = 40
batch_size = 128
model = LSTMCharModel()
model.to(device)

optimizer = optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss(reduction="sum")
loader = data.DataLoader(data.TensorDataset(X, y), shuffle=True, batch_size=batch_size)

In [84]:
# Training loop
for epoch in range(n_epochs):

  for X_batch, y_batch in loader:

    # Data to GPU
    X_batch = X_batch.to(device)
    y_batch = y_batch.to(device)

    optimizer.zero_grad()  # Clear previous gradients

    output = model(X_batch)  # Forward pass

    loss = loss_fn(output, y_batch) # Compute loss
    loss.backward()  # Backpropagation

    optimizer.step()  # Update weights

  # if epoch % 100 == 0:
  print(f"Epoch [{epoch+1}/{n_epochs}], Loss: {loss.item():.4f}")

Epoch [1/40], Loss: 61.6631
Epoch [2/40], Loss: 62.6703
Epoch [3/40], Loss: 48.3693
Epoch [4/40], Loss: 68.8345
Epoch [5/40], Loss: 55.2626
Epoch [6/40], Loss: 59.4498
Epoch [7/40], Loss: 59.6884
Epoch [8/40], Loss: 50.5259
Epoch [9/40], Loss: 43.8814
Epoch [10/40], Loss: 55.5586
Epoch [11/40], Loss: 61.1853
Epoch [12/40], Loss: 57.0465
Epoch [13/40], Loss: 46.3952
Epoch [14/40], Loss: 51.5665
Epoch [15/40], Loss: 49.9568
Epoch [16/40], Loss: 63.7949
Epoch [17/40], Loss: 46.7754
Epoch [18/40], Loss: 45.6253
Epoch [19/40], Loss: 63.5476
Epoch [20/40], Loss: 53.0863
Epoch [21/40], Loss: 38.5105
Epoch [22/40], Loss: 45.8743
Epoch [23/40], Loss: 58.8801
Epoch [24/40], Loss: 63.0137
Epoch [25/40], Loss: 55.3525
Epoch [26/40], Loss: 48.4465
Epoch [27/40], Loss: 39.4393
Epoch [28/40], Loss: 46.2169
Epoch [29/40], Loss: 42.3917
Epoch [30/40], Loss: 40.3730
Epoch [31/40], Loss: 43.9227
Epoch [32/40], Loss: 34.2238
Epoch [33/40], Loss: 35.7325
Epoch [34/40], Loss: 33.8011
Epoch [35/40], Loss: 54

### Generating the next character

In [114]:
top_k = 1

def predict_next_word(text):

  #  lower case
  text = next_char_text.lower()

  # tokenize the text
  tokenize = [char_to_idx[c] for c in text]

  # convert into tensor
  input_tensor = torch.tensor(tokenize).type(torch.float).reshape(1, -1, 1)

  model.eval()

  with torch.no_grad():  # No gradient computation needed

        # Shift to GPU
        input_tensor = input_tensor.to(device)

        # Forward pass
        output = model(input_tensor)

        # Get top-k predictions
        predicted_indices = torch.topk(output, top_k, dim=1).indices.squeeze(0)

        # get char from indicea
        predicted_chars = [idx_to_char[idx.item()] for idx in predicted_indices]

        return predicted_chars


In [111]:
next_char_text = "I wonder what L"

predicted_output = predict_next_word(next_char_text)
print("Input:", next_char_text)
print("Output", next_char_text + predicted_output[0])

torch.Size([1, 15, 1])
Input: I wonder what L
Output I wonder what Lt


In [124]:
next_char_text = "Alice was not a bit hurt, and"

for _ in range(50):
  predicted_output = predict_next_word(next_char_text)
  print(f"{next_char_text + predicted_output[0]}")
  next_char_text += predicted_output[0]

Alice was not a bit hurt, and 
Alice was not a bit hurt, and t
Alice was not a bit hurt, and th
Alice was not a bit hurt, and the
Alice was not a bit hurt, and the 
Alice was not a bit hurt, and the m
Alice was not a bit hurt, and the mi
Alice was not a bit hurt, and the mit
Alice was not a bit hurt, and the mitt
Alice was not a bit hurt, and the mittl
Alice was not a bit hurt, and the mittle
Alice was not a bit hurt, and the mittle 
Alice was not a bit hurt, and the mittle g
Alice was not a bit hurt, and the mittle go
Alice was not a bit hurt, and the mittle gor
Alice was not a bit hurt, and the mittle gorr
Alice was not a bit hurt, and the mittle gorre
Alice was not a bit hurt, and the mittle gorre 
Alice was not a bit hurt, and the mittle gorre o
Alice was not a bit hurt, and the mittle gorre of
Alice was not a bit hurt, and the mittle gorre of 
Alice was not a bit hurt, and the mittle gorre of t
Alice was not a bit hurt, and the mittle gorre of th
Alice was not a bit hurt, and the 

In [119]:
!git clone https://github.com/AbbasKothari1552/Next-Word-Generator.git

Cloning into 'Next-Word-Generator'...


In [120]:
!mv processed_data.pkl Next-Word-Generator/

In [121]:
!mv "/content/Alice’s Adventures in Wonderland.txt" Next-Word-Generator/


In [123]:
%cd Next-Word-Generator
!git config --global user.email "ask50405808@gmail.com.com"
!git config --global user.name "AbbasKothari1552"

!git add .
!git commit -m "Added processed data and Text data"
!git push origin main  # Change `main` to the correct branch if needed


[Errno 2] No such file or directory: 'Next-Word-Generator'
/content/Next-Word-Generator
On branch main
Your branch is based on 'origin/main', but the upstream is gone.
  (use "git branch --unset-upstream" to fixup)

nothing to commit, working tree clean
fatal: could not read Username for 'https://github.com': No such device or address
