#Summer 2025 project  -   StoryGenerator: Creative text generation using a Recurrent Neural Network(RNN)


By Ashar Ali


In [None]:
 #RUN THIS BOX ONCE (SO THERE IS  NO "TORCH TEXT  MODULE NOT FOUND" ERROR IN PART 1 -IT INSTALLS COMPATBLE VERSIONS )
!pip install torch==2.0.1 torchvision==0.15.2 torchtext==0.15.2 --index-url https://download.pytorch.org/whl/cpu


In [None]:
# PART 0 - IMPORTING REQUIRED LIBRARIES
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader,random_split

import numpy  # For numericals
import re # For regular expressions use
import matplotlib.pyplot as plt # For visiualsations

import torchtext
""
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator


In [None]:
# PART 1 - LOADING AND READING THE DATASET

# Download Grimm's Fairy Tales data set (plain text, UTF-8 encoded)
!wget https://www.gutenberg.org/files/2591/2591-0.txt -O grimms_fairy_tales.txt

#Load the dataset into a list of text lines to read
#For simplicity, removes punctaion and uses lowercase
def read_tales():
  with open("grimms_fairy_tales.txt") as f:
    lines = f.readlines()
  return [re.sub('[^A-Za-z] + ', ' ',line).strip().lower() for line in lines]

# Output number of lines in corpurs and o
lines = read_tales()
print(f'# text lines: {len(lines)}')

#Print first 120 lines
for line in lines[0:120]:
  print(line)




In [None]:
# PART 2 - PREAPERING THE DATA (Tokenisation etc)

# Ignore traceback error that may occur

# Tokenisation using torch text to break text into smaller peices
tokenizer = get_tokenizer("basic_english")
def yield_tokens(data_lines):
    for line in data_lines:
        yield tokenizer(line)

# Crearting vocab to map tokens to indices
vocab = build_vocab_from_iterator(yield_tokens(lines), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])


# Create input/target sequences using a sliding window
tokenised_lines = [tokenizer(line) for line in lines]
encoded_lines = [vocab(tokens) for tokens in tokenised_lines if len(tokens) > 0]
sequence_length = 5
input_sequences = []
target_words = []

for line in encoded_lines:
    if len(line) <= sequence_length:
        continue
    for i in range(len(line) - sequence_length):
        input_seq = line[i:i+sequence_length]
        target = line[i+sequence_length]
        input_sequences.append(input_seq)
        target_words.append(target)

X = torch.tensor(input_sequences, dtype=torch.long)
y = torch.tensor(target_words, dtype=torch.long)

print(f"Input shape: {X.shape}, Target shape: {y.shape}")



In [None]:
# PART 3 - MODEL DEFINITION


class StoryLSTM(nn.Module):
  """The RNN model to generete creative text"""
  def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=1):
    super(StoryLSTM, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embed_dim)
    self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)
    self.fc = nn.Linear(hidden_dim, vocab_size)

  def forward(self, x):
    x = self.embedding(x)
    out, _ = self.lstm(x)
    out = out[:, -1, :]
    out = self.fc(out)
    return out





In [None]:
# PART 4 - MODEL TRAINING (will take some time -  use t4 GPU google collab )

#Hyperparameters and model initilisation
vocab_size = len(vocab)
embed_dim = 64
hidden_dim = 128
num_layers = 1

num_epochs = 25
batch_size = 64

model = StoryLSTM(vocab_size, embed_dim, hidden_dim, num_layers)


# Create dataset and split into train and validation (e.g., 80/20 split)
dataset = TensorDataset(X, y)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Initilise dataloders,loss and optimiser
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

#Early stopping and running totals
best_val_loss = float('inf')
patience = 3  # You can tweak this
counter = 0
train_losses = []
val_losses = []


# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0

    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)

    train_loss = running_loss / train_size

    # Validation
    model.eval()
    val_running_loss = 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_running_loss += loss.item() * inputs.size(0)
    val_loss = val_running_loss / val_size

    train_losses.append(train_loss)
    val_losses.append(val_loss)

    print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}")


    # Early stopping logic
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0
        torch.save(model.state_dict(), 'best_model.pth')  # Save best model
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping triggered")
            break

# Plot training and validation loss
plt.plot(range(1, len(train_losses) + 1), train_losses, label='Train Loss')
plt.plot(range(1, len(val_losses) + 1), val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:

# PART 5 - EVALUATING MODEL PERFORMANCE

model.load_state_dict(torch.load('best_model.pth'))  # Load early stopped model
model.eval()

# Hardcoded prompt to give model (CHANGE IT HERE FOR DIFFERENT OUTPUTS)
prompt = "The princess "

temperature = 1.0  #  Controls randomness: lower = more predictable, higher = more creative

# Predict next 40 words
tokens = tokenizer(prompt.lower())
ids = torch.tensor([vocab[token] for token in tokens], dtype=torch.long).unsqueeze(0)
for _ in range(40):
    with torch.no_grad():

        #
        output = model(ids[:, -sequence_length:])
        logits = output / temperature

        probs = torch.nn.functional.softmax(logits, dim=1)
        predicted_id = torch.multinomial(probs, num_samples=1)[-1].item()  # 🎲 Sample instead of argmax
        ids = torch.cat([ids, torch.tensor([[predicted_id]])], dim=1)

generated = [vocab.lookup_token(tok.item()) for tok in ids[0]]
print("Generated:\n", " ".join(generated))
