<a href="https://colab.research.google.com/github/Alaaokaly/nlp-foundations/blob/main/RNNTextGeneration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch

In [3]:
corpus = """Weather in Asia: A Diverse Tapestry of Climates Asia, the largest continent on Earth, is renowned for its immense diversity, not only in culture and geography but also in climate. From the icy tundras of Siberia to the tropical rainforests of Southeast Asia, the weather across this vast expanse varies dramatically. Understanding the climatic conditions in different regions of Asia is essential for agriculture, tourism, and daily life, influencing everything from crop cycles to travel plans.
In northern Asia, particularly in Siberia, winters are harsh and long. Temperatures can plummet to as low as -40°C in some areas, making it one of the coldest places on Earth. The vast taiga forest is covered in snow for much of the year, creating a winter wonderland that attracts adventurous tourists. However, summers are brief and can be surprisingly warm, with temperatures reaching 30°C."""



In [4]:
data = corpus.lower()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))

char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for idx, char in enumerate(chars)}


data has 889 characters, 33 unique.


In [5]:
import torch
import torch.nn as nn

# Hyperparameters
epochs = 120
input_size = len(chars)   # Number of unique characters
output_size = len(chars)  # Number of unique characters
hidden_n = 37             # Hidden layer size
sequence_length = 5       # Length of input sequences and note that the smaller the sequence the better the performance
num_layers = 5            # Number of RNN layers

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, h_0):
        out, h_t = self.rnn(x, h_0)  # out: (batch_size, sequence_length, hidden_size)
        out = out[:, -1, :]          # Use only the last output: (batch_size, hidden_size)
        out = self.fc(out)           # Pass to fully connected layer
        return out

# Initialize the model
model = RNN(input_size, hidden_n, output_size, num_layers)

# Move model to GPU if available (optional)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# Print model summary
print(model)



RNN(
  (rnn): RNN(33, 37, num_layers=5, batch_first=True)
  (fc): Linear(in_features=37, out_features=33, bias=True)
)


In [6]:

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [7]:
for epoch in range(epochs):
    print('Epoch : ', epoch)
    epoch_loss = 0
    for i in range(len(corpus) - sequence_length):

        # Prepare input and target sequences
        x_seq = [char_to_idx[ch] for ch in data[i:i + sequence_length]]
        y_seq = char_to_idx[data[i+sequence_length]]

        # Convert to tensor and one-hot encode
        x_tensor = torch.zeros(1, sequence_length, vocab_size)
        for j, idx in enumerate(x_seq):
            x_tensor[0, j, idx] = 1  # One-hot encoding

        # Initialize hidden state
        h_0 = torch.zeros(num_layers, 1, hidden_n)   # Shape (num_layers, batch_size, hidden_size)

        # Forward pass
        optimizer.zero_grad()  # Zero the gradients
        y_pred = model(x_tensor, h_0)  # Get the prediction

        # Compute loss
        loss = criterion(y_pred, torch.tensor([y_seq]))

        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    if epoch%10 ==0:
        print(f'Epoch {epoch + 1}, Loss: {epoch_loss / (len(corpus) - sequence_length):.4f}')

Epoch :  0
Epoch 1, Loss: 3.0162
Epoch :  1
Epoch :  2
Epoch :  3
Epoch :  4
Epoch :  6
Epoch :  7
Epoch :  8
Epoch :  9
Epoch :  10
Epoch 11, Loss: 2.1141
Epoch :  11
Epoch :  12
Epoch :  13
Epoch :  14
Epoch :  15
Epoch :  16
Epoch :  17
Epoch :  18
Epoch :  19
Epoch :  20
Epoch 21, Loss: 1.5414
Epoch :  21
Epoch :  22
Epoch :  23
Epoch :  24
Epoch :  25
Epoch :  26
Epoch :  27
Epoch :  28
Epoch :  29
Epoch :  30
Epoch 31, Loss: 1.1605
Epoch :  31
Epoch :  32
Epoch :  33
Epoch :  34
Epoch :  35
Epoch :  36
Epoch :  37
Epoch :  38
Epoch :  39
Epoch :  40
Epoch 41, Loss: 0.8175
Epoch :  41
Epoch :  42
Epoch :  43
Epoch :  44
Epoch :  45
Epoch :  46
Epoch :  47
Epoch :  48
Epoch :  49
Epoch :  50
Epoch 51, Loss: 0.6469
Epoch :  51
Epoch :  52
Epoch :  53
Epoch :  54
Epoch :  55
Epoch :  56
Epoch :  57
Epoch :  58
Epoch :  59
Epoch :  60
Epoch 61, Loss: 0.5618
Epoch :  61
Epoch :  62
Epoch :  63
Epoch :  64
Epoch :  65
Epoch :  66
Epoch :  67
Epoch :  68
Epoch :  69
Epoch :  70
Epoch 71,

In [14]:
with torch.no_grad():
    test_input = "weath"
    x_seq = [char_to_idx[ch] for ch in test_input]
    x_tensor = torch.zeros(1, sequence_length, vocab_size)
    for j, idx in enumerate(x_seq):
        x_tensor[0, j, idx] = 1  # One-hot encoding

    h_0 = torch.zeros(num_layers, 1, hidden_n)
    predicted_output = model(x_tensor, h_0)

    # Use temperature sampling
    temperature = 0.8
    probabilities = torch.softmax(predicted_output / temperature, dim=-1)
    predicted_char_idx = torch.multinomial(probabilities, num_samples=1).item()
    predicted_char = idx_to_char[predicted_char_idx]

    print(f"Input: '{test_input}' -> Predicted next character: '{predicted_char}'")


Input: 'weath' -> Predicted next character: 'e'


In [None]:
# evaluate using preplexity