# Starter Notebook
<a target="_blank" href="https://colab.research.google.com/github/patrick-batman/Mosaic-24/blob/main/Mosaic%20PS2/char_level_rnn.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

This task can be modeled as masked language modelling whereby we need to predict the next the next character; so one of the approaches can be to use character level masked language model:

- <b>RNN</b>
- <b>LSTM</b>
- <b>Transformer</b> (Encoder only)
<br />



However since the game requires us only to predict one output at a time rather than fill all the character masks at once, we fill the letter with maximum probability.

In [1]:
## imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np

Take batches of words, pass them through simple RNN; use last output to predict probability distribution

In [2]:
# Define the model architecture
class MaskedLanguageModel(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(MaskedLanguageModel, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, input_size)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, input):
        embedded = self.embedding(input)
        output, _ = self.rnn(embedded)
        output = self.fc(output)
        output = output[:,-1,:]
        output = self.softmax(output)
        return output

Now while making dataset; I have implement `soft-cross entropy loss`; that is suppose you have word '___le' to be modelled from 'apple' now since here 'a' as well as 'p' both a missing we assing weights to both of them s/t it is proportional to their missing frequency hoping that this will enable our model to capture language syntactics as well. 
- `a`: 0.33
- `p`: 0.66

In [3]:
# Custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, masked_words, original_words, char_to_index):
        self.masked_words = masked_words
        self.original_words = original_words
        self.char_to_index = char_to_index

    def __len__(self):
        return len(self.masked_words)

    def __getitem__(self, idx):
        masked_word = self.masked_words[idx]
        original_word = self.original_words[idx]
        
        # Convert characters to indices
        masked_indices = [self.char_to_index[c] for c in masked_word]
        
        # Convert original word to soft encoding
        weights = [0]*len(self.char_to_index)
        total_diff = 0
        for i in range(len(original_word)):
            if original_word[i] != masked_word[i]:
                weights[self.char_to_index[original_word[i]]] += 1 
                total_diff += 1
        for i in range(len(weights)):
            weights[i] = weights[i]/total_diff
        
        original_indices = torch.tensor(weights)

        
        return torch.tensor(masked_indices), original_indices

When dealing with batches with different lengths of individual words; we need to pad them so that every tensor in a batch can be stacked. For this we have `collate_fn`.

In [4]:
# Collate function for DataLoader
def collate_fn(batch):
    masked_words, original_words = zip(*batch)
    max_len = max(len(word) for word in masked_words)
    padded_masked_words = torch.stack([torch.nn.functional.pad(word, (0, max_len - len(word)), value=0) for word in masked_words])
    stacked_orignal_words = torch.stack(original_words)
    # padded_original_words = torch.stack([torch.nn.functional.pad(word, (0, max_len - len(word)), value=0) for word in original_words])
    return padded_masked_words, stacked_orignal_words

The gool-ol' training loop.



PS:This is going to perform extremely poorly since its trained on few words and only for 2 epochs but this will help you get started.

In [5]:
# Training function
def train(model, dataloader, criterion, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for masked_words, weights_chars in tqdm(dataloader, desc=f'Epoch {epoch + 1}/{epochs}', unit='batch'):
            optimizer.zero_grad()
            output = model(masked_words)
            loss = criterion(output, weights_chars)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {running_loss / len(dataloader)}")


Now just to give an example I have just created a list of how you would want to sample the words for language(character) modelling.
Ideally you would want to use 70-80% training set for this and leave rest for validation.

In [6]:
# Sample data
masked_words = ['a__l_', 'p__', 'l__r___g', 'bott_e']
original_words = ['apple', 'pen', 'learning', 'bottle']

## storing how words are mapped so that we can convert them back again
char_to_index = {chr(i): i - 96 for i in range(97, 123)}
char_to_index.update({'_': 27})
char_to_index.update({'-': 0})
index_to_char = {i: char for char, i in char_to_index.items()}

In [7]:
# Hyperparameters
input_size = len(char_to_index)
hidden_size = 128
batch_size = 3
epochs = 2
learning_rate = 0.001

# Initialize model, criterion, optimizer
model = MaskedLanguageModel(input_size, hidden_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Create DataLoader
dataset = CustomDataset(masked_words, original_words, char_to_index)
dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)

# Train the model
train(model, dataloader, criterion, optimizer, epochs)


Epoch 1/2: 100%|██████████| 2/2 [00:00<00:00, 23.47batch/s]


Epoch 1/2, Loss: 3.333022952079773


Epoch 2/2: 100%|██████████| 2/2 [00:00<00:00, 745.39batch/s]

Epoch 2/2, Loss: 3.2866896390914917





Setting up a testing function

In [8]:
# Testing function
def test(model, dataloader, char_to_index, index_to_char):
    model.eval()
    with torch.no_grad():
        for masked_words, original_words in dataloader:
            output = model(masked_words)
            char_ind = [np.argmax(output[i].detach().numpy()) for i in range(output.shape[0])]
            predictions = [index_to_char[index] for index in char_ind]
            print(predictions)
            
            

In [9]:
# Sample data
masked_words_test = ['c__k', 'b_t']
original_words_test = ['cook', 'bat']

# Create DataLoader
dataset = CustomDataset(masked_words_test, original_words_test, char_to_index)
dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)
# Test the model
test(model, dataloader, char_to_index, index_to_char)

['c', 'n']


Saving the model

In [10]:
# Save the model
torch.save(model.state_dict(), 'model.pth')

Incase you need to unload the model uncomment the below lines

In [None]:
# # Load the model from the .pth file
# loaded_model = MaskedLanguageModel(input_size, hidden_size)
# loaded_model.load_state_dict(torch.load('model.pth'))
