### Text classification using LSTM

In this coding exercise, you will create a simple LSTM model using PyTorch to perform text classification on a dataset of short phrases. We will perform the following steps:

- Create a vocabulary to represent words as indices.
- Tokenize, encode, and pad the phrases.
- Convert the phrases and categories to PyTorch tensors.
- Instantiate the LSTM model with the vocabulary size, embedding dimensions, hidden dimensions, and output dimensions.
- Define the loss function and optimizer.
- Train the model for a number of epochs.
- Test the model on new phrases and print the category predictions.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
"""
Phrases (textual data) and their category labels (0 for sports, 1 for technology, 2 for food)
This data is extremely less for realistically training an LSTM model. Feel free to use
a relevant data source or create your own dummy data for this exercise.

Phrases (textual data) and their category labels (0 for sports, 1 for technology, 2 for food)
This model might overfit as the data is less. Feel free to use any other data source for training
or create your own dummy data
"""

phrases = ["great goal scored", "amazing touchdown", "new phone release", "latest laptop model", "tasty pizza", "delicious burger"]
categories = [0, 0, 1, 1, 2, 2]

"""
Create a vocabulary to represent words as indices
"""
vocab = {"<PAD>": 0, "great": 1, "goal": 2, "scored": 3, "amazing": 4, "touchdown": 5, "new": 6, "phone": 7, "release": 8, "latest": 9, "laptop": 10, "model": 11, "tasty": 12, "pizza": 13, "delicious": 14, "burger": 15}

"""
Tokenize, encode, and pad phrases
"""
encoded_phrases = [[vocab[word] for word in phrase.split()] for phrase in phrases]
max_length = max([len(phrase) for phrase in encoded_phrases])
padded_phrases = [phrase + [vocab["<PAD>"]] * (max_length - len(phrase)) for phrase in encoded_phrases]

"""
Convert phrases and categories to PyTorch tensors
"""
inputs = torch.LongTensor(padded_phrases)
labels = torch.LongTensor(categories)

In [None]:
class PhraseClassifier(nn.Module):
    """
    A simple LSTM-based phrase classifier for text classification tasks.
    
    This model processes input sequences through an embedding layer, followed by an LSTM layer,
    and finally a fully connected layer to produce classification logits. It's suitable for
    tasks like sentiment analysis, topic classification, or any text classification problem.

    Attributes:
        embedding (nn.Embedding): Converts token indices to dense embeddings
        lstm (nn.LSTM): Processes the embedded sequences using LSTM
        fc (nn.Linear): Final fully connected layer for classification

    Args:
        vocab_size (int): Size of the vocabulary
        embedding_dim (int): Dimension of the word embeddings
        hidden_dim (int): Number of features in the LSTM hidden state
        output_dim (int): Number of output classes
    """

    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        """
        Initializes the PhraseClassifier with the specified dimensions.
        """
        super(PhraseClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        """
        Forward pass of the model.

        Args:
            x (torch.Tensor): Input tensor of shape (seq_len, batch_size) containing token indices

        Returns:
            torch.Tensor: Output logits of shape (batch_size, output_dim)
        """
        embedded = self.embedding(x)  # (seq_len, batch_size, embedding_dim)
        output, (hidden, _) = self.lstm(embedded)  # output: (seq_len, batch_size, hidden_dim)
                                                   # hidden: (1, batch_size, hidden_dim)
        logits = self.fc(hidden.squeeze(0))  # (batch_size, output_dim)
        return logits

In [7]:
model = PhraseClassifier(len(vocab), embedding_dim=10, hidden_dim=20, output_dim=3)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 100

for epoch in range(epochs):
    optimizer.zero_grad()
    predictions = model(inputs.t())
    loss = criterion(predictions, labels)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        print(f"Epoch: {epoch + 1}, Loss: {loss.item()}")

Epoch: 100, Loss: 0.34661993384361267


In [9]:
# Inference block for testing trained LSTM model on new phrases
with torch.no_grad():  # Disable gradient calculation for inference
    model.eval()  # Set model to evaluation mode (affects dropout, batch norm etc.)
    
    # Sample phrases for testing model predictions
    test_phrases = ["incredible match", "newest gadget", "yummy cake"]
    
    # Tokenize and encode phrases using vocabulary
    # Uses vocab.get() with fallback to <PAD> for unknown words
    encoded_test_phrases = [
        [vocab.get(word, vocab["<PAD>"]) for word in phrase.split()] 
        for phrase in test_phrases
    ]
    
    # Pad sequences to match training data length
    padded_test_phrases = [
        phrase + [vocab["<PAD>"]] * (max_length - len(phrase)) 
        for phrase in encoded_test_phrases
    ]
    
    # Convert to PyTorch tensor (shape: [batch_size, seq_len])
    test_inputs = torch.LongTensor(padded_test_phrases)
    
    # Get model predictions (transpose inputs to [seq_len, batch_size])
    # torch.argmax gets the class with highest probability
    test_predictions = torch.argmax(model(test_inputs.t()), dim=1)
    
    # Print predicted class indices
    print("Test predictions:", test_predictions)

Test predictions: tensor([2, 2, 2])
