In [43]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import load_dataset
from torch.nn.utils.rnn import pad_sequence


In [44]:
mdb_dataset = load_dataset('imdb')

In [45]:
print(mdb_dataset['train'][0])
print(mdb_dataset["test"][0])

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [46]:
def basic_tokenize(text):
    return text.split()

def build_vocab(dataset, tokenizer):
    
    start_index = 2
    vocab = {}

    for doc in dataset:
        for token in tokenizer(doc['text']):
            if token not in vocab:
                vocab[token] = start_index
                start_index += 1

    vocab['<pad>'] = 0
    vocab['<unk>'] = 1
    return vocab

In [47]:
vocab = build_vocab(mdb_dataset['train'], basic_tokenize)


In [48]:
def encode_text(text, tokenizer, vocab):
    return [vocab.get(token, vocab['<unk>']) for token in tokenizer(text)]

def process_data(example, tokenizer, vocab):
    example['input_ids'] = encode_text(example['text'], vocab, tokenizer)
    return example

In [49]:
imdb_dataset = mdb_dataset.map(lambda x: process_data(x, vocab, basic_tokenize))


In [50]:
print(imdb_dataset['train'][0])

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [51]:
def collate_fn(batch):
    inputs = [torch.tensor(item['input_ids'], dtype=torch.long) if not isinstance(item['input_ids'], torch.Tensor) else item['input_ids'] for item in batch]
    labels = [item['label'] for item in batch]
    inputs = pad_sequence(inputs, batch_first=True, padding_value=0)  # Padding to max length in batch
    labels = torch.tensor(labels, dtype=torch.float32)
    return inputs, labels

In [52]:
train_loader = DataLoader(imdb_dataset['train'], batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(imdb_dataset['test'], batch_size=32, shuffle=False, collate_fn=collate_fn)

In [53]:
class SimpleRNNWithEmbeddings(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SimpleRNNWithEmbeddings, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)  # Embedding layer with padding index
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)  # Shape: (batch_size, seq_length, embedding_dim)
        output, hidden = self.rnn(embedded)  # hidden: (1, batch_size, hidden_dim)
        return self.fc(hidden.squeeze(0))  # Shape: (batch_size, output_dim)

# Define hyperparameters
vocab_size = len(vocab)
embedding_dim = 100  # Size of the embedding vectors
hidden_dim = 128  # RNN hidden layer size
output_dim = 1  # Binary classification (positive/negative)

# Instantiate the model
model = SimpleRNNWithEmbeddings(vocab_size, embedding_dim, hidden_dim, output_dim)

# Loss function and optimizer
criterion = nn.BCEWithLogitsLoss()  # Binary Cross Entropy with Logits for binary classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Move model to device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


In [54]:
class SimpleRNNWithEmbeddings(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SimpleRNNWithEmbeddings, self).__init__()
        self.embedding = nn.Embedding(vocab_size,embedding_dim,padding_idx=0)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
                
    def forward(self, text):
        embedded = self.embedding(text)  # Shape: (batch_size, seq_length, embedding_dim)
        output, hidden = self.rnn(embedded)  # hidden: (1, batch_size, hidden_dim)
        return self.fc(hidden.squeeze(0))  # Shape: (batch_size, output_dim)

In [55]:
# Training loop
def train_model(model, train_loader, optimizer, criterion, device):
    model.train()  # Set the model to training mode
    total_loss = 0
    for batch in train_loader:
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        predictions = model(inputs).squeeze(1)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(train_loader)

# Train the model for a few epochs
num_epochs = 5
for epoch in range(num_epochs):
    train_loss = train_model(model, train_loader, optimizer, criterion, device)
    print(f'Epoch {epoch+1}, Loss: {train_loss:.4f}')


Epoch 1, Loss: 0.6976
Epoch 2, Loss: 0.6959
Epoch 3, Loss: 0.6960
Epoch 4, Loss: 0.6960
Epoch 5, Loss: 0.6956
