In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score
import time

# Define a basic RNN model
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout=0.5):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
        rnn_out, _ = self.rnn(packed)
        padded_out, _ = torch.nn.utils.rnn.pad_packed_sequence(rnn_out, batch_first=True)
        out = self.fc(self.dropout(padded_out[:, -1, :]))  # Use the last time-step
        return out

# Dataset class for text classification
class TextDataset(Dataset):
    def __init__(self, texts, labels, max_len, vocab, tokenizer):
        self.texts = texts
        self.labels = labels
        self.max_len = max_len
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokenized_text, length = self.tokenize_and_pad(text)
        return tokenized_text, label, length

    def tokenize_and_pad(self, text):
        # Tokenization: convert text to tokens using vocab
        tokens = self.tokenizer(text)
        length = len(tokens)
        # Padding (or truncating) the sequence to max_len
        if length < self.max_len:
            tokens = tokens + [0] * (self.max_len - length)  # Pad with 0
        else:
            tokens = tokens[:self.max_len]  # Truncate if too long
        return tokens, length

# Training function for each epoch
def train_epoch(dataloader, model, optimizer, loss_fn):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        text_batch, label_batch = text_batch.to(device), label_batch.to(device)
        lengths = torch.tensor(lengths).to(device)

        optimizer.zero_grad()
        pred = model(text_batch, lengths)
        
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        acc = accuracy_score(label_batch.cpu().numpy(), pred.argmax(dim=1).cpu().numpy())
        total_acc += acc

    return total_acc / len(dataloader), total_loss / len(dataloader)

# Evaluation function for each epoch
def evaluate_epoch(dataloader, model, loss_fn):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            text_batch, label_batch = text_batch.to(device), label_batch.to(device)
            lengths = torch.tensor(lengths).to(device)

            pred = model(text_batch, lengths)
            loss = loss_fn(pred, label_batch)

            total_loss += loss.item()
            acc = accuracy_score(label_batch.cpu().numpy(), pred.argmax(dim=1).cpu().numpy())
            total_acc += acc

    return total_acc / len(dataloader), total_loss / len(dataloader)

# Example tokenizer (replace with your actual tokenizer)
def simple_tokenizer(text):
    return text.split()

# Main function to train and evaluate the model
def train_and_evaluate(texts, labels, vocab, tokenizer, max_len=128, batch_size=32, num_epochs=10, hidden_dim=128, learning_rate=0.001):
    # Create dataset and dataloaders
    train_dataset = TextDataset(texts, labels, max_len, vocab, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    # Initialize model, loss function, and optimizer
    model = RNNModel(vocab_size=len(vocab), embedding_dim=100, hidden_dim=hidden_dim, output_dim=len(set(labels)))
    model.to(device)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop with early stopping
    train_accuracies = []
    val_accuracies = []
    train_losses = []
    val_losses = []

    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")
        acc_train, loss_train = train_epoch(train_loader, model, optimizer, loss_fn)
        train_accuracies.append(acc_train)
        train_losses.append(loss_train)

        # Evaluate on validation set (can use another DataLoader for validation)
        acc_valid, loss_valid = evaluate_epoch(train_loader, model, loss_fn)
        val_accuracies.append(acc_valid)
        val_losses.append(loss_valid)

        print(f"Train Accuracy: {acc_train:.4f}, Train Loss: {loss_train:.4f}")
        print(f"Validation Accuracy: {acc_valid:.4f}, Validation Loss: {loss_valid:.4f}")
    
    return model, train_accuracies, val_accuracies, train_losses, val_losses

# Example data (replace with your actual data)
texts = ["this is a sentence", "another example sentence", "deep learning is fun"]
labels = [0, 1, 0]
vocab = {'<PAD>': 0, 'this': 1, 'is': 2, 'a': 3, 'sentence': 4, 'another': 5, 'example': 6, 'deep': 7, 'learning': 8, 'fun': 9}
tokenizer = simple_tokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Train the model
model, train_accuracies, val_accuracies, train_losses, val_losses = train_and_evaluate(texts, labels, vocab, tokenizer)

