<a href="https://colab.research.google.com/github/Benedictakel/Sentiment-Analysis-with-LSTM-IMDB-Dataset-/blob/main/Sentiment_Analysis_with_LSTM_(IMDB_Dataset).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install torch torchvision torchtext matplotlib scikit-learn


In [None]:
import torch
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")
train_iter = IMDB(split='train')

def yield_tokens(data_iter):
    for label, text in data_iter:
        yield tokenizer(text)

# Build vocabulary
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])


In [None]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def preprocess(text):
    return vocab(tokenizer(text))

def collate_batch(batch):
    labels, texts = [], []
    for label, text in batch:
        labels.append(1 if label == 'pos' else 0)
        processed = torch.tensor(preprocess(text), dtype=torch.long)
        texts.append(processed)
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=vocab["<pad>"])
    return padded_texts, torch.tensor(labels)

train_iter = IMDB(split='train')
train_dataloader = DataLoader(list(train_iter), batch_size=16, shuffle=True, collate_fn=collate_batch)

test_iter = IMDB(split='test')
test_dataloader = DataLoader(list(test_iter), batch_size=16, shuffle=False, collate_fn=collate_batch)


In [None]:
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=vocab["<pad>"])
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, _) = self.lstm(embedded)
        return self.fc(self.dropout(hidden[-1]))


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(len(vocab), 128, 256, 1).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

def train(model, dataloader):
    model.train()
    total_loss = 0
    for text, label in dataloader:
        text, label = text.to(device), label.to(device).float()
        optimizer.zero_grad()
        output = model(text).squeeze(1)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

for epoch in range(5):
    loss = train(model, train_dataloader)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}")


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

def evaluate(model, dataloader):
    model.eval()
    preds, truths = [], []
    with torch.no_grad():
        for text, label in dataloader:
            text, label = text.to(device), label.to(device).float()
            output = model(text).squeeze(1)
            pred = torch.round(torch.sigmoid(output))
            preds += pred.cpu().numpy().tolist()
            truths += label.cpu().numpy().tolist()
    acc = accuracy_score(truths, preds)
    prec = precision_score(truths, preds)
    rec = recall_score(truths, preds)
    print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}")

evaluate(model, test_dataloader)


In [None]:
class AttentionLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=vocab["<pad>"])
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        attn_weights = torch.softmax(self.attention(lstm_out), dim=1)
        context = torch.sum(attn_weights * lstm_out, dim=1)
        return self.fc(self.dropout(context))
