In [23]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils import clip_grad_norm_
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [24]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.float)
        }

In [25]:
class BertLSTMClassifier(nn.Module):
    def __init__(self, bert_model_name, hidden_dim):
        super(BertLSTMClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.lstm = nn.LSTM(input_size=768, hidden_size=hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, 1)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        lstm_output, _ = self.lstm(bert_output.last_hidden_state)
        output = self.fc(lstm_output[:, -1, :])
        return torch.sigmoid(output).squeeze()

In [26]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')  # Replace with your uploaded dataset file name

# Preprocess the dataset
df = df.sample(frac=1).reset_index(drop=True)  # Shuffle the dataset
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})  # Map labels to binary

texts = df['review'].tolist()
labels = df['sentiment'].tolist()

df.shape

(50000, 2)

In [27]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [28]:
bert_model_name = 'bert-base-uncased'
max_len = 128
hidden_dim = 128
batch_size = 32
epochs = 10
learning_rate = 0.001

In [29]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, max_len)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)




In [30]:
# Model, Loss, Optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertLSTMClassifier(bert_model_name, hidden_dim).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [31]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1} Loss: {total_loss / len(train_loader)}")

Epoch 1 Loss: 0.40683618985414505
Epoch 2 Loss: 0.3218914108455181
Epoch 3 Loss: 0.29987481405735017
Epoch 4 Loss: 0.2824332995772362
Epoch 5 Loss: 0.269942486089468
Epoch 6 Loss: 0.25166591953635215
Epoch 7 Loss: 0.23791376789808275
Epoch 8 Loss: 0.22052648710906506
Epoch 9 Loss: 0.20430450218468904
Epoch 10 Loss: 0.1875486241698265


In [32]:
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask)
        preds = (outputs >= 0.5).float()
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

In [33]:
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.8673
Precision: 0.8981047937569677
Recall: 0.8223764801959984
F1 Score: 0.8585740168389642
