In [1]:
!pip install transformers torch



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv("IMDB Dataset Processed Lemma test.csv")
x = df["cleaned_review"]
y = df["sentiment"]
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)


In [3]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)


In [4]:
import torch

# Convert labels and encodings to tensors
train_labels = torch.tensor(y_train, dtype=torch.long)
test_labels = torch.tensor(y_test, dtype=torch.long)

train_inputs = torch.tensor(train_encodings['input_ids'])
train_masks = torch.tensor(train_encodings['attention_mask'])
test_inputs = torch.tensor(test_encodings['input_ids'])
test_masks = torch.tensor(test_encodings['attention_mask'])


In [None]:
from torch.utils.data import DataLoader, TensorDataset

train_data = TensorDataset(train_inputs, train_masks, train_labels)
test_data = TensorDataset(test_inputs, test_masks, test_labels)

# Create batches of 16
train_dataloader = DataLoader(train_data, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=16, shuffle=False)


In [6]:
from transformers import BertModel

class BertClassifier(torch.nn.Module):
    def __init__(self):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 2)  # Output layer for 2 classes (positive, negative)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  # Use the [CLS] tokenâ€™s embedding
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        return linear_output


In [7]:
model = BertClassifier()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
for epoch in range(3):  # choose the number of epochs
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader)}")


Epoch 1, Loss: 0.3356909539190031
Epoch 2, Loss: 0.21331968078429195
Epoch 3, Loss: 0.1159635128385778


In [8]:
from sklearn.metrics import accuracy_score, classification_report

# Put model in evaluation mode
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask)
        
        # Move outputs and labels to CPU and take the highest logit as the prediction
        logits = outputs.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        
        # Append predictions and true labels for reporting
        predictions.extend(logits.argmax(axis=1))
        true_labels.extend(label_ids)

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f"Test Accuracy: {accuracy}")

# Generate classification report
report = classification_report(true_labels, predictions, target_names=["Negative", "Positive"])
print("Classification Report:\n", report)


Test Accuracy: 0.8744580014117173
Classification Report:
               precision    recall  f1-score   support

    Negative       0.83      0.94      0.88      4940
    Positive       0.93      0.81      0.87      4977

    accuracy                           0.87      9917
   macro avg       0.88      0.87      0.87      9917
weighted avg       0.88      0.87      0.87      9917

