In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertModel
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the dataset
df = pd.read_csv("IMDB Dataset Processed Lemma test.csv")
x = df["cleaned_review"]
y = df["sentiment"]

# Encode sentiment labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [3]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize data
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)

# Convert encodings and labels to tensors
train_inputs = torch.tensor(train_encodings['input_ids'])
train_masks = torch.tensor(train_encodings['attention_mask'])
train_labels = torch.tensor(y_train, dtype=torch.long)

test_inputs = torch.tensor(test_encodings['input_ids'])
test_masks = torch.tensor(test_encodings['attention_mask'])
test_labels = torch.tensor(y_test, dtype=torch.long)

# Create TensorDataset
train_data = TensorDataset(train_inputs, train_masks, train_labels)
test_data = TensorDataset(test_inputs, test_masks, test_labels)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [4]:
# Define the BERT classifier
class BertClassifier(torch.nn.Module):
    def __init__(self):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = torch.nn.Dropout(0.3)  # Default dropout, will be overridden in hyperparameter tuning
        self.linear = torch.nn.Linear(768, 2)  # Output layer for binary classification

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  # Use [CLS] token's embedding
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        return linear_output

In [5]:
# Define the Optuna objective function
def objective(trial):
    # Hyperparameter suggestions
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
    dropout = trial.suggest_uniform('dropout', 0.1, 0.5)
    batch_size = trial.suggest_categorical('batch_size', [16, 32])
    epochs = trial.suggest_int('epochs', 2, 5)

    # Update DataLoader with batch size
    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

    # Initialize model and optimizer
    model = BertClassifier()
    model.dropout = torch.nn.Dropout(dropout)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    loss_fn = torch.nn.CrossEntropyLoss()

    # Move model to device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Training loop
    model.train()
    for epoch in range(epochs):
        for batch in train_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

    # Evaluation loop
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask)
            logits = outputs.cpu().numpy()
            label_ids = labels.cpu().numpy()
            predictions.extend(logits.argmax(axis=1))
            true_labels.extend(label_ids)

    # Calculate validation accuracy
    val_accuracy = accuracy_score(true_labels, predictions)
    return val_accuracy

In [None]:
# Create and run the Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

# Print the best hyperparameters
print("Best Hyperparameters:")
print(study.best_params)

# Print the best trial
print("Best Trial Accuracy:")
print(study.best_value)

[I 2024-11-27 19:13:10,314] A new study created in memory with name: no-name-54337eed-3a0a-49e6-bbc6-8f9293ecccee
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  dropout = trial.suggest_uniform('dropout', 0.1, 0.5)


In [None]:
# Extract the best parameters
best_params = study.best_params

# Update DataLoader with best batch size
train_dataloader = DataLoader(train_data, batch_size=best_params['batch_size'], shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=best_params['batch_size'], shuffle=False)

# Initialize final model with best dropout
final_model = BertClassifier()
final_model.dropout = torch.nn.Dropout(best_params['dropout'])

# Optimizer with best learning rate
final_optimizer = torch.optim.AdamW(final_model.parameters(), lr=best_params['learning_rate'])

# Move final model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
final_model.to(device)

# Train final model
final_model.train()
for epoch in range(best_params['epochs']):
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        final_optimizer.zero_grad()
        outputs = final_model(input_ids, attention_mask)
        loss = torch.nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        final_optimizer.step()

# Evaluate final model
final_model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = final_model(input_ids, attention_mask)
        logits = outputs.cpu().numpy()
        label_ids = labels.cpu().numpy()
        predictions.extend(logits.argmax(axis=1))
        true_labels.extend(label_ids)

# Print final test accuracy and classification report
test_accuracy = accuracy_score(true_labels, predictions)
print(f"Final Test Accuracy: {test_accuracy}")

report = classification_report(true_labels, predictions, target_names=["Negative", "Positive"])
print("Final Classification Report:\n", report)