In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

# Load your data
df = pd.read_csv('Data/todo.csv', delimiter=';')


#TODO change this to the correct column names
reviews = df['text'].tolist()
labels = df['label'].tolist()

# Split data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    reviews, labels, test_size=0.2, random_state=42, stratify=labels
)

# Load a pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

In [None]:
import torch

class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ReviewDataset(train_encodings, train_labels)
test_dataset = ReviewDataset(test_encodings, test_labels)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16)

In [None]:
from transformers import BertModel
import torch.nn as nn

class BertLSTM(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', hidden_size=128, num_layers=2, num_classes=2):
        super(BertLSTM, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.lstm = nn.LSTM(self.bert.config.hidden_size, hidden_size, num_layers, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(hidden_size * 2, num_classes) # Bidirectional LSTM doubles the output size

    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = bert_output.last_hidden_state
        lstm_out, _ = self.lstm(last_hidden_state)
        # Take the average of the LSTM output over the sequence length
        lstm_out_mean = torch.mean(lstm_out, dim=1)
        output = self.dropout(lstm_out_mean)
        output = self.linear(output)
        return output

model = BertLSTM(num_classes=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
from torch.optim import AdamW
from tqdm import tqdm

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3

model.train()
for epoch in range(num_epochs):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask)
        loss_fn = nn.CrossEntropyLoss()
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        loop.set_postfix(loss=loss.item())

print("Training finished!")

Evaluate the Model (AUC):

In [None]:
from sklearn.metrics import roc_auc_score
import numpy as np

model.eval()
all_probs = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()
        outputs = model(input_ids, attention_mask)
        probabilities = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy() # Get probability of the positive class

        all_probs.extend(probabilities)
        all_labels.extend(labels)

auc = roc_auc_score(all_labels, all_probs)
print(f"AUC on the test set: {auc:.4f}")