In [None]:
!pip install datasets accelerate -U # not necessary with this script

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the path to save the model in your Drive
drive_path = '/content/drive/My Drive/urgency_models/'


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


Load data

In [None]:
df = pd.read_csv('./content/combined_reviews.csv', sep= ',')

texts = df['review'].tolist()

labels = df['urgent'].tolist()

Custom Dataset class:

In [None]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
      self.texts = texts
      self.labels = labels
      self.tokenizer = tokenizer
      self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

If BERT type, use this class:

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

If roberta type, use this class 

In [None]:

class RoBERTaClassifier(nn.Module):
    def __init__(self, roberta_model_name, num_classes):
        super(RoBERTaClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained(roberta_model_name)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

Training loop function

In [None]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

Validation function

In [2]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

To classify unsees reviews

In [3]:
def predict_urgency(text, model, tokenizer, device, max_length=256):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return 'urgent' if preds.item() == 1 else 'not urgent'

Config:

In [None]:
# Set up parameters
model_name = 'Maltehb/danish-bert-botxo'
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 5
learning_rate = 2e-5

Divide data into training/validation/test sets

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

val_texts, test_texts, val_labels, test_labels = train_test_split(val_texts, val_labels, test_size=0.5, random_state=42)

Define tokenizer according to model type and format the data into dataloaders

In [None]:
if 'roberta' in model_name:
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
else:
    tokenizer = BertTokenizer.from_pretrained(model_name)

#train set
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
#validation set
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

#Test set
test_dataset = TextClassificationDataset(test_texts, test_labels, tokenizer, max_length)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Define model according to the chosen model type:

In [None]:
if 'roberta' in model_name:
    model = RoBERTaClassifier(model_name, num_classes).to(device)
else:
    model = BERTClassifier(model_name, num_classes).to(device)

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [None]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=50, num_training_steps=total_steps)



Now train and validate the model:

In [None]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}, Device: {device}")
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

Test on unseen data:

In [None]:
accuracy_score, report = evaluate(model, test_dataloader, device)

Save model:

In [None]:
# Save model to Drive
save_info = input("Please enter the model name under which you want to save the model: ")
torch.save(model.state_dict(), drive_path + save_info +".pth")
tokenizer.save_pretrained(f"./models/tokenizer_{save_info}")

If model and tokenizer has not been instantiated, load them from directory:

In [None]:
# load stored model and tokenizer
model = torch.load('./project/urgency_models/danish_bert_classifier.pth', map_location=torch.device('cpu'))
tokenizer = BertTokenizer.from_pretrained('Maltehb/danish-bert-botxo')

Predict unseen:

In [None]:
# Test urgency prediction
dummy_text = "Holder ikke hvad de lover. Bryder aftaler og giver kun en problemer og gæld med hjem. Kan ikke stole på dem."
urgency = predict_urgency(dummy_text, model, tokenizer, device)
print(dummy_text)
print(f"Predicted sentiment: {urgency}")

De kunne godt smile lidt mere, men de var hjælpsomme, og det var super fedt at komme på værkstedet. Vi købte en ny brugt skoda citigo, og den kører som en drøm. Vi kalder den snehvide herhjemme, men det er jo ligegyldigt for jer. I skal bare vide at det var en totalt ok oplevelse, og jeg kommer gerne igen. I onsdags var jeg ude at besøge jeres værksted, fordi jeg skulle have rettet op på en fejl, som I havde lavet, og der er nu en aftale på plads. Som sagt, oplevelsen var god, og I fortjener alle fem stjerner herfra. Rigtig god dag, kh Bente Prebensen.
Predicted sentiment: urgent
