In [1]:
import pandas as pd
from transformers import DistilBertTokenizerFast
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import numpy as np
from sklearn.metrics import classification_report

# Load data to a DataFrame
df = pd.read_csv('/Users/jinlinchen/Documents/Study/HWR Berlin/Semester 2/Analytics Lab/Analytics Project/Database Part/manual_label - consolidated.csv')

In [2]:
# Keep only the columns we need
df = df[['entity', 'sentence_original', 'class_ID']]

# Convert labels to zero-indexed, because DistilBERT expects zero-indexed labels
df['class_ID'] = df['class_ID'] - 1

# Concatenate entity and sentence
df['text'] = df['entity'] + ": " + df['sentence_original']

In [4]:
# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [5]:
# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [6]:
# Tokenize the text and convert it to a tensor that can be used for model training
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe.class_ID
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])
        target = self.targets.iloc[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

# Setting Parameters for the model
# The maximum length of the tokenized sequences. If the sequences are longer than this, they will be cut.
# The batch size for the training and test data loader.
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8

# Create DataLoader
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = CustomDataset(
        dataframe=df,
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(ds, batch_size=batch_size, num_workers=0)

train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, TRAIN_BATCH_SIZE)
test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, VALID_BATCH_SIZE)

In [7]:
# Determine the model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(df.class_ID.unique()))

# Determine to use CPU or GPU, sometimes GPU is not available or the memory is not enough. 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Using an Optimizer to update the model parameters and minimize the loss
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

# Using Scheduler to adjust the learning rate during training
total_steps = len(train_data_loader) * 3 
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Training function
def train_epoch(model, data_loader, optimizer, device, scheduler):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=targets
        )

        loss = outputs.loss
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

# Evaluation function
def eval_model(model, data_loader, device):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=targets
            )

            loss = outputs.loss
            logits = outputs.logits

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Model training
EPOCHS = 3

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(model, train_data_loader, optimizer, device, scheduler)
    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(model, test_data_loader, device)
    print(f'Validation loss {val_loss} accuracy {val_acc}')
    print()

Epoch 1/3
----------
Train loss 1.0659740704756517 accuracy 0.45544554455445546
Validation loss 1.0170965790748596 accuracy 0.35294117647058826

Epoch 2/3
----------
Train loss 0.9670340969012334 accuracy 0.5495049504950495
Validation loss 0.8889953323772976 accuracy 0.6078431372549019

Epoch 3/3
----------
Train loss 0.7265338851855352 accuracy 0.7178217821782178
Validation loss 0.7588160719190326 accuracy 0.7254901960784313



In [10]:
# Create a function to get the predicted labels and true labels, then evalaute the model
def eval_model_and_get_predictions(model, data_loader, device):
    model = model.eval()
    true_labels = []
    pred_labels = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)

            true_labels.extend(targets.cpu().numpy())
            pred_labels.extend(preds.cpu().numpy())

    return true_labels, pred_labels

# Define num_labels based on the training DataFrame
num_labels = len(df['class_ID'].unique())

# Get predictions and true labels
true_labels, pred_labels = eval_model_and_get_predictions(model, test_data_loader, device)

# Generate the classification report
print(classification_report(true_labels, pred_labels, target_names=[str(i) for i in range(num_labels)]))

              precision    recall  f1-score   support

           0       0.71      0.83      0.77        18
           1       0.83      0.56      0.67         9
           2       0.71      0.71      0.71        24

    accuracy                           0.73        51
   macro avg       0.75      0.70      0.71        51
weighted avg       0.73      0.73      0.72        51

