DistilBert model 1

Input: df['text'] = df['entity'] + ": " + df['sentence_original']

In [9]:
import pandas as pd
from transformers import DistilBertTokenizerFast
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import numpy as np
from sklearn.metrics import classification_report
import random

# Load data to a DataFrame
df = pd.read_csv('/Users/jinlinchen/Documents/Study/HWR Berlin/Semester 2/Analytics Lab/Analytics Project/Database Part/manual_label - consolidated.csv')

In [10]:
# Set seeds for reproducibility (Chatgpt)
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [11]:
# Keep only the columns we need
df = df[['entity', 'sentence_original', 'class_ID']]

# Convert labels to zero-indexed, because DistilBERT expects zero-indexed labels
df['class_ID'] = df['class_ID'] - 1

# Concatenate entity and sentence
df['text'] = df['entity'] + ": " + df['sentence_original']

In [12]:
# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [13]:
# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [14]:
# Defining a class for tokenizing the text and convert it to a tensor that can be used for model training
class cleaned_dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe.class_ID
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])
        target = self.targets.iloc[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

# Setting Parameters for the model
# The maximum length of the tokenized sequences. If the sequences are longer than this, they will be cut.
# The batch size for the training and test data loader.
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8

# Create DataLoader
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = cleaned_dataset(
        dataframe=df,
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(ds, batch_size=batch_size, num_workers=0)

train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, TRAIN_BATCH_SIZE)
test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, VALID_BATCH_SIZE)

In [15]:
# Determine the model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(df.class_ID.unique()))

# Determine to use CPU or GPU, sometimes GPU is not available or the memory is not enough. 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Using an Optimizer to update the model parameters and minimize the loss
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False) 
# we tried a few learning rates, till now 2e-5 is the best, default learning rate has very different result. It is a lot lower than the current learning rate.



# Training function
def train_epoch(model, data_loader, optimizer, device):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=targets
        )

        loss = outputs.loss
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

# Evaluation function (showing result during training)
def eval_model(model, data_loader, device):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=targets
            )

            loss = outputs.loss
            logits = outputs.logits

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Model training
EPOCHS = 100

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(model, train_data_loader, optimizer, device)
    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(model, test_data_loader, device)
    print(f'Validation loss {val_loss} accuracy {val_acc}')
    print()

Epoch 1/100
----------
Train loss 1.0603708578990056 accuracy 0.43564356435643564
Validation loss 0.9287258046013969 accuracy 0.47058823529411764

Epoch 2/100
----------
Train loss 0.8862763689114497 accuracy 0.5792079207920792
Validation loss 0.7685343197413853 accuracy 0.7450980392156863

Epoch 3/100
----------
Train loss 0.6145729262095231 accuracy 0.7772277227722773
Validation loss 0.8565532309668404 accuracy 0.6078431372549019

Epoch 4/100
----------
Train loss 0.34229246985453826 accuracy 0.8712871287128713
Validation loss 1.193250732762473 accuracy 0.6078431372549019

Epoch 5/100
----------
Train loss 0.3004366629398786 accuracy 0.8861386138613861
Validation loss 1.1237867730004447 accuracy 0.6862745098039216

Epoch 6/100
----------
Train loss 0.2305671566954026 accuracy 0.9108910891089109
Validation loss 1.2301928911890303 accuracy 0.6666666666666666

Epoch 7/100
----------
Train loss 0.17922212842565316 accuracy 0.9356435643564357
Validation loss 1.518262198993138 accuracy 0.6

In [17]:
# Create a function to get the predicted labels and true labels, then evalaute the model
def eval_model_and_get_predictions(model, data_loader, device):
    model = model.eval()
    true_labels = []
    pred_labels = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)

            true_labels.extend(targets.cpu().numpy())
            pred_labels.extend(preds.cpu().numpy())

    return true_labels, pred_labels

# Define num_labels based on the training DataFrame
num_labels = len(df['class_ID'].unique())

# Get predictions and true labels
true_labels, pred_labels = eval_model_and_get_predictions(model, test_data_loader, device)

# Generate the classification report
print(classification_report(true_labels, pred_labels, target_names=[str(i) for i in range(num_labels)]))

              precision    recall  f1-score   support

           0       0.64      0.78      0.70        18
           1       0.75      0.67      0.71         9
           2       0.71      0.62      0.67        24

    accuracy                           0.69        51
   macro avg       0.70      0.69      0.69        51
weighted avg       0.69      0.69      0.69        51

