# Fine-tuning Classifier LLM


In [1]:
!pip install optuna
!pip install typing
!pip install evaluate
!pip install torch
!pip install transformers
!pip install accelerate>=0.26.0

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Using cached nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Using cached nvidia_curand_cu12

In [78]:
# setup - load packages
import pandas as pd
from datasets import Dataset, load_dataset
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    balanced_accuracy_score
)
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from typing import Union, Mapping, List, Dict, Any
import evaluate
from tqdm import tqdm
import zipfile
import os
from sklearn.utils.class_weight import compute_class_weight

# Set up device (is available use GPU to speed up computations)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

seed = 13

cuda


## VERSION A

In [None]:
classifier_data = pd.read_csv("../data/classifier_data_A.csv")
# converting to huggingface dataset format
data = Dataset.from_pandas(classifier_data)
# splitting into train, test and validation sets

label_names = ['CDU/CSU', 'SPD', 'GRÜNE', 'FDP', 'AfD', 'LINKE']
label2id = {label: i for i, label in enumerate(sorted(label_names))}
id2label = {i: label for label, i in label2id.items()}
def map_labels(example):
    example["label"] = label2id[example["label"]]
    return example
data = data.map(map_labels)


# shuffling data
raw_dataset = data.shuffle(seed=seed)

# 70% train, 15% test, 15% validation data
split = raw_dataset.train_test_split(test_size=0.3, seed=seed)
train_data = split["train"]
text_and_val_data = split["test"]
split = text_and_val_data.train_test_split(test_size=0.5, seed=seed)
test_data = split["train"]
val_data = split["test"]

print(f"Training samples party: {len(train_data)}")
print(f"Test samples party: {len(test_data)}")
print(f"Validation samples party: {len(val_data)}")



Map:   0%|          | 0/36117 [00:00<?, ? examples/s]

Training samples party: 25281
Test samples party: 5418
Validation samples party: 5418


In [None]:
# subset of train and val data for auto-tuning
#train_data_for_tune = train_data.shuffle(seed=seed)
#val_data_for_tune = val_data.shuffle(seed=seed)

# subsetting roughly 20-25% of train and vall data for tuning
#train_data_for_tune = train_data_for_tune.select(range(60))
#val_data_for_tune = val_data_for_tune.select(range(10))


In [80]:
# compute class weights
# Get raw labels as a list of integers
all_train_labels = train_data["label"]

# Compute unique class values
unique_labels = np.unique(all_train_labels)

# Compute weights
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=unique_labels,
    y=all_train_labels
)
class_weights = torch.tensor(class_weights, dtype=torch.float)
class_weights

tensor([1.1977, 0.6153, 1.2799, 1.1866, 1.7240, 0.7487])

In [81]:
WINDOW_LENGTH = 512
STRIDE = 256

In [82]:
# Load Tokenizer
model_name = "deepset/gbert-base"
num_labels = 6
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    model_max_length=WINDOW_LENGTH
)


In [83]:
def sliding_window_tokenize(batch):
    texts = batch["speech_text"]
    labels = batch["label"]  # ensure this is a flat list of ints

    tokenized = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=WINDOW_LENGTH,
        stride=STRIDE,
        return_overflowing_tokens=True,
    )

    # Assign each overflow window the correct label
    #tokenized["labels"] = [label2id[labels[i]] for i in tokenized["overflow_to_sample_mapping"]]
    tokenized["labels"] = [labels[i] for i in tokenized["overflow_to_sample_mapping"]]

    return tokenized


In [None]:
tokenized_train_data = train_data.map(
    sliding_window_tokenize,
    batched=True,
    remove_columns=train_data.column_names
)

tokenized_val_data = val_data.map(
    sliding_window_tokenize,
    batched=True,
    remove_columns=train_data.column_names
)

# also tokenizing subsets

# tokenized_train_data_subset = train_data_for_tune.map(
#     sliding_window_tokenize,
#     batched=True,
#     remove_columns=train_data_for_tune.column_names
# )

# tokenized_val_data_subset = val_data_for_tune.map(
#     sliding_window_tokenize,
#     batched=True,
#     remove_columns=val_data_for_tune.column_names
# )

Map:   0%|          | 0/25281 [00:00<?, ? examples/s]

Map:   0%|          | 0/5418 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [85]:
model = AutoModelForSequenceClassification.from_pretrained(
    "deepset/gbert-base",
    num_labels=len(label_names),
    id2label=id2label,
    label2id=label2id
)

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    acc = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"]

    return {
        "accuracy": acc,
        "f1": f1
    }


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [86]:
from transformers import Trainer
import torch.nn.functional as F

class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights.to(self.model.device)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        return (loss, outputs) if return_outputs else loss


## Hyperparameter Tuning

In [None]:
# # training arguments for hyperparameter tuning
# def model_init():
#     return AutoModelForSequenceClassification.from_pretrained(
#         model_name,
#         num_labels=num_labels
#     )

# training_args = TrainingArguments(
#     output_dir="./results",
#     eval_strategy="epoch",
#     save_strategy="no",  # don't save checkpoints during tuning
#     logging_dir="./logs",
#     disable_tqdm=True,  # speed up tuning
#     fp16=torch.cuda.is_available(),
#     report_to="none",   # optional: disable W&B or other logging
# )

# def hp_space(trial):
#     return {
#         "learning_rate": trial.suggest_categorical("learning_rate", [1e-5, 2e-5, 3e-5]),
#         "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
#         "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 4),
#         "weight_decay": trial.suggest_categorical("weight_decay", [0.0, 0.01, 0.05]),
#     }

# trainer = WeightedTrainer(
#     model_init=model_init,
#     args=training_args,
#     train_dataset=tokenized_train_data_subset,
#     eval_dataset=tokenized_val_data_subset,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
#     class_weights=class_weights
# )

# best_run = trainer.hyperparameter_search(
#     direction="maximize",        # because we want to maximize accuracy
#     hp_space=hp_space,
#     n_trials=4,                 # how many combinations to try
#     compute_objective=lambda metrics: metrics["eval_accuracy"],
#     backend="optuna"
# )

# print("Best hyperparameters:")
# print(best_run.hyperparameters)


  super().__init__(*args, **kwargs)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-08-05 10:00:32,030] A new study created in memory with name: no-name-60846803-ff49-46c0-8a73-cfe7dad72633
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'eval_loss': 2.095463991165161, 'eval_accuracy': 0.18518518518518517, 'eval_f1': 0.052083333333333336, 'eval_runtime': 0.2998, 'eval_samples_per_second': 90.051, 'eval_steps_per_second': 13.341, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 2.0631144046783447, 'eval_accuracy': 0.1111111111111111, 'eval_f1': 0.053418803418803416, 'eval_runtime': 0.2273, 'eval_samples_per_second': 118.804, 'eval_steps_per_second': 17.601, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 2.129704475402832, 'eval_accuracy': 0.1111111111111111, 'eval_f1': 0.05555555555555555, 'eval_runtime': 0.4104, 'eval_samples_per_second': 65.788, 'eval_steps_per_second': 9.746, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 2.144946336746216, 'eval_accuracy': 0.1111111111111111, 'eval_f1': 0.05555555555555555, 'eval_runtime': 0.2324, 'eval_samples_per_second': 116.202, 'eval_steps_per_second': 17.215, 'epoch': 4.0}
{'train_runtime': 21.0653, 'train_samples_per_second': 34.559, 'train_steps_per_second': 2.279, 'train_loss': 1.6013177235921223, 'epoch': 4.0}


[I 2025-08-05 10:00:54,799] Trial 0 finished with value: 0.1111111111111111 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 4, 'weight_decay': 0.01}. Best is trial 0 with value: 0.1111111111111111.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'eval_loss': 2.0560412406921387, 'eval_accuracy': 0.18518518518518517, 'eval_f1': 0.052083333333333336, 'eval_runtime': 0.2362, 'eval_samples_per_second': 114.297, 'eval_steps_per_second': 16.933, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 2.0405139923095703, 'eval_accuracy': 0.18518518518518517, 'eval_f1': 0.053763440860215055, 'eval_runtime': 0.2501, 'eval_samples_per_second': 107.971, 'eval_steps_per_second': 15.996, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 2.0447702407836914, 'eval_accuracy': 0.18518518518518517, 'eval_f1': 0.05952380952380953, 'eval_runtime': 0.234, 'eval_samples_per_second': 115.38, 'eval_steps_per_second': 17.093, 'epoch': 3.0}
{'train_runtime': 13.1514, 'train_samples_per_second': 41.516, 'train_steps_per_second': 1.369, 'train_loss': 1.719774776034885, 'epoch': 3.0}


[I 2025-08-05 10:01:09,691] Trial 1 finished with value: 0.18518518518518517 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3, 'weight_decay': 0.01}. Best is trial 1 with value: 0.18518518518518517.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'eval_loss': 2.1987695693969727, 'eval_accuracy': 0.18518518518518517, 'eval_f1': 0.052083333333333336, 'eval_runtime': 0.2504, 'eval_samples_per_second': 107.843, 'eval_steps_per_second': 15.977, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 2.145566701889038, 'eval_accuracy': 0.2222222222222222, 'eval_f1': 0.10644257703081232, 'eval_runtime': 0.2378, 'eval_samples_per_second': 113.546, 'eval_steps_per_second': 16.822, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 2.16226863861084, 'eval_accuracy': 0.14814814814814814, 'eval_f1': 0.07125603864734299, 'eval_runtime': 0.2462, 'eval_samples_per_second': 109.666, 'eval_steps_per_second': 16.247, 'epoch': 3.0}
{'train_runtime': 15.9426, 'train_samples_per_second': 34.248, 'train_steps_per_second': 4.328, 'train_loss': 1.6568122532056726, 'epoch': 3.0}


[I 2025-08-05 10:01:27,275] Trial 2 finished with value: 0.14814814814814814 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 3, 'weight_decay': 0.0}. Best is trial 1 with value: 0.18518518518518517.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'eval_loss': 2.060236692428589, 'eval_accuracy': 0.18518518518518517, 'eval_f1': 0.052083333333333336, 'eval_runtime': 0.2365, 'eval_samples_per_second': 114.177, 'eval_steps_per_second': 16.915, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 2.0383543968200684, 'eval_accuracy': 0.18518518518518517, 'eval_f1': 0.05952380952380953, 'eval_runtime': 0.2439, 'eval_samples_per_second': 110.707, 'eval_steps_per_second': 16.401, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 2.0511438846588135, 'eval_accuracy': 0.1111111111111111, 'eval_f1': 0.04, 'eval_runtime': 0.9007, 'eval_samples_per_second': 29.978, 'eval_steps_per_second': 4.441, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 2.058006763458252, 'eval_accuracy': 0.14814814814814814, 'eval_f1': 0.07341269841269842, 'eval_runtime': 0.2424, 'eval_samples_per_second': 111.384, 'eval_steps_per_second': 16.501, 'epoch': 4.0}
{'train_runtime': 19.7245, 'train_samples_per_second': 36.908, 'train_steps_per_second': 1.217, 'train_loss': 1.6841667493184407, 'epoch': 4.0}


[I 2025-08-05 10:01:48,781] Trial 3 finished with value: 0.14814814814814814 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 4, 'weight_decay': 0.05}. Best is trial 1 with value: 0.18518518518518517.


Best hyperparameters:
{'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3, 'weight_decay': 0.01}


In [None]:
# best_run_A_df = pd.DataFrame(best_run)
# best_run_A_df.to_csv("hyperpara_A.csv",index=False)

## Training with best Tuning Parameters


In [None]:
#best_run.hyperparameters = {'learning_rate': 2e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 3, 'weight_decay': 0.01}

# training_args = TrainingArguments(
#     output_dir="./results",
#     learning_rate=best_run.hyperparameters["learning_rate"],
#     per_device_train_batch_size=best_run.hyperparameters["per_device_train_batch_size"],
#     num_train_epochs=best_run.hyperparameters["num_train_epochs"],
#     weight_decay=best_run.hyperparameters["weight_decay"],
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     per_device_eval_batch_size=16,
#     load_best_model_at_end=True,
#     fp16=torch.cuda.is_available(),
#     report_to="none",
# )

AttributeError: can't set attribute

In [87]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_eval_batch_size=16,
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_val_data,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
    class_weights=class_weights
)

trainer.train()


  super().__init__(*args, **kwargs)
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7873,0.759904,0.706295,0.708013
2,0.4588,0.747678,0.738231,0.739981
3,0.2402,0.857359,0.750793,0.748127


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=13302, training_loss=0.5846823154759755, metrics={'train_runtime': 6295.0667, 'train_samples_per_second': 33.808, 'train_steps_per_second': 2.113, 'total_flos': 5.599809519986074e+16, 'train_loss': 0.5846823154759755, 'epoch': 3.0})

In [88]:
model.save_pretrained("classifier_final_A/")
tokenizer.save_pretrained("classifier_final_A/")

('classifier_final_A/tokenizer_config.json',
 'classifier_final_A/special_tokens_map.json',
 'classifier_final_A/vocab.txt',
 'classifier_final_A/added_tokens.json',
 'classifier_final_A/tokenizer.json')

In [89]:
# model und tokenizer müssen schon geladen sein
model.eval()

def tokenize_sliding_windows(example: Dict[str, Any]) -> Dict[str, Any]:
    encoding = tokenizer(
        example["speech_text"],
        truncation=True,
        padding="max_length",
        max_length=WINDOW_LENGTH,
        stride=STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=False,
        return_tensors="pt"
    )
    return encoding

def predict_proba_for_dataset(dataset: Dataset, label_names) -> List[Dict[str, Any]]:
    results = []

    for example in tqdm(dataset):
        tokenized = tokenize_sliding_windows(example)
        input_ids = tokenized["input_ids"].to(model.device)
        attention_mask = tokenized["attention_mask"].to(model.device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()

        avg_probs = probs.mean(axis=0)
        pred_idx = int(np.argmax(avg_probs))
        pred_label = label_names[pred_idx]

        results.append({
            "probs": avg_probs.tolist(),
            "label": label_names[example["label"]],  # true value
            "prediction_label": pred_label,
        })

    return results


### Validation

In [90]:
results_val_A = predict_proba_for_dataset(val_data, sorted(label_names))

  return forward_call(*args, **kwargs)
100%|██████████| 5418/5418 [02:44<00:00, 32.85it/s]


In [91]:
results_val_A_df = pd.DataFrame(results_val_A)

In [92]:
results_val_A_df.to_csv("classifier_final_A_validation_results.csv",index=False)

### TEST

In [93]:
results_test_A = predict_proba_for_dataset(test_data, sorted(label_names))

100%|██████████| 5418/5418 [02:48<00:00, 32.22it/s]


In [94]:
results_test_A_df = pd.DataFrame(results_test_A)

In [95]:
results_test_A_df.to_csv("classifier_final_A_test_results.csv", index=False)

In [None]:

# Name of the zip file you want to create
zip_filename = "diffresults.zip"

# Create a zip file
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Add model/tokenizer folder
    for root, dirs, files in os.walk("classifier_final_A/"):
        for file in files:
            filepath = os.path.join(root, file)
            arcname = os.path.relpath(filepath, start=os.path.dirname("classifier_final_A/"))
            zipf.write(filepath, arcname=arcname)

    # Add any CSVs you want
    for csv_file in ["hyperpara_A.csv", "classifier_final_A_validation_results.csv", "classifier_final_A_test_results.csv"]:
        if os.path.exists(csv_file):
            zipf.write(csv_file)