In [None]:
import os
import torch
import pandas as pd
from datasets import load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    TrainerCallback,
)
import evaluate
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
import numpy as np

os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_DISABLED"] = "true"

#Datensatz laden und aufteilen
dataset = load_from_disk("transformer_train_dringlichkeit_hgdataset")
#dataset = load_from_disk("transformer_train_NLP_dringlichkeit_hgdataset")
# normaler Datensatz für das Basisverfahren
# NLP-Datensatz für das erweiterte und Hyperparametergestützte Verfahren
shuffled_dataset = dataset.shuffle(seed=42)
train_test_split = shuffled_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")

#Debugging fehlendes Padding
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

#Modellauswahl
model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/deberta-v3-small",
    num_labels=2,
)
model.config.pad_token_id = tokenizer.pad_token_id

#Textvorverarbeitung für das Modelltraining
def preprocess_function(examples):
    
    tokenized_examples = tokenizer(
        examples["message"],
        truncation=True,
        max_length=512,
        padding='max_length',
    )
    
    tokenized_examples["labels"] = examples["class"]
    return tokenized_examples


tokenized_train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    desc="Tokenizing Train Dataset",
)

tokenized_test_dataset = test_dataset.map(
    preprocess_function,
    batched=True,
    desc="Tokenizing Test Dataset",
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='max_length')

#Trainingsargumente
training_args = TrainingArguments(
    output_dir="mdeberta_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,
    #gradient_accumulation_steps=3,
    num_train_epochs=10,
    weight_decay=0.1,
    logging_dir="logs",
    logging_steps=10,
    learning_rate=1e-4,
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),  
    report_to="none",
)

#Berechnung der Bewertungsmetriken
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy_metric = evaluate.load("accuracy")
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    tn, fp, fn, tp = confusion_matrix(labels, predictions).ravel()
    tnr = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0.0
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "fnr": fnr,
        "tnr": tnr,
    }

#CustomCallback um die Bewertungsmetriken pro Epoche zu sichern
class SaveMetricsCallback(TrainerCallback):
    def __init__(self, output_file="mdeberta_results/metrics.xlsx"):
        self.metrics = []
        self.output_file = output_file
        # Ensure the output directory exists
        os.makedirs(os.path.dirname(self.output_file), exist_ok=True)
    
    def on_evaluate(self, args, state, control, metrics, **kwargs):
        # Save metrics
        metrics["epoch"] = state.epoch
        self.metrics.append(metrics.copy())
        # Save to Excel file
        df = pd.DataFrame(self.metrics)
        df.to_excel(self.output_file, index=False)

#Trainer definieren
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[SaveMetricsCallback(output_file="mdeberta_results/metrics.xlsx")],
)

#Training starten
trainer.train()


2024-11-30 11:56:44.537440: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-30 11:56:44.552661: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-30 11:56:44.557473: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-30 11:56:44.569685: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Loading cached shuffled indices for dataset at /tmp/t

Tokenizing Train Dataset:   0%|          | 0/32640 [00:00<?, ? examples/s]

Tokenizing Test Dataset:   0%|          | 0/8160 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Fnr,Tnr
1,0.6079,0.632186,0.661152,0.857551,0.37335,0.520215,0.62665,0.939928
2,0.619,0.630026,0.672059,0.757797,0.490162,0.595281,0.509838,0.848251
3,0.6163,0.594276,0.677083,0.91717,0.377833,0.535191,0.622167,0.966948


Using the latest cached version of the module from /root/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Mon Oct 21 15:03:07 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /root/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Mon Oct 21 15:03:07 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /root/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Mon Oct 21 15:03:07 2024) since it couldn't be found locally at evaluate-metric--accuracy,