In [None]:
from datasets import load_from_disk
#Datensatz laden
dataset = load_from_disk("transformer_train_NLP_dringlichkeit_hgdataset")
print(dataset.column_names)
print(dataset[:5])
# Inhalte prüfen
batch = train_dataset[:5]  
print("Manually tokenizing this batch:")
print(batch)

#Datensatz Tokenisieren
tokenized_batch = tokenizer(batch["message"], padding="max_length", truncation=True)
print("Tokenized batch:", tokenized_batch)

['id', 'message', 'class']
{'id': [1, 2, 3, 4, 5], 'message': ['Hallo sprechen dringlich wichtig Mr.', 'wünschen Rückruf dringend gehen 20. wegen Sperrung Hn Kn', 'Jan dringend bitte nehmen', 'Gott segne Tag bitte Wichtig besprechen bitte antwort Sime Bright bitte antworen privat E-Mail', 'geehrt Dame Herr hiermit bitte Gasabrechnung Jahr umgehend zukommen zulassen brauchen dringend Jobcenter Vertragskonto freundlich grüß Sent from my iPhone'], 'class': [1, 1, 1, 1, 1]}
Manually tokenizing this batch:
{'id': [34523, 26478, 13221, 16255, 11736], 'message': ['geehrt Dame Herr Haus Eltern übernehmen Stromvertrag umschreiben lassen sein Umzug berechtigen Kündigung sofern Kondition woanders einfach gut handeln Konto Verbrauchsstelle -Bernstein .55b freundlich Grüße -River sollen Vertragskondi neu AOL-App iOS senden', 'geehrt Dame Herr umgezog Bitte übernehmen Vertrag neu Adresse senden Bestätigung Erhalt E-Mail Bestätigung Umzug alt Lieferstelle Gayerstras 42 Auszugsdatum neu Lieferstelle 1

In [None]:
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import evaluate
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
import pandas as pd
import torch
from torch.nn import CrossEntropyLoss

dataset = load_from_disk("transformer_train_NLP_dringlichkeit_hgdataset")
# normaler Datensatz für das Basisverfahren
# NLP-Datensatz für das erweiterte und Hyperparametergestützte Verfahren
print("Dataset columns:", dataset.column_names)
shuffled_dataset = dataset.shuffle(seed=42)
train_test_split = shuffled_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

#Filtern auf ungültige Datensätze
train_dataset = train_dataset.filter(lambda example: example["message"] is not None and example["message"].strip() != "")
test_dataset = test_dataset.filter(lambda example: example["message"] is not None and example["message"].strip() != "")

#Tokenizer laden
tokenizer = AutoTokenizer.from_pretrained("facebookAI/xlm-roberta-base")

def preprocess_function(examples):
    return tokenizer(examples["message"], padding="max_length", truncation=True)

#Datensätze Tokenisieren
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

#Spalte umbenennen, da meine Ursprungsdatei nicht "labels" sondern "class" heißt
if "class" not in tokenized_train_dataset.column_names:
    raise ValueError("Column 'class' is missing in the dataset.")
tokenized_train_dataset = tokenized_train_dataset.rename_column("class", "labels")
tokenized_test_dataset = tokenized_test_dataset.rename_column("class", "labels")

#Hyperparamteranpassung, um die class-weights Berechnung anzupassen
labels = tokenized_train_dataset["labels"]
class_counts = torch.tensor([sum(1 for label in labels if label == i) for i in range(2)])
positive_weight_multiplier = 2.0
class_weights = torch.tensor([
    1.0 / class_counts[0].float(),
    (1.0 / class_counts[1].float()) * positive_weight_multiplier
])

class WeightedModel(AutoModelForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.loss_fn = CrossEntropyLoss(weight=class_weights)

    def forward(self, **inputs):
        labels = inputs.pop("labels")
        outputs = super().forward(**inputs)
        logits = outputs.logits
        loss = self.loss_fn(logits, labels) if labels is not None else None
        return {"loss": loss, "logits": logits}

model = WeightedModel.from_pretrained("facebookAI/xlm-roberta-base", num_labels=2)

#Trainingsargumente
training_args = TrainingArguments(
    output_dir="RoBERTa_NLP+Hyperparameter/results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.1,
    logging_dir="RoBERTa_NLP+Hyperparameter/logs",
    logging_steps=10,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    fp16=True,
    gradient_checkpointing=False,
    gradient_accumulation_steps=4,
    report_to="none",
)
accuracy_metric = evaluate.load("accuracy")

#Anpassung des Threshhold, um mehr positive-Klassifizierungen zuzulassen
threshold = 0.4

#Berechnung der Metriken
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probabilities = torch.softmax(torch.tensor(logits), dim=-1)[:, 1]
    predictions = (probabilities > threshold).int().numpy()
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    tn, fp, fn, tp = confusion_matrix(labels, predictions).ravel()
    tnr = tn / (tn + fp)  # True Negative Rate
    fnr = fn / (fn + tp)  # False Negative Rate
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "fnr": fnr,
        "tnr": tnr
    }

#Trainer definieren
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
)

#Training starten
trainer.train()

#Testdatensatz zur Evaluierung verwenden
eval_results = trainer.evaluate()

#Metriken speichern
metrics_df = pd.DataFrame([eval_results])
metrics_df.to_excel("RoBERTa_NLP+Hyperparameter/evaluation_metrics.xlsx", index=False)

print("Metrics saved to evaluation_metrics.xlsx")


2024-11-28 09:16:12.804243: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-28 09:16:12.819310: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-28 09:16:12.824057: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-28 09:16:12.836932: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Dataset columns: ['id', 'message', 'class']


Filter:   0%|          | 0/32640 [00:00<?, ? examples/s]

Filter:   0%|          | 0/8160 [00:00<?, ? examples/s]

Map:   0%|          | 0/32633 [00:00<?, ? examples/s]

Map:   0%|          | 0/8157 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at facebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the latest cached version of the module from /root/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Mon Oct 21 15:03:07 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Fnr,Tnr
1,0.5048,0.503628,0.73348,0.98908,0.469051,0.636333,0.530949,0.994881
2,0.4686,0.484589,0.728577,0.834363,0.566461,0.674794,0.433539,0.888835
3,0.4593,0.492937,0.737894,0.912969,0.522565,0.66468,0.477435,0.950756
4,0.4719,0.489736,0.700503,0.72577,0.638964,0.679607,0.361036,0.761336
5,0.4597,0.496438,0.712394,0.771012,0.599507,0.674528,0.400493,0.823988
6,0.4835,0.505013,0.704671,0.744795,0.617509,0.675206,0.382491,0.790834
7,0.469,0.510173,0.67157,0.669375,0.67053,0.669952,0.32947,0.672599
8,0.4328,0.522707,0.632953,0.608865,0.731689,0.664651,0.268311,0.535349
9,0.4451,0.535448,0.629643,0.607753,0.719112,0.65876,0.280888,0.541199
10,0.4644,0.553025,0.630011,0.608179,0.718866,0.658906,0.281134,0.542175




Metrics saved to evaluation_metrics.xlsx
