In [None]:
boolHP = True # True: Hyperparameter-Suche, False: direktes Training mit festen Werten

import time
start_zeit = time.time()


import numpy as np
import pandas as pd
from pandarallel import pandarallel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizer, BertForSequenceClassification, Trainer,  TrainingArguments, AutoConfig, AutoModelForSequenceClassification,DataCollatorWithPadding, TrainerCallback,TrainerState, TrainerControl
import torch
import os
from datetime import datetime
from bs4 import BeautifulSoup
import re

# Initialisiere pandarallel für parallele Verarbeitung
pandarallel.initialize(progress_bar=True)




# Definieren Sie das Cache-Verzeichnis
cache_dir = '/media/ubuntu/5d2d9f9d-a02d-45ab-865f-3d789a0c70f0/download/'
os.environ['TRANSFORMERS_CACHE'] = cache_dir




# Dataset Klasse definieren
class PublicationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

class HPSearchResultLoggerCallback(TrainerCallback):
    def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, metrics: dict, **kwargs):
        """
        Wird nach jeder Evaluation aufgerufen, auch während der HP-Suche für jeden Trial.
        """
        if state.is_hyper_param_search:
            # state.trial_params enthält die Hyperparameter des aktuellen Trials,
            # wenn der HP-Such-Backend (z.B. Optuna) sie bereitstellt.
            # Dies ist normalerweise der Fall.
            current_hyperparameters = state.trial_params if state.trial_params is not None else {}

            log_entry = {}
            # Füge alle Hyperparameter hinzu
            log_entry.update(current_hyperparameters)

            # Füge die gewünschten Metriken hinzu
            log_entry['eval_dataset_type'] = 'train/HP'
            log_entry['eval_loss'] = metrics.get("eval_loss")
            log_entry['eval_accuracy'] = metrics.get("eval_accuracy")
            log_entry['eval_f1'] = metrics.get("eval_f1")
            log_entry['eval_precision'] = metrics.get("precision")
            log_entry['eval_recall'] = metrics.get("eval_recall")
            # Du kannst hier weitere Metriken hinzufügen, die von deiner compute_metrics Funktion zurückgegeben werden
            # log_entry['eval_precision'] = metrics.get("eval_precision")

            # Überprüfe, ob bereits ein Eintrag mit exakt denselben Hyperparametern und Metriken vorhanden ist,
            # um Duplikate zu vermeiden, falls on_evaluate mehrfach pro Trial aufgerufen wird (unwahrscheinlich, aber sicher ist sicher)
            # In der Praxis wird on_evaluate normalerweise einmal pro Trial-Evaluation aufgerufen.
            hp_search_results_list.append(log_entry)
            print("test:\n ")
            print(current_hyperparameters)
            print("test end\n ")
            #print(f"HP Search Trial Logged: {log_entry}") # Optional: zum Debuggen

def clean_text(text):
    # HTML-Tags entfernen
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r"[\",\']","", text)  #  Anführungszeichen entfernen

    # 1. Mehrfache Anführungszeichen durch ein normales ' ersetzen
    text = re.sub(r"'{2,}", "'", text)

    # 2. HTML-Tags entfernen [1, 2, 3]
    # Sucht nach Mustern wie <tag>Inhalt</tag> und ersetzt sie durch einen leeren String.
    text = re.sub(r'<.*?>', '', text)

    # 3. URLs entfernen [1, 2, 3]
    # Sucht nach gängigen URL-Mustern (http/https, www.) und ersetzt sie durch einen leeren String.
    text = re.sub(r'http\S+|www\.\S+', '', text)

    # 4. E-Mail-IDs entfernen [3]
    # Sucht nach E-Mail-Mustern (Zeichenfolge@Zeichenfolge.Domain) und ersetzt sie durch einen leeren String.
    text = re.sub(r'\S*@\S*\s?', '', text)

    # 5. Zusätzliche Leerzeichen normalisieren [1, 4]
    # Teilt den Text nach Leerzeichen auf und fügt ihn mit einem einzigen Leerzeichen wieder zusammen.
    text = " ".join(text.split())

    text = re.sub(r"[\[,\]]","", text)  # Mehrfache Leerzeichen zu einem reduzieren
    

    return text

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    
    # Berechnung des gewichteten F1-Scores
    f1 = f1_score(labels, preds, average='weighted')
    
    # Optional: Berechnung weiterer Metriken
    precision, recall, _, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0) # zero_division=0, um Warnungen zu vermeiden
    acc = accuracy_score(labels, preds)
    
    return {
        'f1': f1,
        'accuracy': acc,
        'precision': precision,
        'recall': recall
    }

def model_init(trial):
    # Laden Sie die Konfiguration zuerst, um sie an LoRaBertForSequenceClassification zu übergeben
    # num_labels muss global oder als Argument verfügbar sein
    model_name = 'distilbert/distilbert-base-multilingual-cased'
    config = AutoConfig.from_pretrained(model_name, num_labels=num_labels, cache_dir=cache_dir)
    

    return AutoModelForSequenceClassification.from_pretrained(
        model_name,
        config=config,
        cache_dir=cache_dir
    )

def time_now():
    # Zeit funktion für den Dateinamen
    current_dateTime = datetime.now()
    time = str(current_dateTime.hour+2)+"-"+str(current_dateTime.minute)+"_"+str(current_dateTime.day) +"-"+ str(current_dateTime.month)+"-"+str(current_dateTime.year)
    return str(time)

def hp_space_optuna(trial):
    # Hyperparameter-Suchraum für Optuna
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "num_train_epochs": trial.suggest_categorical("num_train_epochs",  [5, 12, 16]),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32]),
        
        "weight_decay": trial.suggest_categorical("weight_decay",  [0.0, 0.01]),
    }
        #"warmup_ratio": trial.suggest_float("warmup_ratio", 0.0, 0.2),
        
        # Fügen Sie hier bei Bedarf weitere Hyperparameter hinzu
        # "lora_rank": trial.suggest_categorical("lora_rank", [2, 4, 8, 16]), # Wenn LoRa-Rank auch gesucht werden soll 

def prepare_val(df):
    # Kombiniere Titel und Abstract
    df['text'] = df['title'].astype(str) + " - " + df['abstract'].astype(str)

    # Bereinigen Sie den Text
    df["text"] = df["text"].parallel_apply(clean_text).str.lower()
    # encode the labels
    df['label_encoded'] = le.fit_transform(df['class']).astype(int)

    df = df.sample(frac=1)
    X_val = df["text"]
    Y_val = df["label_encoded"]  

    print("Tokenizing validation data...")
    val_encodings = tokenizer(
        list(X_val), truncation=True, padding=True, max_length=512)
    print("End...")
    print("creating dataset...")
    ## Dataset erstellen
    val_dataset = PublicationsDataset(val_encodings, Y_val.reset_index(drop=True))
    print("End...")
    return val_dataset,df

def prepare_test_train(df):
    # Kombiniere Titel und Abstract
    df['text'] = df['title'].astype(str) + " - " + df['abstract'].astype(str)

    # Bereinigen Sie den Text
    df["text"] = df["text"].parallel_apply(clean_text).str.lower()

    # encode the labels
    df['label_encoded'] = le.fit_transform(df['class']).astype(int)

    # train_test_split für das Training/Validation-Set (aus dfBert)
    # Beachten Sie, dass X_test, y_test hier nur für das Training verwendet werden.
    # dfBert_eval wird als separates Validierungsset für die Valedierung genutzt.
    X_train, X_test, y_train, y_test = train_test_split(
        df['text'], df['label_encoded'], test_size=0.2, random_state=42, stratify=df['label_encoded'])

    print("Tokenizing training data...")
    train_encodings = tokenizer(
        list(X_train), truncation=True, padding=True, max_length=512)
    test_encodings = tokenizer(
        list(X_test), truncation=True, padding=True, max_length=512)
    print("End...")
    print("creating dataset...")
    ## Dataset erstellen
    train_dataset = PublicationsDataset(train_encodings, y_train.reset_index(drop=True))
    test_dataset = PublicationsDataset(test_encodings, y_test.reset_index(drop=True))
    print("End...")
    return train_dataset,test_dataset,df





















# --- 1. Initialisierung ---

hp_search_results_list = []
final_run_results_list = []
model_name = 'distilbert/distilbert-base-multilingual-cased'

# # Definieren Sie den Pfad zu den Daten
#path_train='../01_Daten/pkl/df_all_15k-1.pkl'
path_train='../01_Daten/pkl/df_all_15k-2.pkl'
#path_train='../01_Daten/pkl/df_all_15k-3.pkl'
path_val='../01_Daten/pkl/df_val_5k-2.pkl'
#path_val='../01_Daten/pkl/df_val_5k-3.pkl'


#speicher Pfad für Logs und Modelle
time_log_save = time_now()
model_base_path = f"../01_Daten/logs/{time_log_save}/{model_name} /"
model_log_path = model_base_path+"logs/"
model_output_path = model_base_path+"results/"
model_final_path = model_base_path+"final_model/"

# LabelEncoder, tokenizer und  Data collator initialisieren
le = LabelEncoder()
tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# --- 2. Erstellen HP Trainer und args---
train_dataset, test_dataset, dfBert_train = prepare_test_train(pd.read_pickle(path_train))
val_dataset, dfBert_val = prepare_val(pd.read_pickle(path_val)) # Das ist jetzt dein dediziertes Validierungsset

if boolHP:

    # num_labels auslesen für die Model-Initialisierung
    num_labels = dfBert_train['label_encoded'].nunique()

    # Trainingsparameter für die Hyperparameter-Suche, diese Werte dienen als Standardwerte oder Fallbacks.
    # Die Werte aus Optuna (über hp_space_optuna) werden während der Trials verwendet.
    training_args = TrainingArguments(
        output_dir=f'{model_output_path}results_hp_search', 
        
        learning_rate=1e-5,
        num_train_epochs=1, 
        per_device_train_batch_size= 16,        
        
        # Feste Werte für die Suche:
        logging_dir=f'{model_log_path}logs_hp_search',
        logging_steps=10,
        report_to="tensorboard",
        eval_strategy="epoch", 
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=False,
        metric_for_best_model="f1",
        greater_is_better=True,
    )
    # Trainer initialisieren (ohne ein festes Modell - model_init wird verwendet)
    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # --- 3. Starten der Hyperparameter-Suche ---
    print("Starte Hyperparameter-Suche...")
    # Starten der Hyperparameter-Suche
    best_trial = trainer.hyperparameter_search(
        direction="maximize", # Maximiere den F1-Score
        backend="optuna",
        n_trials=10, # Anzahl der Trials, die Optuna durchführen soll, je mehr Trials, desto länger dauert es, aber potenziell bessere Ergebnisse.
        hp_space=hp_space_optuna,
    )
    print("\n--- Hyperparameter-Suche abgeschlossen ---")

    print("Beste Trial:")
    print(f"  Wert (F1): {best_trial}") 
    print("\n------------------------------------------")
    print("learning_rate: "+str(best_trial.hyperparameters["learning_rate"]))
    print("num_train_epochs: "+str(best_trial.hyperparameters["num_train_epochs"]))
    print("per_device_train_batch_size : "+str(best_trial.hyperparameters["per_device_train_batch_size"]))
    #print("warmup_ratio: "+str(best_trial.hyperparameters["warmup_ratio"]))
    print("weight_decay: "+str(best_trial.hyperparameters["weight_decay"]))
    print("\n------------------END---------------------\n\n\n")

    # --- 4. Train with Best Hyperparameters ---
    #update der TrainingArguments mit den besten Hyperparametern
    best_hp = best_trial.hyperparameters


    final_training_args = TrainingArguments(
        output_dir=f"{model_output_path}best_run", 
        logging_dir=f"{model_log_path}best_run",
        report_to="tensorboard",
        learning_rate=best_hp["learning_rate"],
        num_train_epochs=best_hp["num_train_epochs"],
        per_device_train_batch_size=best_hp["per_device_train_batch_size"],
        per_device_eval_batch_size=training_args.per_device_eval_batch_size, 
        weight_decay=best_hp.get("weight_decay", training_args.weight_decay),
        #warmup_ratio=best_hp.get("warmup_ratio", training_args.warmup_ratio),
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        seed=42
    )
else:
    final_training_args = TrainingArguments(
    output_dir=f"{model_output_path}best_run", 
    logging_dir=f"{model_log_path}best_run",

    learning_rate=2.7166361333742085e-05,
    num_train_epochs= 5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay = 0.0,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit = 1,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    seed=42
    )

# Initialisiere den Trainer mit den statischen Hyperparametern
final_trainer = Trainer(
    model_init=model_init,
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("\nTraining the final model with the best hyperparameters...")
final_trainer.train()

# --- 5. Evaluate the Final Model ---
print("\nEvaluating the final model on the test set...")
test_results = final_trainer.evaluate(test_dataset)
print("\nTest Set Evaluation Results:")
for key, value in test_results.items():
    print(f"  {key}: {value:.4f}")

print("\nEvaluating the final model on the validation set...")
val_results = final_trainer.evaluate(val_dataset)
print("\nVal Set Evaluation Results:")
for key, value in val_results.items():
    print(f"  {key}: {value:.4f}")




if boolHP:
    # # --- 6. Save the Final Model & Tokenizer ---
    end_zeit = time.time()
    laufzeit = end_zeit - start_zeit
    print("\n Saving the fine-tuned model and tokenizer...")
    final_trainer.save_model(f"{model_final_path}model")
    tokenizer.save_pretrained(f"{model_final_path}tokenizer/")



# # --- 7. Predict and to DF ---
# --- Erstellen eines DataFrames mit Vorhersagen für das Validierungsset ---
print("\ncreate predictions, to save in a df...")

predictions_output_val = final_trainer.predict(val_dataset)
predicted_scores_val = predictions_output_val.predictions
predicted_labels_encoded_val = np.argmax(predicted_scores_val, axis=1)
predicted_labels_named_val = le.inverse_transform(predicted_labels_encoded_val)

dfResults_pred = pd.DataFrame()

dfResults_pred['id_im_aktuellen_df'] = dfBert_val.index.values

# Ursprüngliche Klasse (Text-Label) aus dfBert_val
dfResults_pred['class_original'] = dfBert_val['class'].values

# Ursprüngliche Klasse (numerisch kodiertes Label - Ground Truth) aus dfBert_val
dfResults_pred['label_encoded_original'] = dfBert_val['label_encoded'].values

# Vorhergesagte Klasse (numerisch kodiertes Label)
dfResults_pred['prediction_encoded'] = predicted_labels_encoded_val

# Vorhergesagte Klasse (Text-Label)
dfResults_pred['prediction_named'] = predicted_labels_named_val

# Optional: Fügen Sie den Text hinzu, der für die Vorhersage verwendet wurde
dfResults_pred['text_input'] = dfBert_val['text'].values


print("--- 1. DF rdy ---")
print("Erstelle einen DataFrame nur mit den Vorhersagen, die vom Original abweichen (Validierungsset)...")
# Filtere dfResults_pred, um nur Zeilen zu erhalten, bei denen das Original-Label und das vorhergesagte Label unterschiedlich sind.
dfResults_pred_diff = dfResults_pred[dfResults_pred['label_encoded_original'] != dfResults_pred['prediction_encoded']]
print("\n---  DFs rdy !!! ---")
if dfResults_pred_diff.empty:
    print("Keine Unterschiede zwischen Original- und Vorhersage-Labels im Validierungsset gefunden. Perfekte Vorhersage!")
else:
    print("--------------------------------------")
    print(f"\nAnzahl der unterschiedlichen Vorhersagen im Validierungsset: {len(dfResults_pred_diff)}")
print("\n-------------------------------------------")

# Wahre Labels und vorhergesagte Labels aus dem DataFrame extrahieren
y_true_val = dfResults_pred['label_encoded_original']
y_pred_val = dfResults_pred['prediction_encoded']

print("\n validation-set metrics (calculated from dfResults_pred):")

#Gewichteter F1-Score
f1_val_weighted = f1_score(y_true_val, y_pred_val, average='weighted', zero_division=0)
print(f"  Gewichteter F1-Score: {f1_val_weighted:.4f}")
print("\n------------------------------------------")


dfResults_pred.to_pickle(f"{model_base_path}dfResults_pred.pkl")
dfResults_pred_diff.to_pickle(f"{model_base_path}dfResults_pred_diff({len(dfResults_pred_diff)}).pkl")



print(f"runtimet: {laufzeit/60} min")
print("\n Script finished successfully!")

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1250), Label(value='0 / 1250'))), …

Tokenizing training data...
End...
creating dataset...
End...


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=417), Label(value='0 / 417'))), HB…

Tokenizing validation data...
End...
creating dataset...
End...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-06-09 17:24:07,934] A new study created in memory with name: no-name-43ebf83a-9e62-43cb-8bf4-701b83dc1a0b


Starte Hyperparameter-Suche...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,1.0424,1.013925,0.633483,0.638,0.638485,0.638
2,0.8805,0.905621,0.662723,0.665333,0.674799,0.665333
3,0.7912,0.850163,0.68381,0.686667,0.687112,0.686667
4,0.7976,0.821861,0.698346,0.7,0.698103,0.7
5,0.7462,0.842817,0.682465,0.686333,0.692876,0.686333
6,0.7225,0.8108,0.69185,0.694,0.692766,0.694
7,0.6863,0.81043,0.695813,0.698333,0.697976,0.698333
8,0.6311,0.817208,0.692705,0.696,0.697199,0.696
9,0.5915,0.807862,0.69684,0.7,0.698164,0.7
10,0.6203,0.806497,0.699655,0.702333,0.700872,0.702333


[I 2025-06-09 18:03:26,684] Trial 0 finished with value: 2.797495603089045 and parameters: {'learning_rate': 3.2888095206990444e-06, 'num_train_epochs': 12, 'per_device_train_batch_size': 32, 'weight_decay': 0.01}. Best is trial 0 with value: 2.797495603089045.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.781,0.824273,0.682225,0.683667,0.697022,0.683667
2,0.6118,0.771781,0.718011,0.720333,0.724284,0.720333
3,0.3632,0.91835,0.71639,0.716667,0.722319,0.716667
4,0.1873,1.091389,0.722485,0.722667,0.72295,0.722667
5,0.053,1.343204,0.722573,0.721667,0.723883,0.721667


[I 2025-06-09 18:19:49,986] Trial 1 finished with value: 2.8897898528963553 and parameters: {'learning_rate': 9.16873665647161e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'weight_decay': 0.01}. Best is trial 1 with value: 2.8897898528963553.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.7435,0.797811,0.703805,0.706667,0.711415,0.706667
2,0.5463,0.839115,0.686878,0.690667,0.701639,0.690667
3,0.3683,1.081276,0.676632,0.68,0.693364,0.68
4,0.365,1.336544,0.696326,0.695,0.708548,0.695
5,0.0882,1.678437,0.692031,0.692333,0.69687,0.692333
6,0.1199,1.910957,0.698715,0.696,0.707833,0.696
7,0.1372,2.124757,0.694544,0.693667,0.69922,0.693667
8,0.0297,2.415919,0.708212,0.706,0.713627,0.706
9,0.0241,2.46397,0.702251,0.701,0.704115,0.701
10,0.0001,2.534516,0.701981,0.702667,0.701674,0.702667


[I 2025-06-09 19:01:33,944] Trial 2 finished with value: 2.801489046064531 and parameters: {'learning_rate': 8.598143793309295e-05, 'num_train_epochs': 12, 'per_device_train_batch_size': 16, 'weight_decay': 0.0}. Best is trial 1 with value: 2.8897898528963553.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.7885,0.832422,0.684586,0.686333,0.69678,0.686333
2,0.6113,0.78694,0.711927,0.716667,0.722232,0.716667
3,0.3835,0.955733,0.707646,0.711,0.710041,0.711
4,0.2675,1.051269,0.70943,0.706,0.719543,0.706
5,0.1187,1.353183,0.698503,0.697,0.705315,0.697
6,0.1313,1.567194,0.70532,0.703333,0.708969,0.703333
7,0.0351,1.931715,0.692823,0.694667,0.698245,0.694667
8,0.0244,2.10797,0.7024,0.703333,0.704906,0.703333
9,0.0275,2.187114,0.709642,0.708667,0.712206,0.708667
10,0.0003,2.328472,0.711709,0.710667,0.713476,0.710667


[I 2025-06-09 19:40:49,999] Trial 3 finished with value: 2.8270856237578794 and parameters: {'learning_rate': 8.88119249739111e-05, 'num_train_epochs': 12, 'per_device_train_batch_size': 32, 'weight_decay': 0.01}. Best is trial 1 with value: 2.8897898528963553.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.7423,0.845425,0.682803,0.683667,0.698442,0.683667
2,0.6192,0.796475,0.710523,0.714333,0.719615,0.714333
3,0.5144,0.837782,0.702798,0.703,0.714667,0.703
4,0.5039,0.882426,0.713708,0.713,0.718596,0.713
5,0.2183,1.05096,0.701801,0.705333,0.705106,0.705333
6,0.2139,1.187419,0.713595,0.713,0.714479,0.713
7,0.1138,1.450071,0.702067,0.705,0.701807,0.705
8,0.0534,1.664846,0.705569,0.705667,0.707251,0.705667
9,0.0944,1.885956,0.698433,0.702333,0.701184,0.702333
10,0.0417,1.98388,0.704501,0.704,0.706822,0.704


[I 2025-06-09 20:36:38,002] Trial 4 finished with value: 2.8435140166070676 and parameters: {'learning_rate': 1.5225886096750911e-05, 'num_train_epochs': 16, 'per_device_train_batch_size': 16, 'weight_decay': 0.01}. Best is trial 1 with value: 2.8897898528963553.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.8298,0.828356,0.682681,0.683,0.698302,0.683
2,0.6473,0.767328,0.717631,0.721,0.724252,0.721
3,0.4303,0.850942,0.71486,0.714667,0.719345,0.714667
4,0.349,1.018722,0.709454,0.708,0.712054,0.708
5,0.1038,1.309208,0.706096,0.707667,0.708548,0.707667
6,0.1088,1.578663,0.708554,0.709,0.712445,0.709
7,0.0926,1.800342,0.708159,0.71,0.715637,0.71
8,0.026,1.981465,0.714495,0.714,0.716517,0.714
9,0.0092,1.99012,0.720728,0.722333,0.720674,0.722333
10,0.0012,2.101735,0.719911,0.72,0.720186,0.72


[I 2025-06-09 21:15:49,995] Trial 5 finished with value: 2.8762420095743537 and parameters: {'learning_rate': 6.189713308884082e-05, 'num_train_epochs': 12, 'per_device_train_batch_size': 32, 'weight_decay': 0.0}. Best is trial 1 with value: 2.8897898528963553.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.8593,0.869185,0.675013,0.676,0.694378,0.676


[I 2025-06-09 21:19:03,445] Trial 6 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.8764,0.881128,0.67576,0.676667,0.694389,0.676667


[I 2025-06-09 21:22:17,251] Trial 7 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,1.0393,1.089916,0.611973,0.620333,0.62217,0.620333


[I 2025-06-09 21:25:43,795] Trial 8 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.7611,0.860003,0.683309,0.685,0.694872,0.685


[I 2025-06-09 21:29:10,310] Trial 9 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Hyperparameter-Suche abgeschlossen ---
Beste Trial:
  Wert (F1): BestRun(run_id='1', objective=2.8897898528963553, hyperparameters={'learning_rate': 9.16873665647161e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'weight_decay': 0.01}, run_summary=None)

------------------------------------------
learning_rate: 9.16873665647161e-05
num_train_epochs: 5
per_device_train_batch_size : 32
weight_decay: 0.01

------------------END---------------------





Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training the final model with the best hyperparameters...


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,No log,0.816474,0.693687,0.6944,0.704766,0.6944
2,0.878500,0.787844,0.712314,0.7138,0.717538,0.7138
3,0.503600,0.897657,0.711471,0.7122,0.713717,0.7122
4,0.236300,1.130081,0.70822,0.7102,0.708413,0.7102
5,0.236300,1.372384,0.708856,0.708,0.710061,0.708



Evaluating the final model on the test set...



Test Set Evaluation Results:
  eval_loss: 0.7832
  eval_f1: 0.7199
  eval_accuracy: 0.7220
  eval_precision: 0.7227
  eval_recall: 0.7220
  eval_runtime: 15.3072
  eval_samples_per_second: 195.9870
  eval_steps_per_second: 24.4980
  epoch: 5.0000

Evaluating the final model on the validation set...

Val Set Evaluation Results:
  eval_loss: 0.7878
  eval_f1: 0.7123
  eval_accuracy: 0.7138
  eval_precision: 0.7175
  eval_recall: 0.7138
  eval_runtime: 26.4359
  eval_samples_per_second: 189.1370
  eval_steps_per_second: 23.6420
  epoch: 5.0000

 Saving the fine-tuned model and tokenizer...

create predictions, to save in a df...
--- 1. DF rdy ---
Erstelle einen DataFrame nur mit den Vorhersagen, die vom Original abweichen (Validierungsset)...

---  DFs rdy !!! ---
--------------------------------------

Anzahl der unterschiedlichen Vorhersagen im Validierungsset: 1431

-------------------------------------------

 validation-set metrics (calculated from dfResults_pred):
  Gewichteter F1-

In [3]:
best_trial

BestRun(run_id='1', objective=2.8897898528963553, hyperparameters={'learning_rate': 9.16873665647161e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'weight_decay': 0.01}, run_summary=None)

In [4]:
print("------------------------------------------")
print("Beste Trial:\n")

print("learning_rate: "+str(best_trial.hyperparameters["learning_rate"]))
print("num_train_epochs: "+str(best_trial.hyperparameters["num_train_epochs"]))
print("per_device_train_batch_size : "+str(best_trial.hyperparameters["per_device_train_batch_size"]))
print("weight_decay: "+str(best_trial.hyperparameters["weight_decay"]))
#print("test: "+str(best_trial.hyperparameters["warmup_ratio"]))

print("\n------------------END---------------------")

------------------------------------------
Beste Trial:

learning_rate: 9.16873665647161e-05
num_train_epochs: 5
per_device_train_batch_size : 32
weight_decay: 0.01

------------------END---------------------
