In [None]:
boolHP = True # True: Hyperparameter-Suche, False: direktes Training mit festen Werten

import time
start_zeit = time.time()


import numpy as np
import pandas as pd
from pandarallel import pandarallel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizer, BertForSequenceClassification, Trainer,  TrainingArguments, AutoConfig, AutoModelForSequenceClassification,DataCollatorWithPadding, TrainerCallback,TrainerState, TrainerControl
import torch
import os
from datetime import datetime
from bs4 import BeautifulSoup
import re

# Initialisiere pandarallel für parallele Verarbeitung
pandarallel.initialize(progress_bar=True)




# Definieren Sie das Cache-Verzeichnis
cache_dir = '/media/ubuntu/5d2d9f9d-a02d-45ab-865f-3d789a0c70f0/download/'
os.environ['TRANSFORMERS_CACHE'] = cache_dir




# Dataset Klasse definieren
class PublicationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

class HPSearchResultLoggerCallback(TrainerCallback):
    def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, metrics: dict, **kwargs):
        """
        Wird nach jeder Evaluation aufgerufen, auch während der HP-Suche für jeden Trial.
        """
        if state.is_hyper_param_search:
            # state.trial_params enthält die Hyperparameter des aktuellen Trials,
            # wenn der HP-Such-Backend (z.B. Optuna) sie bereitstellt.
            # Dies ist normalerweise der Fall.
            current_hyperparameters = state.trial_params if state.trial_params is not None else {}

            log_entry = {}
            # Füge alle Hyperparameter hinzu
            log_entry.update(current_hyperparameters)

            # Füge die gewünschten Metriken hinzu
            log_entry['eval_dataset_type'] = 'train/HP'
            log_entry['eval_loss'] = metrics.get("eval_loss")
            log_entry['eval_accuracy'] = metrics.get("eval_accuracy")
            log_entry['eval_f1'] = metrics.get("eval_f1")
            log_entry['eval_precision'] = metrics.get("precision")
            log_entry['eval_recall'] = metrics.get("eval_recall")
            # Du kannst hier weitere Metriken hinzufügen, die von deiner compute_metrics Funktion zurückgegeben werden
            # log_entry['eval_precision'] = metrics.get("eval_precision")

            # Überprüfe, ob bereits ein Eintrag mit exakt denselben Hyperparametern und Metriken vorhanden ist,
            # um Duplikate zu vermeiden, falls on_evaluate mehrfach pro Trial aufgerufen wird (unwahrscheinlich, aber sicher ist sicher)
            # In der Praxis wird on_evaluate normalerweise einmal pro Trial-Evaluation aufgerufen.
            hp_search_results_list.append(log_entry)
            print("test:\n ")
            print(current_hyperparameters)
            print("test end\n ")
            #print(f"HP Search Trial Logged: {log_entry}") # Optional: zum Debuggen

def clean_text(text):
    # HTML-Tags entfernen
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r"[\",\']","", text)  #  Anführungszeichen entfernen

    # 1. Mehrfache Anführungszeichen durch ein normales ' ersetzen
    text = re.sub(r"'{2,}", "'", text)

    # 2. HTML-Tags entfernen [1, 2, 3]
    # Sucht nach Mustern wie <tag>Inhalt</tag> und ersetzt sie durch einen leeren String.
    text = re.sub(r'<.*?>', '', text)

    # 3. URLs entfernen [1, 2, 3]
    # Sucht nach gängigen URL-Mustern (http/https, www.) und ersetzt sie durch einen leeren String.
    text = re.sub(r'http\S+|www\.\S+', '', text)

    # 4. E-Mail-IDs entfernen [3]
    # Sucht nach E-Mail-Mustern (Zeichenfolge@Zeichenfolge.Domain) und ersetzt sie durch einen leeren String.
    text = re.sub(r'\S*@\S*\s?', '', text)

    # 5. Zusätzliche Leerzeichen normalisieren [1, 4]
    # Teilt den Text nach Leerzeichen auf und fügt ihn mit einem einzigen Leerzeichen wieder zusammen.
    text = " ".join(text.split())

    text = re.sub(r"[\[,\]]","", text)  # Mehrfache Leerzeichen zu einem reduzieren
    

    return text

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    
    # Berechnung des gewichteten F1-Scores
    f1 = f1_score(labels, preds, average='weighted')
    
    # Optional: Berechnung weiterer Metriken
    precision, recall, _, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0) # zero_division=0, um Warnungen zu vermeiden
    acc = accuracy_score(labels, preds)
    
    return {
        'f1': f1,
        'accuracy': acc,
        'precision': precision,
        'recall': recall
    }

def model_init(trial):
    # Laden Sie die Konfiguration zuerst, um sie an LoRaBertForSequenceClassification zu übergeben
    # num_labels muss global oder als Argument verfügbar sein
    model_name = 'bert-base-uncased'
    config = AutoConfig.from_pretrained(model_name, num_labels=num_labels, cache_dir=cache_dir)
    

    return BertForSequenceClassification.from_pretrained(
        model_name,
        config=config,
        cache_dir=cache_dir
    )

def time_now():
    # Zeit funktion für den Dateinamen
    current_dateTime = datetime.now()
    time = str(current_dateTime.hour+2)+"-"+str(current_dateTime.minute)+"_"+str(current_dateTime.day) +"-"+ str(current_dateTime.month)+"-"+str(current_dateTime.year)
    return str(time)

def hp_space_optuna(trial):
    # Hyperparameter-Suchraum für Optuna
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "num_train_epochs": trial.suggest_categorical("num_train_epochs",  [5, 12, 16]),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32]),
        "weight_decay": trial.suggest_categorical("weight_decay",  [0.0, 0.01]),
    }
        

def prepare_val(df):
    # Kombiniere Titel und Abstract
    df['text'] = df['title'].astype(str) + " - " + df['abstract'].astype(str)

    # Bereinigen Sie den Text
    df["text"] = df["text"].parallel_apply(clean_text).str.lower()
    # encode the labels
    df['label_encoded'] = le.fit_transform(df['class']).astype(int)

    df = df.sample(frac=1)
    X_val = df["text"]
    Y_val = df["label_encoded"]  

    print("Tokenizing validation data...")
    val_encodings = tokenizer(
        list(X_val), truncation=True, padding=True, max_length=512)
    print("End...")
    print("creating dataset...")
    ## Dataset erstellen
    val_dataset = PublicationsDataset(val_encodings, Y_val.reset_index(drop=True))
    print("End...")
    return val_dataset,df

def prepare_test_train(df):
    # Kombiniere Titel und Abstract
    df['text'] = df['title'].astype(str) + " - " + df['abstract'].astype(str)

    # Bereinigen Sie den Text
    df["text"] = df["text"].parallel_apply(clean_text).str.lower()

    # encode the labels
    df['label_encoded'] = le.fit_transform(df['class']).astype(int)

    # train_test_split für das Training/Validation-Set (aus dfBert)
    # Beachten Sie, dass X_test, y_test hier nur für das Training verwendet werden.
    # dfBert_eval wird als separates Validierungsset für die Valedierung genutzt.
    X_train, X_test, y_train, y_test = train_test_split(
        df['text'], df['label_encoded'], test_size=0.2, random_state=42, stratify=df['label_encoded'])

    print("Tokenizing training data...")
    train_encodings = tokenizer(
        list(X_train), truncation=True, padding=True, max_length=512)
    test_encodings = tokenizer(
        list(X_test), truncation=True, padding=True, max_length=512)
    print("End...")
    print("creating dataset...")
    ## Dataset erstellen
    train_dataset = PublicationsDataset(train_encodings, y_train.reset_index(drop=True))
    test_dataset = PublicationsDataset(test_encodings, y_test.reset_index(drop=True))
    print("End...")
    return train_dataset,test_dataset,df





















# --- 1. Initialisierung ---

hp_search_results_list = []
final_run_results_list = []
model_name = 'bert-base-uncased'

# # Definieren Sie den Pfad zu den Daten
#path_train='../01_Daten/pkl/df_all_15k-1.pkl'
path_train='../01_Daten/pkl/df_all_15k-2.pkl'
#path_train='../01_Daten/pkl/df_all_15k-3.pkl'
path_val='../01_Daten/pkl/df_val_5k-2.pkl'
#path_val='../01_Daten/pkl/df_val_5k-3.pkl'


#speicher Pfad für Logs und Modelle
time_log_save = time_now()
model_base_path = f"../01_Daten/logs/{time_log_save}/bert_multiclass_FT-15k/"
model_log_path = model_base_path+"logs/"
model_output_path = model_base_path+"results/"
model_final_path = model_base_path+"final_model/"

# LabelEncoder, tokenizer und  Data collator initialisieren
le = LabelEncoder()
tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# --- 2. Erstellen HP Trainer und args---
train_dataset, test_dataset, dfBert_train = prepare_test_train(pd.read_pickle(path_train))
val_dataset, dfBert_val = prepare_val(pd.read_pickle(path_val)) # Das ist jetzt dein dediziertes Validierungsset

# num_labels auslesen für die Model-Initialisierung
num_labels = dfBert_train['label_encoded'].nunique()

if boolHP:



    # Trainingsparameter für die Hyperparameter-Suche, diese Werte dienen als Standardwerte oder Fallbacks.
    # Die Werte aus Optuna (über hp_space_optuna) werden während der Trials verwendet.
    training_args = TrainingArguments(
        output_dir=f'{model_output_path}results_hp_search', 
        
        learning_rate=1e-5,
        num_train_epochs=1, 
        per_device_train_batch_size= 16,        
        
        # Feste Werte für die Suche:
        logging_dir=f'{model_log_path}logs_hp_search',
        logging_steps=10,
        report_to="tensorboard",
        eval_strategy="epoch", 
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=False,
        metric_for_best_model="f1",
        greater_is_better=True,
    )
    # Trainer initialisieren (ohne ein festes Modell - model_init wird verwendet)
    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # --- 3. Starten der Hyperparameter-Suche ---
    print("Starte Hyperparameter-Suche...")
    # Starten der Hyperparameter-Suche
    best_trial = trainer.hyperparameter_search(
        direction="maximize", # Maximiere den F1-Score
        backend="optuna",
        n_trials=10, # Anzahl der Trials, die Optuna durchführen soll, je mehr Trials, desto länger dauert es, aber potenziell bessere Ergebnisse.
        hp_space=hp_space_optuna,
    )
    print("\n--- Hyperparameter-Suche abgeschlossen ---")

    print("Beste Trial:")
    print(f"  Wert (F1): {best_trial}") 
    print("\n------------------------------------------")
    print("learning_rate: "+str(best_trial.hyperparameters["learning_rate"]))
    print("num_train_epochs: "+str(best_trial.hyperparameters["num_train_epochs"]))
    print("per_device_train_batch_size : "+str(best_trial.hyperparameters["per_device_train_batch_size"]))
    #print("warmup_ratio: "+str(best_trial.hyperparameters["warmup_ratio"]))
    print("weight_decay: "+str(best_trial.hyperparameters["weight_decay"]))
    print("\n------------------END---------------------\n\n\n")

    # --- 4. Train with Best Hyperparameters ---
    #update der TrainingArguments mit den besten Hyperparametern
    best_hp = best_trial.hyperparameters


    final_training_args = TrainingArguments(
        output_dir=f"{model_output_path}best_run", 
        logging_dir=f"{model_log_path}best_run",
        report_to="tensorboard",
        learning_rate=best_hp["learning_rate"],
        num_train_epochs=best_hp["num_train_epochs"],
        per_device_train_batch_size=best_hp["per_device_train_batch_size"],
        per_device_eval_batch_size=training_args.per_device_eval_batch_size, 
        weight_decay=best_hp.get("weight_decay", training_args.weight_decay),
        #warmup_ratio=best_hp.get("warmup_ratio", training_args.warmup_ratio),
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        seed=42
    )
else:
    final_training_args = TrainingArguments(
    output_dir=f"{model_output_path}best_run", 
    logging_dir=f"{model_log_path}best_run",

    learning_rate=2.7166361333742085e-05,
    num_train_epochs= 5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay = 0.0,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit = 1,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    seed=42
    )

# Initialisiere den Trainer mit den statischen Hyperparametern
final_trainer = Trainer(
    model_init=model_init,
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("\nTraining the final model with the best hyperparameters...")
final_trainer.train()

# --- 5. Evaluate the Final Model ---
print("\nEvaluating the final model on the test set...")
test_results = final_trainer.evaluate(test_dataset)
print("\nTest Set Evaluation Results:")
for key, value in test_results.items():
    print(f"  {key}: {value:.4f}")

print("\nEvaluating the final model on the validation set...")
val_results = final_trainer.evaluate(val_dataset)
print("\nVal Set Evaluation Results:")
for key, value in val_results.items():
    print(f"  {key}: {value:.4f}")




if boolHP:
    # # --- 6. Save the Final Model & Tokenizer ---
    end_zeit = time.time()
    laufzeit = end_zeit - start_zeit
    print("\n Saving the fine-tuned model and tokenizer...")
    final_trainer.save_model(f"{model_final_path}model")
    tokenizer.save_pretrained(f"{model_final_path}tokenizer/")



# # --- 7. Predict and to DF ---
# --- Erstellen eines DataFrames mit Vorhersagen für das Validierungsset ---
print("\ncreate predictions, to save in a df...")

predictions_output_val = final_trainer.predict(val_dataset)
predicted_scores_val = predictions_output_val.predictions
predicted_labels_encoded_val = np.argmax(predicted_scores_val, axis=1)
predicted_labels_named_val = le.inverse_transform(predicted_labels_encoded_val)

dfResults_pred = pd.DataFrame()

dfResults_pred['id_im_aktuellen_df'] = dfBert_val.index.values

# Ursprüngliche Klasse (Text-Label) aus dfBert_val
dfResults_pred['class_original'] = dfBert_val['class'].values

# Ursprüngliche Klasse (numerisch kodiertes Label - Ground Truth) aus dfBert_val
dfResults_pred['label_encoded_original'] = dfBert_val['label_encoded'].values

# Vorhergesagte Klasse (numerisch kodiertes Label)
dfResults_pred['prediction_encoded'] = predicted_labels_encoded_val

# Vorhergesagte Klasse (Text-Label)
dfResults_pred['prediction_named'] = predicted_labels_named_val

# Optional: Fügen Sie den Text hinzu, der für die Vorhersage verwendet wurde
dfResults_pred['text_input'] = dfBert_val['text'].values


print("--- 1. DF rdy ---")
print("Erstelle einen DataFrame nur mit den Vorhersagen, die vom Original abweichen (Validierungsset)...")
# Filtere dfResults_pred, um nur Zeilen zu erhalten, bei denen das Original-Label und das vorhergesagte Label unterschiedlich sind.
dfResults_pred_diff = dfResults_pred[dfResults_pred['label_encoded_original'] != dfResults_pred['prediction_encoded']]
print("\n---  DFs rdy !!! ---")
if dfResults_pred_diff.empty:
    print("Keine Unterschiede zwischen Original- und Vorhersage-Labels im Validierungsset gefunden. Perfekte Vorhersage!")
else:
    print("--------------------------------------")
    print(f"\nAnzahl der unterschiedlichen Vorhersagen im Validierungsset: {len(dfResults_pred_diff)}")
print("\n-------------------------------------------")

# Wahre Labels und vorhergesagte Labels aus dem DataFrame extrahieren
y_true_val = dfResults_pred['label_encoded_original']
y_pred_val = dfResults_pred['prediction_encoded']

print("\n validation-set metrics (calculated from dfResults_pred):")

#Gewichteter F1-Score
f1_val_weighted = f1_score(y_true_val, y_pred_val, average='weighted', zero_division=0)
print(f"  Gewichteter F1-Score: {f1_val_weighted:.4f}")
print("\n------------------------------------------")


dfResults_pred.to_pickle(f"{model_base_path}dfResults_pred.pkl")
dfResults_pred_diff.to_pickle(f"{model_base_path}dfResults_pred_diff({len(dfResults_pred_diff)}).pkl")



# Wandle die gesammelten Ergebnisse in einen DataFrame um
df_hp_results = pd.DataFrame(hp_search_results_list)
print("\nGesammelte Ergebnisse der Hyperparameter-Suche:")
print(df_hp_results)

print(f"runtimet: {laufzeit/60} min")
print("\n Script finished successfully!")

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1250), Label(value='0 / 1250'))), …

Tokenizing training data...
End...
creating dataset...
End...


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=417), Label(value='0 / 417'))), HB…

Tokenizing validation data...
End...
creating dataset...
End...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-06-06 17:00:46,938] A new study created in memory with name: no-name-24906a9a-bd80-4863-a0b3-c63df342f47d


Starte Hyperparameter-Suche...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.6948,0.7997,0.707259,0.708333,0.715138,0.708333
2,0.5591,0.76357,0.720241,0.722333,0.721989,0.722333
3,0.4827,0.78244,0.729301,0.729333,0.732521,0.729333
4,0.4362,0.838012,0.724378,0.725667,0.723749,0.725667
5,0.2519,0.875079,0.722807,0.725333,0.723395,0.725333


[I 2025-06-06 17:33:37,612] Trial 0 finished with value: 2.8968689041637288 and parameters: {'learning_rate': 1.3887966021119244e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'weight_decay': 0.01}. Best is trial 0 with value: 2.8968689041637288.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.7059,0.841747,0.693634,0.695667,0.704523,0.695667
2,0.6474,0.772593,0.71681,0.718333,0.716999,0.718333
3,0.6015,0.769596,0.71951,0.720667,0.722377,0.720667
4,0.6063,0.777698,0.721183,0.722333,0.722077,0.722333
5,0.4167,0.857599,0.714554,0.718333,0.724932,0.718333
6,0.4623,0.832496,0.724601,0.724667,0.727559,0.724667
7,0.2843,0.88791,0.718438,0.722333,0.721833,0.722333
8,0.2151,0.960523,0.71783,0.721667,0.720806,0.721667
9,0.3409,1.020036,0.709991,0.714,0.717264,0.714
10,0.2527,1.034502,0.709405,0.714667,0.713417,0.714667


[I 2025-06-06 18:52:15,352] Trial 1 finished with value: 2.8552989071578216 and parameters: {'learning_rate': 5.945723536491367e-06, 'num_train_epochs': 12, 'per_device_train_batch_size': 16, 'weight_decay': 0.01}. Best is trial 0 with value: 2.8968689041637288.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.6806,0.797576,0.714734,0.716,0.72303,0.716
2,0.5023,0.786794,0.718664,0.721333,0.730383,0.721333
3,0.3464,0.924183,0.709927,0.712333,0.717887,0.712333
4,0.4295,1.284651,0.711572,0.715333,0.714408,0.715333
5,0.152,1.706107,0.694096,0.696667,0.697318,0.696667
6,0.0984,1.848857,0.712916,0.712333,0.714222,0.712333
7,0.0413,2.081402,0.707747,0.707333,0.716202,0.707333
8,0.0427,2.122175,0.711876,0.713667,0.711563,0.713667
9,0.0004,2.215179,0.712673,0.713,0.712524,0.713
10,0.0007,2.335338,0.714159,0.714,0.715356,0.714


[I 2025-06-06 20:10:46,467] Trial 2 finished with value: 2.8655125060462963 and parameters: {'learning_rate': 4.274882008055365e-05, 'num_train_epochs': 12, 'per_device_train_batch_size': 16, 'weight_decay': 0.0}. Best is trial 0 with value: 2.8968689041637288.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.6897,0.815242,0.704007,0.704667,0.714006,0.704667
2,0.5245,0.781247,0.71709,0.721,0.723548,0.721
3,0.4081,0.858584,0.717323,0.717667,0.727578,0.717667
4,0.4087,1.014542,0.715662,0.717667,0.719931,0.717667
5,0.2118,1.387007,0.717138,0.718333,0.717061,0.718333
6,0.278,1.5639,0.712334,0.711,0.715146,0.711
7,0.114,1.912544,0.706031,0.706667,0.712598,0.706667
8,0.0229,1.919653,0.715287,0.716667,0.716187,0.716667
9,0.102,2.094717,0.714188,0.714333,0.717062,0.714333
10,0.0384,2.249606,0.709098,0.711,0.709844,0.711


[I 2025-06-06 21:55:44,192] Trial 3 finished with value: 2.8472149963091913 and parameters: {'learning_rate': 2.4703090985463156e-05, 'num_train_epochs': 16, 'per_device_train_batch_size': 16, 'weight_decay': 0.01}. Best is trial 0 with value: 2.8968689041637288.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.6782,0.788499,0.712358,0.714667,0.71798,0.714667
2,0.4748,0.838535,0.713056,0.716333,0.726391,0.716333
3,0.3712,1.005361,0.699659,0.703,0.711402,0.703
4,0.3051,1.306467,0.716749,0.718667,0.71663,0.718667
5,0.1927,1.603266,0.707245,0.709,0.713082,0.709
6,0.1137,1.91927,0.703081,0.704333,0.705258,0.704333
7,0.0864,1.994917,0.721919,0.722,0.722779,0.722
8,0.0156,2.289403,0.713798,0.714333,0.717612,0.714333
9,0.0512,2.441628,0.704965,0.707,0.713765,0.707
10,0.0369,2.546951,0.716056,0.713,0.726626,0.713


[I 2025-06-06 23:40:34,611] Trial 4 finished with value: 2.8559430439455022 and parameters: {'learning_rate': 6.323079170820639e-05, 'num_train_epochs': 16, 'per_device_train_batch_size': 16, 'weight_decay': 0.01}. Best is trial 0 with value: 2.8968689041637288.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.7595,0.869399,0.680665,0.683333,0.68749,0.683333


[I 2025-06-06 23:47:06,449] Trial 5 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.9144,0.971885,0.65517,0.659667,0.662449,0.659667


[I 2025-06-06 23:53:37,864] Trial 6 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.7009,0.834177,0.697052,0.698667,0.707881,0.698667


[I 2025-06-07 00:00:09,443] Trial 7 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.7801,0.837083,0.694973,0.697333,0.706152,0.697333
2,0.7102,0.764775,0.724975,0.724,0.728033,0.724
3,0.6474,0.761649,0.722097,0.723333,0.724235,0.723333
4,0.5321,0.780747,0.72212,0.723667,0.721573,0.723667
5,0.3775,0.842413,0.712563,0.715667,0.717186,0.715667
6,0.4137,0.867122,0.719196,0.716667,0.727474,0.716667
7,0.27,0.940965,0.717154,0.719333,0.717163,0.719333
8,0.1857,1.047325,0.697967,0.700667,0.703211,0.700667
9,0.1361,1.138389,0.700706,0.705667,0.704945,0.705667
10,0.0777,1.19307,0.697927,0.702,0.700109,0.702


[I 2025-06-07 01:40:27,480] Trial 8 finished with value: 2.8274937808228913 and parameters: {'learning_rate': 1.000075293533579e-05, 'num_train_epochs': 16, 'per_device_train_batch_size': 32, 'weight_decay': 0.0}. Best is trial 0 with value: 2.8968689041637288.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.8725,0.946692,0.660015,0.664333,0.66786,0.664333


[I 2025-06-07 01:46:59,024] Trial 9 pruned. 



--- Hyperparameter-Suche abgeschlossen ---
Beste Trial:
  Wert (F1): BestRun(run_id='0', objective=2.8968689041637288, hyperparameters={'learning_rate': 1.3887966021119244e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'weight_decay': 0.01}, run_summary=None)

------------------------------------------
learning_rate: 1.3887966021119244e-05
num_train_epochs: 5
per_device_train_batch_size : 16
weight_decay: 0.01

------------------END---------------------





Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training the final model with the best hyperparameters...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,1.0099,0.78927,0.703541,0.7046,0.70998,0.7046
2,0.672,0.766667,0.720619,0.7226,0.726562,0.7226
3,0.5274,0.788171,0.726256,0.7264,0.729091,0.7264
4,0.4004,0.850398,0.719721,0.7214,0.720317,0.7214
5,0.3337,0.884342,0.719478,0.7222,0.722397,0.7222



Evaluating the final model on the test set...



Test Set Evaluation Results:
  eval_loss: 0.7820
  eval_f1: 0.7307
  eval_accuracy: 0.7307
  eval_precision: 0.7337
  eval_recall: 0.7307
  eval_runtime: 29.5144
  eval_samples_per_second: 101.6450
  eval_steps_per_second: 12.7060
  epoch: 5.0000

Evaluating the final model on the validation set...

Val Set Evaluation Results:
  eval_loss: 0.7882
  eval_f1: 0.7263
  eval_accuracy: 0.7264
  eval_precision: 0.7291
  eval_recall: 0.7264
  eval_runtime: 50.1399
  eval_samples_per_second: 99.7210
  eval_steps_per_second: 12.4650
  epoch: 5.0000

 Saving the fine-tuned model and tokenizer...

create predictions, to save in a df...
--- 1. DF rdy ---
Erstelle einen DataFrame nur mit den Vorhersagen, die vom Original abweichen (Validierungsset)...

---  DFs rdy !!! ---
--------------------------------------

Anzahl der unterschiedlichen Vorhersagen im Validierungsset: 1368

-------------------------------------------

 validation-set metrics (calculated from dfResults_pred):
  Gewichteter F1-S

In [3]:
best_trial

BestRun(run_id='0', objective=2.8968689041637288, hyperparameters={'learning_rate': 1.3887966021119244e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'weight_decay': 0.01}, run_summary=None)

In [4]:
print("------------------------------------------")
print("Beste Trial:\n")

print("learning_rate: "+str(best_trial.hyperparameters["learning_rate"]))
print("num_train_epochs: "+str(best_trial.hyperparameters["num_train_epochs"]))
print("per_device_train_batch_size : "+str(best_trial.hyperparameters["per_device_train_batch_size"]))
print("weight_decay: "+str(best_trial.hyperparameters["weight_decay"]))
#print("test: "+str(best_trial.hyperparameters["warmup_ratio"]))

print("\n------------------END---------------------")

------------------------------------------
Beste Trial:

learning_rate: 1.3887966021119244e-05
num_train_epochs: 5
per_device_train_batch_size : 16
weight_decay: 0.01

------------------END---------------------


In [1]:
from transformers import pipeline

In [None]:
model_path="../01_Daten/logs/final/bert_multiclass_FT-15k/final_model/"
path_val="../01_Daten/pkl/df_val_5k-3.pkl"

In [None]:
def prepare_val(df):
    # Kombiniere Titel und Abstract
    df['text'] = df['title'].astype(str) + " - " + df['abstract'].astype(str)

    # Bereinigen Sie den Text
    df["text"] = df["text"].parallel_apply(clean_text).str.lower()
    # encode the labels
    df['label_encoded'] = le.fit_transform(df['class']).astype(int)

    df = df.sample(frac=1)
    X_val = df["text"]
    Y_val = df["label_encoded"]  

    print("Tokenizing validation data...")
    val_encodings = tokenizer(
        list(X_val), truncation=True, padding=True, max_length=512)
    print("End...")
    print("creating dataset...")
    ## Dataset erstellen
    val_dataset = PublicationsDataset(val_encodings, Y_val.reset_index(drop=True))
    print("End...")
    return val_dataset,df

In [None]:
loaded_model =model_path+"model"
loaded_tokenizer=model_path+"tokenizer"
classifier = pipeline("text-classification", model=loaded_model, tokenizer=loaded_tokenizer, device=0 if torch.cuda.is_available() else -1)
le = LabelEncoder()

# Load Validation DF
dfBert_val = prepare_val(pd.read_pickle(path_val))

# Klassifikation auf jede Zeile anwenden
dfBert_val['predictions_encoded'] = dfBert_val['text'].apply(lambda x:  classifier(x, truncation=True, max_length=512)[0]['label'])
dfBert_val['predictions_encoded'] = dfBert_val['predictions_encoded'].apply(lambda x: int(x[6:]))
dfBert_val['predicted_labels_named'] = dfBert_val['predictions_encoded'].apply(lambda x: le.inverse_transform([x])[0])

# Wahre Labels und vorhergesagte Labels aus dem DataFrame extrahieren
y_true_val = dfBert_val['label_encoded']
y_pred_val = dfBert_val['predictions_encoded']



print("------------------------------------------")
print("\nvalidation-set metrics (calculated from dfResults_pred):")
#F1-Score
f1_val_weighted = f1_score(y_true_val, y_pred_val, average='weighted', zero_division=0)
print(f"F1-Score: {f1_val_weighted:.4f}")
print("\n------------------------------------------")


dfResults_pred_diff = dfBert_val[dfBert_val['label_encoded'] != dfBert_val['predictions_encoded']]
if dfResults_pred_diff.empty:
    print("\nKeine Unterschiede zwischen Original- und Vorhersage-Labels im Validierungsset gefunden. Perfekte Vorhersage!")
else:
    print(f"\nAnzahl der unterschiedlichen Vorhersagen im Validierungsset: {len(dfResults_pred_diff)}")
print("\n-------------------------------------------")

------------------------------------------

validation-set metrics (calculated from dfResults_pred):
F1-Score: 0.7059

------------------------------------------

Anzahl der unterschiedlichen Vorhersagen im Validierungsset: 1462

-------------------------------------------


In [None]:
#path checker
import os
                
model_path = "../01_Daten/logs/FULL_bert_multiclass_FT-15k/bert_multiclass_FT-15k/final_model/model"
if os.path.exists(model_path):
    print(f"Model directory exists at: {model_path}")
else:
    print(f"Model directory not found at: {model_path}")