In [None]:
boolHP = True # True: Hyperparameter-Suche, False: direktes Training mit festen Werten

import time
start_zeit = time.time()


import numpy as np
import pandas as pd
from pandarallel import pandarallel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizer, BertForSequenceClassification, Trainer,  TrainingArguments, AutoConfig, AutoModelForSequenceClassification,DataCollatorWithPadding, TrainerCallback,TrainerState, TrainerControl
import torch
import os
from datetime import datetime
from bs4 import BeautifulSoup
import re

# Initialisiere pandarallel für parallele Verarbeitung
# Hier optional das Backend ändern, z.B. 'dask' oder 'ray' für größere Datensätze
pandarallel.initialize(progress_bar=True)




# Definieren Sie das Cache-Verzeichnis
cache_dir = '/media/ubuntu/5d2d9f9d-a02d-45ab-865f-3d789a0c70f0/download/'
os.environ['TRANSFORMERS_CACHE'] = cache_dir




# Dataset Klasse definieren
class PublicationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item



def clean_text(text):
    # HTML-Tags entfernen
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r"[\",\']","", text)  #  Anführungszeichen entfernen

    # 1. Mehrfache Anführungszeichen durch ein normales ' ersetzen
    text = re.sub(r"'{2,}", "'", text)

    # 2. HTML-Tags entfernen [1, 2, 3]
    # Sucht nach Mustern wie <tag>Inhalt</tag> und ersetzt sie durch einen leeren String.
    text = re.sub(r'<.*?>', '', text)

    # 3. URLs entfernen [1, 2, 3]
    # Sucht nach gängigen URL-Mustern (http/https, www.) und ersetzt sie durch einen leeren String.
    text = re.sub(r'http\S+|www\.\S+', '', text)

    # 4. E-Mail-IDs entfernen [3]
    # Sucht nach E-Mail-Mustern (Zeichenfolge@Zeichenfolge.Domain) und ersetzt sie durch einen leeren String.
    text = re.sub(r'\S*@\S*\s?', '', text)

    # 5. Zusätzliche Leerzeichen normalisieren [1, 4]
    # Teilt den Text nach Leerzeichen auf und fügt ihn mit einem einzigen Leerzeichen wieder zusammen.
    text = " ".join(text.split())

    text = re.sub(r"[\[,\]]","", text)  # Mehrfache Leerzeichen zu einem reduzieren
    

    return text

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    
    # Berechnung des gewichteten F1-Scores
    f1 = f1_score(labels, preds, average='weighted')
    
    # Optional: Berechnung weiterer Metriken
    precision, recall, _, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0) # zero_division=0, um Warnungen zu vermeiden
    acc = accuracy_score(labels, preds)
    
    return {
        'f1': f1,
        'accuracy': acc,
        'precision': precision,
        'recall': recall
    }

def model_init(trial):
    # Laden Sie die Konfiguration zuerst, um sie an LoRaBertForSequenceClassification zu übergeben
    # num_labels muss global oder als Argument verfügbar sein
    
    
    #model_name = 'bert-base-uncased'
    model_name = 'microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext'
    config = AutoConfig.from_pretrained(model_name, num_labels=num_labels, cache_dir=cache_dir)

    
    return BertForSequenceClassification.from_pretrained(
        model_name,
        config=config,
        cache_dir=cache_dir
    )

def time_now():
    # Zeit funktion für den Dateinamen
    current_dateTime = datetime.now()
    time = str(current_dateTime.hour+2)+"-"+str(current_dateTime.minute)+"_"+str(current_dateTime.day) +"-"+ str(current_dateTime.month)+"-"+str(current_dateTime.year)
    return str(time)

def hp_space_optuna(trial):
    # Hyperparameter-Suchraum für Optuna
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "num_train_epochs": trial.suggest_categorical("num_train_epochs",  [5, 12, 16]),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32]),
        "weight_decay": trial.suggest_categorical("weight_decay",  [0.0, 0.01]),
    }


def prepare_val(df):
   
    # Kombiniere Titel und Abstract
    df['text'] = df['title'].astype(str) + " - " + df['abstract'].astype(str)

    # Bereinigen Sie den Text
    df["text"] = df["text"].parallel_apply(clean_text).str.lower()
    # encode the labels
    df['label_encoded'] = le.fit_transform(df['class']).astype(int)

    df = df.sample(frac=1)
    X_val = df["text"]
    Y_val = df["label_encoded"]  

    print("Tokenizing validation data...")
    val_encodings = tokenizer(
        list(X_val), truncation=True, padding=True, max_length=512)
    print("End...")
    print("creating dataset...")
    ## Dataset erstellen
    val_dataset = PublicationsDataset(val_encodings, Y_val.reset_index(drop=True))
    print("End...")
    return val_dataset,df

def prepare_test_train(df):

    df['text'] = df['title'].astype(str) + " - " + df['abstract'].astype(str)

    # Bereinigen Sie den Text
    df["text"] = df["text"].parallel_apply(clean_text).str.lower()

    # encode the labels
    df['label_encoded'] = le.fit_transform(df['class']).astype(int)

    # train_test_split für das Training/Validation-Set (aus dfBert)
    # Beachten Sie, dass X_test, y_test hier nur für das Training verwendet werden.
    # dfBert_eval wird als separates Validierungsset für die Valedierung genutzt.
    X_train, X_test, y_train, y_test = train_test_split(
        df['text'], df['label_encoded'], test_size=0.2, random_state=42, stratify=df['label_encoded'])

    print("Tokenizing training data...")
    train_encodings = tokenizer(
        list(X_train), truncation=True, padding=True, max_length=512)
    test_encodings = tokenizer(
        list(X_test), truncation=True, padding=True, max_length=512)
    print("End...")
    print("creating dataset...")
    ## Dataset erstellen
    train_dataset = PublicationsDataset(train_encodings, y_train.reset_index(drop=True))
    test_dataset = PublicationsDataset(test_encodings, y_test.reset_index(drop=True))
    print("End...")
    return train_dataset,test_dataset,df








# --- 1. Initialisierung ---

hp_search_results_list = []
final_run_results_list = []

# # Definieren Sie den Pfad zu den Daten
#path_train='../01_Daten/pkl/df_all_15k-1.pkl'
path_train='../01_Daten/pkl/df_all_15k-2.pkl'
#path_train='../01_Daten/pkl/df_all_15k-3.pkl'
path_val='../01_Daten/pkl/df_val_5k-2.pkl'
#path_val='../01_Daten/pkl/df_val_5k-3.pkl'


#speicher Pfad für Logs und Modelle
time_log_save = time_now()
model_base_path = f"../01_Daten/logs/{time_log_save}/PubMedBert_multiclass_FT-1k/"
model_log_path = model_base_path+"logs/"
model_output_path = model_base_path+"results/"
model_final_path = model_base_path+"final_model/"

# LabelEncoder, tokenizer und  Data collator initialisieren
le = LabelEncoder()
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', cache_dir=cache_dir)
tokenizer = BertTokenizer.from_pretrained('microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext', cache_dir=cache_dir)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# --- 2. Erstellen HP Trainer und args---
train_dataset, test_dataset, dfBert_train = prepare_test_train(pd.read_pickle(path_train))
val_dataset, dfBert_val = prepare_val(pd.read_pickle(path_val)) # Das ist jetzt dein dediziertes Validierungsset

if boolHP:

    # num_labels auslesen für die Model-Initialisierung
    num_labels = dfBert_train['label_encoded'].nunique()

    # Trainingsparameter für die Hyperparameter-Suche, diese Werte dienen als Standardwerte oder Fallbacks.
    # Die Werte aus Optuna (über hp_space_optuna) werden während der Trials verwendet.
    training_args = TrainingArguments(
        output_dir=f'{model_output_path}results_hp_search', 
        
        learning_rate=1e-5,
        num_train_epochs=1, 
        per_device_train_batch_size= 16,        
        
        # Feste Werte für die Suche:
        logging_dir=f'{model_log_path}logs_hp_search',
        logging_steps=10,
        report_to="tensorboard",
        eval_strategy="epoch", 
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=False,
        metric_for_best_model="f1",
        greater_is_better=True,
    )
    # Trainer initialisieren (ohne ein festes Modell - model_init wird verwendet)
    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        #callbacks=[HPSearchResultLoggerCallback()], 
    )

    # --- 3. Starten der Hyperparameter-Suche ---
    print("Starte Hyperparameter-Suche...")
    # Starten der Hyperparameter-Suche
    best_trial = trainer.hyperparameter_search(
        direction="maximize", # Maximiere den F1-Score
        backend="optuna",
        n_trials=10, # Anzahl der Trials, die Optuna durchführen soll, je mehr Trials, desto länger dauert es, aber potenziell bessere Ergebnisse.
        hp_space=hp_space_optuna,
    )
    print("\n--- Hyperparameter-Suche abgeschlossen ---")

    print("Beste Trial:")
    print(f"  Wert (F1): {best_trial}") 
    print("\n------------------------------------------")
    print("learning_rate: "+str(best_trial.hyperparameters["learning_rate"]))
    print("num_train_epochs: "+str(best_trial.hyperparameters["num_train_epochs"]))
    print("per_device_train_batch_size : "+str(best_trial.hyperparameters["per_device_train_batch_size"]))
    #print("warmup_ratio: "+str(best_trial.hyperparameters["warmup_ratio"]))
    print("weight_decay: "+str(best_trial.hyperparameters["weight_decay"]))
    print("\n------------------END---------------------\n\n\n")

    # --- 4. Train with Best Hyperparameters ---
    #update der TrainingArguments mit den besten Hyperparametern
    best_hp = best_trial.hyperparameters


    final_training_args = TrainingArguments(
        output_dir=f"{model_output_path}best_run", 
        logging_dir=f"{model_log_path}best_run",
        report_to="tensorboard",
        learning_rate=best_hp["learning_rate"],
        num_train_epochs=best_hp["num_train_epochs"],
        per_device_train_batch_size=best_hp["per_device_train_batch_size"],
        per_device_eval_batch_size=training_args.per_device_eval_batch_size, 
        weight_decay=best_hp.get("weight_decay", training_args.weight_decay),
        #warmup_ratio=best_hp.get("warmup_ratio", training_args.warmup_ratio),
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1, 
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        seed=42
    )
else:
    final_training_args = TrainingArguments(
    output_dir=f"{model_output_path}best_run", 
    logging_dir=f"{model_log_path}best_run",

    learning_rate=1.3630463847311672e-05,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64, 
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit = 1,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    seed=42
    )

# Initialisiere den Trainer mit den statischen Hyperparametern
final_trainer = Trainer(
    model_init=model_init,
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("\nTraining the final model with the best hyperparameters...")
final_trainer.train()

# --- 5. Evaluate the Final Model ---
print("\nEvaluating the final model on the test set...")
test_results = final_trainer.evaluate(test_dataset)
print("\nTest Set Evaluation Results:")
for key, value in test_results.items():
    print(f"  {key}: {value:.4f}")

print("\nEvaluating the final model on the validation set...")
val_results = final_trainer.evaluate(val_dataset)
print("\nVal Set Evaluation Results:")
for key, value in val_results.items():
    print(f"  {key}: {value:.4f}")





if boolHP:
    # # --- 6. Save the Final Model & Tokenizer ---
    end_zeit = time.time()
    laufzeit = end_zeit - start_zeit
    print("\n Saving the fine-tuned model and tokenizer...")
    final_trainer.save_model(f"{model_final_path}model")
    tokenizer.save_pretrained(f"{model_final_path}tokenizer/")



# # --- 7. Predict and to DF ---
# --- Erstellen eines DataFrames mit Vorhersagen für das Validierungsset ---
print("\ncreate predictions, to save in a df...")

predictions_output_val = final_trainer.predict(val_dataset)
predicted_scores_val = predictions_output_val.predictions
predicted_labels_encoded_val = np.argmax(predicted_scores_val, axis=1)
predicted_labels_named_val = le.inverse_transform(predicted_labels_encoded_val)

dfResults_pred = pd.DataFrame()

dfResults_pred['id_im_aktuellen_df'] = dfBert_val.index.values

# Ursprüngliche Klasse (Text-Label) aus dfBert_val
dfResults_pred['class_original'] = dfBert_val['class'].values

# Ursprüngliche Klasse (numerisch kodiertes Label - Ground Truth) aus dfBert_val
dfResults_pred['label_encoded_original'] = dfBert_val['label_encoded'].values

# Vorhergesagte Klasse (numerisch kodiertes Label)
dfResults_pred['prediction_encoded'] = predicted_labels_encoded_val

# Vorhergesagte Klasse (Text-Label)
dfResults_pred['prediction_named'] = predicted_labels_named_val

# Optional: Fügen Sie den Text hinzu, der für die Vorhersage verwendet wurde
dfResults_pred['text_input'] = dfBert_val['text'].values


print("--- 1. DF rdy ---")
print("Erstelle einen DataFrame nur mit den Vorhersagen, die vom Original abweichen (Validierungsset)...")
# Filtere dfResults_pred, um nur Zeilen zu erhalten, bei denen das Original-Label und das vorhergesagte Label unterschiedlich sind.
dfResults_pred_diff = dfResults_pred[dfResults_pred['label_encoded_original'] != dfResults_pred['prediction_encoded']]
print("\n---  DFs rdy !!! ---")
if dfResults_pred_diff.empty:
    print("Keine Unterschiede zwischen Original- und Vorhersage-Labels im Validierungsset gefunden. Perfekte Vorhersage!")
else:
    print("--------------------------------------")
    print(f"\nAnzahl der unterschiedlichen Vorhersagen im Validierungsset: {len(dfResults_pred_diff)}")
print("\n-------------------------------------------")

# Wahre Labels und vorhergesagte Labels aus dem DataFrame extrahieren
y_true_val = dfResults_pred['label_encoded_original']
y_pred_val = dfResults_pred['prediction_encoded']

print("\n validation-set metrics (calculated from dfResults_pred):")

#Gewichteter F1-Score
f1_val_weighted = f1_score(y_true_val, y_pred_val, average='weighted', zero_division=0)
print(f"  Gewichteter F1-Score: {f1_val_weighted:.4f}")
print("\n------------------------------------------")


dfResults_pred.to_pickle(f"{model_base_path}dfResults_pred.pkl")
dfResults_pred_diff.to_pickle(f"{model_base_path}dfResults_pred_diff({len(dfResults_pred_diff)}).pkl")


print(f"runtimet: {laufzeit/60} min")
print("\n Script finished successfully!")


INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1250), Label(value='0 / 1250'))), …

Tokenizing training data...
End...
creating dataset...
End...


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=417), Label(value='0 / 417'))), HB…

Tokenizing validation data...
End...
creating dataset...
End...


  trainer = Trainer(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-06-02 16:05:54,812] A new study created in memory with name: no-name-b13a94b3-e707-4b1e-a7ee-8285a6f2f7f8


Starte Hyperparameter-Suche...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.7926,0.825707,0.706056,0.706667,0.707592,0.706667
2,0.6874,0.786757,0.70834,0.708667,0.708837,0.708667
3,0.6275,0.801096,0.719225,0.721,0.724871,0.721
4,0.5453,0.801447,0.722051,0.723667,0.72423,0.723667
5,0.4284,0.815464,0.720134,0.722333,0.723226,0.722333


[I 2025-06-02 16:37:15,368] Trial 0 finished with value: 2.8880271159143724 and parameters: {'learning_rate': 1.3630463847311672e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'weight_decay': 0.0}. Best is trial 0 with value: 2.8880271159143724.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.6648,0.812175,0.701731,0.702333,0.710369,0.702333
2,0.6024,0.783353,0.716705,0.719333,0.718818,0.719333
3,0.487,0.838251,0.708107,0.709333,0.713508,0.709333
4,0.3062,0.949674,0.716921,0.718667,0.716768,0.718667
5,0.2305,1.176032,0.71435,0.717,0.715378,0.717
6,0.1205,1.422783,0.709016,0.709667,0.713284,0.709667
7,0.0613,1.633317,0.708847,0.711,0.710289,0.711
8,0.0722,1.898704,0.704817,0.706333,0.708368,0.706333
9,0.0723,2.034156,0.713392,0.714333,0.712989,0.714333
10,0.0324,2.120025,0.709765,0.710667,0.710216,0.710667


[I 2025-06-02 17:55:51,283] Trial 1 finished with value: 2.8581302059047884 and parameters: {'learning_rate': 1.7661512995208496e-05, 'num_train_epochs': 12, 'per_device_train_batch_size': 16, 'weight_decay': 0.01}. Best is trial 0 with value: 2.8880271159143724.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.7659,0.823753,0.701472,0.702667,0.707855,0.702667
2,0.6516,0.78833,0.716503,0.717,0.71938,0.717
3,0.5268,0.814014,0.718211,0.718333,0.722438,0.718333
4,0.32,0.938873,0.711785,0.712333,0.714687,0.712333
5,0.1811,1.124742,0.711,0.710667,0.71556,0.710667
6,0.1524,1.344617,0.706179,0.706333,0.712351,0.706333
7,0.0882,1.559387,0.702861,0.703667,0.705743,0.703667
8,0.0525,1.790332,0.709161,0.71,0.710434,0.71
9,0.0233,1.95678,0.702232,0.704667,0.70401,0.704667
10,0.0382,2.064751,0.704156,0.705,0.706834,0.705


[I 2025-06-02 19:36:07,388] Trial 2 finished with value: 2.8438892510453604 and parameters: {'learning_rate': 2.6513260375521086e-05, 'num_train_epochs': 16, 'per_device_train_batch_size': 32, 'weight_decay': 0.0}. Best is trial 0 with value: 2.8880271159143724.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.7611,0.824544,0.694704,0.695,0.700367,0.695
2,0.6515,0.780682,0.71483,0.714667,0.717338,0.714667
3,0.5609,0.825522,0.713032,0.713667,0.719793,0.713667
4,0.3977,0.890996,0.716396,0.717333,0.718145,0.717333
5,0.2286,1.033077,0.717087,0.718667,0.71829,0.718667
6,0.1263,1.205544,0.708557,0.709,0.709178,0.709
7,0.1036,1.350276,0.711611,0.712667,0.711766,0.712667
8,0.0252,1.528785,0.711408,0.713,0.712312,0.713
9,0.0194,1.666014,0.708231,0.710333,0.708954,0.710333
10,0.011,1.774798,0.708945,0.713,0.710951,0.713


[I 2025-06-02 20:51:20,442] Trial 3 finished with value: 2.8477608060780133 and parameters: {'learning_rate': 2.2768068181709884e-05, 'num_train_epochs': 12, 'per_device_train_batch_size': 32, 'weight_decay': 0.0}. Best is trial 0 with value: 2.8880271159143724.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.6837,0.827207,0.700404,0.701333,0.704176,0.701333
2,0.6977,0.804993,0.705985,0.708,0.707785,0.708
3,0.6114,0.793334,0.714558,0.716333,0.719386,0.716333
4,0.5467,0.813843,0.712485,0.715,0.717672,0.715
5,0.4458,0.841573,0.724291,0.727333,0.726477,0.727333
6,0.4309,0.87989,0.722026,0.722667,0.723809,0.722667
7,0.3785,0.94082,0.713698,0.716667,0.716554,0.716667
8,0.2146,1.028816,0.709235,0.712333,0.712578,0.712333
9,0.2621,1.085823,0.711754,0.716,0.713032,0.716
10,0.2725,1.159829,0.71358,0.714667,0.71311,0.714667


[I 2025-06-02 22:35:59,018] Trial 4 finished with value: 2.8250390048728593 and parameters: {'learning_rate': 6.589528693399998e-06, 'num_train_epochs': 16, 'per_device_train_batch_size': 16, 'weight_decay': 0.0}. Best is trial 0 with value: 2.8880271159143724.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.8912,0.968751,0.656946,0.659333,0.660087,0.659333


[I 2025-06-02 22:42:30,795] Trial 5 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.9329,0.928604,0.664927,0.666667,0.66801,0.666667


[I 2025-06-02 22:48:46,461] Trial 6 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.687,0.806756,0.706949,0.707667,0.713612,0.707667
2,0.6359,0.789152,0.714553,0.717667,0.717924,0.717667


[I 2025-06-02 23:01:52,417] Trial 7 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.7506,0.823668,0.692736,0.693333,0.698858,0.693333


[I 2025-06-02 23:08:07,394] Trial 8 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.7345,0.846921,0.690411,0.691,0.692388,0.691


[I 2025-06-02 23:14:39,445] Trial 9 pruned. 



--- Hyperparameter-Suche abgeschlossen ---
Beste Trial:

------------------------------------------
learning_rate: 1.3630463847311672e-05
num_train_epochs: 5
per_device_train_batch_size : 32
weight_decay: 0.0

------------------END---------------------





Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training the final model with the best hyperparameters...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,No log,0.80566,0.694106,0.6942,0.697607,0.6942
2,0.913500,0.763778,0.718022,0.7184,0.719416,0.7184
3,0.660100,0.777008,0.717285,0.7188,0.726233,0.7188
4,0.546700,0.781493,0.718497,0.72,0.723845,0.72
5,0.546700,0.793662,0.71735,0.7196,0.724653,0.7196



Evaluating the final model on the test set...



Test Set Evaluation Results:
  eval_loss: 0.8004
  eval_f1: 0.7209
  eval_accuracy: 0.7223
  eval_precision: 0.7235
  eval_recall: 0.7223
  eval_runtime: 29.3764
  eval_samples_per_second: 102.1230
  eval_steps_per_second: 12.7650
  epoch: 5.0000

Evaluating the final model on the validation set...

Val Set Evaluation Results:
  eval_loss: 0.7815
  eval_f1: 0.7185
  eval_accuracy: 0.7200
  eval_precision: 0.7238
  eval_recall: 0.7200
  eval_runtime: 49.8475
  eval_samples_per_second: 100.3060
  eval_steps_per_second: 12.5380
  epoch: 5.0000

 Saving the fine-tuned model and tokenizer...

create predictions, to save in a df...
--- 1. DF rdy ---
Erstelle einen DataFrame nur mit den Vorhersagen, die vom Original abweichen (Validierungsset)...

---  DFs rdy !!! ---
--------------------------------------

Anzahl der unterschiedlichen Vorhersagen im Validierungsset: 1400

-------------------------------------------

 validation-set metrics (calculated from dfResults_pred):
  Gewichteter F1-

In [None]:
best_trial = trainer.hyperparameter_search(
        direction="maximize", # Maximiere den F1-Score
        backend="optuna",
        n_trials=10, # Anzahl der Trials, die Optuna durchführen soll, je mehr Trials, desto länger dauert es, aber potenziell bessere Ergebnisse.
        hp_space=hp_space_optuna,
    )

In [23]:
final_training_args2 = TrainingArguments(
    output_dir=f"{model_output_path}best_run2", 
    logging_dir=f"{model_log_path}best_run2",
    report_to="tensorboard",
    learning_rate=best_hp["learning_rate"],
    num_train_epochs=best_hp["num_train_epochs"],
    per_device_train_batch_size=best_hp["per_device_train_batch_size"],
    per_device_eval_batch_size=training_args.per_device_eval_batch_size, 
    weight_decay=best_hp.get("weight_decay", training_args.weight_decay),
    #warmup_ratio=best_hp.get("warmup_ratio", training_args.warmup_ratio),
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1, 
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    seed=42
)

# Initialisiere den Trainer mit den statischen Hyperparametern
final_trainer2 = Trainer(
    model_init=model_init,
    args=final_training_args2,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print("------------------------------------------")
print("Beste Trial\n:")

print("learning_rate: "+str(best_trial.hyperparameters["learning_rate"]))
print("num_train_epochs: "+str(best_trial.hyperparameters["num_train_epochs"]))
print("per_device_train_batch_size : "+str(best_trial.hyperparameters["per_device_train_batch_size"]))
print("weight_decay: "+str(best_trial.hyperparameters["weight_decay"]))
#print("test: "+str(best_trial.hyperparameters["warmup_ratio"]))

print("\n------------------END---------------------")

In [24]:


print("\nTraining the final model with the best hyperparameters...")
final_trainer.train()

# --- 5. Evaluate the Final Model ---
print("\nEvaluating the final model on the test set...")
test_results = final_trainer.evaluate(test_dataset)
print("\nTest Set Evaluation Results:")
for key, value in test_results.items():
    print(f"  {key}: {value:.4f}")

print("\nEvaluating the final model on the validation set...")
val_results = final_trainer.evaluate(val_dataset)
print("\nVal Set Evaluation Results:")
for key, value in val_results.items():
    print(f"  {key}: {value:.4f}")


Training the final model with the best hyperparameters...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,No log,0.805916,0.697725,0.6978,0.701484,0.6978
2,0.911500,0.764255,0.717997,0.7182,0.718914,0.7182
3,0.661100,0.779274,0.716233,0.7176,0.724006,0.7176
4,0.546000,0.78109,0.717589,0.7192,0.722237,0.7192
5,0.546000,0.792449,0.718652,0.7208,0.724259,0.7208



Evaluating the final model on the test set...



Test Set Evaluation Results:
  eval_loss: 0.8094
  eval_f1: 0.7260
  eval_accuracy: 0.7280
  eval_precision: 0.7282
  eval_recall: 0.7280
  eval_runtime: 29.5913
  eval_samples_per_second: 101.3810
  eval_steps_per_second: 12.6730
  epoch: 5.0000

Evaluating the final model on the validation set...

Val Set Evaluation Results:
  eval_loss: 0.7924
  eval_f1: 0.7187
  eval_accuracy: 0.7208
  eval_precision: 0.7243
  eval_recall: 0.7208
  eval_runtime: 50.2634
  eval_samples_per_second: 99.4760
  eval_steps_per_second: 12.4340
  epoch: 5.0000
