# imports

In [5]:
from datasets import load_dataset
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

import pandas as pd 
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import evaluate

from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = '/Users/christophhau/Desktop/HA_ML/data/aggregated/german_manifestos_combined.csv'

df= pd.read_csv(path)
print(len(df))
print(len(df['cmp_code'].unique())) #labels und text_en
print(df['cmp_code'].unique()) #labels und text_en
print(df['text_en'].isna().sum())
print(df['text_en'].isna().sum())
print(df['cmp_code'].isna().sum())


df.columns

17529
75
['H' '403' '201.1' '411' '402' '410' '704' '505' '701' '504' '305.1' '110'
 '601.1' '401' '506' '605.1' '703.1' '303' '407' '603' '606.1' '202.1'
 '106' '503' '414' '412' '601.2' '109' '406' '304' '000' '608.2' '501'
 '201.2' '203' '301' '107' '204' '416.2' '502' '413' '103.1' '105' '104'
 '705' '108' '602.2' '608.1' '202.4' '202.2' '604' '706' '302' '507'
 '602.1' '409' '605.2' '415' '101' '607.2' '405' '404' '416.1' '408'
 '607.1' '703.2' '606.2' '305.6' '102' '305.2' '305.3' '202.3' '702'
 '607.3' '103.2']
0
0
0


Index(['text', 'text_en', 'cmp_code', 'eu_code', 'party', 'domain'], dtype='object')

In [14]:
## clean up memeory
import torch
import gc
model = model.to('cpu')
del model
del tokenizer
gc.collect()

# If you were using MPS backend
if torch.backends.mps.is_available():
    torch.mps.empty_cache()

# simple quasi sentence finetuning 

In [2]:
path = '/Users/christophhau/Desktop/HA_ML/data/aggregated/german_manifestos_combined.csv'
df= pd.read_csv(path)


df_small = df[["text_en", "cmp_code"]].rename(columns={"text_en": "text", "cmp_code": "label"})
df_small = df_small.groupby("label").filter(lambda x: len(x) > 1)
print(len(df_small))
print(len(df_small['label'].unique())) #labels und text_en

le = LabelEncoder()
df_small["label"] = le.fit_transform(df_small["label"])
print(len(df_small['label'].unique())) #labels und text_en

train_df, test_df = train_test_split(df_small, test_size=0.30, random_state=42, stratify=df_small["label"])
test_df = test_df.groupby("label").filter(lambda x: len(x) > 1)
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42, stratify=test_df["label"])

train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))

dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "val": val_dataset
})

print(dataset)



17525
71
71
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 12267
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2626
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 2626
    })
})


In [3]:
# MPS Device Check und Setup
import torch
def get_device():
    if torch.backends.mps.is_available():
        device = torch.device("mps")
        print("✅ MPS (Apple Silicon GPU) ist verfügbar und wird verwendet")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
        print("✅ CUDA GPU wird verwendet")
    else:
        device = torch.device("cpu")
        print("⚠️ Nur CPU verfügbar - Training wird langsamer sein")
    return device

device = get_device()

✅ MPS (Apple Silicon GPU) ist verfügbar und wird verwendet


In [4]:
model = AutoModelForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=71)
model.to(device)

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def tokenize(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)
dataset = dataset.map(tokenize, batched=True)
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Berechne Precision, Recall, F1 und Support für den gewichteten Durchschnitt
    # 'weighted' berücksichtigt die Anzahl der Instanzen pro Klasse (gut bei unbalancierten Datensätzen)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    
    # Berechne die einfache Genauigkeit
    acc = accuracy_score(labels, predictions)
    
    # --- Für die detaillierte Aufschlüsselung nach Label ---
    # Erstelle den Classification Report. 
    # output_dict=True macht die Ausgabe zu einem Dictionary, das du weiterverarbeiten kannst.
    # zero_division=0 verhindert eine Warnung, falls eine Klasse nie vorhergesagt wird.
    report = classification_report(labels, predictions, output_dict=True, zero_division=0)
    
    # Du kannst dir den Report auch einfach ausgeben lassen, um ihn während des Trainings zu sehen:
    print("\nClassification Report:\n", classification_report(labels, predictions, zero_division=0))
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


training_args = TrainingArguments(
    output_dir="manifesto_classifier",
    
    # MPS-optimierte Einstellungen
    per_device_train_batch_size=4,  # Kleinere Batch Size für MPS
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # Kompensiert kleinere Batch Size
    
    # Training Parameter
    learning_rate=2e-5,
    num_train_epochs=3,
    
    # Evaluation und Saving
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    
    # Logging
    logging_steps=50,
    logging_dir="./logs",
    
    # MPS-spezifische Einstellungen
    dataloader_pin_memory=False,  # Wichtig für MPS
    fp16=False,  # MPS unterstützt noch kein FP16
    push_to_hub=False,
    
    # Reproducibility
    seed=42,
    
    # Warmup für bessere Konvergenz
    warmup_steps=100,
    weight_decay=0.01,)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 12267/12267 [00:01<00:00, 10726.34 examples/s]
Map: 100%|██████████| 2626/2626 [00:00<00:00, 11120.18 examples/s]
Map: 100%|██████████| 2626/2626 [00:00<00:00, 7903.28 examples/s]


In [5]:
trainer.train()

print("Saving model and tokenizer...")
trainer.save_model("./manifesto_classifier_sentence2")
tokenizer.save_pretrained("./manifesto_classifier_sentence2")



trainer.evaluate(dataset["val"])

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,2.2912,2.073513,0.488576,0.432746,0.432868,0.488576
2,1.7306,1.719622,0.566641,0.525339,0.502961,0.566641
3,1.5737,1.657611,0.579208,0.541355,0.533863,0.579208



Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           1       0.00      0.00      0.00         8
           3       0.00      0.00      0.00        11
           4       0.32      0.65      0.43        54
           5       0.00      0.00      0.00        30
           6       0.00      0.00      0.00        29
           7       0.41      0.59      0.48        56
           8       0.36      0.39      0.37        41
           9       0.00      0.00      0.00        11
          10       0.00      0.00      0.00        22
          11       0.82      0.14      0.24        63
          12       0.48      0.23      0.31        44
          13       0.45      0.62      0.52       103
          14       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         4
          17       0.00      0.00      0.00         3
          18       0.00      0.00      0.00         5
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           1       0.00      0.00      0.00         8
           3       0.00      0.00      0.00        11
           4       0.54      0.69      0.61        54
           5       0.77      0.33      0.47        30
           6       0.46      0.76      0.57        29
           7       0.54      0.52      0.53        56
           8       0.39      0.49      0.43        41
           9       0.00      0.00      0.00        11
          10       0.10      0.05      0.06        22
          11       0.49      0.35      0.41        63
          12       0.50      0.41      0.45        44
          13       0.49      0.64      0.55       103
          14       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         4
          17       0.00      0.00      0.00         3
          18       0.00      0.00      0.00         5
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           1       0.00      0.00      0.00         8
           3       0.00      0.00      0.00        11
           4       0.48      0.74      0.58        54
           5       0.75      0.20      0.32        30
           6       0.47      0.69      0.56        29
           7       0.55      0.66      0.60        56
           8       0.40      0.54      0.46        41
           9       0.00      0.00      0.00        11
          10       0.15      0.09      0.11        22
          11       0.55      0.37      0.44        63
          12       0.46      0.39      0.42        44
          13       0.52      0.68      0.59       103
          14       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         4
          17       0.00      0.00      0.00         3
          18       0.00      0.00      0.00         5
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving model and tokenizer...



Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.00      0.00      0.00         7
           3       0.00      0.00      0.00        11
           4       0.54      0.74      0.62        54
           5       0.88      0.24      0.38        29
           6       0.44      0.76      0.56        29
           7       0.45      0.51      0.48        57
           8       0.40      0.41      0.40        41
           9       0.00      0.00      0.00        12
          10       0.37      0.32      0.34        22
          11       0.53      0.42      0.47        62
          12       0.59      0.38      0.46        45
          13       0.55      0.75      0.63       103
          14       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         4
          17       0.00      0.00      0.00         2
          18       0.00      0.00      0.00         5
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.6853973865509033,
 'eval_accuracy': 0.5784463061690784,
 'eval_f1': 0.5407385305628692,
 'eval_precision': 0.5280519006221395,
 'eval_recall': 0.5784463061690784,
 'eval_runtime': 92.4497,
 'eval_samples_per_second': 28.405,
 'eval_steps_per_second': 3.559,
 'epoch': 3.0}

# context classifier 

In [1]:
import pandas as pd

# Erstellen eines Beispiel-DataFrames zur Demonstration
# Ersetze diesen Teil durch das Laden deiner eigenen CSV-Datei
df = pd.read_csv('data/processed/german_manifestos_punctuation_context.csv')




# Zeilenweise durch den DataFrame iterieren
for index, row in df.iterrows():
    print(f"--- Zeile {index} ---")
    
    # 1. Den Inhalt der Spalte 'context' für die aktuelle Zeile ausgeben
    print("\n\033[1mContext:\033[0m") # Fettgedruckte Überschrift
    print(row['context'])
    print("-" * 20) # Trennlinie

    # 2. Den Inhalt der Spalte 'text' mit den umliegenden Zeilen anzeigen
    print("\n\033[1mText (mit umliegenden Zeilen):\033[0m") # Fettgedruckte Überschrift
    
    # Start- und End-Index für den Ausschnitt festlegen
    # max() und min() verhindern Fehler am Anfang und Ende des DataFrames
    start = max(0, index - 3)
    end = min(len(df), index + 4)
    
    # Den relevanten Ausschnitt des DataFrames ausgeben
    # Wir heben die aktuelle Zeile hervor
    for i in range(start, end):
        if i == index:
            # Aktuelle Zeile mit > markieren und fett drucken
            print(f"> \033[1m{df.loc[i, 'text_en']}\033[0m")
        else:
            print(f"  {df.loc[i, 'text_en']}")

    # 3. Auf eine Benutzereingabe warten, um zur nächsten Zeile zu gelangen
    # Das Skript pausiert hier, bis du "Enter" drückst.
    try:
        inp =input("\nDrücke Enter, um zur nächsten Zeile zu gelangen (oder Strg+C zum Abbrechen)...")
        if inp == 'exit':
            break# IPython.display.clear_output() # Diese Zeile einkommentieren, um die Ausgabe nach jeder Iteration zu löschen
    except KeyboardInterrupt:
        print("\nSchleife wurde vom Benutzer abgebrochen.")
        break # Die Schleife beenden

print("\nAlle Zeilen wurden untersucht.")

FileNotFoundError: [Errno 2] No such file or directory: 'data/processed/german_manifestos_punctuation_context.csv'

In [6]:
import os
import pandas as pd
# Verzeichnis mit den CSV-Dateien
csv_dir = 'data/contextaware_data_20250927_112401'

# Alle CSV-Dateipfade im Verzeichnis sammeln
df_paths = [os.path.join(csv_dir, f) for f in os.listdir(csv_dir) if f.endswith('.csv')]

# DataFrames einlesen und kombinieren
dfs = [pd.read_csv(path) for path in df_paths]
combined_df = pd.concat(dfs, ignore_index=True)

# Kombinierten DataFrame als neue CSV speichern
combined_df.to_csv('/Users/christophhau/Desktop/HA_ML/data/contextaware_data_20250927_112401/combined_output.csv', index=False)

print(combined_df.shape)

(107304, 18)


In [7]:
path = 'data/contextaware_data_20250927_112401/combined_output_context.csv'
def create_formatted_input(text, context, tokenizer, max_tokens=200):
    """Create sentence-pair input: <s> text </s> </s> context </s>"""
    # Truncate context to max_tokens
    context_tokens = tokenizer.tokenize(str(context))[:max_tokens]
    truncated_context = tokenizer.convert_tokens_to_string(context_tokens)
    
    return f"<s> {str(text).strip()} </s> </s> {truncated_context} </s>"

df = pd.read_csv(path)

# Load tokenizer for precise token counting
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
special_tokens = ['<s>', '</s>']
new_tokens = [token for token in special_tokens if token not in tokenizer.vocab]

if new_tokens:
    tokenizer.add_tokens(new_tokens)
    print(f"Added {len(new_tokens)} new tokens to tokenizer: {new_tokens}")
else:
    print("Special tokens already in tokenizer vocabulary")

# Create formatted inputs
df['text'] = df.apply(lambda row: create_formatted_input(
    row['text_en'], row['context'], tokenizer), axis=1)

# Prepare dataset (same as your original code)
df_small = df[["text", "cmp_code"]].rename(columns={"cmp_code": "label"})
df_small = df_small.groupby("label").filter(lambda x: len(x) > 1)

le = LabelEncoder()
df_small["label"] = le.fit_transform(df_small["label"])

# Split data
train_df, test_df = train_test_split(df_small, test_size=0.30, random_state=42, stratify=df_small["label"])
test_df = test_df.groupby("label").filter(lambda x: len(x) > 1)
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42, stratify=test_df["label"])

# Create datasets
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
    "test": Dataset.from_pandas(test_df.reset_index(drop=True)),
    "val": Dataset.from_pandas(val_df.reset_index(drop=True))
})

print(f"Dataset created: {len(train_df)} train, {len(val_df)} val, {len(test_df)} test")
print(f"Example input: {train_df.iloc[0]['text'][:150]}...")
print(dataset)


Special tokens already in tokenizer vocabulary
Dataset created: 12267 train, 2626 val, 2626 test
Example input: <s> not make private motorized transport unaffordable. </s> </s> We want to create acceptance through transparency and dialog at eye level, e.g. for o...
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 12267
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2626
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 2626
    })
})


In [8]:
# MPS Device Check und Setup
import torch
def get_device():
    if torch.backends.mps.is_available():
        device = torch.device("mps")
        print("✅ MPS (Apple Silicon GPU) ist verfügbar und wird verwendet")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
        print("✅ CUDA GPU wird verwendet")
    else:
        device = torch.device("cpu")
        print("⚠️ Nur CPU verfügbar - Training wird langsamer sein")
    return device

device = get_device()

✅ MPS (Apple Silicon GPU) ist verfügbar und wird verwendet


In [9]:
model = AutoModelForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=71)
model.to(device)

def tokenize(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
dataset = dataset.map(tokenize, batched=True)
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Berechne Precision, Recall, F1 und Support für den gewichteten Durchschnitt
    # 'weighted' berücksichtigt die Anzahl der Instanzen pro Klasse (gut bei unbalancierten Datensätzen)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    
    # Berechne die einfache Genauigkeit
    acc = accuracy_score(labels, predictions)
    
    # --- Für die detaillierte Aufschlüsselung nach Label ---
    # Erstelle den Classification Report. 
    # output_dict=True macht die Ausgabe zu einem Dictionary, das du weiterverarbeiten kannst.
    # zero_division=0 verhindert eine Warnung, falls eine Klasse nie vorhergesagt wird.
    report = classification_report(labels, predictions, output_dict=True, zero_division=0)
    
    # Du kannst dir den Report auch einfach ausgeben lassen, um ihn während des Trainings zu sehen:
    print("\nClassification Report:\n", classification_report(labels, predictions, zero_division=0))
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


training_args = TrainingArguments(
    output_dir="manifesto_classifier",
    
    # MPS-optimierte Einstellungen
    per_device_train_batch_size=4,  # Kleinere Batch Size für MPS
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # Kompensiert kleinere Batch Size
    
    # Training Parameter
    learning_rate=2e-5,
    num_train_epochs=3,
    
    # Evaluation und Saving
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    
    # Logging
    logging_steps=50,
    logging_dir="./logs",
    
    # MPS-spezifische Einstellungen
    dataloader_pin_memory=False,  # Wichtig für MPS
    fp16=False,  # MPS unterstützt noch kein FP16
    push_to_hub=False,
    
    # Reproducibility
    seed=42,
    
    # Warmup für bessere Konvergenz
    warmup_steps=100,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Map: 100%|██████████| 12267/12267 [00:01<00:00, 7596.03 examples/s]
Map: 100%|██████████| 2626/2626 [00:00<00:00, 8466.68 examples/s]
Map: 100%|██████████| 2626/2626 [00:00<00:00, 8204.36 examples/s]


In [10]:
trainer.train()

print("Saving model and tokenizer...")
trainer.save_model("./manifesto_classifier_context2")
tokenizer.save_pretrained("./manifesto_classifier_contex2t")


print('saved')
trainer.evaluate(dataset["val"])

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,2.0881,1.966988,0.500762,0.434499,0.424012,0.500762
2,1.4915,1.559848,0.601295,0.566632,0.55771,0.601295
3,1.2562,1.4806,0.63214,0.600609,0.589097,0.63214



Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           1       0.00      0.00      0.00         8
           3       0.00      0.00      0.00        11
           4       0.51      0.74      0.60        54
           5       0.48      0.33      0.39        30
           6       0.47      0.55      0.51        29
           7       0.44      0.48      0.46        56
           8       0.32      0.56      0.40        41
           9       0.00      0.00      0.00        11
          10       0.00      0.00      0.00        22
          11       0.59      0.16      0.25        63
          12       0.22      0.05      0.08        44
          13       0.46      0.55      0.50       103
          14       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         4
          17       0.00      0.00      0.00         3
          18       0.00      0.00      0.00         5
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           1       0.00      0.00      0.00         8
           3       0.00      0.00      0.00        11
           4       0.69      0.69      0.69        54
           5       0.74      0.47      0.57        30
           6       0.39      0.97      0.55        29
           7       0.71      0.57      0.63        56
           8       0.42      0.54      0.47        41
           9       0.00      0.00      0.00        11
          10       0.25      0.05      0.08        22
          11       0.51      0.44      0.47        63
          12       0.47      0.48      0.47        44
          13       0.40      0.69      0.51       103
          14       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         4
          17       0.00      0.00      0.00         3
          18       0.00      0.00      0.00         5
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           1       0.00      0.00      0.00         8
           3       0.00      0.00      0.00        11
           4       0.68      0.70      0.69        54
           5       0.68      0.57      0.62        30
           6       0.46      0.97      0.62        29
           7       0.68      0.71      0.70        56
           8       0.47      0.59      0.52        41
           9       0.00      0.00      0.00        11
          10       0.29      0.09      0.14        22
          11       0.55      0.49      0.52        63
          12       0.53      0.57      0.55        44
          13       0.52      0.65      0.58       103
          14       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         4
          17       0.00      0.00      0.00         3
          18       0.00      0.00      0.00         5
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saving model and tokenizer...
saved



Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.00      0.00      0.00         7
           3       0.00      0.00      0.00        11
           4       0.61      0.69      0.64        54
           5       0.65      0.45      0.53        29
           6       0.42      0.76      0.54        29
           7       0.55      0.63      0.59        57
           8       0.53      0.66      0.59        41
           9       0.00      0.00      0.00        12
          10       0.44      0.18      0.26        22
          11       0.53      0.53      0.53        62
          12       0.59      0.58      0.58        45
          13       0.69      0.77      0.72       103
          14       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         4
          17       0.00      0.00      0.00         2
          18       0.00      0.00      0.00         5
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.411609411239624,
 'eval_accuracy': 0.6344249809596344,
 'eval_f1': 0.6005346434802009,
 'eval_precision': 0.5916351059052101,
 'eval_recall': 0.6344249809596344,
 'eval_runtime': 95.6456,
 'eval_samples_per_second': 27.456,
 'eval_steps_per_second': 3.44,
 'epoch': 3.0}