# Read and prepare Data from csv

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset

# 1. CSV laden
df = pd.read_csv("data/full_dataset.csv")
df["overall_classification"] = df["overall_classification"] - 1

# 2. Trainings-/Validierungsdaten aufteilen
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# 3. In HuggingFace Dataset umwandeln
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Load Tokenizer and pretrained Model

In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "xlm-roberta-base"

# Tokenizer laden
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Modell mit 4 Klassen
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Prepare Tokenising and Label

In [16]:
# Tokenisierung der Textpaare
def preprocess(examples):
    texts1 = [str(x) if x is not None else "" for x in examples["text1"]]
    texts2 = [str(x) if x is not None else "" for x in examples["text2"]]
    return tokenizer(
        texts1,
        texts2,
        truncation=True,
        padding="max_length",
        max_length=512,
    )

# Texte tokenisieren
tokenized_train = train_dataset.map(preprocess, batched=True)
tokenized_val = val_dataset.map(preprocess, batched=True)

# Label-Spalte festlegen
tokenized_train = tokenized_train.rename_column("overall_classification", "label")
tokenized_val = tokenized_val.rename_column("overall_classification", "label")

# Unnötige Spalten entfernen
drop_cols = [
    "pair_id", "title1", "text1", "lang1", "title2", "text2", "lang2",
    "overall", "geography", "entities", "time", "narrative", "style", "tone"
]

tokenized_train = tokenized_train.remove_columns(drop_cols)
tokenized_val = tokenized_val.remove_columns(drop_cols)

# Label als int casten
from datasets import Value

tokenized_train = tokenized_train.cast_column("label", Value("int64"))
tokenized_val = tokenized_val.cast_column("label", Value("int64"))


Map:   0%|          | 0/2462 [00:00<?, ? examples/s]

Map:   0%|          | 0/274 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2462 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/274 [00:00<?, ? examples/s]

# Prepare Training

In [17]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# Metriken definieren
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro")
    }

# Trainingsparameter
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
)


# Train and save Model

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Training starten
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,No log,1.334139,0.40146,0.143229
2,1.324700,1.124356,0.540146,0.334131
3,1.324700,0.946073,0.576642,0.499899
4,1.086000,0.942085,0.638686,0.56756




TrainOutput(global_step=1232, training_loss=1.1557869229997908, metrics={'train_runtime': 165177.1066, 'train_samples_per_second': 0.06, 'train_steps_per_second': 0.007, 'total_flos': 2591164202385408.0, 'train_loss': 1.1557869229997908, 'epoch': 4.0})

In [19]:
trainer.save_model("trained_models/xlm-roberta-news-similarity")
tokenizer.save_pretrained("trained_models/xlm-roberta-news-similarity")


('trained_models/xlm-roberta-news-similarity\\tokenizer_config.json',
 'trained_models/xlm-roberta-news-similarity\\special_tokens_map.json',
 'trained_models/xlm-roberta-news-similarity\\tokenizer.json')