# Sentence Embeddings - XLM-RoBERTa

## Classification

#### Imports

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from transformers import XLMRobertaTokenizer, XLMRobertaModel, get_cosine_schedule_with_warmup
from torch.optim import AdamW
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import seaborn as sns
from transformers import logging
logging.set_verbosity_error()

### Config

In [None]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

class Config:
    def __init__(self, p):
        self.learning_rate = p['learning_rate']
        self.epoch = p['epoch']
        self.batch_size = p['batch_size']
        self.max_len = p['max_len']
        self.model_save_path = p['model_save_path']
        self.warmup_rate = p['warmup_rate']
        self.weight_decay = p['weight_decay']
        self.model_pretrain_dir = p['model_pretrain_dir']
        self.training_set_path = p['training_set_path']
        self.testing_set_path = p['testing_set_path']
        self.seed = p['seed']

params = {
    "learning_rate": 2e-5,
    "epoch": 5,
    "batch_size": 8,
    "max_len": 512,
    "model_save_path": "best_classifier.pth",
    "warmup_rate": 0.1,
    "weight_decay": 0.01,
    "model_pretrain_dir": "xlm-roberta-base",
    "training_set_path": "training_set.csv",
    "testing_set_path": "testing_set.csv",
    "seed": 42
}

### Model

In [None]:
class MMClassifier(nn.Module):
    def __init__(self, model_path):
        super().__init__()
        self.backbone = XLMRobertaModel.from_pretrained(model_path)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.backbone.config.hidden_size, 4)

    def forward(self, input_ids, attention_mask):
        output = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = output.last_hidden_state[:, 0, :]
        return self.classifier(self.dropout(cls_output))

### Training + Evaluation

In [None]:
class Trainer:
    def __init__(self, config):
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = XLMRobertaTokenizer.from_pretrained(config.model_pretrain_dir)
        set_seed(config.seed)

    def dataset(self, path):
        print(f"\n📂 Lade Daten aus: {path}")
        df = pd.read_csv(path, usecols=["text1", "text2", "Overall"])
        df = df.dropna(subset=["text1", "text2"])
        print(f"✅ Nach Entfernen von NaNs: {len(df)} Zeilen")
        input_ids, attention_masks, labels = [], [], []

        for _, row in df.iterrows():
            enc = self.tokenizer(str(row['text1']), str(row['text2']),
                               padding='max_length', truncation=True,
                               max_length=self.config.max_len, return_tensors='pt')
            input_ids.append(enc["input_ids"].squeeze(0))
            attention_masks.append(enc["attention_mask"].squeeze(0))
            labels.append(int(round(row["Overall"])) - 1)

        return torch.stack(input_ids), torch.stack(attention_masks), torch.tensor(labels)

    def data_loader(self, ids, masks, labels, shuffle=True):
        return DataLoader(TensorDataset(ids, masks, labels), batch_size=self.config.batch_size, shuffle=shuffle)

    def evaluate(self, model, loader):
        model.eval()
        y_true, y_pred = [], []
        with torch.no_grad():
            for ids, att, y in loader:
                ids, att, y = ids.to(self.device), att.to(self.device), y.to(self.device)
                logits = model(ids, att)
                preds = torch.argmax(logits, dim=1)
                y_true.extend(y.tolist())
                y_pred.extend(preds.cpu().tolist())
        return y_true, y_pred

    def train(self):
        print("📦 Lade Trainingsdaten...")
        ids, masks, labels = self.dataset(self.config.training_set_path)
        train_loader = self.data_loader(ids, masks, labels)
        print(f"✅ Trainingsloader bereit: {len(train_loader)} Batches")

        print("📦 Lade Dev/Testdaten...")
        dev_ids, dev_masks, dev_labels = self.dataset(self.config.testing_set_path)
        dev_loader = self.data_loader(dev_ids, dev_masks, dev_labels, shuffle=False)
        print(f"✅ Dev/Testloader bereit: {len(dev_loader)} Batches")

        model = MMClassifier(self.config.model_pretrain_dir).to(self.device)
        optimizer = AdamW(model.parameters(), lr=self.config.learning_rate, weight_decay=self.config.weight_decay)

        total_steps = len(train_loader) * self.config.epoch
        scheduler = get_cosine_schedule_with_warmup(optimizer, 
                                                    num_warmup_steps=int(self.config.warmup_rate * total_steps),
                                                    num_training_steps=total_steps)

        criterion = nn.CrossEntropyLoss()
        best_acc = 0
        train_losses, train_accuracies, dev_accuracies = [], [], []

        for epoch in range(self.config.epoch):
            print(f"\n🚀 Starte Epoche {epoch + 1}/{self.config.epoch}")
            model.train()
            running_loss, correct, total = 0.0, 0, 0

            for batch_idx, (ids, att, y) in enumerate(train_loader):
                ids, att, y = ids.to(self.device), att.to(self.device), y.to(self.device)
                logits = model(ids, att)
                loss = criterion(logits, y)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                scheduler.step()

                running_loss += loss.item()
                preds = torch.argmax(logits, dim=1)
                correct += (preds == y).sum().item()
                total += y.size(0)

                if batch_idx % 10 == 0:
                    print(f"  🔁 Batch {batch_idx + 1}/{len(train_loader)}")

            train_loss = running_loss / len(train_loader)
            train_acc = correct / total
            y_true_dev, y_pred_dev = self.evaluate(model, dev_loader)
            dev_acc = accuracy_score(y_true_dev, y_pred_dev)

            train_losses.append(train_loss)
            train_accuracies.append(train_acc)
            dev_accuracies.append(dev_acc)

            print(f"Epoch {epoch+1} | Train Acc: {train_acc:.4f} | Dev Acc: {dev_acc:.4f} | Loss: {train_loss:.4f}")

            if dev_acc > best_acc:
                best_acc = dev_acc
                torch.save(model.state_dict(), self.config.model_save_path)

        print(f"\n✅ Training finished. Best Dev Acc: {best_acc:.4f}")

        # Lernkurve
        plt.plot(train_losses, label="Train Loss")
        plt.plot(train_accuracies, label="Train Accuracy")
        plt.xlabel("Epoch")
        plt.ylabel("Wert")
        plt.title("Training Verlauf")
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()

        # Beste Modell laden & evaluieren
        model.load_state_dict(torch.load(self.config.model_save_path))
        model.eval()
        y_true, y_pred = self.evaluate(model, dev_loader)
        evaluate_classification(y_true, y_pred, description="Final Dev Set Evaluation")

def evaluate_classification(y_true, y_pred, description="Model"):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)

    print(f"\n📊 Evaluation – {description}")
    print(f"Accuracy: {acc:.3f}")
    print(f"Precision: {prec:.3f}")
    print(f"Recall: {rec:.3f}")
    print(f"F1 Score: {f1:.3f}")
    print("\n" + classification_report(y_true, y_pred, digits=3, zero_division=0))

    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')


In [None]:
config = Config(params)
trainer = Trainer(config)
trainer.train()

## Regression

### Import

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from transformers import XLMRobertaTokenizer, XLMRobertaModel, get_cosine_schedule_with_warmup
from torch.optim import AdamW
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
from scipy.stats import pearsonr

### Config

In [None]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

class Config:
    def __init__(self, p):
        self.learning_rate = p['learning_rate']
        self.epoch = p['epoch']
        self.batch_size = p['batch_size']
        self.max_len = p['max_len']
        self.model_save_path = p['model_save_path']
        self.warmup_rate = p['warmup_rate']
        self.weight_decay = p['weight_decay']
        self.model_pretrain_dir = p['model_pretrain_dir']
        self.training_set_path = p['training_set_path']
        self.testing_set_path = p['testing_set_path']
        self.seed = p['seed']

params = {
    "learning_rate": 2e-5,
    "epoch": 10,
    "batch_size": 8,
    "max_len": 512,
    "model_save_path": "/kaggle/working/best_regressor.pth",
    "warmup_rate": 0.1,
    "weight_decay": 0.01,
    "model_pretrain_dir": "xlm-roberta-base",
    "training_set_path": "training_set.csv",
    "testing_set_path": "testing_set.csv",
    "seed": 42
}

### Model

In [None]:
class MMRegressor(nn.Module):
    def __init__(self, model_path):
        super().__init__()
        self.backbone = XLMRobertaModel.from_pretrained(model_path)
        self.dropout = nn.Dropout(0.1)
        self.regressor = nn.Linear(self.backbone.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        output = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = output.last_hidden_state[:, 0, :]
        return self.regressor(self.dropout(cls_output)).squeeze(-1)


### Training + Evaluation

In [None]:
class Trainer:
    def __init__(self, config):
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = XLMRobertaTokenizer.from_pretrained(config.model_pretrain_dir)
        set_seed(config.seed)

    def dataset(self, path):
        input_ids, attention_masks, labels = [], [], []
        df = pd.read_csv(path)
        df = df.dropna(subset=["text1", "text2", "Overall"])
        for _, row in df.iterrows():  
          enc = self.tokenizer(str(row['text1']), str(row['text2']),
                               padding='max_length', truncation=True,
                               max_length=self.config.max_len, return_tensors='pt')
          input_ids.append(enc['input_ids'].squeeze(0))
          attention_masks.append(enc['attention_mask'].squeeze(0))
          labels.append(float(row['Overall']))
        return torch.stack(input_ids), torch.stack(attention_masks), torch.tensor(labels)

    def data_loader(self, ids, masks, labels, shuffle=True):
        return DataLoader(TensorDataset(ids, masks, labels),
                          batch_size=self.config.batch_size, shuffle=shuffle)

    def evaluate(self, model, loader):
        model.eval()
        preds, targets = [], []
        with torch.no_grad():
            for ids, att, y in loader:
                ids, att, y = ids.to(self.device), att.to(self.device), y.to(self.device)
                outputs = model(ids, att)
                preds.extend(outputs.cpu().numpy())
                targets.extend(y.cpu().numpy())
        return np.array(targets), np.array(preds)

    def train(self):
        print("📦 Lade Trainingsdaten...")
        ids, masks, labels = self.dataset(self.config.training_set_path)
        train_loader = self.data_loader(ids, masks, labels)
        print(f"✅ Trainingsloader bereit: {len(train_loader)} Batches")

        print("📦 Lade Dev/Testdaten...")
        dev_ids, dev_masks, dev_labels = self.dataset(self.config.testing_set_path)
        dev_loader = self.data_loader(dev_ids, dev_masks, dev_labels, shuffle=False)
        print(f"✅ Dev/Testloader bereit: {len(dev_loader)} Batches")

        model = MMRegressor(self.config.model_pretrain_dir).to(self.device)
        optimizer = AdamW(model.parameters(), lr=self.config.learning_rate, weight_decay=self.config.weight_decay)
        total_steps = len(train_loader) * self.config.epoch
        scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=int(self.config.warmup_rate * total_steps),
                                                    num_training_steps=total_steps)
        criterion = nn.MSELoss()
        best_mse = float('inf')

        for epoch in range(self.config.epoch):
            print(f"\n🚀 Starte Epoche {epoch + 1}/{self.config.epoch}")
            model.train()
            total_loss = 0
            for batch_idx, (ids, att, y) in enumerate(train_loader):
                ids, att, y = ids.to(self.device), att.to(self.device), y.to(self.device)
                outputs = model(ids, att)
                loss = criterion(outputs, y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                scheduler.step()
                total_loss += loss.item()
                if batch_idx % 10 == 0:
                    print(f"  🔁 Batch {batch_idx + 1}/{len(train_loader)}")

            y_true_dev, y_pred_dev = self.evaluate(model, dev_loader)
            mse = mean_squared_error(y_true_dev, y_pred_dev)

            print(f"Epoch {epoch+1} | Dev MSE: {mse:.4f} | Train Loss: {total_loss:.4f}")
            if mse < best_mse:
                best_mse = mse
                torch.save(model.state_dict(), self.config.model_save_path)
                print("💾 Bestes Modell gespeichert (niedrigstes MSE).")

        print(f"\n✅ Training abgeschlossen. Bestes Dev MSE: {best_mse:.4f}")

        model.load_state_dict(torch.load(self.config.model_save_path))
        model.eval()
        y_true, y_pred = self.evaluate(model, dev_loader)
        evaluate_regression(y_true, y_pred, description="Final Dev Regression Evaluation", save_json_file_name="/kaggle/working/final_dev_regression_metrics.json")

def evaluate_regression(y_true, y_pred, description="Model", save_json_file_name=None):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    try:
        pearson_corr, _ = pearsonr(y_true, y_pred)
    except Exception:
        pearson_corr = float('nan')

    print(f"\n📊 Regression Evaluation – {description}")
    print(f"Mean Squared Error (MSE): {mse:.3f}")
    print(f"Mean Absolute Error (MAE): {mae:.3f}")
    print(f"R2 Score: {r2:.3f}")
    print(f"Pearson Correlation: {pearson_corr:.3f}")

    plt.figure(figsize=(6, 6))
    sns.scatterplot(x=y_true, y=y_pred, alpha=0.6)
    plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], '--', color='red')
    plt.xlabel("True Values")
    plt.ylabel("Predictions")
    plt.title(f"{description} - Regression Prediction Plot")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    results = {
        "Model": description,
        "MSE": mse,
        "MAE": mae,
        "R2": r2,
        "Pearson": pearson_corr
    }

    if save_json_file_name:
        os.makedirs("results", exist_ok=True)
        path = os.path.join("results", save_json_file_name)
        with open(path, "w") as f:
            json.dump(results, f, indent=2)
        print(f"📁 Ergebnisse gespeichert unter: {save_json_file_name}")

    return results


In [None]:
config = Config(params)
trainer = Trainer(config)
trainer.train()