# imports

In [7]:
from datasets import load_dataset
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

import pandas as pd 
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import evaluate


In [8]:
path = '/Users/christophhau/Desktop/HA_ML/data/aggregated/german_manifestos_combined.csv'

df= pd.read_csv(path)
print(len(df))
print(len(df['cmp_code'].unique())) #labels und text_en
print(df['cmp_code'].unique()) #labels und text_en
print(df['text_en'].isna().sum())
print(df['text_en'].isna().sum())
print(df['cmp_code'].isna().sum())


df.columns

17529
75
['H' '403' '201.1' '411' '402' '410' '704' '505' '701' '504' '305.1' '110'
 '601.1' '401' '506' '605.1' '703.1' '303' '407' '603' '606.1' '202.1'
 '106' '503' '414' '412' '601.2' '109' '406' '304' '000' '608.2' '501'
 '201.2' '203' '301' '107' '204' '416.2' '502' '413' '103.1' '105' '104'
 '705' '108' '602.2' '608.1' '202.4' '202.2' '604' '706' '302' '507'
 '602.1' '409' '605.2' '415' '101' '607.2' '405' '404' '416.1' '408'
 '607.1' '703.2' '606.2' '305.6' '102' '305.2' '305.3' '202.3' '702'
 '607.3' '103.2']
0
0
0


Index(['text', 'text_en', 'cmp_code', 'eu_code', 'party', 'domain'], dtype='object')

# formatting 

In [None]:
path = '/Users/christophhau/Desktop/HA_ML/data/aggregated/german_manifestos_combined.csv'
df= pd.read_csv(path)


df_small = df[["text_en", "cmp_code"]].rename(columns={"text_en": "text", "cmp_code": "label"})
df_small = df_small.groupby("label").filter(lambda x: len(x) > 1)
print(len(df_small))
print(len(df_small['label'].unique())) #labels und text_en

le = LabelEncoder()
df_small["label"] = le.fit_transform(df_small["label"])
print(len(df_small['label'].unique())) #labels und text_en

train_df, test_df = train_test_split(df_small, test_size=0.30, random_state=42, stratify=df_small["label"])
test_df = test_df.groupby("label").filter(lambda x: len(x) > 1)
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42, stratify=test_df["label"])

train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))

dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "val": val_dataset
})

print(dataset)



17525
71
71
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 12267
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2626
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 2626
    })
})


In [46]:
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=71)

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

def tokenize(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)
dataset = dataset.map(tokenize, batched=True)
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # convert the logits to their predicted class
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


training_args = TrainingArguments(
    output_dir="manifesto_classifier",
    eval_strategy="epoch",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 12267/12267 [00:01<00:00, 7532.54 examples/s]
Map: 100%|██████████| 2626/2626 [00:00<00:00, 9071.00 examples/s]
Map: 100%|██████████| 2626/2626 [00:00<00:00, 9048.96 examples/s]


In [None]:
trainer.train()

print("Saving model and tokenizer...")
trainer.save_model("./manifesto_classifier_sentence")
tokenizer.save_pretrained("./manifesto_classifier_context")





Epoch,Training Loss,Validation Loss,Accuracy
1,1.9446,1.795486,0.541127
2,1.3082,1.580742,0.600533
3,0.7746,1.584514,0.619573




TrainOutput(global_step=4602, training_loss=1.4874885273932996, metrics={'train_runtime': 6063.1904, 'train_samples_per_second': 6.07, 'train_steps_per_second': 0.759, 'total_flos': 9688748636132352.0, 'train_loss': 1.4874885273932996, 'epoch': 3.0})

In [2]:
trainer.evaluate(dataset["val"])

NameError: name 'trainer' is not defined

# context classifier 

In [1]:
import pandas as pd

# Erstellen eines Beispiel-DataFrames zur Demonstration
# Ersetze diesen Teil durch das Laden deiner eigenen CSV-Datei
df = pd.read_csv('data/processed/german_manifestos_punctuation_context.csv')




# Zeilenweise durch den DataFrame iterieren
for index, row in df.iterrows():
    print(f"--- Zeile {index} ---")
    
    # 1. Den Inhalt der Spalte 'context' für die aktuelle Zeile ausgeben
    print("\n\033[1mContext:\033[0m") # Fettgedruckte Überschrift
    print(row['context'])
    print("-" * 20) # Trennlinie

    # 2. Den Inhalt der Spalte 'text' mit den umliegenden Zeilen anzeigen
    print("\n\033[1mText (mit umliegenden Zeilen):\033[0m") # Fettgedruckte Überschrift
    
    # Start- und End-Index für den Ausschnitt festlegen
    # max() und min() verhindern Fehler am Anfang und Ende des DataFrames
    start = max(0, index - 3)
    end = min(len(df), index + 4)
    
    # Den relevanten Ausschnitt des DataFrames ausgeben
    # Wir heben die aktuelle Zeile hervor
    for i in range(start, end):
        if i == index:
            # Aktuelle Zeile mit > markieren und fett drucken
            print(f"> \033[1m{df.loc[i, 'text_en']}\033[0m")
        else:
            print(f"  {df.loc[i, 'text_en']}")

    # 3. Auf eine Benutzereingabe warten, um zur nächsten Zeile zu gelangen
    # Das Skript pausiert hier, bis du "Enter" drückst.
    try:
        inp =input("\nDrücke Enter, um zur nächsten Zeile zu gelangen (oder Strg+C zum Abbrechen)...")
        if inp == 'exit':
            break# IPython.display.clear_output() # Diese Zeile einkommentieren, um die Ausgabe nach jeder Iteration zu löschen
    except KeyboardInterrupt:
        print("\nSchleife wurde vom Benutzer abgebrochen.")
        break # Die Schleife beenden

print("\nAlle Zeilen wurden untersucht.")

--- Zeile 0 ---

[1mContext:[0m
TIME FOR PROSPERITY SOCIAL MARKET ECONOMY AND HEALTH SOCIAL MARKET ECONOMY We are firmly committed to the principles of the social market economy, which have ensured prosperity and social peace in our country for decades.
--------------------

[1mText (mit umliegenden Zeilen):[0m
> [1mTIME FOR PROSPERITY SOCIAL MARKET ECONOMY AND HEALTH[0m
  SOCIAL MARKET ECONOMY
  We are firmly committed to the principles of the social market economy, which have ensured prosperity and social peace in our country for decades.
  In a time of global challenges, we want to preserve the fundamental values of our economic order
--- Zeile 1 ---

[1mContext:[0m
TIME FOR PROSPERITY SOCIAL MARKET ECONOMY AND HEALTH SOCIAL MARKET ECONOMY We are firmly committed to the principles of the social market economy, which have ensured prosperity and social peace in our country for decades.
--------------------

[1mText (mit umliegenden Zeilen):[0m
  TIME FOR PROSPERITY SOCIAL MA

In [2]:
import os
import pandas as pd
# Verzeichnis mit den CSV-Dateien
csv_dir = 'data/contextaware_data_20250927_112401'

# Alle CSV-Dateipfade im Verzeichnis sammeln
df_paths = [os.path.join(csv_dir, f) for f in os.listdir(csv_dir) if f.endswith('.csv')]

# DataFrames einlesen und kombinieren
dfs = [pd.read_csv(path) for path in df_paths]
combined_df = pd.concat(dfs, ignore_index=True)

# Kombinierten DataFrame als neue CSV speichern
combined_df.to_csv('/Users/christophhau/Desktop/HA_ML/data/contextaware_data_20250927_112401/combined_output.csv', index=False)

print(combined_df.shape)

(17884, 18)


In [9]:
path = 'data/contextaware_data_20250927_112401/combined_output_context.csv'
def create_formatted_input(text, context, tokenizer, max_tokens=200):
    """Create sentence-pair input: <s> text </s> </s> context </s>"""
    # Truncate context to max_tokens
    context_tokens = tokenizer.tokenize(str(context))[:max_tokens]
    truncated_context = tokenizer.convert_tokens_to_string(context_tokens)
    
    return f"<s> {str(text).strip()} </s> </s> {truncated_context} </s>"

df = pd.read_csv(path)

# Load tokenizer for precise token counting
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
special_tokens = ['<s>', '</s>']
new_tokens = [token for token in special_tokens if token not in tokenizer.vocab]

if new_tokens:
    tokenizer.add_tokens(new_tokens)
    print(f"Added {len(new_tokens)} new tokens to tokenizer: {new_tokens}")
else:
    print("Special tokens already in tokenizer vocabulary")

# Create formatted inputs
df['text'] = df.apply(lambda row: create_formatted_input(
    row['text_en'], row['context'], tokenizer), axis=1)

# Prepare dataset (same as your original code)
df_small = df[["text", "cmp_code"]].rename(columns={"cmp_code": "label"})
df_small = df_small.groupby("label").filter(lambda x: len(x) > 1)

le = LabelEncoder()
df_small["label"] = le.fit_transform(df_small["label"])

# Split data
train_df, test_df = train_test_split(df_small, test_size=0.30, random_state=42, stratify=df_small["label"])
test_df = test_df.groupby("label").filter(lambda x: len(x) > 1)
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42, stratify=test_df["label"])

# Create datasets
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
    "test": Dataset.from_pandas(test_df.reset_index(drop=True)),
    "val": Dataset.from_pandas(val_df.reset_index(drop=True))
})

print(f"Dataset created: {len(train_df)} train, {len(val_df)} val, {len(test_df)} test")
print(f"Example input: {train_df.iloc[0]['text'][:150]}...")
print(dataset)


Special tokens already in tokenizer vocabulary
Dataset created: 12267 train, 2626 val, 2626 test
Example input: <s> not make private motorized transport unaffordable. </s> </s> We want to create acceptance through transparency and dialog at eye level, e.g. for o...
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 12267
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2626
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 2626
    })
})


In [12]:
# MPS Device Check und Setup
import torch
def get_device():
    if torch.backends.mps.is_available():
        device = torch.device("mps")
        print("✅ MPS (Apple Silicon GPU) ist verfügbar und wird verwendet")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
        print("✅ CUDA GPU wird verwendet")
    else:
        device = torch.device("cpu")
        print("⚠️ Nur CPU verfügbar - Training wird langsamer sein")
    return device

device = get_device()

✅ MPS (Apple Silicon GPU) ist verfügbar und wird verwendet


In [None]:
model = AutoModelForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=71)
model.to(device)

def tokenize(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
dataset = dataset.map(tokenize, batched=True)
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # convert the logits to their predicted class
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="manifesto_classifier",
    
    # MPS-optimierte Einstellungen
    per_device_train_batch_size=4,  # Kleinere Batch Size für MPS
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # Kompensiert kleinere Batch Size
    
    # Training Parameter
    learning_rate=2e-5,
    num_train_epochs=3,
    
    # Evaluation und Saving
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    
    # Logging
    logging_steps=50,
    logging_dir="./logs",
    
    # MPS-spezifische Einstellungen
    dataloader_pin_memory=False,  # Wichtig für MPS
    fp16=False,  # MPS unterstützt noch kein FP16
    push_to_hub=False,
    
    # Reproducibility
    seed=42,
    
    # Warmup für bessere Konvergenz
    warmup_steps=100,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 12267/12267 [00:01<00:00, 7764.14 examples/s]
Map: 100%|██████████| 2626/2626 [00:00<00:00, 8167.59 examples/s]
Map: 100%|██████████| 2626/2626 [00:00<00:00, 8079.83 examples/s]


In [None]:
trainer.train()

print("Saving model and tokenizer...")
trainer.save_model("./manifesto_classifier_sentence")
tokenizer.save_pretrained("./manifesto_classifier_context")


print('saved')
trainer.evaluate(dataset["val"])