In [1]:
# ----------------------------------------
# Komplettes Notebook-Skript für den LLM-Wettbewerb
# Mit korrektem Submission-Format (inkl. id-Spalte)
# ----------------------------------------

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import log_loss
import gc

# 0) Pfad zu Deinen CSV-Dateien anpassen:
BASE_DIR = '/kaggle/input/llm-classification-finetuning'  # ← Hier den Ordner einstellen, in dem train.csv, test.csv, sample_submission.csv liegen

# 1) Daten laden
train_df = pd.read_csv(f'{BASE_DIR}/train.csv')
test_df  = pd.read_csv(f'{BASE_DIR}/test.csv')
sub_df   = pd.read_csv(f'{BASE_DIR}/sample_submission.csv')

# 2) Ziel kodieren: winner_model_a → 0, winner_model_b → 1, tie → 2
def encode_label(row):
    if row['winner_model_a'] == 1: return 0
    if row['winner_model_b'] == 1: return 1
    return 2

train_df['label'] = train_df.apply(encode_label, axis=1)
y = train_df['label'].values

# 3) TF-IDF-Vektoren für Prompt und Antworten (max. 2000 Features, Unigram+Bigram)
tf_prompt = TfidfVectorizer(max_features=2000, ngram_range=(1,2))
tf_resp   = TfidfVectorizer(max_features=2000, ngram_range=(1,2))

Xp_tr = tf_prompt.fit_transform(train_df['prompt'])
Xp_te = tf_prompt.transform(test_df['prompt'])

Xa_tr = tf_resp.fit_transform(train_df['response_a'])
Xa_te = tf_resp.transform(test_df['response_a'])

Xb_tr = tf_resp.transform(train_df['response_b'])
Xb_te = tf_resp.transform(test_df['response_b'])

# 4) Feature-Matrix: Prompt | RespA | RespB | (RespA − RespB)
X_sparse      = hstack([Xp_tr, Xa_tr, Xb_tr, Xa_tr - Xb_tr]).tocsr()
X_test_sparse = hstack([Xp_te, Xa_te, Xb_te, Xa_te - Xb_te]).tocsr()

# 5) Für HistGradientBoosting dichten Datensatz benötigen
X      = X_sparse.toarray()
X_test = X_test_sparse.toarray()
del X_sparse, X_test_sparse
gc.collect()

# 6) Split in Training/Validation (20 % für Validierung)
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

# 7) Modell: HistGradientBoostingClassifier mit Early Stopping
model = HistGradientBoostingClassifier(
    loss='log_loss',
    learning_rate=0.1,
    max_iter=500,
    random_state=42,
    validation_fraction=0.2,
    n_iter_no_change=30
)
model.fit(X_tr, y_tr)

# 8) Evaluation auf Validierungsset
val_probs = model.predict_proba(X_val)
print("Validation multi_logloss:", log_loss(y_val, val_probs))

# 9) Vorhersage auf Testdaten
test_probs = model.predict_proba(X_test)

# 10) Submission-Datei erstellen mit korrektem Format
submission = pd.DataFrame({
    'id': test_df['id'],
    'winner_model_a':   test_probs[:, 0],
    'winner_model_b':   test_probs[:, 1],
    'winner_model_tie': test_probs[:, 2],
})
submission.to_csv('submission.csv', index=False)

print("✔️ submission.csv wurde erstellt. Zeilen:", len(submission))


Validation multi_logloss: 1.0427783995732065
✔️ submission.csv wurde erstellt. Zeilen: 3
