In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import spacy
from spacy.lang.en import English
from tqdm import tqdm

train_data = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")
# train_data.head()

test_data = pd.read_csv("/kaggle/input/llm-classification-finetuning/test.csv")
# test_data.head()

def encode_label(row):
    if row["winner_model_a"] == 1:
        return 0
    elif row["winner_model_b"] == 1:
        return 1
    else:
        return 2

def extract_text_features(df, prefix, col):
    char_lens = []
    word_counts = []
    token_counts = []
    avg_word_lens = []

    for text in tqdm(df[col].astype(str).tolist(), desc=f"Extracting {prefix} features"):

        char_lens.append(len(text))

        words = text.split()
        word_counts.append(len(words))

        tokens = [t.text for t in tokenizer(text)]
        token_counts.append(len(tokens))

        avg_word_lens.append(np.mean([len(w) for w in words]) if words else 0)

    df[f"{prefix}_char_len"] = char_lens
    df[f"{prefix}_word_count"] = word_counts
    df[f"{prefix}_token_count"] = token_counts
    df[f"{prefix}_avg_word_len"] = avg_word_lens

    return df

nlp = English()
tokenizer = nlp.tokenizer

train_data["win_label"] = train_data.apply(encode_label, axis=1)

for col, prefix in [("response_a", "a"), ("response_b", "b")]:
    train_data = extract_text_features(train_data, prefix, col)
    test_data  = extract_text_features(test_data, prefix, col)

# print(train_data.head())

diff_feats = []
for feat in ["char_len", "word_count", "token_count", "avg_word_len"]:
    train_data[f"diff_{feat}"] = train_data[f"a_{feat}"] - train_data[f"b_{feat}"]
    test_data[f"diff_{feat}"]  = test_data[f"a_{feat}"] - test_data[f"b_{feat}"]
    diff_feats.append(f"diff_{feat}")

# print("Feature columns:", diff_feats)

X = train_data[diff_feats]
y = train_data["win_label"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

clf = LogisticRegression(
    max_iter=1000,
    solver='lbfgs',
    multi_class='multinomial',
    random_state=42
)
clf.fit(X_train, y_train)

val_preds = clf.predict_proba(X_val)
val_loss = log_loss(y_val, val_preds)
print(f"Validation Log Loss: {val_loss:.4f}")

X_test_scaled = scaler.transform(test_data[diff_feats])
test_preds = clf.predict_proba(X_test_scaled)

submission = pd.DataFrame({
    "id": test_data["id"],
    "winner_model_a": test_preds[:, 0],
    "winner_model_b": test_preds[:, 1],
    "winner_tie": test_preds[:, 2],
})
submission.to_csv("submission.csv", index=False)