In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pdimport numpy as np
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer

train_data = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")
test_data = pd.read_csv("/kaggle/input/llm-classification-finetuning/test.csv")

def encode_label(row):
    if row["winner_model_a"] == 1:
        return 0
    elif row["winner_model_b"] == 1:
        return 1
    else:
        return 2

def get_embeddings(df, col):
    
    texts = (df["prompt"].astype(str) + " " + df[col].astype(str)).tolist()
    
    embeddings = model.encode(
        texts,
        batch_size=64,
        show_progress_bar=True,
        normalize_embeddings=True
    )
    return np.array(embeddings)

model = SentenceTransformer("/kaggle/input/sentencetransformersallminilml6v2")

train_data["win_label"] = train_data.apply(encode_label, axis=1)

emb_a = get_embeddings(train_data, "response_a")
emb_b = get_embeddings(train_data, "response_b")

emb_a_test = get_embeddings(test_data, "response_a")
emb_b_test = get_embeddings(test_data, "response_b")

X_train = np.concatenate([emb_a, emb_b, np.abs(emb_a - emb_b)], axis=1)
y_train = train_data["win_label"]

X_test = np.concatenate([emb_a_test, emb_b_test, np.abs(emb_a_test - emb_b_test)], axis=1)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = LogisticRegression(
    max_iter=1000,
    multi_class="multinomial",
    solver="lbfgs",
    random_state=42
)
clf.fit(X_train_scaled, y_train)

preds = clf.predict_proba(X_test_scaled)

submission = pd.DataFrame({
    "id": test_data["id"],
    "winner_model_a": preds[:, 0],
    "winner_model_b": preds[:, 1],
    "winner_tie": preds[:, 2],
})
submission.to_csv("/kaggle/working/submission.csv", index=False)