In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

# Load Data

In [2]:
train_path = "/kaggle/input/llm-classification-finetuning/train.csv"
test_path = "/kaggle/input/llm-classification-finetuning/test.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print(train.head(2))

Train shape: (57477, 9)
Test shape: (3, 4)
      id             model_a     model_b  \
0  30192  gpt-4-1106-preview  gpt-4-0613   
1  53567           koala-13b  gpt-4-0613   

                                              prompt  \
0  ["Is it morally right to try to have a certain...   
1  ["What is the difference between marriage lice...   

                                          response_a  \
0  ["The question of whether it is morally right ...   
1  ["A marriage license is a legal document that ...   

                                          response_b  winner_model_a  \
0  ["As an AI, I don't have personal beliefs or o...               1   
1  ["A marriage license and a marriage certificat...               0   

   winner_model_b  winner_tie  
0               0           0  
1               1           0  


# 2. Build target

In [3]:
target_cols = ["winner_model_a", "winner_model_b", "winner_tie"]

for col in target_cols:
    assert col in train.columns, f"Missing column in TRAIN → {col}"

print("Columns verified")

y = train[target_cols].idxmax(axis=1).map({
    "winner_model_a": 0,
    "winner_model_b": 1,
    "winner_tie": 2,
}).values

print("Unique labels in y:", set(y))


Columns verified
Unique labels in y: {0, 1, 2}


# 3. Build text feature

In [4]:
import re

def clean_text(x):
    if pd.isna(x): 
        return ""
    x = str(x).strip()
    
    x = re.sub(r'^\s*\[\s*"', '', x)  
    x = re.sub(r'"\s*\]\s*$', '', x)   

    x = x.replace('", "', ' ')  

    return x

def combine_text(row):
    p = clean_text(row["prompt"])
    a = clean_text(row["response_a"])
    b = clean_text(row["response_b"])
    return f"{p} [SEP_A] {a} [SEP_B] {b}"

train["text_all"] = train.apply(combine_text, axis=1)
test["text_all"] = test.apply(combine_text, axis=1)

print(train["text_all"].iloc[0][:500])


Is it morally right to try to have a certain percentage of females on managerial positions?","OK, does pineapple belong on a pizza? Relax and give me fun answer. [SEP_A] The question of whether it is morally right to aim for a certain percentage of females in managerial positions is a complex ethical issue that involves considerations of fairness, equality, diversity, and discrimination.\n\nHere are some arguments in favor of and against such policies:\n\n**Arguments in favor:**\n\n1. **Correcti


# 4. TF-IDF VECTORIZATION

In [5]:
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=150_000,  
    ngram_range=(1, 2), 
)

X = vectorizer.fit_transform(train["text_all"])
X_test = vectorizer.transform(test["text_all"])

print(f"Train TF-IDF shape: {X.shape}")
print(f"Test TF-IDF shape: {X_test.shape}")


Train TF-IDF shape: (57477, 150000)
Test TF-IDF shape: (3, 150000)


# 5. TRAIN / VALIDATION SPLIT

In [6]:

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.1,
    random_state=42,
    stratify=y
)

print(f"Train split: {X_tr.shape}, Val split: {X_val.shape}")


Train split: (51729, 150000), Val split: (5748, 150000)


# 6. LOGISTIC REGRESSION MODEL

In [7]:
clf = LogisticRegression(
    max_iter=2000,
    n_jobs=-1,
    multi_class="multinomial"
)

clf.fit(X_tr, y_tr)

# 7. VALIDATION LOG LOSS

In [8]:
val_proba = clf.predict_proba(X_val)
val_logloss = log_loss(y_val, val_proba)

print(f"Validation Log Loss: {val_logloss:.5f}")

Validation Log Loss: 1.10313


# 8. TRAIN ON FULL DATA

In [9]:
clf_full = LogisticRegression(
    max_iter=2000,
    n_jobs=-1,
    multi_class="multinomial"
)

clf_full.fit(X, y)


# 9. INFERENCE ON TEST SET

In [10]:
test_proba = clf_full.predict_proba(X_test)

print(f"Test Predictions Probability Shape: {test_proba.shape}")

Test Predictions Probability Shape: (3, 3)


# 10. BUILD SUBMISSION FILE

In [11]:
sub = pd.DataFrame({
    "id": test["id"],
    "winner_model_a": test_proba[:, 0],
    "winner_model_b": test_proba[:, 1],
    "winner_tie": test_proba[:, 2],
})

sub.to_csv("submission.csv", index=False)
print("\n submission.csv saved successfully!")
sub.head()


 submission.csv saved successfully!


Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
0,136060,0.244513,0.409154,0.346332
1,211333,0.448236,0.255508,0.296256
2,1233961,0.362315,0.509203,0.128482
