In [None]:
import os
import pandas as pd
import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

print("Libraries imported.")

In [None]:
KAGGLE_DIR = "/kaggle/input/llm-classification-finetuning"
DATA_DIR = KAGGLE_DIR if os.path.exists(KAGGLE_DIR) else "data"

train_path = os.path.join(DATA_DIR, "train.csv")
test_path  = os.path.join(DATA_DIR, "test.csv")

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)

print("train shape:", train.shape)
print("test  shape:", test.shape)
display(train.head(2))

In [None]:
# Make Labels
label_cols = ['winner_model_a','winner_model_b','winner_tie']
y = train[label_cols].values.argmax(axis=1)
print("Using label columns:", label_cols)
print("Class distribution:", np.bincount(y))

In [None]:
# Make Test Inputs (Prompt + A + B)
def build_text(df):
    p = df['prompt'].fillna('').astype(str)
    a = df['response_a'].fillna('').astype(str)
    b = df['response_b'].fillna('').astype(str)
    return "Prompt: " + p + " [SEP] A: " + a + " [SEP] B: " + b

train_text = build_text(train)
test_text  = build_text(test)

print("Example:\n", train_text.iloc[0][:200] + "...")

In [None]:
# Vectorize (TF-IDF Word + Char)
from sklearn.feature_extraction.text import TfidfVectorizer

word_vec = TfidfVectorizer(
    lowercase=True, stop_words='english',
    ngram_range=(1,2), min_df=3, sublinear_tf=True, max_features=200_000
)
char_vec = TfidfVectorizer(
    analyzer='char', ngram_range=(3,5), min_df=3, sublinear_tf=True, max_features=120_000
)

print("Fitting word-level TF-IDF...")
Xw_tr = word_vec.fit_transform(train_text)
Xw_te = word_vec.transform(test_text)

print("Fitting char-level TF-IDF...")
Xc_tr = char_vec.fit_transform(train_text)
Xc_te = char_vec.transform(test_text)

from scipy.sparse import csr_matrix
X_tr = hstack([Xw_tr, Xc_tr], format="csr")
X_te = hstack([Xw_te, Xc_te], format="csr")

print("Shapes -> X_tr:", X_tr.shape, "| X_te:", X_te.shape)

In [None]:
# Train Logistic Regression Model
model = LogisticRegression(
    max_iter=2000,
    C=2.0,
    class_weight='balanced',
    solver='lbfgs',
    n_jobs=-1
)
print("Fitting LogisticRegression on full train...")
model.fit(X_tr, y)
print("Training complete")

In [None]:
# Prediction
proba = model.predict_proba(X_te)

# Map predicted columns to submission format
sub_cols = ['winner_model_a','winner_model_b','winner_tie']

submission = pd.DataFrame({
    'id': test['id'],
    sub_cols[0]: proba[:,0],
    sub_cols[1]: proba[:,1],
    sub_cols[2]: proba[:,2],
})
out_path = "submission.csv"
submission.to_csv(out_path, index=False)
print("Saved:", out_path)
submission.head()