# Setting

In [None]:
# dataset setting
train_path = '/kaggle/input/llm-classification-finetuning/train.csv'
test_path = '/kaggle/input/llm-classification-finetuning/test.csv'

import pandas as pd
pd.set_option('display.unicode.escape', False)
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print("Train dataset size:", train_df.shape)
print("Test dataset size:", test_df.shape)

In [None]:
# imports
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

In [None]:
SEED = 42

from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Step 1. Baseline Model

In [None]:
# copy dataframe
step1_train_df = train_df.copy()
step1_test_df = test_df.copy()

# create target variable 'y'
target_cols = ['winner_model_a', 'winner_model_b', 'winner_tie']
y_one_hot = step1_train_df[target_cols].values
y = np.argmax(y_one_hot, axis=1)

In [None]:
# extract lexical/length features
def extract_length_features(df):
    features = pd.DataFrame()
    # 1. prompt length
    features['prompt_len'] = df['prompt'].apply(len)

    # 2. response_a/b length
    features['response_a_len'] = df['response_a'].apply(len)
    features['response_b_len'] = df['response_b'].apply(len)

    # 3. difference and ratio between response_a/b
    features['response_diff_len'] = features['response_a_len'] - features['response_b_len']
    features['response_ratio_len'] = features['response_a_len'] / (features['response_b_len'] + 1e-6) # prevent divided by zero

    return features

In [None]:
# Extract features for training and testing datasets
X_train_features = extract_length_features(step1_train_df)
X_test_features = extract_length_features(step1_test_df)

# Convert features to numpy array for easier indexing
X = X_train_features.values
X_test = X_test_features.values
y_array = y

# Create train data result array
oof_preds_train = np.zeros((X.shape[0], len(target_cols)))
# Create test data result array (final result)
oof_predictions_test = np.zeros((X_test.shape[0], len(target_cols)))

# K-Fold loop
for fold, (train_index, val_index) in enumerate(skf.split(X, y_array)):

    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y_array[train_index], y_array[val_index]

    # Initialize and train the Logistic Regression model
    model = LogisticRegression(
        random_state=42,
        max_iter=1000
    )

    # Train the model
    model.fit(X_train, y_train)

    # save train data result
    val_predictions = model.predict_proba(X_val)
    oof_preds_train[val_index] = val_predictions

    # save test data result
    test_fold_predictions = model.predict_proba(X_test)
    oof_predictions_test += test_fold_predictions

overall_logloss = log_loss(y_array, oof_preds_train, labels=range(len(target_cols)))

print(f"Overall OOF Log Loss: {overall_logloss:.6f}")

# Save the average of test data result array
predictions = oof_predictions_test / skf.n_splits

In [None]:
# Create the submission DataFrame
sub_step1 = pd.DataFrame({
    'id': step1_test_df['id'],
    'winner_model_a': predictions[:, 0], # Probability for model_a
    'winner_model_b': predictions[:, 1], # Probability for model_b
    'winner_tie': predictions[:, 2]      # Probability for tie
})

# Save the submission file
sub_step1.to_csv('submission.csv', index=False)

print("'submission.csv' file created successfully.")
print(sub_step1.head())