In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.multioutput import MultiOutputClassifier
pd.set_option('display.max_colwidth', None)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#Load the data
train_df = pd.read_csv('../input/llm-classification-finetuning/train.csv')

#Basic Exploration
print("Dataset Shape:",train_df.shape)
print("\nColumns:",train_df.columns.tolist())
print("\nSample of data:")
print(train_df.head(2))

#Check for missing values
print("\nMissing values:")
print(train_df.isnull().sum())

#Check distribution of winners
print("\nDistribution of winners:")
print("Model A wins:",train_df['winner_model_a'].value_counts(normalize=True))
print("Model B wins:", train_df['winner_model_b'].value_counts(normalize=True))
print("Ties:", train_df['winner_tie'].value_counts(normalize=True))

/kaggle/input/llm-classification-finetuning/sample_submission.csv
/kaggle/input/llm-classification-finetuning/train.csv
/kaggle/input/llm-classification-finetuning/test.csv
Dataset Shape: (57477, 9)

Columns: ['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie']

Sample of data:
      id             model_a     model_b  \
0  30192  gpt-4-1106-preview  gpt-4-0613   
1  53567           koala-13b  gpt-4-0613   

                                                                                                                                                                                                     prompt  \
0                                     ["Is it morally right to try to have a certain percentage of females on managerial positions?","OK, does pineapple belong on a pizza? Relax and give me fun answer."]   
1  ["What is the difference between marriage license and marriage certificate?","How can I get both of them as

In [2]:
def preprocess_text(text):
    """Basic Text Processing"""
    #convert to lowercase
    text = str(text).lower()

    #Remove extra whitespace
    text = ' '.join(text.split())
    return text

#Combine prompt and responses for analysis
train_df['response_a_full'] = train_df['prompt'] + " [SEP] " + train_df['response_a']
train_df['response_b_full'] = train_df['prompt'] + " [SEP] " + train_df['response_b']

#Applying preprocessing
train_df['response_a_processed'] = train_df['response_a_full'].apply(preprocess_text)
train_df['response_b_processed'] = train_df['response_b_full'].apply(preprocess_text)


#Look at some processed examples
print("Sample processed responses:")
print(train_df[['response_a_processed', 'response_b_processed']].head(1))

Sample processed responses:
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            

In [3]:
#Convert text to numerical features using TF-IDF
tfidf = TfidfVectorizer(max_features=5000) #start with limited features

#Creates features for both responses
features_a = tfidf.fit_transform(train_df['response_a_processed'])
features_b = tfidf.fit_transform(train_df['response_b_processed'])

#Combine features from both responses
X = np.hstack([features_a.toarray(), features_b.toarray()])

#Create traget variable (one-hot encoded)
y = pd.get_dummies(train_df[['winner_model_a', 'winner_model_b','winner_tie']])

#Split the data
x_train, x_val, y_train, y_val = train_test_split(X,y, test_size=0.2, random_state = 42)


#Create and Train the model
#We wrap LogisticRegression with MultiOutputClassifier to handle multiple targets
base_model = LogisticRegression(max_iter = 1000)
model = MultiOutputClassifier(base_model)
model.fit(x_train, y_train)

#Make prediction on validation set
val_pred = model.predict_proba(x_val)

#Calculate Log loss
#Since we have multiple output, we  need to calculate log loss for each output and average 
scores = []
for i, target in enumerate(['winner_model_a', 'winner_model_b', 'winner_tie']):
    score = log_loss(y_val[target], val_pred[i])
    scores.append(score)
    print(f'Log loss for {target}: {score}')
print(f'Average Log Loss: {sum(scores)/len(scores)}')

Log loss for winner_model_a: 0.6351862218031198
Log loss for winner_model_b: 0.6319725774375268
Log loss for winner_tie: 0.6282944951158368
Average Log Loss: 0.6318177647854945


In [4]:
# Load test data
test_df = pd.read_csv('../input/llm-classification-finetuning/test.csv')

# Preprocess test data
test_df['response_a_full'] = test_df['prompt'] + " [SEP] " + test_df['response_a']
test_df['response_b_full'] = test_df['prompt'] + " [SEP] " + test_df['response_b']

test_df['response_a_processed'] = test_df['response_a_full'].apply(preprocess_text)
test_df['response_b_processed'] = test_df['response_b_full'].apply(preprocess_text)

# Create features for test data
test_features_a = tfidf.transform(test_df['response_a_processed'])
test_features_b = tfidf.transform(test_df['response_b_processed'])

# Combine test features
X_test = np.hstack([test_features_a.toarray(), test_features_b.toarray()])

# Make predictions on test data
test_predictions = model.predict_proba(X_test)

# Create submission DataFrame - Fixed version
submission = pd.DataFrame({
    'id': test_df['id'],
    'winner_model_a': test_predictions[0][:, 1],  # Get probabilities for positive class
    'winner_model_b': test_predictions[1][:, 1],
    'winner_tie': test_predictions[2][:, 1]
})

# Normalize probabilities to ensure they sum to 1
prob_columns = ['winner_model_a', 'winner_model_b', 'winner_tie']
submission[prob_columns] = submission[prob_columns].div(submission[prob_columns].sum(axis=1), axis=0)

# Save submission file
submission.to_csv('submission.csv', index=False)

# Verify submission format
print("\nSubmission file preview:")
print(submission.head())

# Verify probabilities sum to 1
sum_probs = submission[prob_columns].sum(axis=1)
print("\nSum of probabilities (should be exactly 1):")
print(sum_probs.describe())


Submission file preview:
        id  winner_model_a  winner_model_b  winner_tie
0   136060        0.256772        0.284560    0.458667
1   211333        0.311365        0.517635    0.171000
2  1233961        0.306163        0.522098    0.171738

Sum of probabilities (should be exactly 1):
count    3.0
mean     1.0
std      0.0
min      1.0
25%      1.0
50%      1.0
75%      1.0
max      1.0
dtype: float64


In [5]:
# Verification checks
def verify_submission(submission_df):
    print("Submission Verification:")
    
    # Check column names
    expected_columns = ['id', 'winner_model_a', 'winner_model_b', 'winner_tie']
    missing_cols = set(expected_columns) - set(submission_df.columns)
    if missing_cols:
        print(f"❌ Missing columns: {missing_cols}")
    else:
        print("✓ All required columns present")
    
    # Check for null values
    if submission_df.isnull().any().any():
        print("❌ Contains null values!")
    else:
        print("✓ No null values")
    
    # Check probability range
    probs_df = submission_df[['winner_model_a', 'winner_model_b', 'winner_tie']]
    if (probs_df < 0).any().any() or (probs_df > 1).any().any():
        print("❌ Probabilities outside [0,1] range!")
    else:
        print("✓ Probabilities in valid range")
    
    # Check probability sums
    prob_sums = probs_df.sum(axis=1)
    if not np.allclose(prob_sums, 1, atol=1e-5):
        print("❌ Probabilities don't sum to 1!")
    else:
        print("✓ Probabilities sum to 1")

# Run verification
verify_submission(submission)

Submission Verification:
✓ All required columns present
✓ No null values
✓ Probabilities in valid range
✓ Probabilities sum to 1
