In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-classification-finetuning/sample_submission.csv
/kaggle/input/llm-classification-finetuning/train.csv
/kaggle/input/llm-classification-finetuning/test.csv


In [2]:
# Load the data
train_df = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')
test_df = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')

In [3]:
# Basic info
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

# See columns
print(f"Columns: {list(train_df.columns)}")

# Look at first few rows
print(train_df.head())

Train shape: (57477, 9)
Test shape: (3, 4)
Columns: ['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie']
       id             model_a              model_b  \
0   30192  gpt-4-1106-preview           gpt-4-0613   
1   53567           koala-13b           gpt-4-0613   
2   65089  gpt-3.5-turbo-0613       mistral-medium   
3   96401    llama-2-13b-chat  mistral-7b-instruct   
4  198779           koala-13b   gpt-3.5-turbo-0314   

                                              prompt  \
0  ["Is it morally right to try to have a certain...   
1  ["What is the difference between marriage lice...   
2  ["explain function calling. how would you call...   
3  ["How can I create a test set for a very rare ...   
4  ["What is the best way to travel from Tel-Aviv...   

                                          response_a  \
0  ["The question of whether it is morally right ...   
1  ["A marriage license is a legal document that ...   
2

In [4]:
# Check target distribution
# As percentages
total = len(train_df)
print(f"\nAs percentages:")
print(f"Model A wins: {train_df['winner_model_a'].sum() / total * 100:.1f}%")
print(f"Model B wins: {train_df['winner_model_b'].sum() / total * 100:.1f}%")
print(f"Ties: {train_df['winner_tie'].sum() / total * 100:.1f}%")


As percentages:
Model A wins: 34.9%
Model B wins: 34.2%
Ties: 30.9%


In [5]:
print(train_df.isnull().sum())

id                0
model_a           0
model_b           0
prompt            0
response_a        0
response_b        0
winner_model_a    0
winner_model_b    0
winner_tie        0
dtype: int64


In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
import xgboost as xgb
import warnings

In [7]:
def create_advanced_features(df):
    print("Creating features...")
    
    # Basic length features
    df['prompt_length'] = df['prompt'].str.len()
    df['response_a_length'] = df['response_a'].str.len()
    df['response_b_length'] = df['response_b'].str.len()
    df['length_diff'] = df['response_a_length'] - df['response_b_length']
    df['length_ratio'] = df['response_a_length'] / (df['response_b_length'] + 1)
    
    # Word counts
    df['response_a_words'] = df['response_a'].str.split().str.len()
    df['response_b_words'] = df['response_b'].str.split().str.len()
    df['word_diff'] = df['response_a_words'] - df['response_b_words']
    
    # Sentence counts
    df['response_a_sentences'] = df['response_a'].str.count(r'[.!?]+')
    df['response_b_sentences'] = df['response_b'].str.count(r'[.!?]+')
    df['sentence_diff'] = df['response_a_sentences'] - df['response_b_sentences']
    
    # Question marks (helpfulness)
    df['response_a_questions'] = df['response_a'].str.count(r'\?')
    df['response_b_questions'] = df['response_b'].str.count(r'\?')
    df['question_diff'] = df['response_a_questions'] - df['response_b_questions']
    
    # Exclamation marks (enthusiasm)
    df['response_a_exclamations'] = df['response_a'].str.count(r'!')
    df['response_b_exclamations'] = df['response_b'].str.count(r'!')
    df['exclamation_diff'] = df['response_a_exclamations'] - df['response_b_exclamations']
    
    # Politeness indicators
    df['response_a_polite'] = df['response_a'].str.lower().str.contains(r'please|thank|sorry').astype(int)
    df['response_b_polite'] = df['response_b'].str.lower().str.contains(r'please|thank|sorry').astype(int)
    df['polite_diff'] = df['response_a_polite'] - df['response_b_polite']
    
    # Code indicators
    df['response_a_code'] = df['response_a'].str.contains(r'```|def |import |class |function').astype(int)
    df['response_b_code'] = df['response_b'].str.contains(r'```|def |import |class |function').astype(int)
    df['code_diff'] = df['response_a_code'] - df['response_b_code']
    
    # Average word length (complexity indicator)
    df['response_a_avg_word_len'] = df['response_a_length'] / (df['response_a_words'] + 1)
    df['response_b_avg_word_len'] = df['response_b_length'] / (df['response_b_words'] + 1)
    df['avg_word_len_diff'] = df['response_a_avg_word_len'] - df['response_b_avg_word_len']
    
    return df

In [8]:
# Create features
train_df = create_advanced_features(train_df)
test_df = create_advanced_features(test_df)

Creating features...
Creating features...


In [9]:
# PREPARE TARGET
train_df['target'] = 0  # model_a wins
train_df.loc[train_df['winner_model_b'] == 1, 'target'] = 1  # model_b wins
train_df.loc[train_df['winner_tie'] == 1, 'target'] = 2  # tie

In [10]:
feature_cols = [
    'prompt_length', 'response_a_length', 'response_b_length', 'length_diff', 'length_ratio',
    'response_a_words', 'response_b_words', 'word_diff',
    'response_a_sentences', 'response_b_sentences', 'sentence_diff',
    'response_a_questions', 'response_b_questions', 'question_diff',
    'response_a_exclamations', 'response_b_exclamations', 'exclamation_diff',
    'response_a_polite', 'response_b_polite', 'polite_diff',
    'response_a_code', 'response_b_code', 'code_diff',
    'response_a_avg_word_len', 'response_b_avg_word_len', 'avg_word_len_diff'
]
print(len(feature_cols))

26


In [11]:
X = train_df[feature_cols]
y = train_df['target']

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")

Training set: (45981, 26)
Validation set: (11496, 26)


In [13]:
#TRAIN MULTIPLE MODELS

models = {}

# Model 1: Random Forest
models['rf'] = RandomForestClassifier(
    n_estimators=200, 
    max_depth=15, 
    min_samples_split=5,
    random_state=42
)

# Model 2: XGBoost
models['xgb'] = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    random_state=42
)

# Model 3: Gradient Boosting
models['gb'] = GradientBoostingClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    random_state=42
)

# Model 4: Logistic Regression
models['lr'] = LogisticRegression(
    random_state=42,
    max_iter=1000,
    C=1.0
)

In [14]:
# Train all models and collect predictions
individual_scores = {}
val_predictions = {}
test_predictions = {}

for name, model in models.items():
    print(f"Training {name}...")
    
    # Train model
    model.fit(X_train, y_train)
    
    # Validation predictions
    val_pred_proba = model.predict_proba(X_val)
    val_predictions[name] = val_pred_proba
    
    # Test predictions
    test_pred_proba = model.predict_proba(test_df[feature_cols])
    test_predictions[name] = test_pred_proba
    
    # Score
    score = log_loss(y_val, val_pred_proba)
    individual_scores[name] = score
    
    print(f"  {name} validation log loss: {score:.4f}")

Training rf...
  rf validation log loss: 1.0414
Training xgb...
  xgb validation log loss: 1.0457
Training gb...
  gb validation log loss: 1.0483
Training lr...
  lr validation log loss: 1.0707


In [15]:
val_ensemble_avg = np.mean(list(val_predictions.values()), axis=0)
test_ensemble_avg = np.mean(list(test_predictions.values()), axis=0)
ensemble_avg_score = log_loss(y_val, val_ensemble_avg)

In [16]:
weights = {}
total_weight = 0
for name, score in individual_scores.items():
    # Lower log loss = better = higher weight
    weight = 1 / (score + 0.001)  # Add small value to avoid division by zero
    weights[name] = weight
    total_weight += weight
# Create weighted ensemble
val_ensemble_weighted = np.zeros_like(val_ensemble_avg)
test_ensemble_weighted = np.zeros_like(test_ensemble_avg)

for name, weight in weights.items():
    val_ensemble_weighted += weight * val_predictions[name]
    test_ensemble_weighted += weight * test_predictions[name]

ensemble_weighted_score = log_loss(y_val, val_ensemble_weighted)
print(f"Weighted ensemble score: {ensemble_weighted_score:.4f}")

Weighted ensemble score: 1.0796


In [17]:
# SELECT BEST APPROACH
best_score = min(min(individual_scores.values()), ensemble_avg_score, ensemble_weighted_score)

if best_score == ensemble_weighted_score:
    print(f"\nBest approach: Weighted Ensemble (score: {best_score:.4f})")
    final_predictions = test_ensemble_weighted
elif best_score == ensemble_avg_score:
    print(f"\nBest approach: Average Ensemble (score: {best_score:.4f})")
    final_predictions = test_ensemble_avg
else:
    best_model = min(individual_scores, key=individual_scores.get)
    print(f"\nBest approach: {best_model} (score: {best_score:.4f})")
    final_predictions = test_predictions[best_model]


Best approach: rf (score: 1.0414)


In [18]:
# Create submission
submission = pd.DataFrame({
    'id': test_df['id'],
    'winner_model_a': final_predictions[:, 0],
    'winner_model_b': final_predictions[:, 1],
    'winner_tie': final_predictions[:, 2]
})

In [19]:
print(submission)

        id  winner_model_a  winner_model_b  winner_tie
0   136060        0.272180        0.279686    0.448134
1   211333        0.533395        0.224764    0.241841
2  1233961        0.319080        0.407441    0.273479


In [20]:
# Verify probabilities sum to ~1.0
prob_sums = submission[['winner_model_a', 'winner_model_b', 'winner_tie']].sum(axis=1)
print(f"Probability sums (should be close to 1.0): {prob_sums.head()}")

# STEP 2: Save submission file
submission.to_csv('submission.csv', index=False)

Probability sums (should be close to 1.0): 0    1.0
1    1.0
2    1.0
dtype: float64
