In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/distilbertbaseuncased/rust_model.ot
/kaggle/input/distilbertbaseuncased/config.json
/kaggle/input/distilbertbaseuncased/README.md
/kaggle/input/distilbertbaseuncased/tokenizer.json
/kaggle/input/distilbertbaseuncased/tf_model.h5
/kaggle/input/distilbertbaseuncased/tokenizer_config.json
/kaggle/input/distilbertbaseuncased/pytorch_model.bin
/kaggle/input/distilbertbaseuncased/.gitattributes
/kaggle/input/distilbertbaseuncased/vocab.txt
/kaggle/input/distilbertbaseuncased/flax_model.msgpack
/kaggle/input/llm-classification-finetuning/sample_submission.csv
/kaggle/input/llm-classification-finetuning/train.csv
/kaggle/input/llm-classification-finetuning/test.csv


### Initial setup

In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [4]:
# Load data
train_df = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')
test_df = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

Train shape: (57477, 9)
Test shape: (3, 4)


In [5]:
# Create target variable
train_df['target'] = 0  # model_a wins
train_df.loc[train_df['winner_model_b'] == 1, 'target'] = 1  # model_b wins
train_df.loc[train_df['winner_tie'] == 1, 'target'] = 2  # tie

print("Target distribution:")
print(train_df['target'].value_counts())

Target distribution:
target
0    20064
1    19652
2    17761
Name: count, dtype: int64


In [6]:
from transformers import DistilBertTokenizer, DistilBertModel

# Use the dataset path instead of downloading
tokenizer = DistilBertTokenizer.from_pretrained('/kaggle/input/distilbertbaseuncased')
model = DistilBertModel.from_pretrained('/kaggle/input/distilbertbaseuncased')

2025-08-21 05:06:15.981484: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755752776.180342      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755752776.235984      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Preprocess for transformer

In [7]:
def prepare_text_for_transformer(df, tokenizer, max_length=512):
    """
    Prepare text data for transformer models
    We'll combine prompt + response_a + response_b into single input
    """
    print("Preparing text for transformer...")
    
    # Create combined text for each conversation
    # Format: [CLS] prompt [SEP] response_a [SEP] response_b [SEP]
    combined_texts = []
    
    for idx, row in df.iterrows():
        # Clean and truncate text to avoid memory issues
        prompt = str(row['prompt'])[:200]  # Limit prompt length
        response_a = str(row['response_a'])[:300]  # Limit response length
        response_b = str(row['response_b'])[:300]
        
        # Combine with special tokens
        combined_text = f"{prompt} [SEP] {response_a} [SEP] {response_b}"
        combined_texts.append(combined_text)
    
    # Tokenize all texts
    print("Tokenizing texts...")
    tokenized = tokenizer(
        combined_texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    
    print(f"Tokenized shape: {tokenized['input_ids'].shape}")
    return tokenized

In [8]:
sample_size = 5000  # Reduce for faster training
train_sample = train_df.head(sample_size).copy()

print(f"Using sample of {len(train_sample)} conversations for demo")

# Prepare training data
train_tokenized = prepare_text_for_transformer(train_sample, tokenizer, max_length=512)

# Prepare test data (all of it since it's small)
test_tokenized = prepare_text_for_transformer(test_df, tokenizer, max_length=512)

print("Section 2 complete - Text preprocessing done!")

Using sample of 5000 conversations for demo
Preparing text for transformer...
Tokenizing texts...
Tokenized shape: torch.Size([5000, 512])
Preparing text for transformer...
Tokenizing texts...
Tokenized shape: torch.Size([3, 193])
Section 2 complete - Text preprocessing done!


### Create Transformer Model

In [9]:
import torch.nn as nn
from transformers import AutoModel

class ChatbotPreferenceModel(nn.Module):
    """
    Extremely conservative model to prevent overfitting
    """
    def __init__(self, model_name='distilbert-base-uncased', num_classes=3):
        super(ChatbotPreferenceModel, self).__init__()
        
        self.transformer = AutoModel.from_pretrained(model_name)
        hidden_size = self.transformer.config.hidden_size
        
        # EXTREME regularization
        self.classifier = nn.Sequential(
            nn.Dropout(0.8),                    # VERY HIGH dropout
            nn.Linear(hidden_size, num_classes) # Direct to output - NO hidden layers
        )
        
        # FREEZE ALMOST EVERYTHING
        for param in self.transformer.parameters():
            param.requires_grad = False
            
        # Only train the VERY LAST attention layer
        for param in self.transformer.transformer.layer[-1].attention.parameters():
            param.requires_grad = True
            
        print("🔒 ULTRA-CONSERVATIVE: Froze entire transformer except final attention layer")
        
        # Count trainable parameters
        trainable = sum(p.numel() for p in self.parameters() if p.requires_grad)
        total = sum(p.numel() for p in self.parameters())
        print(f"📊 Trainable: {trainable:,} / {total:,} ({100*trainable/total:.1f}%)")
    
    def forward(self, input_ids, attention_mask):
        # Get transformer outputs
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        
        # Use [CLS] token representation (first token)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        
        # Pass through classifier
        logits = self.classifier(pooled_output)
        
        return logits

In [10]:

# Initialize model
print("Creating transformer model...")
model = ChatbotPreferenceModel(model_name = '/kaggle/input/distilbertbaseuncased', num_classes=3)
model.to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

print("Section 3 complete - Model created!")

Creating transformer model...
🔒 ULTRA-CONSERVATIVE: Froze entire transformer except final attention layer
📊 Trainable: 2,364,675 / 66,365,187 (3.6%)
Total parameters: 66,365,187
Trainable parameters: 2,364,675
Section 3 complete - Model created!


### Training Setup

In [11]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

class ChatbotDataset(Dataset):
    """
    Custom dataset for chatbot preference data
    """
    def __init__(self, tokenized_data, targets=None):
        self.input_ids = tokenized_data['input_ids']
        self.attention_mask = tokenized_data['attention_mask']
        self.targets = targets
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        item = {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx]
        }
        
        if self.targets is not None:
            item['targets'] = torch.tensor(self.targets[idx], dtype=torch.long)
        
        return item

In [12]:
# Split training data
X_train, X_val, y_train, y_val = train_test_split(
    range(len(train_sample)), 
    train_sample['target'].values,
    test_size=0.2, 
    random_state=42, 
    stratify=train_sample['target'].values
)

print(f"Train size: {len(X_train)}")
print(f"Validation size: {len(X_val)}")

Train size: 4000
Validation size: 1000


In [13]:
# Create subset of tokenized data for train/val split
train_input_ids = train_tokenized['input_ids'][X_train]
train_attention_mask = train_tokenized['attention_mask'][X_train]
train_tokenized_subset = {'input_ids': train_input_ids, 'attention_mask': train_attention_mask}

val_input_ids = train_tokenized['input_ids'][X_val]
val_attention_mask = train_tokenized['attention_mask'][X_val]
val_tokenized_subset = {'input_ids': val_input_ids, 'attention_mask': val_attention_mask}


In [14]:
# Create datasets
train_dataset = ChatbotDataset(train_tokenized_subset, y_train)
val_dataset = ChatbotDataset(val_tokenized_subset, y_val)
test_dataset = ChatbotDataset(test_tokenized)

In [15]:
# Create data loaders
batch_size = 32  # Adjust based on GPU memory
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")

Train batches: 125
Val batches: 32
Test batches: 1


In [16]:
# Setup optimizer and scheduler
learning_rate = 0.01 # Lower learning rate for pre-trained models
num_epochs = 25  

optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.3)

# Calculate total training steps
total_steps = len(train_loader) * num_epochs

# Setup learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0.1 * total_steps,  # 10% warmup
    num_training_steps=total_steps
)

# Loss function
criterion = nn.CrossEntropyLoss()

### Training

In [17]:
import torch.nn.functional as F
from tqdm import tqdm

def train_epoch(model, train_loader, optimizer, scheduler, criterion, device):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    
    progress_bar = tqdm(train_loader, desc="Training")
    
    for batch in progress_bar:
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, targets)
        
        # Backward pass
        loss.backward()
        
        # Gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        # Update weights
        optimizer.step()
        scheduler.step()
        
        # Calculate accuracy
        predictions = torch.argmax(logits, dim=-1)
        correct_predictions += (predictions == targets).sum().item()
        total_predictions += targets.size(0)
        
        total_loss += loss.item()
        
        # Update progress bar
        progress_bar.set_postfix({
            'loss': f'{loss.item():.4f}',
            'acc': f'{correct_predictions/total_predictions:.4f}'
        })
    
    avg_loss = total_loss / len(train_loader)
    accuracy = correct_predictions / total_predictions
    
    return avg_loss, accuracy

In [18]:
def evaluate(model, val_loader, criterion, device):
    """Evaluate model"""
    model.eval()
    total_loss = 0
    all_predictions = []
    all_probabilities = []
    all_targets = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)
            
            # Forward pass
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, targets)
            
            # Get probabilities
            probabilities = F.softmax(logits, dim=-1)
            predictions = torch.argmax(logits, dim=-1)
            
            # Collect results
            all_predictions.extend(predictions.cpu().numpy())
            all_probabilities.extend(probabilities.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
            total_loss += loss.item()
    
    avg_loss = total_loss / len(val_loader)
    accuracy = accuracy_score(all_targets, all_predictions)
    
    # Calculate log loss
    log_loss_score = log_loss(all_targets, all_probabilities)
    
    return avg_loss, accuracy, log_loss_score, all_probabilities, all_predictions

In [19]:
# Training loop
print("Starting training...")
best_log_loss = float('inf')
best_model_state = None

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print("-" * 30)
    
    # Train
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, scheduler, criterion, device)
    
    # Evaluate
    val_loss, val_acc, val_log_loss, _, _ = evaluate(model, val_loader, criterion, device)
    
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | Val Log Loss: {val_log_loss:.4f}")
    
    # Save best model
    if val_log_loss < best_log_loss:
        best_log_loss = val_log_loss
        best_model_state = model.state_dict().copy()
        print(f"New best model! Log Loss: {best_log_loss:.4f}")


Starting training...

Epoch 1/25
------------------------------


Training: 100%|██████████| 125/125 [00:44<00:00,  2.79it/s, loss=1.1022, acc=0.3417]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.13it/s]


Train Loss: 1.1637 | Train Acc: 0.3417
Val Loss: 1.1118 | Val Acc: 0.3400 | Val Log Loss: 1.1103
New best model! Log Loss: 1.1103

Epoch 2/25
------------------------------


Training: 100%|██████████| 125/125 [00:44<00:00,  2.78it/s, loss=1.1268, acc=0.3563]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.08it/s]


Train Loss: 1.2173 | Train Acc: 0.3563
Val Loss: 1.1072 | Val Acc: 0.3340 | Val Log Loss: 1.1051
New best model! Log Loss: 1.1051

Epoch 3/25
------------------------------


Training: 100%|██████████| 125/125 [00:45<00:00,  2.77it/s, loss=1.3672, acc=0.3360]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.08it/s]


Train Loss: 1.2600 | Train Acc: 0.3360
Val Loss: 1.1309 | Val Acc: 0.3230 | Val Log Loss: 1.1275

Epoch 4/25
------------------------------


Training: 100%|██████████| 125/125 [00:45<00:00,  2.77it/s, loss=1.2885, acc=0.3422]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.08it/s]


Train Loss: 1.2518 | Train Acc: 0.3422
Val Loss: 1.1444 | Val Acc: 0.3070 | Val Log Loss: 1.1471

Epoch 5/25
------------------------------


Training: 100%|██████████| 125/125 [00:45<00:00,  2.77it/s, loss=1.1850, acc=0.3385]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.09it/s]


Train Loss: 1.1978 | Train Acc: 0.3385
Val Loss: 1.1163 | Val Acc: 0.3070 | Val Log Loss: 1.1182

Epoch 6/25
------------------------------


Training: 100%|██████████| 125/125 [00:45<00:00,  2.77it/s, loss=1.1907, acc=0.3443]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.08it/s]


Train Loss: 1.2048 | Train Acc: 0.3443
Val Loss: 1.0987 | Val Acc: 0.3860 | Val Log Loss: 1.0975
New best model! Log Loss: 1.0975

Epoch 7/25
------------------------------


Training: 100%|██████████| 125/125 [00:45<00:00,  2.77it/s, loss=1.1896, acc=0.3513]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.08it/s]


Train Loss: 1.1873 | Train Acc: 0.3513
Val Loss: 1.1098 | Val Acc: 0.3540 | Val Log Loss: 1.1078

Epoch 8/25
------------------------------


Training: 100%|██████████| 125/125 [00:45<00:00,  2.77it/s, loss=1.0540, acc=0.3377]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.08it/s]


Train Loss: 1.1983 | Train Acc: 0.3377
Val Loss: 1.1341 | Val Acc: 0.3400 | Val Log Loss: 1.1312

Epoch 9/25
------------------------------


Training: 100%|██████████| 125/125 [00:45<00:00,  2.77it/s, loss=1.2697, acc=0.3533]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.08it/s]


Train Loss: 1.1575 | Train Acc: 0.3533
Val Loss: 1.1391 | Val Acc: 0.3530 | Val Log Loss: 1.1374

Epoch 10/25
------------------------------


Training: 100%|██████████| 125/125 [00:45<00:00,  2.77it/s, loss=1.1269, acc=0.3327]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.07it/s]


Train Loss: 1.1713 | Train Acc: 0.3327
Val Loss: 1.1158 | Val Acc: 0.3400 | Val Log Loss: 1.1134

Epoch 11/25
------------------------------


Training: 100%|██████████| 125/125 [00:45<00:00,  2.77it/s, loss=1.1611, acc=0.3495]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.08it/s]


Train Loss: 1.1591 | Train Acc: 0.3495
Val Loss: 1.1798 | Val Acc: 0.3070 | Val Log Loss: 1.1836

Epoch 12/25
------------------------------


Training: 100%|██████████| 125/125 [00:45<00:00,  2.77it/s, loss=1.0365, acc=0.3397]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.08it/s]


Train Loss: 1.1635 | Train Acc: 0.3397
Val Loss: 1.1173 | Val Acc: 0.3150 | Val Log Loss: 1.1179

Epoch 13/25
------------------------------


Training: 100%|██████████| 125/125 [00:45<00:00,  2.77it/s, loss=1.1908, acc=0.3347]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.07it/s]


Train Loss: 1.1438 | Train Acc: 0.3347
Val Loss: 1.0963 | Val Acc: 0.3580 | Val Log Loss: 1.0977

Epoch 14/25
------------------------------


Training: 100%|██████████| 125/125 [00:45<00:00,  2.77it/s, loss=1.0646, acc=0.3503]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.08it/s]


Train Loss: 1.1381 | Train Acc: 0.3503
Val Loss: 1.1036 | Val Acc: 0.3440 | Val Log Loss: 1.1021

Epoch 15/25
------------------------------


Training: 100%|██████████| 125/125 [00:45<00:00,  2.77it/s, loss=1.1116, acc=0.3668]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.08it/s]


Train Loss: 1.1272 | Train Acc: 0.3668
Val Loss: 1.1022 | Val Acc: 0.3500 | Val Log Loss: 1.1014

Epoch 16/25
------------------------------


Training: 100%|██████████| 125/125 [00:45<00:00,  2.77it/s, loss=1.0718, acc=0.3495]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.07it/s]


Train Loss: 1.1312 | Train Acc: 0.3495
Val Loss: 1.0966 | Val Acc: 0.3190 | Val Log Loss: 1.0975

Epoch 17/25
------------------------------


Training: 100%|██████████| 125/125 [00:45<00:00,  2.77it/s, loss=1.1489, acc=0.3750]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.07it/s]


Train Loss: 1.1150 | Train Acc: 0.3750
Val Loss: 1.1126 | Val Acc: 0.3510 | Val Log Loss: 1.1128

Epoch 18/25
------------------------------


Training: 100%|██████████| 125/125 [00:45<00:00,  2.77it/s, loss=1.0847, acc=0.3673]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.08it/s]


Train Loss: 1.1097 | Train Acc: 0.3673
Val Loss: 1.1150 | Val Acc: 0.3480 | Val Log Loss: 1.1126

Epoch 19/25
------------------------------


Training: 100%|██████████| 125/125 [00:45<00:00,  2.77it/s, loss=1.1213, acc=0.3972]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.08it/s]


Train Loss: 1.0991 | Train Acc: 0.3972
Val Loss: 1.1137 | Val Acc: 0.3670 | Val Log Loss: 1.1154

Epoch 20/25
------------------------------


Training: 100%|██████████| 125/125 [00:45<00:00,  2.77it/s, loss=1.1575, acc=0.3895]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.08it/s]


Train Loss: 1.0916 | Train Acc: 0.3895
Val Loss: 1.1187 | Val Acc: 0.3640 | Val Log Loss: 1.1193

Epoch 21/25
------------------------------


Training: 100%|██████████| 125/125 [00:45<00:00,  2.77it/s, loss=1.0079, acc=0.3917]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.08it/s]


Train Loss: 1.0824 | Train Acc: 0.3917
Val Loss: 1.1036 | Val Acc: 0.3630 | Val Log Loss: 1.1047

Epoch 22/25
------------------------------


Training: 100%|██████████| 125/125 [00:45<00:00,  2.77it/s, loss=1.1591, acc=0.4253]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.08it/s]


Train Loss: 1.0601 | Train Acc: 0.4253
Val Loss: 1.1167 | Val Acc: 0.3720 | Val Log Loss: 1.1165

Epoch 23/25
------------------------------


Training: 100%|██████████| 125/125 [00:45<00:00,  2.77it/s, loss=1.0036, acc=0.4422]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.08it/s]


Train Loss: 1.0567 | Train Acc: 0.4422
Val Loss: 1.1291 | Val Acc: 0.3650 | Val Log Loss: 1.1314

Epoch 24/25
------------------------------


Training: 100%|██████████| 125/125 [00:45<00:00,  2.77it/s, loss=1.0890, acc=0.4600]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.09it/s]


Train Loss: 1.0396 | Train Acc: 0.4600
Val Loss: 1.1259 | Val Acc: 0.3780 | Val Log Loss: 1.1312

Epoch 25/25
------------------------------


Training: 100%|██████████| 125/125 [00:45<00:00,  2.77it/s, loss=1.1060, acc=0.4748]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.09it/s]

Train Loss: 1.0225 | Train Acc: 0.4748
Val Loss: 1.1322 | Val Acc: 0.3830 | Val Log Loss: 1.1379





In [20]:
# Load best model
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    print(f"\nLoaded best model with log loss: {best_log_loss:.4f}")


Loaded best model with log loss: 1.0975


### Make predictions

In [21]:
import torch.nn.functional as F

def make_predictions(model, test_loader, device):
    """Make predictions on test set"""
    model.eval()
    all_probabilities = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Making predictions"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            # Forward pass
            logits = model(input_ids, attention_mask)
            
            # Get probabilities
            probabilities = F.softmax(logits, dim=-1)
            all_probabilities.extend(probabilities.cpu().numpy())
    
    return np.array(all_probabilities)

# Make predictions on test set
print("Making predictions on test set...")
test_predictions = make_predictions(model, test_loader, device)

print(f"Test predictions shape: {test_predictions.shape}")
print(f"First few predictions:")
print(test_predictions[:3])


Making predictions on test set...


Making predictions: 100%|██████████| 1/1 [00:00<00:00, 67.14it/s]

Test predictions shape: (3, 3)
First few predictions:
[[0.25520962 0.2073963  0.53739405]
 [0.5597395  0.32862967 0.11163091]
 [0.294327   0.4047011  0.30097187]]





In [22]:
# Create submission dataframe
submission = pd.DataFrame({
    'id': test_df['id'],
    'winner_model_a': test_predictions[:, 0],    # Probability model A wins
    'winner_model_b': test_predictions[:, 1],    # Probability model B wins  
    'winner_tie': test_predictions[:, 2]         # Probability of tie
})

In [23]:
# Verify probabilities sum to 1 (they should due to softmax)
prob_sums = submission[['winner_model_a', 'winner_model_b', 'winner_tie']].sum(axis=1)
print(f"Probability sums (should be ~1.0): {prob_sums.describe()}")


Probability sums (should be ~1.0): count    3.0
mean     1.0
std      0.0
min      1.0
25%      1.0
50%      1.0
75%      1.0
max      1.0
dtype: float64


In [24]:
# Display submission
print("\nSubmission preview:")
print(submission)


Submission preview:
        id  winner_model_a  winner_model_b  winner_tie
0   136060        0.255210        0.207396    0.537394
1   211333        0.559739        0.328630    0.111631
2  1233961        0.294327        0.404701    0.300972


In [25]:
# Save submission
submission.to_csv('submission.csv', index=False)
print("\nSubmission saved as 'submission.csv'")


Submission saved as 'submission.csv'


In [26]:
# Additional analysis
print("\nPrediction statistics:")
print("Average probabilities by class:")
print(f"Model A wins: {submission['winner_model_a'].mean():.3f}")
print(f"Model B wins: {submission['winner_model_b'].mean():.3f}")
print(f"Tie: {submission['winner_tie'].mean():.3f}")

print("\nMost confident predictions:")
max_probs = submission[['winner_model_a', 'winner_model_b', 'winner_tie']].max(axis=1)
confident_idx = max_probs.idxmax()
print(f"Most confident prediction (ID {submission.loc[confident_idx, 'id']}):")
print(f"  Model A: {submission.loc[confident_idx, 'winner_model_a']:.3f}")
print(f"  Model B: {submission.loc[confident_idx, 'winner_model_b']:.3f}")
print(f"  Tie: {submission.loc[confident_idx, 'winner_tie']:.3f}")


Prediction statistics:
Average probabilities by class:
Model A wins: 0.370
Model B wins: 0.314
Tie: 0.317

Most confident predictions:
Most confident prediction (ID 211333):
  Model A: 0.560
  Model B: 0.329
  Tie: 0.112
