In [None]:
import os
import gc
import sys
import math
import time
import random
import glob
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, AutoConfig
from tqdm.auto import tqdm
import html

# ------------------- Configuration -------------------
class CFG:
    # Path to trained weights
    model_dir = "/kaggle/input/deberta-finetuned/pytorch/arch1/11" 
    
    # Tokenizer path
    base_model = "/kaggle/input/deberta-tokenizer/deberta-v3-base-tokenizer" 
    
    # Must match training setup
    pooling_strategy = 'arch1_6groups' 
    
    max_len = 512
    batch_size = 16
    num_workers = 2
    seed = 42
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

TARGET_COLS = [
    'question_asker_intent_understanding', 'question_body_critical', 'question_conversational',
    'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer',
    'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent',
    'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice',
    'question_type_compare', 'question_type_consequence', 'question_type_definition',
    'question_type_entity', 'question_type_instructions', 'question_type_procedure',
    'question_type_reason_explanation', 'question_type_spelling', 'question_well_written',
    'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance',
    'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure',
    'answer_type_reason_explanation', 'answer_well_written'
],

# Reordered to align with 6-head grouping
GROUP_ORDER_INDICES = [
    3, 4, 5, 16, 17,          # G1
    0, 1, 6, 7, 20,           # G2
    2, 10,                    # G3
    8, 9, 11, 12, 13, 14, 15, 18, 19, # G4
    26, 27,                   # G5
    21, 22, 23, 24, 25, 28, 29 # G6
]
SORTED_TARGET_COLS = [TARGET_COLS[i] for i in GROUP_ORDER_INDICES]

class OptimizedRounder:
    def __init__(self):
        # Looser clip bounds for binary ordinal outputs that are already calibrated
        self.coef_ = [0.025, 0.975]

    def predict(self, X, coef):
        # 1. Handle NaN
        X = np.nan_to_num(X, nan=0.5)
        X_p = np.copy(X)
        low, high = coef[0], coef[1]
        
        # 2. Clip
        X_p = np.clip(X_p, low, high)
        
        # 3. Avoid constant outputs (tiny perturbation)
        if np.unique(X_p).size == 1:
            eps = 1e-6
            # Nudge based on original trend to avoid zero denominator in Spearman
            if X_p[0] == low:
                max_idx = np.argmax(X)
                X_p[max_idx] += eps
            elif X_p[0] == high:
                min_idx = np.argmin(X)
                X_p[min_idx] -= eps
            
        return X_p

# ------------------- Core Module: Binary Target Encoder -------------------
class BinaryTargetEncoder:
    """Define output dimensions and restore scores."""
    def __init__(self, target_cols=SORTED_TARGET_COLS):
        self.target_cols = target_cols
        self.unique_values = {} 
        self.thresholds = {}    
        self.output_slices = {} 
        self.total_output_dim = 0

    def fit(self, df):
        """
        Critical: read train.csv to determine bit counts per column so the model head matches.
        """
        print(f"Fitting Binary Encoder on {len(df)} samples...")
        current_idx = 0
        for col in self.target_cols:
            uniques = sorted(df[col].unique())
            self.unique_values[col] = uniques
            
            if len(uniques) > 1:
                thresh = uniques[:-1]
            else:
                thresh = [uniques[0]] 
                
            self.thresholds[col] = thresh
            n_dims = len(thresh)
            self.output_slices[col] = slice(current_idx, current_idx + n_dims)
            current_idx += n_dims
            
        self.total_output_dim = current_idx
        print(f"Total Binary Output Dimension: {self.total_output_dim}")

    def inverse_transform(self, binary_preds):
        """
        Input: (Batch, Total_Binary_Dim) - probabilities
        Output: (Batch, 30) - restored scores
        """
        batch_size = binary_preds.shape[0]
        # Output order follows self.target_cols (SORTED_TARGET_COLS)
        output = np.zeros((batch_size, len(self.target_cols)), dtype=np.float32)
        
        for i, col in enumerate(self.target_cols):
            slc = self.output_slices[col]
            col_preds = binary_preds[:, slc]
            # Restore via mean (expected value approximation)
            output[:, i] = col_preds.mean(axis=1)
            
        return output

# ------------------- Data Processing -------------------
def modern_preprocess(text):
    if pd.isna(text): return ""
    text = str(text)
    text = html.unescape(text)
    text = " ".join(text.split())
    return text

class QuestDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        
        self.questions = [
            modern_preprocess(t) + " " + modern_preprocess(b) 
            for t, b in zip(df['question_title'].values, df['question_body'].values)
        ]
        self.answers = [modern_preprocess(a) for a in df['answer'].values]
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        question = self.questions[idx]
        answer = self.answers[idx]
        
        inputs = self.tokenizer(
            question,
            answer,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors=None
        )
        
        item = {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long)
        }
        
        if 'token_type_ids' in inputs:
            item['token_type_ids'] = torch.tensor(inputs['token_type_ids'], dtype=torch.long)
            
        return item

# ------------------- Model Definition (dynamic output dims) -------------------
class QuestModel(nn.Module):
    def __init__(self, model_name, target_encoder, pooling_strategy='arch1_6groups', dropout_rate=0.1):
        super().__init__()
        self.pooling_strategy = pooling_strategy
        self.config = AutoConfig.from_pretrained(model_name)
        
        # Initialize from config (offline-friendly for Kaggle)
        self.backbone = AutoModel.from_config(self.config)
        hidden_size = self.config.hidden_size
        
        # 1. Compute output dims per group
        all_cols = target_encoder.target_cols 
        
        # Slice groups according to SORTED_TARGET_COLS order
        g1_cols = all_cols[0:5]
        g2_cols = all_cols[5:10]
        g3_cols = all_cols[10:12]
        g4_cols = all_cols[12:21]
        g5_cols = all_cols[21:23]
        g6_cols = all_cols[23:30]
        
        self.group_dims = {}
        for g_name, cols in zip(['g1','g2','g3','g4','g5','g6'], [g1_cols, g2_cols, g3_cols, g4_cols, g5_cols, g6_cols]):
            dim = 0
            for c in cols:
                slc = target_encoder.output_slices[c]
                dim += (slc.stop - slc.start)
            self.group_dims[g_name] = dim
            
        # 2. Define heads
        if self.pooling_strategy == 'arch1_6groups':
            self.head_g1 = self._make_head(hidden_size * 3, self.group_dims['g1'], dropout_rate)
            self.head_g2 = self._make_head(hidden_size * 3, self.group_dims['g2'], dropout_rate)
            self.head_g3 = self._make_head(hidden_size * 3, self.group_dims['g3'], dropout_rate)
            self.head_g4 = self._make_head(hidden_size * 3, self.group_dims['g4'], dropout_rate)
            self.head_g5 = self._make_head(hidden_size * 3, self.group_dims['g5'], dropout_rate)
            self.head_g6 = self._make_head(hidden_size * 3, self.group_dims['g6'], dropout_rate)

    def _make_head(self, input_dim, output_dim, dropout_rate):
        head = nn.Sequential(
            nn.Linear(input_dim, self.config.hidden_size),
            nn.Tanh(),
            nn.Dropout(dropout_rate),
            nn.Linear(self.config.hidden_size, output_dim)
        )
        return head

    def _masked_mean_pooling(self, hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float()
        sum_embeddings = torch.sum(hidden_state * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask

    def _get_pooling_features(self, last_hidden_state, attention_mask, token_type_ids):
        cls_token = last_hidden_state[:, 0, :]
        global_avg = self._masked_mean_pooling(last_hidden_state, attention_mask)
        
        if token_type_ids is None:
            q_avg = global_avg; a_avg = global_avg
        else:
            q_mask = attention_mask * (1 - token_type_ids)
            q_avg = self._masked_mean_pooling(last_hidden_state, q_mask)
            a_mask = attention_mask * token_type_ids
            a_avg = self._masked_mean_pooling(last_hidden_state, a_mask)
            
        return cls_token, global_avg, q_avg, a_avg

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        last_hidden_state = outputs.last_hidden_state
        
        if self.pooling_strategy == 'arch1_6groups':
            cls, glob, q, a = self._get_pooling_features(last_hidden_state, attention_mask, token_type_ids)
            
            feat_pure_q = torch.cat([cls, glob, q], dim=1)
            feat_pure_a = torch.cat([cls, glob, a], dim=1)
            
            out_g1 = self.head_g1(feat_pure_q)
            out_g2 = self.head_g2(feat_pure_q)
            out_g3 = self.head_g3(feat_pure_q)
            out_g4 = self.head_g4(feat_pure_q)
            out_g5 = self.head_g5(feat_pure_a)
            out_g6 = self.head_g6(feat_pure_a)
            
            # Directly concatenate to form the long binary vector
            output = torch.cat([out_g1, out_g2, out_g3, out_g4, out_g5, out_g6], dim=1)
            return output
        
        return None

# ------------------- Inference Logic -------------------
def inference_fn(test_loader, model, device):
    model.eval()
    preds = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Predicting"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch.get('token_type_ids')
            if token_type_ids is not None:
                token_type_ids = token_type_ids.to(device)
            
            y_preds = model(input_ids, attention_mask, token_type_ids)
            # Binary logits -> Sigmoid -> Probability
            preds.append(y_preds.sigmoid().cpu().numpy())
            
    return np.concatenate(preds)

# ------------------- Main -------------------
if __name__ == '__main__':
    # 1. Load data (train initializes encoder; test for inference)
    TRAIN_PATH = '/kaggle/input/google-quest-challenge/train.csv'
    TEST_PATH = '/kaggle/input/google-quest-challenge/test.csv'
    
    # Local fallback
    if not os.path.exists(TEST_PATH):
        TRAIN_PATH = 'train.csv'
        TEST_PATH = 'test.csv'
    
    print("Loading data...")
    train_df = pd.read_csv(TRAIN_PATH)
    test_df = pd.read_csv(TEST_PATH)
    print(f"Train Shape: {train_df.shape}, Test Shape: {test_df.shape}")
    
    # 2. Initialize and fit encoder (sets model output shape)
    target_encoder = BinaryTargetEncoder(target_cols=SORTED_TARGET_COLS)
    target_encoder.fit(train_df)
    
    # 3. Prepare tokenizer & dataloader
    tokenizer = AutoTokenizer.from_pretrained(CFG.base_model)
    test_dataset = QuestDataset(test_df, tokenizer, max_len=CFG.max_len)
    test_loader = DataLoader(
        test_dataset, 
        batch_size=CFG.batch_size, 
        shuffle=False, 
        num_workers=CFG.num_workers,
        pin_memory=True
    )
    
    # 4. Find model weights
    weight_paths = []
    for fold in range(5):
        path = os.path.join(CFG.model_dir, f"deberta_v3_fold{fold}_best.pth")
        if os.path.exists(path): weight_paths.append(path)
    if os.path.exists(os.path.join(CFG.model_dir, "deberta_v3_single_run_best.pth")):
        weight_paths.append(os.path.join(CFG.model_dir, "deberta_v3_single_run_best.pth"))
        
    if not weight_paths:
        print("No weights found!")
        sys.exit(1)
        
    print(f"Found {len(weight_paths)} models.")

    # 5. Inference and reconstruction
    final_preds_accum = []
    
    for weight_path in weight_paths:
        print(f"Predicting with {os.path.basename(weight_path)}...")
        
        # Build model with encoder-defined dimensions
        model = QuestModel(
            CFG.base_model, 
            target_encoder=target_encoder,
            pooling_strategy=CFG.pooling_strategy
        )
        
        state_dict = torch.load(weight_path, map_location=CFG.device)
        model.load_state_dict(state_dict)
        model.to(CFG.device)
        
        # Binary probability predictions (Batch, Total_Dim)
        binary_preds = inference_fn(test_loader, model, CFG.device)
        
        # Restore continuous scores (Batch, 30) in SORTED_TARGET_COLS order
        decoded_preds = target_encoder.inverse_transform(binary_preds)
        final_preds_accum.append(decoded_preds)
        
        del model, state_dict
        torch.cuda.empty_cache()
        gc.collect()
        
    # 6. Average and write submission
    if len(final_preds_accum) > 0:
        # Raw restored scores before rounding
        avg_preds = np.mean(final_preds_accum, axis=0)
        
        # Apply OptimizedRounder
        print("Applying OptimizedRounder...")
        final_preds = np.zeros_like(avg_preds)
        opt = OptimizedRounder()
        
        # Clip each column
        for i in range(len(TARGET_COLS)):
            final_preds[:, i] = opt.predict(avg_preds[:, i], opt.coef_)
        
        # Build submission DataFrame
        submission = pd.read_csv('/kaggle/input/google-quest-challenge/sample_submission.csv')
        
        # Map sorted columns back to original TARGET_COLS order
        pred_df = pd.DataFrame(final_preds, columns=SORTED_TARGET_COLS)
        
        for col in TARGET_COLS:
            submission[col] = pred_df[col]
            
        submission.to_csv('submission.csv', index=False)
        print("submission.csv saved successfully! (With OptimizedRounder)")
    else:
        print("Error during inference.")