In [None]:
# ====================================================
# Kaggle Inference Notebook (With Optimization)
# ====================================================

import os
import gc
import sys
import math
import time
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, AutoConfig
from tqdm.auto import tqdm
import html

# ------------------- 配置 (Configuration) -------------------
class CFG:
    # 您的訓練權重路徑
    model_dir = "/kaggle/input/deberta-finetuned/pytorch/arch1/3" 
    
    # 您的 Tokenizer 資料夾路徑
    base_model = "/kaggle/input/deberta-tokenizer/deberta-v3-base-tokenizer" 
    
    pooling_strategy = 'arch1'
    max_len = 512
    batch_size = 16
    num_workers = 2
    seed = 42
    n_fold = 5
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

TARGET_COLS = [
    'question_asker_intent_understanding', 'question_body_critical', 'question_conversational',
    'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer',
    'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent',
    'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice',
    'question_type_compare', 'question_type_consequence', 'question_type_definition',
    'question_type_entity', 'question_type_instructions', 'question_type_procedure',
    'question_type_reason_explanation', 'question_type_spelling', 'question_well_written',
    'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance',
    'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure',
    'answer_type_reason_explanation', 'answer_well_written'
]

# ------------------- 資料處理 -------------------
def modern_preprocess(text):
    if pd.isna(text): return ""
    text = str(text)
    text = html.unescape(text)
    text = " ".join(text.split())
    return text

class QuestDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        
        self.questions = [
            modern_preprocess(t) + " " + modern_preprocess(b) 
            for t, b in zip(df['question_title'].values, df['question_body'].values)
        ]
        self.answers = [modern_preprocess(a) for a in df['answer'].values]
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        question = self.questions[idx]
        answer = self.answers[idx]
        
        inputs = self.tokenizer(
            question,
            answer,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors=None
        )
        
        item = {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long)
        }
        
        if 'token_type_ids' in inputs:
            item['token_type_ids'] = torch.tensor(inputs['token_type_ids'], dtype=torch.long)
            
        return item

# ------------------- 模型定義 -------------------
class QuestModel(nn.Module):
    def __init__(self, model_name, num_targets, pooling_strategy='arch1'):
        super().__init__()
        self.pooling_strategy = pooling_strategy
        
        self.config = AutoConfig.from_pretrained(model_name)
        if pooling_strategy == 'arch2':
            self.config.update({'output_hidden_states': True})
            
        # 使用 from_config 初始化結構
        self.backbone = AutoModel.from_config(self.config)
        
        hidden_size = self.config.hidden_size
        
        if self.pooling_strategy == 'mean':
            self.fc = nn.Linear(hidden_size, num_targets)
            
        elif self.pooling_strategy == 'arch1':
            self.intermediate_layer = nn.Sequential(
                nn.Linear(hidden_size * 5, hidden_size),
                nn.Tanh(),
                nn.Dropout(0.1)
            )
            self.fc = nn.Linear(hidden_size, num_targets)
            
        elif self.pooling_strategy == 'arch2':
            self.fc = nn.Sequential(
                nn.Dropout(0.1),
                nn.Linear(hidden_size * 4, num_targets)
            )

    def _masked_mean_pooling(self, hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float()
        sum_embeddings = torch.sum(hidden_state * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask

    def _pool_arch1(self, last_hidden_state, attention_mask, token_type_ids):
        batch_size = last_hidden_state.size(0)
        cls_token = last_hidden_state[:, 0, :]
        last_token_indices = attention_mask.sum(dim=1) - 1
        last_token = last_hidden_state[torch.arange(batch_size), last_token_indices, :]
        global_avg = self._masked_mean_pooling(last_hidden_state, attention_mask)
        
        if token_type_ids is None:
            q_avg = global_avg
            a_avg = global_avg
        else:
            q_mask = attention_mask * (1 - token_type_ids)
            q_avg = self._masked_mean_pooling(last_hidden_state, q_mask)
            a_mask = attention_mask * token_type_ids
            a_avg = self._masked_mean_pooling(last_hidden_state, a_mask)
            
        return torch.cat([cls_token, last_token, global_avg, q_avg, a_avg], dim=1)

    def _pool_arch2(self, all_hidden_states):
        last_4_layers = all_hidden_states[-4:]
        cls_embeddings = [layer[:, 0, :] for layer in last_4_layers]
        return torch.cat(cls_embeddings, dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        
        if self.pooling_strategy == 'mean':
            feature = self._masked_mean_pooling(outputs.last_hidden_state, attention_mask)
            output = self.fc(feature)
        elif self.pooling_strategy == 'arch1':
            feature = self._pool_arch1(outputs.last_hidden_state, attention_mask, token_type_ids)
            output = self.fc(self.intermediate_layer(feature))
        elif self.pooling_strategy == 'arch2':
            feature = self._pool_arch2(outputs.hidden_states)
            output = self.fc(feature)
            
        return output

# ------------------- 優化器 (OptimizedRounder) -------------------
class OptimizedRounder:
    def __init__(self):
        self.coef_ = [0.025, 0.975]

    def predict(self, X, coef):
        # 1. 處理 NaN: 如果有空值，先補成 0.5 (避免程式崩潰)
        X = np.nan_to_num(X, nan=0.5)
        
        X_p = np.copy(X)
        low, high = coef[0], coef[1]
        
        # 2. 執行截斷
        X_p = np.clip(X_p, low, high)
        
        # 3. 檢查是否變成常數 (所有數值都一樣)
        if np.unique(X_p).size == 1:
            # 為了讓 Spearman 能計算，我們必須打破「所有數值都一樣」的狀態
            # 我們製造一個極小的擾動 (epsilon)
            eps = 1e-3
            
            # 策略：找出原始預測中「最大」的那個值，讓它在截斷後稍微大一點點
            # 這能保證我們沒有破壞原本的排序邏輯 (原本大的，現在還是稍微大一點)
            max_idx = np.argmax(X)
            
            # 強制加上擾動
            X_p[max_idx] += eps
            
        return X_p

# ------------------- 推論迴圈 -------------------
def inference_fn(test_loader, model, device):
    model.eval()
    preds = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Predicting"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            token_type_ids = batch.get('token_type_ids')
            if token_type_ids is not None:
                token_type_ids = token_type_ids.to(device)
            
            y_preds = model(input_ids, attention_mask, token_type_ids)
            preds.append(y_preds.sigmoid().cpu().numpy())
            
    return np.concatenate(preds)

# ------------------- 主程式 -------------------
if __name__ == '__main__':
    TEST_PATH = 'test.csv'
    if not os.path.exists(TEST_PATH):
        TEST_PATH = '/kaggle/input/google-quest-challenge/test.csv'
    
    test = pd.read_csv(TEST_PATH)
    print(f"Test Data Shape: {test.shape}")
    
    tokenizer = AutoTokenizer.from_pretrained(CFG.base_model)
    
    test_dataset = QuestDataset(test, tokenizer, max_len=CFG.max_len)
    test_loader = DataLoader(
        test_dataset, 
        batch_size=CFG.batch_size, 
        shuffle=False, 
        num_workers=CFG.num_workers,
        pin_memory=True
    )
    
    fold_preds = []
    
    for fold in range(CFG.n_fold):
        weight_path = os.path.join(CFG.model_dir, f"deberta_v3_fold{fold}_best.pth")
        
        if not os.path.exists(weight_path):
            print(f"Warning: Weights for fold {fold} not found at {weight_path}. Skipping.")
            continue
            
        print(f"Loading Fold {fold} Model...")
        
        model = QuestModel(
            CFG.base_model, 
            num_targets=len(TARGET_COLS), 
            pooling_strategy=CFG.pooling_strategy
        )
        
        state_dict = torch.load(weight_path, map_location=CFG.device)
        model.load_state_dict(state_dict)
        model.to(CFG.device)
        
        preds = inference_fn(test_loader, model, CFG.device)
        fold_preds.append(preds)
        
        del model, state_dict
        torch.cuda.empty_cache()
        gc.collect()
        
    if len(fold_preds) > 0:
        # 1. 取得平均預測 (未優化)
        avg_preds = np.mean(fold_preds, axis=0)
        
        # 2. 應用 OptimizedRounder (優化)
        print("Applying OptimizedRounder...")
        final_preds = np.zeros_like(avg_preds)
        opt = OptimizedRounder()
        
        # 針對每一行 (樣本) 的每一個欄位 (目標) 進行截斷
        # 因為 OptimizedRounder 是針對 columns 操作的
        for i in range(len(TARGET_COLS)):
            final_preds[:, i] = opt.predict(avg_preds[:, i], opt.coef_)
            
        submission = pd.read_csv('/kaggle/input/google-quest-challenge/sample_submission.csv')
        submission[TARGET_COLS] = final_preds
        submission.to_csv('submission.csv', index=False)
        print("submission.csv saved successfully! (With Optimization)")
    else:
        print("Error: No predictions generated.")