## Model 1-2: Qwen 3 8b & Deepseek math 7b parallel
Run deepseek on cuda:0 and qwen 3 on cuda:1

In [None]:
%%writefile qwen3_4b_inference.py

import os
import torch
import json
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from scipy.special import softmax
from tqdm import tqdm

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

print("Starting Qwen3-4B inference...")

# ============================================
# Load Dataset
# ============================================
train = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/train.csv')
test = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/test.csv')

# ============================================
# Feature Engineering - Correct Answer Lookup
# ============================================
idx = train.apply(lambda row: row.Category.split('_')[0], axis=1) == 'True'
correct = train.loc[idx].copy()
correct['c'] = correct.groupby(['QuestionId', 'MC_Answer']).MC_Answer.transform('count')
correct = correct.sort_values('c', ascending=False).drop_duplicates(['QuestionId'])
correct = correct[['QuestionId', 'MC_Answer']]
correct['is_correct'] = 1

train = train.merge(correct, on=['QuestionId','MC_Answer'], how='left')
train.is_correct = train.is_correct.fillna(0)

test = test.merge(correct, on=['QuestionId','MC_Answer'], how='left')
test.is_correct = test.is_correct.fillna(0)

def format_input(row):
    x = "This answer is correct." if row['is_correct'] else "This answer is incorrect."
    return (
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"{x}\n"
        f"Student Explanation: {row['StudentExplanation']}"
    )

test['text'] = test.apply(format_input, axis=1)

# Create dataset
ds_test = Dataset.from_pandas(test[['row_id', 'text']])

# ============================================
# Load Model & Tokenizer
# ============================================
model_name = "/kaggle/input/qwen3-4b-full-map-competition"

# Load label encoder mapping
with open(os.path.join(model_name, "label_encoder.json")) as f:
    label_data = json.load(f)
target_classes = label_data["classes"]
idx2label = {i: lbl for i, lbl in enumerate(target_classes)}
label2idx = {lbl: i for i, lbl in enumerate(target_classes)}

n_classes = len(target_classes)
print(f"Number of classes: {n_classes}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print("Loading Qwen3-4B model...")
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=n_classes,
    device_map="auto",
    torch_dtype=torch.bfloat16
)
model.config.pad_token_id = tokenizer.pad_token_id
model.eval()

# Tokenize dataset
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=256)

ds_test = ds_test.map(tokenize, batched=True)

# ============================================
# Inference
# ============================================
print("Running inference...")

test_args = TrainingArguments(
    output_dir="./qwen3_results",
    do_train=False,
    do_predict=True,
    per_device_eval_batch_size=16,
    fp16=True,
    bf16=False,
    report_to='none'
)

trainer = Trainer(
    model=model,
    args=test_args,
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer)
)

predictions = trainer.predict(ds_test)
probs = softmax(predictions.predictions, axis=1)

# ============================================
# Generate Unified Output for Ensemble
# ============================================
print("Generating unified interface output...")

# Get top-25 predictions
TOP_K = 25
top_indices = np.argsort(-probs, axis=1)

# Decode to class names
flat_indices = top_indices.flatten()
decoded_labels = np.vectorize(idx2label.get)(flat_indices)
top_labels = decoded_labels.reshape(top_indices.shape)

# Create standard submission (top-3)
top3_labels = top_labels[:, :3]
joined_preds = [" ".join(row) for row in top3_labels]

sub = pd.DataFrame({
    "row_id": test.row_id.values,
    "Category:Misconception": joined_preds
})
sub.to_csv("submission_qwen3_4b.csv", index=False)

# Create probability file for ensemble
prob_data = []
for i in range(len(test)):
    prob_dict = {
        'row_id': test.row_id.values[i],
        'top_classes': " ".join(top_labels[i, :TOP_K])
    }
    # Add probabilities for top-25 classes
    for j in range(TOP_K):
        prob_dict[f'prob_{j}'] = probs[i, top_indices[i, j]]
    
    prob_data.append(prob_dict)

prob_df = pd.DataFrame(prob_data)
prob_df.to_csv("submission_qwen3_4b_probabilities.csv", index=False)

print("✅ Qwen3-4B inference completed!")
print(f"Generated files:")
print(f"  - submission_qwen3_4b.csv (standard submission)")
print(f"  - submission_qwen3_4b_probabilities.csv (for ensemble)")

# Clean up
del model, tokenizer
torch.cuda.empty_cache()

In [None]:
%%writefile qwen3_deepseek_inference.py

# we do parallel inference, for deepseek and qwen3
import os
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
import threading
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding
from scipy.special import softmax
from tqdm import tqdm
import time

os.environ["TOKENIZERS_PARALLELISM"] = "false"


train = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/train.csv')
test  = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/test.csv')

model_paths = [
    "/kaggle/input/deekseepmath-7b-map-competition/MAP_EXP_09_FULL",
   "/kaggle/input/qwen3-8b-map-competition/MAP_EXP_16_FULL"]

def format_input(row):
    x = "This answer is correct."
    if not row['is_correct']:
        x = "This is answer is incorrect."
    return (
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"{x}\n"
        f"Student Explanation: {row['StudentExplanation']}")


le = LabelEncoder()
train.Misconception  = train.Misconception.fillna('NA')
train['target']   = train.Category + ':' +train.Misconception
train['label']    = le.fit_transform(train['target'])

n_classes = len(le.classes_)
print(f"Train shape: {train.shape} with {n_classes} target classes")
idx = train.apply(lambda row: row.Category.split('_')[0],axis=1)=='True'
correct = train.loc[idx].copy()
correct['c'] = correct.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
correct = correct.sort_values('c',ascending=False)
correct = correct.drop_duplicates(['QuestionId'])
correct = correct[['QuestionId','MC_Answer']]
correct['is_correct'] = 1

test = test.merge(correct, on=['QuestionId','MC_Answer'], how='left')
test.is_correct = test.is_correct.fillna(0)
test['text'] = test.apply(format_input,axis=1)
ds_test = Dataset.from_pandas(test)


def run_inference_on_gpu(model_path, gpu_id, test_data, output_name):
    """Run inference for one model on one GPU"""
    
    device = f"cuda:{gpu_id}"
    print(f"Loading {output_name} on {device}...")
    
    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path, 
        device_map=device, 
        torch_dtype=torch.float16
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model.config.pad_token_id = tokenizer.pad_token_id
    model.eval()
    
    # Tokenize function
    def tokenize(batch):
        return tokenizer(batch["text"], 
                        truncation=True,
                        max_length=256)
    
    ds_test = Dataset.from_pandas(test_data[['text']])
    ds_test = ds_test.map(tokenize, batched=True, remove_columns=['text'])
    
    # Data collator
    data_collator = DataCollatorWithPadding(
        tokenizer=tokenizer,
        padding=True,
        return_tensors="pt"
    )
    
    # DataLoader
    dataloader = DataLoader(
        ds_test,
        batch_size=4,
        shuffle=False,
        collate_fn=data_collator,
        pin_memory=True,
        num_workers=0
    )
    
    # Inference
    all_logits = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc=f"{output_name}"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            all_logits.append(outputs.logits.float().cpu().numpy())
    
    predictions = np.concatenate(all_logits, axis=0)
    
    # Process results
    probs = softmax(predictions, axis=1)
    top_indices = np.argsort(-probs, axis=1)
    
    # Decode labels
    flat_indices = top_indices.flatten()
    decoded_labels = le.inverse_transform(flat_indices)
    top_labels = decoded_labels.reshape(top_indices.shape)
    
    # Save top-3 submission
    joined_preds = [" ".join(row[:3]) for row in top_labels]
    sub = pd.DataFrame({
        "row_id": test_data.row_id.values,
        "Category:Misconception": joined_preds
    })
    sub.to_csv(f"submission_{output_name}.csv", index=False)
    
    # Save probabilities for ensemble
    prob_data = []
    for i in range(len(predictions)):
        prob_dict = {f"prob_{j}": probs[i, top_indices[i, j]] for j in range(25)}
        prob_dict['row_id'] = test_data.row_id.values[i]
        prob_dict['top_classes'] = " ".join(top_labels[i, :25])
        prob_data.append(prob_dict)
    
    prob_df = pd.DataFrame(prob_data)
    prob_df.to_csv(f"submission_{output_name}_probabilities.csv", index=False)
    
    print(f" {output_name} completed - saved submission and probabilities")
    
    # Clean up GPU memory
    del model, tokenizer
    torch.cuda.empty_cache()

print(" Starting multi-GPU inference...")
start_time = time.time()

threads = []
gpu_assignments = [
    (model_paths[0], 0, "deepseek"),
    (model_paths[1], 1, "qwen3"),
]

# Start threads
for model_path, gpu_id, name in gpu_assignments:
    if gpu_id < torch.cuda.device_count():  
        thread = threading.Thread(
            target=run_inference_on_gpu,
            args=(model_path, gpu_id, test, name)
        )
        threads.append(thread)
        thread.start()
        time.sleep(10)  # Stagger starts to avoid memory issues

# Wait for completion
for thread in threads:
    thread.join()

end_time = time.time()
print(f" completed in {end_time - start_time:.2f} seconds!")

## Model 3: ettin encoder 1b 5fold

In [None]:
%%writefile ettin_inference.py

import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import gc
import os

# Disable wandb
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# For multi-GPU if needed (adjust based on your needs)
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

# ============================================
# Configuration
# ============================================

MODEL_PATHS = [
    "/kaggle/input/ettin-encoder1b-5fold/content/drive/MyDrive/models/ver_1_5fold/fold_0/final_model", 
    "/kaggle/input/ettin-encoder1b-5fold/content/drive/MyDrive/models/ver_1_5fold/fold_1/final_model",
    "/kaggle/input/ettin-encoder1b-5fold/content/drive/MyDrive/models/ver_1_5fold/fold_2/final_model",
    "/kaggle/input/ettin-encoder1b-5fold/content/drive/MyDrive/models/ver_1_5fold/fold_3/final_model",
    "/kaggle/input/ettin-encoder1b-5fold/content/drive/MyDrive/models/ver_1_5fold/fold_4/final_model",
]

# ============================================
# Setup
# ============================================

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ============================================
# Data Preparation
# ============================================

print("Loading data...")
test = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/test.csv')
train = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/train.csv')

# Prepare label encoder (same as other models)
train['Misconception'] = train['Misconception'].fillna('NA')
train['target'] = train['Category'] + ":" + train['Misconception']
le = LabelEncoder()
le.fit(train['target'])
num_classes = len(le.classes_)
print(f"Number of classes: {num_classes}")

# Add is_correct feature
idx = train.apply(lambda row: row.Category.split('_')[0], axis=1) == 'True'
correct = train.loc[idx].copy()
correct['c'] = correct.groupby(['QuestionId', 'MC_Answer']).MC_Answer.transform('count')
correct = correct.sort_values('c', ascending=False).drop_duplicates(['QuestionId'])
correct = correct[['QuestionId', 'MC_Answer']]
correct['is_correct'] = 1

test = test.merge(correct, on=['QuestionId', 'MC_Answer'], how='left')
test['is_correct'] = test['is_correct'].fillna(0)

# Format input text
def format_input(row):
    x = "Yes" if row['is_correct'] else "No"
    return f"Question: {row['QuestionText']}\nAnswer: {row['MC_Answer']}\nCorrect? {x}\nStudent Explanation: {row['StudentExplanation']}"

test['text'] = test.apply(format_input, axis=1)

# ============================================
# Batch Prediction Function
# ============================================

def batch_predict(model, tokenizer, texts, batch_size=8, max_length=256):
    all_probs = []
    
    for i in tqdm(range(0, len(texts), batch_size), desc="Inference"):
        batch_texts = texts[i:i+batch_size]
        
        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        )
        
        with torch.no_grad():
            inputs = {k: v.to(device) for k, v in inputs.items()}
            outputs = model(**inputs)
            
            batch_probs = torch.nn.functional.softmax(
                outputs.logits.float(), dim=-1
            ).cpu().numpy()
            
            all_probs.append(batch_probs)
            
            del outputs, inputs
            if i % 100 == 0:
                torch.cuda.empty_cache()
    
    return np.vstack(all_probs)

# ============================================
# Process Each Fold
# ============================================

print("Starting 5-fold inference...")
all_fold_probs = []

for fold_idx, model_path in enumerate(MODEL_PATHS):
    print(f"\nProcessing Fold {fold_idx + 1}/{len(MODEL_PATHS)}")
    
    try:
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        # Load model
        model = AutoModelForSequenceClassification.from_pretrained(
            model_path,
            num_labels=num_classes,
            torch_dtype=torch.float16,
            device_map=None,
        ).to(device)
        
        model.eval()
        
        # Predict
        fold_probs = batch_predict(
            model, 
            tokenizer, 
            test['text'].tolist(),
            batch_size=8,
            max_length=256
        )
        
        all_fold_probs.append(fold_probs)
        
        # Clean up
        del model
        del tokenizer
        torch.cuda.empty_cache()
        gc.collect()
        
        print(f"✅ Fold {fold_idx + 1} completed")
        
    except Exception as e:
        print(f"❌ Error in fold {fold_idx + 1}: {str(e)}")
        continue

# ============================================
# Ensemble and Generate Output
# ============================================

print("\nEnsembling predictions...")
ensemble_probs = np.mean(all_fold_probs, axis=0)

# Get top-25 predictions
top_indices = np.argsort(-ensemble_probs, axis=1)

# Decode to class names
flat_indices = top_indices.flatten()
decoded_labels = le.inverse_transform(flat_indices)
top_labels = decoded_labels.reshape(top_indices.shape)

# Create standard submission (top 3)
joined_preds = [" ".join(row[:3]) for row in top_labels]
sub = pd.DataFrame({
    "row_id": test.row_id.values,
    "Category:Misconception": joined_preds
})
sub.to_csv("submission_ettin.csv", index=False)

# Create probability file for ensemble
prob_data = []
for i in range(len(test)):
    prob_dict = {f"prob_{j}": ensemble_probs[i, top_indices[i, j]] for j in range(25)}
    prob_dict['row_id'] = test.row_id.values[i]
    prob_dict['top_classes'] = " ".join(top_labels[i, :25])
    prob_data.append(prob_dict)

prob_df = pd.DataFrame(prob_data)
prob_df.to_csv("submission_ettin_probabilities.csv", index=False)

print("✅ Ettin inference completed!")
print(f"Generated: submission_ettin.csv and submission_ettin_probabilities.csv")

## Model 4: Hunyuan_7b

In [None]:
%%writefile hunyuan_inference_safe.py

import os
import sys

# 不升级transformers，直接使用当前版本
print("Using current transformers version...")

# 设置环境变量
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

import pandas as pd
import torch
import numpy as np
from argparse import Namespace
from sklearn.preprocessing import LabelEncoder
from scipy.special import softmax

# DataProcessor类（保持不变）
class DataProcessor:
    def __init__(self, args):
        self.args = args
        self.le = None
        self.isPreprocess = False
        self.correct_lookup = None

    def load_data(self):
        self.train_df = pd.read_csv(self.args.train_path)
        self.test_df = pd.read_csv(self.args.test_path)

    def get_num_classes(self):
        if not self.isPreprocess:
            return "please preprocess first"
        return self.train_df['label'].nunique()

    def get_label_encoder(self):
        if self.le is None:
            raise ValueError("LabelEncoder not initialized.")
        return self.le

    @staticmethod
    def format_input(row):
        correct_text = "Yes" if row['IsCorrect'] else "No"
        return (
            f"Question: {row['QuestionText']}\n"
            f"Answer: {row['MC_Answer']}\n"
            f"Correct? {correct_text}\n"
            f"Student Explanation: {row['StudentExplanation']}\n"
        )

    def preprocess(self):
        self.load_data()
        
        # 标准化标签编码
        self.train_df['Misconception'] = self.train_df['Misconception'].fillna('NA')
        self.train_df['target'] = self.train_df['Category'] + ':' + self.train_df['Misconception']

        # 获取正确答案
        correct_samples = self.train_df[self.train_df['Category'].str.startswith('True', na=False)].copy()
        correct_samples['count'] = correct_samples.groupby(['QuestionId', 'MC_Answer'])['MC_Answer'].transform('count')
        most_popular_correct = correct_samples.sort_values('count', ascending=False).drop_duplicates(['QuestionId'])
        self.correct_lookup = most_popular_correct[['QuestionId', 'MC_Answer']].copy()
        self.correct_lookup['IsCorrect_flag'] = True

        self.train_df = self.train_df.merge(self.correct_lookup, on=['QuestionId', 'MC_Answer'], how='left')
        self.train_df['IsCorrect'] = self.train_df['IsCorrect_flag'].notna()
        self.train_df = self.train_df.drop(columns=['IsCorrect_flag'])

        self.le = LabelEncoder()
        self.train_df['label'] = self.le.fit_transform(self.train_df['target'])
        self.train_df['text'] = self.train_df.apply(self.format_input, axis=1)

        self.isPreprocess = True
        return self.train_df

    def inference_processor(self):
        if not self.isPreprocess:
            return "Please preprocess first"
        self.test_df = self.test_df.merge(self.correct_lookup, on=['QuestionId', 'MC_Answer'], how='left')
        self.test_df['IsCorrect'] = self.test_df['IsCorrect_flag'].notna()
        self.test_df = self.test_df.drop(columns=['IsCorrect_flag'])
        self.test_df['text'] = self.test_df.apply(self.format_input, axis=1)
        return self.test_df

# 配置参数
args = Namespace(
    train_path='/kaggle/input/map-charting-student-math-misunderstandings/train.csv',
    test_path='/kaggle/input/map-charting-student-math-misunderstandings/test.csv',
    model_dir="/kaggle/input/hunyuan-7b-instruct-map",
    inference_model_dir="/kaggle/input/hunyuan-7b-instruct-map",
    model_name="/kaggle/input/hunyuan-7b-instruct-bf16"
)

print("Preprocessing data...")
DP = DataProcessor(args)
_ = DP.preprocess()
num_classes = DP.get_num_classes()
print(f"Number of classes: {num_classes}")

try:
    # 尝试导入transformers
    from datasets import Dataset
    from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
    from peft import PeftModel
    
    print("Loading tokenizer and model...")
    tokenizer = AutoTokenizer.from_pretrained(args.inference_model_dir, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # 加载基座模型 + LoRA
    print("Loading base model...")
    base_model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name,
        num_labels=num_classes,
        device_map="auto",
        torch_dtype=torch.float16,
        trust_remote_code=True  # 添加这个参数
    )
    
    print("Loading LoRA weights...")
    model = PeftModel.from_pretrained(base_model, args.inference_model_dir)
    model.config.pad_token_id = tokenizer.pad_token_id
    model.eval()

    # Tokenize函数
    MAX_LEN = 256
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=MAX_LEN)

    # 准备测试数据
    print("Preparing test data...")
    test_df = DP.inference_processor()
    ds_test = Dataset.from_pandas(test_df[['text']])
    ds_test = ds_test.map(tokenize_function, batched=True)

    # 推理设置
    inference_args = TrainingArguments(
        do_train=False,
        do_eval=True,
        output_dir="./results_hunyuan",
        per_device_eval_batch_size=16,
        fp16=True,
        bf16=False,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=inference_args,
        processing_class=tokenizer,
    )

    # 执行推理
    print("Running inference...")
    pred = trainer.predict(ds_test)
    probs = torch.nn.functional.softmax(torch.tensor(pred.predictions), dim=1).numpy()

    # ============================================
    # 生成统一接口输出
    # ============================================

    print("Generating unified interface output...")

    # 获取top-25预测
    TOP_K = 25
    top_indices = np.argsort(-probs, axis=1)

    # 解码为类别名
    flat_indices = top_indices.flatten()
    le = DP.get_label_encoder()
    decoded_labels = le.inverse_transform(flat_indices)
    top_labels = decoded_labels.reshape(top_indices.shape)

    # 创建标准提交文件（top-3）
    top3_labels = top_labels[:, :3]
    joined_preds = [" ".join(row) for row in top3_labels]

    sub = pd.DataFrame({
        "row_id": test_df.row_id.values,
        "Category:Misconception": joined_preds
    })
    sub.to_csv("submission_hunyuan.csv", index=False)

    # 创建概率文件
    prob_data = []
    for i in range(len(test_df)):
        prob_dict = {
            'row_id': test_df.row_id.values[i],
            'top_classes': " ".join(top_labels[i, :TOP_K])
        }
        for j in range(TOP_K):
            prob_dict[f'prob_{j}'] = probs[i, top_indices[i, j]]
        
        prob_data.append(prob_dict)

    prob_df = pd.DataFrame(prob_data)
    prob_df.to_csv("submission_hunyuan_probabilities.csv", index=False)

    print("✅ Hunyuan inference completed!")
    print(f"Generated files:")
    print(f"  - submission_hunyuan.csv")
    print(f"  - submission_hunyuan_probabilities.csv")

except Exception as e:
    print(f"❌ Error occurred: {e}")
    print("\nTrying alternative approach without Hunyuan model...")
    
    # 如果Hunyuan失败，创建一个占位文件以便其他模型可以继续
    print("Creating placeholder files for ensemble compatibility...")
    
    # 使用简单的随机预测作为备份
    test_df = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/test.csv')
    
    # 创建空的概率文件（让集成可以继续，只是不包含Hunyuan）
    print("⚠️ Hunyuan model failed, proceeding without it")

## Run inference

In [None]:
import time 
!python /kaggle/working/qwen3_4b_inference.py 
time.sleep(10)
!python /kaggle/working/qwen3_deepseek_inference.py
time.sleep(10)
!python /kaggle/working/ettin_inference.py
time.sleep(10)
!pip install --upgrade --no-index --find-links=/kaggle/input/transformers-4-56-1-and-deps transformers -qq
!python /kaggle/working/hunyuan_inference_safe.py

## Ensemble 

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from scipy.special import softmax



def extract_class_probabilities(row, model_suffix='', top_k=25):
    """Extract class names and probabilities from a row"""
    # Get top classes
    classes_col = f'top_classes{model_suffix}'
    if classes_col in row:
        classes = row[classes_col].split(' ')[:top_k]
    else:
        return {}
    # Get probabilities
    class_probs = {}
    for i in range(min(top_k, len(classes))):
        prob_col = f'prob_{i}{model_suffix}'
        if prob_col in row:
            class_probs[classes[i]] = row[prob_col]
    return class_probs


def ensemble_with_disagreement_handling(prob_files, model_weights=None, top_k=3):
    n_models = len(prob_files)
    prob_dfs = []
    final_predictions = []
    
    for file_path in prob_files:
        df = pd.read_csv(file_path)
        prob_dfs.append(df)
    
    # Merge on row_id
    merged_df = prob_dfs[0]
    for i, df in enumerate(prob_dfs[1:], 1):
        merged_df = pd.merge(merged_df, df, on='row_id', suffixes=('', f'_model{i+1}'))
      
    for idx, row in merged_df.iterrows():
        
        # Extract probabilities from each model
        all_class_probs = []
        for i in range(n_models):
            suffix = f'_model{i+1}' if i > 0 else ''
            class_probs = extract_class_probabilities(row, suffix, top_k=25)
            all_class_probs.append(class_probs)
        
        # Get all unique classes
        all_classes = set()
        for class_probs in all_class_probs:
            all_classes.update(class_probs.keys())
        
        # Calculate agreement and disagreement
        class_votes = defaultdict(int)
        class_total_prob = defaultdict(float)
        class_max_prob = defaultdict(float)
        
        for i, class_probs in enumerate(all_class_probs):
            weight = model_weights[i]
            
            for class_name, prob in class_probs.items():
                class_votes[class_name] += 1
                class_total_prob[class_name] += prob * weight
                class_max_prob[class_name] = max(class_max_prob[class_name], prob * weight)
        
        final_scores = {}
        for class_name in all_classes:
            
            # Base score: weighted average probability
            base_score = class_total_prob[class_name]
            
            # Agreement : classes predicted by more models get boost
            agreement_bonus = class_votes[class_name] / n_models
            
            # Confidence bonus: classes with high max probability get boost
            confidence_bonus = class_max_prob[class_name]
            
            # Combined score
            final_scores[class_name] = (
                base_score * 0.6 +           # 60% base probs
                agreement_bonus * 0.3 +      # 30% agreement
                confidence_bonus * 0.1       # 10% confidence
            )
        
        # Sort and get top-k
        sorted_classes = sorted(final_scores.items(), key=lambda x: -x[1])
        top_classes = [class_name for class_name, _ in sorted_classes[:top_k]]
        
        final_predictions.append(' '.join(top_classes))
    
    return final_predictions

# single models scores
# deepseek math 7b - 0.944
# qwen3 8b - 0.943
# ettin encoder 1b - 0.944
# hunyuan_inference lb - 0.945
w0 = 1.0
w1 = 1.0  
w2 = 1.0  
w3 = 1.0  
w4 = 1.1  

prob_files = [
    '/kaggle/working/submission_qwen3_4b_probabilities.csv',
    '/kaggle/working/submission_deepseek_probabilities.csv',
    '/kaggle/working/submission_qwen3_probabilities.csv',
    '/kaggle/working/submission_ettin_probabilities.csv',
    '/kaggle/working/submission_hunyuan_probabilities.csv'

]


predictions = ensemble_with_disagreement_handling(
        prob_files, 
        model_weights=[w0, w1, w2, w3, w4],  
        top_k=3
    )
    
test_df = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/test.csv')

submission = pd.DataFrame({
    'row_id': test_df.row_id.values,
    'Category:Misconception': predictions
})

submission.to_csv('submission.csv', index=False)
submission