In [1]:
!pip install --no-index --find-links /kaggle/input/pymupdf/pymupdf PyMuPDF > /dev/null 2>&1
!pip install --no-index --find-links /kaggle/input/pymupdf/vllm/transformers-4.53.3-py3-none-any.whl > /dev/null 2>&1
!pip install --no-index --find-links /kaggle/input/pymupdf/vllm vllm > /dev/null 2>&1
!pip install --no-index --find-links /kaggle/input/pymupdf/logits_processor_zoo logits-processor-zoo==0.1.10 > /dev/null 2>&1
!pip install --no-index --find-links /kaggle/input/pymupdf/triton triton==3.2.0 > /dev/null 2>&1
!pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [2]:

import multiprocessing as mp
mp.set_start_method('spawn', force=True)

import os
import pandas as pd
import numpy as np
import torch
import re
import warnings
import json
import jsonlines
from typing import List, Dict, Any, Optional, Tuple
from tqdm import tqdm
from dataclasses import dataclass
from enum import Enum
import time
from datetime import datetime

warnings.filterwarnings('ignore')
os.environ["VLLM_USE_V1"] = "0"

# Try to import vLLM for batch processing
try:
    from vllm import LLM, SamplingParams
    HAS_VLLM = True
    print(" vLLM available - Qwen3 batch inference enabled")
except ImportError:
    HAS_VLLM = False
    print("vLLM not available - Using sequential processing")

# Try to import transformers for fallback
try:
    from transformers import AutoModelForCausalLM, AutoTokenizer
    HAS_TRANSFORMERS = True
    print(" Transformers available for fallback")
except ImportError:
    HAS_TRANSFORMERS = False
    print(" Transformers not available")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'  Device: {device}')

class Qwen3ModelType(Enum):
    """Qwen3 model processing types"""
    BATCH = "qwen3_batch"      
    SEQUENTIAL = "qwen3_sequential"  

@dataclass
class Qwen3Config:
    """Qwen3 model configuration"""
    name: str
    type: Qwen3ModelType
    model_path: str
    
    # Processing parameters
    batch_size: int = 8
    max_tokens: int = 300
    temperature: float = 0.05
    
    # vLLM specific parameters
    tensor_parallel_size: Optional[int] = None
    gpu_memory_utilization: float = 0.85
    max_model_len: int = 2048
    enable_prefix_caching: bool = True
    
    # Medical safety parameters
    safety_threshold: float = 0.8
    confidence_threshold: float = 0.6

class MedicalPromptEngine:
    """Medical prompt engineering for Qwen3"""
    
    @staticmethod
    def create_system_prompt(domain: str = "general") -> str:
        """Create domain-specific medical system prompts"""
        base_prompt = """You are an expert medical AI assistant specializing in drug decision-making and precision therapeutics. 
You have deep knowledge of pharmacology, drug interactions, contraindications, and treatment guidelines.
Always prioritize patient safety and provide evidence-based medical recommendations."""
        
        domain_specializations = {
            "pediatric": "\n\nSPECIALIZATION: You are a pediatric medicine expert. Consider age-appropriate dosing, Reye's syndrome risks, and developmental pharmacokinetics.",
            "geriatric": "\n\nSPECIALIZATION: You are a geriatric medicine expert. Consider age-related pharmacokinetic changes, polypharmacy risks, and organ function decline.",
            "pregnancy": "\n\nSPECIALIZATION: You are a maternal-fetal medicine expert. Consider pregnancy categories, teratogenic risks, and maternal-fetal drug transfer.",
            "pharmacogenomics": "\n\nSPECIALIZATION: You are a pharmacogenomics expert. Consider CYP enzyme variants, genetic polymorphisms, and personalized dosing."
        }
        
        return base_prompt + domain_specializations.get(domain, "")
    
    @staticmethod
    def get_medical_examples() -> List[Dict[str, str]]:
        """High-quality medical reasoning examples"""
        return [
            {
                "question": "A 8-year-old child with fever (102°F) needs pain relief. Which medication is safest?",
                "options": ["A: Aspirin 325mg", "B: Ibuprofen 100mg", "C: Acetaminophen 160mg", "D: No medication"],
                "reasoning": "For pediatric fever management, aspirin is absolutely contraindicated due to Reye's syndrome risk. Acetaminophen is the safest first-line antipyretic for children, with appropriate weight-based dosing.",
                "answer": "C"
            },
            {
                "question": "A patient is a poor CYP2D6 metabolizer taking antidepressants. Which requires immediate dose reduction?",
                "options": ["A: Sertraline", "B: Citalopram", "C: Venlafaxine", "D: Paroxetine"],
                "reasoning": "Poor CYP2D6 metabolizers cannot effectively metabolize paroxetine, leading to significantly increased plasma levels and toxicity risk. Dose reduction by 50% is typically required.",
                "answer": "D"
            },
            {
                "question": "A pregnant woman in her first trimester has severe nausea. Which antiemetic is safest?",
                "options": ["A: Ondansetron", "B: Metoclopramide", "C: Pyridoxine (Vitamin B6)", "D: Promethazine"],
                "reasoning": "During first trimester, teratogenic risk is highest. Pyridoxine (Vitamin B6) is pregnancy category A and is the safest first-line treatment for pregnancy-related nausea.",
                "answer": "C"
            }
        ]

class Qwen3MedicalProcessor:
    """Qwen3-based medical question processor with batch optimization"""
    
    def __init__(self, config: Qwen3Config):
        self.config = config
        self.model = None
        self.tokenizer = None
        self.model_available = False
        self.prompt_engine = MedicalPromptEngine()
        
        # Initialize model
        self._initialize_model()
        
        # Optimized sampling parameters for medical reasoning
        self.sampling_params = SamplingParams(
            seed=42,
            temperature=config.temperature,
            max_tokens=config.max_tokens,
            skip_special_tokens=False,
            top_p=0.95,
            repetition_penalty=1.05,
            frequency_penalty=0.1
        )
    
    def _initialize_model(self):
        """Initialize Qwen3 model with optimal configuration"""
        try:
            if self.config.type == Qwen3ModelType.BATCH and HAS_VLLM:
                self._setup_vllm_batch_processing()
            elif self.config.type == Qwen3ModelType.SEQUENTIAL and HAS_TRANSFORMERS:
                self._setup_transformers_sequential()
            else:
                print(f"  {self.config.type} not available, model will use fallback logic")
                self.model_available = False
                
        except Exception as e:
            print(f" Failed to initialize {self.config.name}: {e}")
            self.model_available = False
    
    def _setup_vllm_batch_processing(self):
        """Setup vLLM for high-performance batch processing"""
        if not os.path.exists(self.config.model_path):
            raise FileNotFoundError(f"Qwen3 model not found: {self.config.model_path}")
        
        print(f" Initializing {self.config.name} with vLLM batch processing...")
        
        # Optimize tensor parallelism
        tensor_parallel_size = self.config.tensor_parallel_size or min(torch.cuda.device_count(), 4)
        
        self.model = LLM(
            self.config.model_path,
            tensor_parallel_size=tensor_parallel_size,
            gpu_memory_utilization=self.config.gpu_memory_utilization,
            trust_remote_code=True,
            dtype="half",
            enforce_eager=True,
            max_model_len=self.config.max_model_len,
            disable_log_stats=True,
            enable_prefix_caching=self.config.enable_prefix_caching,
            swap_space=4    
        )
        
        self.tokenizer = self.model.get_tokenizer()
        self.model_available = True
        print(f" {self.config.name} ready for batch processing (TP={tensor_parallel_size})")
    
    def _setup_transformers_sequential(self):
        """Setup transformers for sequential processing fallback"""
        print(f" Initializing {self.config.name} with transformers (sequential)...")
        
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.config.model_path,
            trust_remote_code=True,
            use_fast=True
        )
        
        self.model = AutoModelForCausalLM.from_pretrained(
            self.config.model_path,
            trust_remote_code=True,
            torch_dtype=torch.float16,
            device_map="auto",
            low_cpu_mem_usage=True
        )
        
        self.model_available = True
        print(f" {self.config.name} ready for sequential processing")
    
    def detect_medical_domain(self, question: str) -> str:
        """Intelligent medical domain detection"""
        question_lower = question.lower()
        
        # Domain detection with priority order
        domain_patterns = {
            "pediatric": ['child', 'pediatric', 'year-old', 'infant', 'juvenile', 'adolescent'],
            "geriatric": ['elderly', 'geriatric', 'old', 'senior', 'aged'],
            "pregnancy": ['pregnant', 'pregnancy', 'lactating', 'breastfeeding', 'maternal'],
            "pharmacogenomics": ['cyp', 'metabolizer', 'genetic', 'polymorphism', 'variant']
        }
        
        for domain, keywords in domain_patterns.items():
            if any(keyword in question_lower for keyword in keywords):
                return domain
        
        return "general"
    
    def create_medical_prompt(self, question: str, options: List[str], question_type: str) -> str:
        """Create optimized medical prompt with domain expertise"""
        
        # Detect medical domain for specialized prompting
        domain = self.detect_medical_domain(question)
        
        # Create domain-specific system prompt
        system_content = self.prompt_engine.create_system_prompt(domain)
        
        # Add relevant medical examples
        examples = self.prompt_engine.get_medical_examples()
        examples_text = ""
        
        # Select most relevant examples (limit to 2 for efficiency)
        relevant_examples = examples[:2]
        
        for i, example in enumerate(relevant_examples, 1):
            examples_text += f"\n--- Example {i} ---\n"
            examples_text += f"Question: {example['question']}\n"
            examples_text += f"Options: {' | '.join(example['options'])}\n"
            examples_text += f"Medical Reasoning: {example['reasoning']}\n"
            examples_text += f"Answer: {example['answer']}\n"
        
        # Create main question with structured format
        user_content = f"""{examples_text}

--- Current Medical Question ---

Question: {question}

Options:
{chr(10).join([f"{chr(65+i)}. {option}" for i, option in enumerate(options)])}

INSTRUCTIONS:
Analyze this medical question considering:
1. PATIENT SAFETY (highest priority)
2. Drug interactions and contraindications  
3. Age-specific pharmacokinetics
4. Genetic factors and metabolism
5. Evidence-based guidelines

Provide your response in this exact format:
MEDICAL_REASONING: [Your detailed step-by-step medical analysis]
FINAL_ANSWER: [Single letter: A, B, C, or D]

Response:"""
        
        # Apply Qwen3 chat template
        messages = [
            {"role": "system", "content": system_content},
            {"role": "user", "content": user_content}
        ]
        
        if self.tokenizer:
            try:
                prompt = self.tokenizer.apply_chat_template(
                    messages,
                    add_generation_prompt=True,
                    tokenize=False
                )
                return prompt
            except Exception as e:
                return f"<|im_start|>system\n{system_content}<|im_end|>\n<|im_start|>user\n{user_content}<|im_end|>\n<|im_start|>assistant\n"
        else:
            return f"System: {system_content}\n\nUser: {user_content}\n\nAssistant:"
    
    def extract_medical_response(self, response_text: str, options: List[str]) -> Tuple[str, str, float]:
        """Advanced medical response extraction with confidence scoring"""
        try:
            response_text = response_text.strip()
            
            # Initialize defaults
            reasoning = ""
            answer = "A"
            confidence = 0.5
            
            # Primary extraction: structured format
            if "MEDICAL_REASONING:" in response_text and "FINAL_ANSWER:" in response_text:
                parts = response_text.split("FINAL_ANSWER:")
                reasoning = parts[0].replace("MEDICAL_REASONING:", "").strip()
                answer_part = parts[1].strip()
                
                # Extract letter answer with multiple patterns
                answer_patterns = [
                    r'^[(\[]?([A-D])[)\].]?',  # (A), [B], C., D
                    r'\b([A-D])\b',            # Standalone letter
                    r'([A-D])(?:\s|$)'         # Letter followed by space or end
                ]
                
                for pattern in answer_patterns:
                    match = re.search(pattern, answer_part.upper())
                    if match:
                        answer = match.group(1)
                        break
            
            # Fallback extraction methods
            elif "REASONING:" in response_text and "ANSWER:" in response_text:
                parts = response_text.split("ANSWER:")
                reasoning = parts[0].replace("REASONING:", "").strip()
                answer_part = parts[1].strip()
                
                answer_match = re.search(r'^[(\[]?([A-D])[)\].]?', answer_part.upper())
                if answer_match:
                    answer = answer_match.group(1)
            
            else:
                reasoning = response_text
                for i, option in enumerate(options):
                    letter = chr(65 + i)
                    if letter in response_text.upper():
                        answer = letter
                        break
            
            confidence = self._calculate_medical_confidence(response_text, reasoning)
            
            return reasoning, answer, confidence
            
        except Exception as e:
            return "Error in response parsing - using safety fallback", "A", 0.1
    
    def _calculate_medical_confidence(self, response_text: str, reasoning: str) -> float:
        """Calculate confidence based on medical reasoning quality"""
        confidence = 0.5  # Base confidence
        
        # Medical expertise indicators (increase confidence)
        medical_quality_indicators = [
            'contraindication', 'interaction', 'pharmacokinetics', 'metabolism',
            'safety', 'efficacy', 'adverse', 'therapeutic', 'dosing',
            'renal', 'hepatic', 'cardiac', 'pregnancy category'
        ]
        
        for indicator in medical_quality_indicators:
            if indicator in response_text.lower():
                confidence += 0.04
        
        # Structured reasoning (increase confidence)
        reasoning_sentences = len([s for s in reasoning.split('.') if len(s.strip()) > 10])
        if reasoning_sentences >= 3:
            confidence += 0.15
        elif reasoning_sentences >= 2:
            confidence += 0.10
        
        # Safety considerations (increase confidence)
        safety_indicators = ['patient safety', 'risk', 'benefit', 'contraindicated']
        for indicator in safety_indicators:
            if indicator in response_text.lower():
                confidence += 0.05
        
        # Uncertainty indicators (decrease confidence)
        uncertainty_indicators = ['maybe', 'possibly', 'unclear', 'uncertain', 'might be']
        for indicator in uncertainty_indicators:
            if indicator in response_text.lower():
                confidence -= 0.08
        
        # Ensure confidence is within valid range
        return min(max(confidence, 0.1), 0.95)
    
    def process_batch(self, questions_data: List[Dict]) -> List[Dict]:
        if not self.model_available:
            return [self._medical_safety_fallback(data) for data in questions_data]
        
        try:
            # Create optimized prompts for entire batch
            prompts = []
            for data in questions_data:
                question = data['question']
                options = [data['options'][key] for key in sorted(data['options'].keys())]
                question_type = data.get('question_type', 'multi_choice')
                
                prompt = self.create_medical_prompt(question, options, question_type)
                prompts.append(prompt)
            
            # Process based on model type
            if self.config.type == Qwen3ModelType.BATCH:
                return self._process_vllm_batch(prompts, questions_data)
            else:
                return self._process_sequential_batch(prompts, questions_data)
                
        except Exception as e:
            return [self._medical_safety_fallback(data) for data in questions_data]
    
    def _process_vllm_batch(self, prompts: List[str], questions_data: List[Dict]) -> List[Dict]:
        """Process batch using vLLM for maximum performance"""
        try:
            outputs = self.model.generate(prompts, self.sampling_params)
            
            results = []
            for i, (output, data) in enumerate(zip(outputs, questions_data)):
                try:
                    response = output.outputs[0].text.strip()
                    options = [data['options'][key] for key in sorted(data['options'].keys())]
                    reasoning, answer, confidence = self.extract_medical_response(response, options)
                    
                    results.append({
                        'id': data['id'],
                        'reasoning': reasoning,
                        'answer': answer,
                        'confidence': confidence,
                        'model': self.config.name,
                        'domain': self.detect_medical_domain(data['question']),
                        'raw_response': response
                    })
                except Exception as e:
                    results.append(self._medical_safety_fallback(data))
            
            return results
            
        except Exception as e:
            return [self._medical_safety_fallback(data) for data in questions_data]
    
    def _process_sequential_batch(self, prompts: List[str], questions_data: List[Dict]) -> List[Dict]:
        results = []
        
        for prompt, data in zip(prompts, questions_data):
            try:
                inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
                inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
                
                with torch.no_grad():
                    outputs = self.model.generate(
                        **inputs,
                        max_new_tokens=self.config.max_tokens,
                        temperature=self.config.temperature,
                        do_sample=True,
                        top_p=0.95,
                        pad_token_id=self.tokenizer.eos_token_id
                    )
                
                response = self.tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
                options = [data['options'][key] for key in sorted(data['options'].keys())]
                reasoning, answer, confidence = self.extract_medical_response(response, options)
                
                results.append({
                    'id': data['id'],
                    'reasoning': reasoning,
                    'answer': answer,
                    'confidence': confidence,
                    'model': self.config.name,
                    'domain': self.detect_medical_domain(data['question']),
                    'raw_response': response
                })
                
            except Exception as e:
                results.append(self._medical_safety_fallback(data))
        
        return results
    
    def _medical_safety_fallback(self, data: Dict) -> Dict:
        question = data['question'].lower()
        options = data['options']
        
        safety_score = 0.6
        selected_answer = "A"
        
        # Pediatric safety protocols
        if any(keyword in question for keyword in ['child', 'pediatric', 'year-old', 'infant']):
            reasoning += "PEDIATRIC PATIENT - Enhanced safety protocols. "
            
            for key, option in options.items():
                option_lower = option.lower()
                # Avoid aspirin in children (Reye's syndrome)
                if 'aspirin' in option_lower:
                    reasoning += "Avoiding aspirin (Reye's syndrome risk). "
                    continue
                # Prefer acetaminophen for children
                elif 'acetaminophen' in option_lower or 'tylenol' in option_lower:
                    selected_answer = key
                    safety_score = 0.8
                    reasoning += f"Selected pediatric-safe option {key} (acetaminophen). "
                    break
                # Ibuprofen as second choice
                elif 'ibuprofen' in option_lower:
                    selected_answer = key
                    safety_score = 0.7
                    reasoning += f"Selected option {key} (ibuprofen - acceptable for children >6mo). "
        
        # Poor metabolizer safety protocols
        elif 'poor metabolizer' in question or 'cyp' in question:
            reasoning += "GENETIC VARIANT DETECTED - Dose adjustment required. "
            
            for key, option in options.items():
                option_lower = option.lower()
                if any(word in option_lower for word in ['low', 'reduced', 'half', '50%', 'alternative']):
                    selected_answer = key
                    safety_score = 0.85
                    reasoning += f"Selected dose-adjusted option {key} for poor metabolizer. "
                    break
        
        # Pregnancy safety protocols
        elif any(keyword in question for keyword in ['pregnant', 'pregnancy', 'lactating']):
            reasoning += "PREGNANCY DETECTED - Teratogenic risk assessment. "
            
            for key, option in options.items():
                option_lower = option.lower()
                if any(safe_word in option_lower for safe_word in ['category a', 'category b', 'vitamin', 'folic']):
                    selected_answer = key
                    safety_score = 0.8
                    reasoning += f"Selected pregnancy-safe option {key}. "
                    break
        
        # Elderly patient protocols
        elif any(keyword in question for keyword in ['elderly', 'geriatric', 'old']):
            reasoning += "GERIATRIC PATIENT - Reduced clearance considerations. "
            
            for key, option in options.items():
                option_lower = option.lower()
                if any(word in option_lower for word in ['low', 'reduced', 'geriatric']):
                    selected_answer = key
                    safety_score = 0.75
                    reasoning += f"Selected age-appropriate option {key}. "
                    break
        
        return {
            'id': data['id'],
            'reasoning': reasoning,
            'answer': selected_answer,
            'confidence': safety_score,
            'model': 'medical_safety_fallback',
            'domain': self.detect_medical_domain(data['question']),
            'raw_response': reasoning
        }


def process_cure_bench_qwen3(
    input_file: str,
    output_file: str,
    batch_size: int = 28,
    model_path: str = "/kaggle/input/qwen-3/transformers/32b-awq/1"
) -> pd.DataFrame:
    """Process CURE-Bench dataset with optimized Qwen3 pipeline"""
    
    print(" Loading CURE-Bench medical dataset...")
    
    # Load questions
    questions_data = []
    with jsonlines.open(input_file) as reader:
        for obj in reader:
            questions_data.append(obj)
    
    print(f"📊 Loaded {len(questions_data)} medical questions")
    
    # Configure Qwen3 processor
    qwen3_config = Qwen3Config(
        name="qwen3_32b_medical",
        type=Qwen3ModelType.BATCH if HAS_VLLM else Qwen3ModelType.SEQUENTIAL,
        model_path=model_path,
        batch_size=batch_size,
        max_tokens=300,
        temperature=0.05,
        safety_threshold=0.8,gpu_memory_utilization=0.9,
        confidence_threshold=0.6
    )
    
    # Initialize processor
    processor = Qwen3MedicalProcessor(qwen3_config)
    
    if not processor.model_available:
        print(" Qwen3 model not available - using medical safety fallbacks only")
    
    # Process in optimized batches
    all_results = []
    num_batches = len(questions_data) // batch_size + (1 if len(questions_data) % batch_size > 0 else 0)
    
    print(f"⚡ Processing {len(questions_data)} questions in {num_batches} batches...")
    
    start_time = time.time()
    
    with tqdm(total=len(questions_data), desc="🏥 CURE-Bench Processing") as pbar:
        for i in range(0, len(questions_data), batch_size):
            batch_data = questions_data[i:i+batch_size]
            
            try:
                batch_results = processor.process_batch(batch_data)
                all_results.extend(batch_results)
                
                # Calculate batch statistics
                batch_confidence = np.mean([r["confidence"] for r in batch_results])
                batch_domains = [r.get("domain", "general") for r in batch_results]
                domain_counts = pd.Series(batch_domains).value_counts()
                
                pbar.update(len(batch_data))
                pbar.set_postfix({
                    'Batch': f'{i//batch_size + 1}/{num_batches}',
                    'Conf': f'{batch_confidence:.3f}',
                    'Domains': f'{len(domain_counts)}'
                })
                
            except Exception as e:
                # Add safety fallback results
                fallback_results = [processor._medical_safety_fallback(data) for data in batch_data]
                all_results.extend(fallback_results)
                pbar.update(len(batch_data))
    
    processing_time = time.time() - start_time
    
    # Create submission DataFrame with OFFICIAL CURE-Bench format
    submission_data = []
    for result in all_results:
        # Determine question type to handle choice column correctly
        question_data = next((q for q in questions_data if q['id'] == result['id']), {})
        question_type = question_data.get('question_type', 'multi_choice')
        
        # Handle choice column based on question type
        if question_type == "open_ended":
            choice_value = "NOTAVALUE"  # Official requirement for open-ended
        else:
            choice_value = result['answer']  # A, B, C, D for multiple choice
        
        submission_data.append({
            'id': str(result['id']),
            'prediction': str(result['reasoning']),  # Full reasoning as prediction
            'choice': str(choice_value),
            'reasoning': str(result['reasoning'])  # Same as prediction for internal reasoning
        })
    
    df_submission = pd.DataFrame(submission_data)
    
    # Official CURE-Bench NULL value cleaning
    null_replacements = {
        'id': 'unknown_id',
        'prediction': 'No prediction available',
        'choice': 'NOTAVALUE',
        'reasoning': 'No reasoning available'
    }
    
    # Aggressive null cleaning (matching official framework)
    for col in df_submission.columns:
        # Replace pandas null values
        df_submission[col] = df_submission[col].fillna(null_replacements.get(col, 'NOTAVALUE'))
        
        # Replace string representations of null
        null_like_values = ['nan', 'NaN', 'None', 'null', 'NULL', '<NA>', 'nat', 'NaT', '']
        for null_val in null_like_values:
            df_submission[col] = df_submission[col].replace(null_val, null_replacements.get(col, 'NOTAVALUE'))
        
        # Special handling for choice column
        if col == 'choice':
            # Ensure choice is never empty or null-like
            df_submission[col] = df_submission[col].replace('', 'NOTAVALUE')
            df_submission[col] = df_submission[col].replace(' ', 'NOTAVALUE')
    
    # Save CSV file
    csv_path = output_file
    df_submission.to_csv(csv_path, index=False, na_rep='NOTAVALUE', quoting=1)
    
    # Create official CURE-Bench metadata
    metadata = {
        "meta_data": {
            "model_name": "qwen3_32b_medical",
            "track": "internal_reasoning",
            "model_type": "LocalModel", 
            "base_model_type": "OpenWeighted",
            "base_model_name": "Qwen3-32B-AWQ",
            "dataset": "cure_bench_pharse_1",
            "additional_info": "Qwen3 with medical domain specialization and batch processing"
        }
    }
    
    metadata_path = output_file.replace('.csv', '_meta_data.json')
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)
    
    import zipfile
    zip_path = output_file.replace('.csv', '_submission.zip')
    
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(csv_path, "submission.csv")
        zipf.write(metadata_path, "meta_data.json")
    
    # print(f"  Total processing time: {processing_time:.1f} seconds")
    # print(f" Average time per question: {processing_time/len(all_results):.3f} seconds")
    # print(f" CSV saved to: {csv_path}")
    # print(f" Official submission ZIP: {zip_path}")
    
    # print(f"   Total questions: {len(all_results)}")
    # print(f"   Average confidence: {np.mean([r['confidence'] for r in all_results]):.3f}")
    # print(f"   High confidence (>0.8): {sum(1 for r in all_results if r['confidence'] > 0.8)} ({sum(1 for r in all_results if r['confidence'] > 0.8)/len(all_results)*100:.1f}%)")
    
    # domain_dist = pd.Series([r.get('domain', 'general') for r in all_results]).value_counts()
    # for domain, count in domain_dist.items():
    #     print(f"   {domain.title()}: {count} ({count/len(all_results)*100:.1f}%)")
    
    # answer_dist = pd.Series([r['answer'] for r in all_results]).value_counts()
    # for answer, count in answer_dist.items():
    #     print(f"   Option {answer}: {count} ({count/len(all_results)*100:.1f}%)")
    
    return df_submission
    
 

input_file = "/kaggle/input/cure-bench/curebench_testset_phase1.jsonl"
output_file = "qwen3_cure_bench_submission.csv"
batch_size = 28 
model_path = "/kaggle/input/qwen-3/transformers/32b-awq/1"


submission_df = process_cure_bench_qwen3(
    input_file=input_file,
    output_file=output_file,
    batch_size=batch_size,
    model_path=model_path
)

# submission_df.to_csv(output_file, index=False)

 


2025-08-03 15:07:46.983770: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754233667.194630      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754233667.251403      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


INFO 08-03 15:08:00 [__init__.py:235] Automatically detected platform cuda.
 vLLM available - Qwen3 batch inference enabled
 Transformers available for fallback
  Device: cuda
 Loading CURE-Bench medical dataset...
📊 Loaded 2079 medical questions
 Initializing qwen3_32b_medical with vLLM batch processing...
INFO 08-03 15:08:16 [config.py:1604] Using max model len 2048
INFO 08-03 15:08:18 [llm_engine.py:228] Initializing a V0 LLM engine (v0.10.0) with config: model='/kaggle/input/qwen-3/transformers/32b-awq/1', speculative_config=None, tokenizer='/kaggle/input/qwen-3/transformers/32b-awq/1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_conf

2025-08-03 15:08:23.414098: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754233703.434831     101 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754233703.441265     101 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


INFO 08-03 15:08:28 [__init__.py:235] Automatically detected platform cuda.
[1;36m(VllmWorkerProcess pid=101)[0;0m INFO 08-03 15:08:30 [multiproc_worker_utils.py:226] Worker ready; awaiting tasks
[1;36m(VllmWorkerProcess pid=101)[0;0m INFO 08-03 15:08:30 [cuda.py:346] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
[1;36m(VllmWorkerProcess pid=101)[0;0m INFO 08-03 15:08:30 [cuda.py:395] Using XFormers backend.


[W803 15:08:41.308442647 socket.cpp:200] [c10d] The hostname of the client socket cannot be retrieved. err=-3
[W803 15:08:41.651379225 socket.cpp:200] [c10d] The hostname of the client socket cannot be retrieved. err=-3
[W803 15:08:51.317434496 socket.cpp:200] [c10d] The hostname of the client socket cannot be retrieved. err=-3


INFO 08-03 15:09:01 [__init__.py:1375] Found nccl from library libnccl.so.2
INFO 08-03 15:09:01 [pynccl.py:70] vLLM is using nccl==2.26.2


[W803 15:09:01.327869039 socket.cpp:200] [c10d] The hostname of the client socket cannot be retrieved. err=-3


[1;36m(VllmWorkerProcess pid=101)[0;0m INFO 08-03 15:09:01 [__init__.py:1375] Found nccl from library libnccl.so.2
[1;36m(VllmWorkerProcess pid=101)[0;0m INFO 08-03 15:09:01 [pynccl.py:70] vLLM is using nccl==2.26.2
INFO 08-03 15:09:01 [custom_all_reduce_utils.py:208] generating GPU P2P access cache in /root/.cache/vllm/gpu_p2p_access_cache_for_0,1.json
INFO 08-03 15:09:24 [custom_all_reduce_utils.py:246] reading GPU P2P access cache from /root/.cache/vllm/gpu_p2p_access_cache_for_0,1.json
[1;36m(VllmWorkerProcess pid=101)[0;0m INFO 08-03 15:09:24 [custom_all_reduce_utils.py:246] reading GPU P2P access cache from /root/.cache/vllm/gpu_p2p_access_cache_for_0,1.json
INFO 08-03 15:09:24 [shm_broadcast.py:289] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_f98610b3'), local_subscribe_addr='ipc:///tmp/ae032088-06fa-4380-bd5b-c3997aa4c9a1', remote_subscribe_addr=None, remote_addr_ipv6=False)
INFO 08-03 15:09:24 [parallel_state

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


[1;36m(VllmWorkerProcess pid=101)[0;0m INFO 08-03 15:12:07 [default_loader.py:262] Loading weights took 162.00 seconds
INFO 08-03 15:12:08 [default_loader.py:262] Loading weights took 162.20 seconds
[1;36m(VllmWorkerProcess pid=101)[0;0m INFO 08-03 15:12:08 [model_runner.py:1115] Model loading took 9.0570 GiB and 162.350997 seconds
INFO 08-03 15:12:09 [model_runner.py:1115] Model loading took 9.0570 GiB and 162.558890 seconds
[1;36m(VllmWorkerProcess pid=101)[0;0m INFO 08-03 15:12:19 [worker.py:295] Memory profiling takes 9.59 seconds
[1;36m(VllmWorkerProcess pid=101)[0;0m INFO 08-03 15:12:19 [worker.py:295] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.90) = 13.27GiB
[1;36m(VllmWorkerProcess pid=101)[0;0m INFO 08-03 15:12:19 [worker.py:295] model weights take 9.06GiB; non_torch_memory takes 0.11GiB; PyTorch activation peak memory takes 0.43GiB; the rest of the memory reserved for KV Cache is 3.68GiB.
INFO 08-03 15:12:19 [worker.p

🏥 CURE-Bench Processing:   0%|          | 0/2079 [00:00<?, ?it/s]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:   1%|▏         | 28/2079 [01:41<2:04:06,  3.63s/it, Batch=1/75, Conf=0.630, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:   3%|▎         | 56/2079 [03:16<1:57:23,  3.48s/it, Batch=2/75, Conf=0.630, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:   4%|▍         | 84/2079 [04:50<1:54:16,  3.44s/it, Batch=3/75, Conf=0.640, Domains=2]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:   5%|▌         | 112/2079 [06:24<1:51:25,  3.40s/it, Batch=4/75, Conf=0.630, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:   7%|▋         | 140/2079 [08:02<1:51:06,  3.44s/it, Batch=5/75, Conf=0.627, Domains=2]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:   8%|▊         | 168/2079 [09:38<1:49:21,  3.43s/it, Batch=6/75, Conf=0.648, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:   9%|▉         | 196/2079 [11:13<1:47:15,  3.42s/it, Batch=7/75, Conf=0.669, Domains=2]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  11%|█         | 224/2079 [12:47<1:45:10,  3.40s/it, Batch=8/75, Conf=0.629, Domains=2]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  12%|█▏        | 252/2079 [14:23<1:43:38,  3.40s/it, Batch=9/75, Conf=0.672, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  13%|█▎        | 280/2079 [15:57<1:41:44,  3.39s/it, Batch=10/75, Conf=0.603, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  15%|█▍        | 308/2079 [17:33<1:40:24,  3.40s/it, Batch=11/75, Conf=0.654, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  16%|█▌        | 336/2079 [19:10<1:39:21,  3.42s/it, Batch=12/75, Conf=0.646, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  18%|█▊        | 364/2079 [20:44<1:37:21,  3.41s/it, Batch=13/75, Conf=0.618, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  19%|█▉        | 392/2079 [22:19<1:35:37,  3.40s/it, Batch=14/75, Conf=0.611, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  20%|██        | 420/2079 [23:54<1:33:57,  3.40s/it, Batch=15/75, Conf=0.648, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  22%|██▏       | 448/2079 [25:29<1:32:20,  3.40s/it, Batch=16/75, Conf=0.645, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  23%|██▎       | 476/2079 [27:04<1:30:48,  3.40s/it, Batch=17/75, Conf=0.654, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  24%|██▍       | 504/2079 [28:40<1:29:19,  3.40s/it, Batch=18/75, Conf=0.635, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  26%|██▌       | 532/2079 [30:15<1:27:43,  3.40s/it, Batch=19/75, Conf=0.651, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  27%|██▋       | 560/2079 [31:50<1:25:54,  3.39s/it, Batch=20/75, Conf=0.633, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  28%|██▊       | 588/2079 [33:24<1:24:02,  3.38s/it, Batch=21/75, Conf=0.649, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  30%|██▉       | 616/2079 [34:59<1:22:33,  3.39s/it, Batch=22/75, Conf=0.625, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  31%|███       | 644/2079 [36:34<1:21:06,  3.39s/it, Batch=23/75, Conf=0.626, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  32%|███▏      | 672/2079 [38:08<1:19:18,  3.38s/it, Batch=24/75, Conf=0.638, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  34%|███▎      | 700/2079 [39:43<1:17:49,  3.39s/it, Batch=25/75, Conf=0.653, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  35%|███▌      | 728/2079 [41:18<1:16:18,  3.39s/it, Batch=26/75, Conf=0.632, Domains=5]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  36%|███▋      | 756/2079 [42:52<1:14:25,  3.38s/it, Batch=27/75, Conf=0.651, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  38%|███▊      | 784/2079 [44:26<1:12:43,  3.37s/it, Batch=28/75, Conf=0.640, Domains=2]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  39%|███▉      | 812/2079 [46:00<1:11:11,  3.37s/it, Batch=29/75, Conf=0.643, Domains=2]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  40%|████      | 840/2079 [47:37<1:10:14,  3.40s/it, Batch=30/75, Conf=0.656, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  42%|████▏     | 868/2079 [49:11<1:08:20,  3.39s/it, Batch=31/75, Conf=0.647, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  43%|████▎     | 896/2079 [50:46<1:06:49,  3.39s/it, Batch=32/75, Conf=0.645, Domains=2]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  44%|████▍     | 924/2079 [52:21<1:05:14,  3.39s/it, Batch=33/75, Conf=0.659, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  46%|████▌     | 952/2079 [53:56<1:03:39,  3.39s/it, Batch=34/75, Conf=0.633, Domains=5]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  47%|████▋     | 980/2079 [55:30<1:01:55,  3.38s/it, Batch=35/75, Conf=0.621, Domains=5]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  48%|████▊     | 1008/2079 [57:05<1:00:18,  3.38s/it, Batch=36/75, Conf=0.644, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  50%|████▉     | 1036/2079 [58:39<58:40,  3.38s/it, Batch=37/75, Conf=0.629, Domains=2]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  51%|█████     | 1064/2079 [1:00:16<57:31,  3.40s/it, Batch=38/75, Conf=0.629, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  53%|█████▎    | 1092/2079 [1:01:50<55:42,  3.39s/it, Batch=39/75, Conf=0.623, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  54%|█████▍    | 1120/2079 [1:03:24<54:02,  3.38s/it, Batch=40/75, Conf=0.660, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  55%|█████▌    | 1148/2079 [1:04:59<52:28,  3.38s/it, Batch=41/75, Conf=0.648, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  57%|█████▋    | 1176/2079 [1:06:34<50:54,  3.38s/it, Batch=42/75, Conf=0.618, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  58%|█████▊    | 1204/2079 [1:08:08<49:19,  3.38s/it, Batch=43/75, Conf=0.655, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  59%|█████▉    | 1232/2079 [1:09:44<47:54,  3.39s/it, Batch=44/75, Conf=0.635, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  61%|██████    | 1260/2079 [1:11:18<46:13,  3.39s/it, Batch=45/75, Conf=0.628, Domains=5]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  62%|██████▏   | 1288/2079 [1:12:52<44:31,  3.38s/it, Batch=46/75, Conf=0.628, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  63%|██████▎   | 1316/2079 [1:14:25<42:45,  3.36s/it, Batch=47/75, Conf=0.663, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  65%|██████▍   | 1344/2079 [1:15:59<41:07,  3.36s/it, Batch=48/75, Conf=0.640, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  66%|██████▌   | 1372/2079 [1:17:34<39:37,  3.36s/it, Batch=49/75, Conf=0.659, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  67%|██████▋   | 1400/2079 [1:19:08<38:03,  3.36s/it, Batch=50/75, Conf=0.645, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  69%|██████▊   | 1428/2079 [1:20:43<36:37,  3.38s/it, Batch=51/75, Conf=0.659, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  70%|███████   | 1456/2079 [1:22:18<35:01,  3.37s/it, Batch=52/75, Conf=0.613, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  71%|███████▏  | 1484/2079 [1:23:52<33:26,  3.37s/it, Batch=53/75, Conf=0.668, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  73%|███████▎  | 1512/2079 [1:25:26<31:52,  3.37s/it, Batch=54/75, Conf=0.655, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  74%|███████▍  | 1540/2079 [1:27:01<30:17,  3.37s/it, Batch=55/75, Conf=0.640, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  75%|███████▌  | 1568/2079 [1:28:35<28:40,  3.37s/it, Batch=56/75, Conf=0.624, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  77%|███████▋  | 1596/2079 [1:30:08<27:01,  3.36s/it, Batch=57/75, Conf=0.613, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  78%|███████▊  | 1624/2079 [1:31:42<25:27,  3.36s/it, Batch=58/75, Conf=0.630, Domains=2]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  79%|███████▉  | 1652/2079 [1:33:17<23:57,  3.37s/it, Batch=59/75, Conf=0.663, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  81%|████████  | 1680/2079 [1:34:53<22:32,  3.39s/it, Batch=60/75, Conf=0.650, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  82%|████████▏ | 1708/2079 [1:36:28<20:56,  3.39s/it, Batch=61/75, Conf=0.647, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  84%|████████▎ | 1736/2079 [1:38:03<19:21,  3.39s/it, Batch=62/75, Conf=0.623, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  85%|████████▍ | 1764/2079 [1:39:36<17:42,  3.37s/it, Batch=63/75, Conf=0.637, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  86%|████████▌ | 1792/2079 [1:41:11<16:08,  3.37s/it, Batch=64/75, Conf=0.671, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  88%|████████▊ | 1820/2079 [1:42:45<14:31,  3.37s/it, Batch=65/75, Conf=0.643, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  89%|████████▉ | 1848/2079 [1:44:19<12:58,  3.37s/it, Batch=66/75, Conf=0.660, Domains=5]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  90%|█████████ | 1876/2079 [1:45:53<11:22,  3.36s/it, Batch=67/75, Conf=0.672, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  92%|█████████▏| 1904/2079 [1:47:28<09:49,  3.37s/it, Batch=68/75, Conf=0.627, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  93%|█████████▎| 1932/2079 [1:49:03<08:16,  3.38s/it, Batch=69/75, Conf=0.646, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  94%|█████████▍| 1960/2079 [1:50:37<06:41,  3.37s/it, Batch=70/75, Conf=0.648, Domains=2]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  96%|█████████▌| 1988/2079 [1:52:12<05:06,  3.37s/it, Batch=71/75, Conf=0.643, Domains=4]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  97%|█████████▋| 2016/2079 [1:53:46<03:32,  3.37s/it, Batch=72/75, Conf=0.636, Domains=3]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing:  98%|█████████▊| 2044/2079 [1:55:20<01:57,  3.37s/it, Batch=73/75, Conf=0.666, Domains=2]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing: 100%|█████████▉| 2072/2079 [1:56:55<00:23,  3.37s/it, Batch=74/75, Conf=0.628, Domains=4]

Adding requests:   0%|          | 0/7 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/7 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

🏥 CURE-Bench Processing: 100%|██████████| 2079/2079 [1:57:35<00:00,  3.39s/it, Batch=75/75, Conf=0.659, Domains=2]


INFO 08-03 17:10:00 [multiproc_worker_utils.py:138] Terminating local vLLM worker processes
[1;36m(VllmWorkerProcess pid=101)[0;0m INFO 08-03 17:10:00 [multiproc_worker_utils.py:260] Worker exiting
