In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install transformers>=4.51.0 torch torchvision torchaudio accelerate bitsandbytes -q
!pip install sentencepiece protobuf -q
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import gc
import warnings
import re
warnings.filterwarnings('ignore')
# Check GPU availability
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

In [None]:
import torch
import gc
import re
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from typing import List, Dict, Tuple, Any
import warnings
warnings.filterwarnings('ignore')

class ArabicMedicalQuestionClassifier:
    def __init__(self):
        self.model_name = "Qwen/Qwen3-14B"
        self.tokenizer = None
        self.model = None
        self.question_categories = {
            'A': 'Diagnosis (questions about interpreting clinical findings)',
            'B': 'Treatment (questions about seeking treatments)',
            'C': 'Anatomy and Physiology (questions about basic medical knowledge)',
            'D': 'Epidemiology (questions about the course, prognosis, and etiology of diseases)',
            'E': 'Healthy Lifestyle (questions related to diet, exercise, and mood control)',
            'F': 'Provider Choices (questions seeking recommendations for medical professionals and facilities)',
            'Z': 'Other (questions that do not fall under the above-mentioned categories)'
        }
        self.load_model()
    
    def load_model(self):
        """Load Qwen3-14B with optimizations for Arabic medical text classification"""
        print("Loading Qwen3-14B model for Arabic Medical Question Classification...")
        
        # Configure quantization for memory efficiency
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )
        
        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_name,
            trust_remote_code=True
        )
        
        # Load model with quantization
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            quantization_config=quantization_config,
            device_map="auto",
            trust_remote_code=True,
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True
        )
        
        print("Qwen3-14B model loaded successfully!")
        self.print_model_info()
    
    def print_model_info(self):
        """Print model and memory information"""
        print(f"\nModel Information:")
        print(f"Model Name: {self.model_name}")
        print(f"Model Parameters: 14.8B (13.2B non-embedding)")
        print(f"Context Length: 32,768 tokens")
        print(f"Tokenizer Vocab Size: {len(self.tokenizer):,}")
        
        if torch.cuda.is_available():
            print(f"GPU Memory Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
            print(f"GPU Memory Reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
    
    def classify_question(self, question: str, use_thinking_mode: bool = True, max_new_tokens: int = 8000) -> List[str]:
        """
        Classify Arabic medical question into categories (Sub-Task 1)
        
        Args:
            question: Arabic medical question
            use_thinking_mode: Enable Qwen3's thinking capabilities
            max_new_tokens: Maximum tokens to generate
        
        Returns:
            List[str]: extracted_categories
        """
        
        # Create category descriptions in Arabic for better understanding
        category_descriptions = """
فئات الأسئلة الطبية:
(A) التشخيص - أسئلة حول تفسير النتائج السريرية والأعراض
(B) العلاج - أسئلة حول البحث عن علاجات وطرق العلاج
(C) التشريح وعلم وظائف الأعضاء - أسئلة حول المعرفة الطبية الأساسية
(D) علم الأوبئة - أسئلة حول مسار المرض وتشخيصه وأسبابه
(E) نمط الحياة الصحي - أسئلة متعلقة بالنظام الغذائي والرياضة والصحة النفسية
(F) اختيار مقدم الرعاية - أسئلة تطلب توصيات للمهنيين الطبيين والمرافق
(Z) أخرى - أسئلة لا تندرج تحت الفئات المذكورة أعلاه
"""
        
        messages = [
            {
                "role": "user", 
                "content": f"""أنت خبير في تصنيف الأسئلة الطبية باللغة العربية. مهمتك هي تصنيف السؤال التالي إلى فئة أو أكثر من الفئات المحددة.

{category_descriptions}

السؤال المراد تصنيفه:
{question}

تعليمات:
1. اقرأ السؤال بعناية وحلل محتواه
2. حدد الفئة أو الفئات المناسبة (يمكن أن يكون هناك أكثر من فئة واحدة)
3. اشرح سبب اختيارك لكل فئة
4. في النهاية، اكتب الإجابة بالتنسيق التالي:
   "التصنيف النهائي: [A,B,C]" (استخدم الأحرف المناسبة مفصولة بفواصل)

مثال على التنسيق:
- إذا كان السؤال عن التشخيص فقط: "التصنيف النهائي: [A]"
- إذا كان السؤال عن التشخيص والعلاج: "التصنيف النهائي: [A,B]"
"""
            }
        ]
        
        # Apply chat template with thinking mode control
        text = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=use_thinking_mode
        )
        
        # Tokenize input
        model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
        
        # Generate response
        with torch.no_grad():
            if use_thinking_mode:
                generated_ids = self.model.generate(
                    **model_inputs,
                    max_new_tokens=max_new_tokens,
                    temperature=0.6,
                    top_p=0.95,
                    top_k=20,
                    do_sample=True,
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                    repetition_penalty=1.1
                )
            else:
                generated_ids = self.model.generate(
                    **model_inputs,
                    max_new_tokens=max_new_tokens,
                    temperature=0.7,
                    top_p=0.8,
                    top_k=20,
                    do_sample=True,
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                    repetition_penalty=1.1
                )
        
        # Extract output tokens
        output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
        
        # Parse thinking content (if thinking mode is enabled)
        content = ""
        
        if use_thinking_mode:
            try:
                # Find </think> token (151668)
                index = len(output_ids) - output_ids[::-1].index(151668)
                content = self.tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip()
            except ValueError:
                content = self.tokenizer.decode(output_ids, skip_special_tokens=True).strip()
        else:
            content = self.tokenizer.decode(output_ids, skip_special_tokens=True).strip()
        
        categories = self.extract_question_categories(content)
        
        return categories
    
    def extract_question_categories(self, response: str) -> List[str]:
        """Extract question categories from Arabic response text"""
        
        # Arabic patterns for category extraction
        patterns = [
            r'التصنيف النهائي:\s*\[([ABCDEFZ,\s]+)\]',  # "التصنيف النهائي: [A,B,C]"
            r'الفئات:\s*\[([ABCDEFZ,\s]+)\]',  # "الفئات: [A,B,C]"
            r'التصنيف:\s*\[([ABCDEFZ,\s]+)\]',  # "التصنيف: [A,B,C]"
            r'النتيجة:\s*\[([ABCDEFZ,\s]+)\]',  # "النتيجة: [A,B,C]"
        ]
        
        for pattern in patterns:
            match = re.search(pattern, response, re.IGNORECASE)
            if match:
                categories_str = match.group(1)
                categories = [cat.strip().upper() for cat in categories_str.split(',')]
                return [cat for cat in categories if cat in ['A', 'B', 'C', 'D', 'E', 'F', 'Z']]
        
        # English patterns as fallback
        english_patterns = [
            r'Final Classification:\s*\[([ABCDEFZ,\s]+)\]',
            r'Categories:\s*\[([ABCDEFZ,\s]+)\]',
            r'Classification:\s*\[([ABCDEFZ,\s]+)\]',
        ]
        
        for pattern in english_patterns:
            match = re.search(pattern, response, re.IGNORECASE)
            if match:
                categories_str = match.group(1)
                categories = [cat.strip().upper() for cat in categories_str.split(',')]
                return [cat for cat in categories if cat in ['A', 'B', 'C', 'D', 'E', 'F', 'Z']]
        
        # Look for individual category mentions
        found_categories = []
        for category in ['A', 'B', 'C', 'D', 'E', 'F', 'Z']:
            if f'({category})' in response or f'[{category}]' in response:
                found_categories.append(category)
        
        return found_categories if found_categories else ['Z']  # Default to 'Other' if nothing found
    
    def process_test_dataset(self, df: pd.DataFrame, use_thinking: bool = True, show_progress: bool = True) -> pd.DataFrame:
        """
        Process test dataset for question classification only
        
        Args:
            df: DataFrame with 'question' column
            use_thinking: Enable thinking mode
            show_progress: Show progress information
        
        Returns:
            DataFrame with predictions
        """
        
        print("🚀 Starting Arabic Medical Question Classification for Test Dataset...")
        print(f"Test dataset size: {len(df)} samples")
        print("-" * 60)
        
        # Initialize result list
        predictions = []
        
        for idx, row in df.iterrows():
            if show_progress and idx % 10 == 0:
                print(f"Processing sample {idx+1}/{len(df)}")
            
            # Classify question
            try:
                categories = self.classify_question(
                    row['question'], 
                    use_thinking_mode=use_thinking
                )
                # Convert list to comma-separated string as required by submission format
                prediction_str = ', '.join(sorted(categories))
                predictions.append(prediction_str)
            except Exception as e:
                print(f"Error processing question {idx}: {e}")
                predictions.append('Z')
            
            # Clean up memory periodically
            if idx % 20 == 0:
                self.cleanup_memory()
        
        # Create result dataframe with just the predictions
        result_df = pd.DataFrame({'prediction': predictions})
        
        return result_df
    
    def cleanup_memory(self):
        """Clean up GPU memory"""
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()

# Initialize the classifier
print("Initializing Arabic Medical Question Classifier with Qwen3-14B...")
classifier = ArabicMedicalQuestionClassifier()

def generate_test_predictions(test_file_path: str, output_file_path: str = 'prediction_subtask_1.tsv', use_thinking: bool = True):
    """
    Generate predictions for test dataset and save to TSV file
    
    Args:
        test_file_path: Path to test dataset TSV file
        output_file_path: Path for output predictions file
        use_thinking: Enable thinking mode
    """
    
    print("🚀 Loading test dataset...")
    
    # Load test dataset
    try:
        df = pd.read_csv(test_file_path, sep='\t', header=None)
        df.columns = ['question']
        # df=df[:1]
        print(f"✅ Test dataset loaded successfully: {len(df)} samples")
    except Exception as e:
        print(f"❌ Error loading test dataset: {e}")
        return
    
    # Check if 'question' column exists
    if 'question' not in df.columns:
        print("❌ Error: 'question' column not found in test dataset")
        print(f"Available columns: {list(df.columns)}")
        return
    
    # Process test dataset
    print("🔍 Processing test dataset...")
    results_df = classifier.process_test_dataset(df, use_thinking=use_thinking)
    
    # Save predictions to TSV file
    try:
        # Save as TSV without header, just predictions
        results_df['prediction'].to_csv(output_file_path, sep='\t', index=False, header=False)
        print(f"✅ Predictions saved to: {output_file_path}")
        print(f"📄 Total predictions: {len(results_df)}")
        
        # Show sample predictions
        print("\n📋 Sample predictions:")
        for i in range(min(5, len(results_df))):
            print(f"Sample {i+1}: {results_df['prediction'].iloc[i]}")
            
    except Exception as e:
        print(f"❌ Error saving predictions: {e}")
    
    # Clean up memory
    classifier.cleanup_memory()
    
    return results_df

def get_model_status():
    """Display current model status"""
    print("\n📊 Model Status:")
    print(f"Model: {classifier.model_name}")
    print(f"Model loaded: {'✅' if classifier.model else '❌'}")
    if torch.cuda.is_available():
        print(f"GPU memory usage: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    classifier.print_model_info()

# Main execution
if __name__ == "__main__":
    print("🚀 Arabic Medical Question Classifier for Test Dataset Ready!")
    
    # Replace with your actual test dataset path
    TEST_DATASET_PATH = '/kaggle/input/testdf/subtask1_input_test.tsv'  # Update this path
    
    # Generate predictions
    # Uncomment the following line and update the path:
    generate_test_predictions(TEST_DATASET_PATH)
    
    # For demonstration with sample data (remove this in actual usage):
    # print("📚 Creating sample test to demonstrate functionality...")
    # sample_data = pd.DataFrame({
    #     'question': [
    #         'هل يمكن أن يكون الصداع المستمر علامة على مرض خطير؟'
    #     ]
    # })
    
    # print("Processing sample data...")
    # sample_results = classifier.process_test_dataset(sample_data, use_thinking=True)
    # print("\nSample predictions:")
    # for i, pred in enumerate(sample_results['prediction']):
    #     print(f"Question {i+1}: {pred}")
    
    # # Save sample predictions
    # sample_results['prediction'].to_csv('sample_prediction_subtask_1.tsv', sep='\t', index=False, header=False)
    # print("Sample predictions saved to: sample_prediction_subtask_1.tsv")