# AIMO3 Kaggle Submission - Multi-Model Support
## Fine-tuned LLM Solver with Model Selection

This notebook generates predictions for the AIMO3 competition using selectable models:
- **Fast Inference**: GPT-2, Gemma 3 (4B/12B)
- **Strong Reasoning**: Llama 4, Qwen 3, DeepSeek-R1, Mistral Large 3
- **Ensemble**: Combine multiple models for better accuracy

## 1. Install Dependencies

In [1]:
# Install required packages
!pip install --quiet torch transformers peft pandas tqdm

## 2. Import Libraries and Configure

## Phase 4: Import Verification & Metrics Components

In [None]:
# Add phase 4 source code to path using robust discovery
import sys
import os

def find_source_dir(start_path='/kaggle/input', target_file='monitoring.py'):
    """Recursively find directory containing target file"""
    # Check explicit paths first for speed
    common_paths = [
        '/kaggle/input/aimo-solver-phase4',
        '/kaggle/input/datasets/muzansano/aimo-solver-phase4',
        'src',
        '../src'
    ]
    for p in common_paths:
        if os.path.exists(os.path.join(p, target_file)):
            return p

    # Walk directory tree
    for root, dirs, files in os.walk(start_path):
        if target_file in files:
            return root
    return None

src_path = find_source_dir()
if src_path:
    if src_path not in sys.path:
        sys.path.insert(0, src_path)
    print(f"✅ Phase 4 source found and added: {src_path}")
else:
    print("⚠️ Could not find 'monitoring.py'. Phase 4 features will be disabled.")
    # Debugging: List input directory structure
    print("DEBUG: /kaggle/input structure:")
    try:
        for root, dirs, files in os.walk('/kaggle/input'):
            print(f"{root} -> {dirs}")
    except: pass

# Phase 4: Import Verification & Metrics Components
try:
    from monitoring import VerificationTracker, ExecutionMetrics
    from resilience import ErrorRecoveryHandler
    from computation import SymbolicCompute, AnswerValidator
    
    PHASE4_AVAILABLE = True
    print("✅ Phase 4 components imported successfully")
except ImportError as e:
    PHASE4_AVAILABLE = False
    print(f"⚠️ Phase 4 components not found: {e}")
    print("   Verification features will be disabled.")


In [None]:
# Phase 4: Initialize verification components if available
if PHASE4_AVAILABLE:
    verification_tracker = VerificationTracker()
    error_recovery = ErrorRecoveryHandler()
    execution_metrics = ExecutionMetrics()
    symbolic_compute = SymbolicCompute()
    answer_validator = AnswerValidator()
    
    print("✅ Phase 4 verification system initialized")
    print(f"   - Verification Tracker ready")
    print(f"   - Error Recovery Handler ready")
    print(f"   - Execution Metrics ready")
else:
    verification_tracker = None
    error_recovery = None
    execution_metrics = None
    print("⚠️ Phase 4 verification disabled - using basic predictions only")


In [2]:
import os
import re
import json
import torch
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from typing import Optional
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

print(f"PyTorch: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

  from .autonotebook import tqdm as notebook_tqdm


PyTorch: 2.10.0+cu128
CUDA Available: False
Using device: cpu


## 3. Define Model Loader

In [None]:
class AIOMInference:
    """Multi-model inference engine with model selection"""
    
    # Supported models
    SUPPORTED_MODELS = {
        "gpt2": {"name": "GPT-2 (Fast Baseline)", "vram": 1, "speed": "instant"},
        "gemma3-4b": {"name": "Gemma 3 4B", "vram": 8, "speed": "3s/q"},
        "gemma3-12b": {"name": "Gemma 3 12B", "vram": 24, "speed": "6s/q"},
        "llama4-scout": {"name": "Llama 4 Scout 8B", "vram": 16, "speed": "8s/q"},
        "qwen3-32b": {"name": "Qwen 3 32B", "vram": 64, "speed": "15s/q"},
        "deepseek-r1": {"name": "DeepSeek-R1 67B", "vram": 180, "speed": "45s/q", "reasoning": True},
        "mistral-large-3": {"name": "Mistral Large 3 123B", "vram": 280, "speed": "50s/q"},
    }
    
    def __init__(self, model_name="gpt2", lora_path=None, device="cuda"):
        self.device = device
        self.model_name = model_name
        self.lora_path = lora_path
        self.model = None
        self.tokenizer = None
        
    def load_model(self):
        """Load model checking local paths first for offline Kaggle use"""
        print(f"Loading {self.SUPPORTED_MODELS.get(self.model_name, {}).get('name', self.model_name)}...")
        
        from transformers import AutoTokenizer, AutoModelForCausalLM
        from peft import PeftModel
        
        # Check for local model paths (Kaggle input datasets)
        # Priority: 1. Exact path, 2. /kaggle/input/{name}, 3. Recursive search
        
        def find_model_path_recursive(base_path="/kaggle/input", model_identifier="gpt2"):
            """Recursively search for model directory containing specific identifier"""
            try:
                # First check common paths specific to this competition/user
                sys_paths = [
                    f"/kaggle/input/{model_identifier}",
                    f"/kaggle/input/model-{model_identifier}",
                    f"/kaggle/input/datasets/muzansano/model-{model_identifier}",
                    f"/kaggle/input/datasets/muzansano/{model_identifier}",
                ]
                for p in sys_paths:
                    if os.path.exists(p):
                        return p

                # Fallback: Recursive search
                if os.path.exists(base_path):
                    for root, dirs, files in os.walk(base_path):
                        if model_identifier in root.split(os.sep):
                            return root
                        for d in dirs:
                            if model_identifier in d or f"model-{model_identifier}" in d:
                                return os.path.join(root, d)
            except:
                pass
            return None

        # Try to find model path dynamically
        found_path = find_model_path_recursive(model_identifier=self.model_name.split('/')[-1])
        if found_path:
             print(f"✅ Dynamically found model path: {found_path}")
        
        potential_paths = [
            self.model_name,
            found_path if found_path else f"/kaggle/input/{self.model_name.split('/')[-1]}",
            f"/kaggle/input/{self.model_name.split('/')[-1]}",
            f"/kaggle/input/model-{self.model_name.split('/')[-1]}",
            f"/kaggle/input/{self.model_name.replace('/', '-')}"
        ]
        
        model_path = self.model_name
        local_files_only = False
        
        for path in potential_paths:
            if os.path.exists(path):
                print(f"✅ Found local model at: {path}")
                model_path = path
                local_files_only = True
                break
        
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=local_files_only)
            self.model = AutoModelForCausalLM.from_pretrained(
                model_path,
                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
                device_map="auto" if self.device == "cuda" else None,
                local_files_only=local_files_only
            )
            
            # Load LoRA weights if available
            if self.lora_path and os.path.exists(self.lora_path):
                print(f"Loading LoRA from {self.lora_path}...")
                self.model = PeftModel.from_pretrained(self.model, self.lora_path)
            
            if self.device == "cpu":
                self.model = self.model.to(self.device)
            
            self.model.eval()
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
            
            print("✅ Model loaded!")
        except Exception as e:
            print(f"❌ Failed to load model: {e}")
            raise e
    
    def create_prompt(self, problem, include_cot=True):
        """Create chain-of-thought prompt"""
        if include_cot:
            return f"""Solve this mathematical problem step by step.

Problem: {problem}

Let's think through this:
1. What is being asked?
2. What approach should we use?
3. Let's work through the solution
4. Extract the numerical answer

Answer:"""
        else:
            return f"Problem: {problem}\n\nAnswer:"
    
    def extract_answer(self, text):
        """Extract numeric answer from generated text"""
        # Try patterns like "Answer: 42"
        match = re.search(r'(?:Answer|Final|Result)[:\s]*([-\d.]+)', text, re.IGNORECASE)
        if match:
            return match.group(1)
        
        # Extract last number
        numbers = re.findall(r'-?\d+(?:\.\d+)?', text)
        if numbers:
            return numbers[-1]
        
        return "0"
    
    def generate(self, problem, max_length=512, temperature=0.7):
        """Generate answer for problem"""
        prompt = self.create_prompt(problem, include_cot=True)
        inputs = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
        
        with torch.no_grad():
            outputs = self.model.generate(
                inputs,
                max_length=max_length,
                temperature=temperature,
                top_p=0.9,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )
        
        text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        answer = self.extract_answer(text.split("Answer:")[-1])
        return answer
    
    @staticmethod
    def list_available_models():
        """List all available models"""
        print("\n" + "="*80)
        print("AVAILABLE MODELS")
        print("="*80)
        for key, info in AIOMInference.SUPPORTED_MODELS.items():
            reasoning = " [Reasoning]" if info.get("reasoning") else ""
            print(f"{key:<20} | {info['name']:<40} | {info['vram']:<6}GB | {info['speed']}{reasoning}")
        print("="*80 + "\n")

print("✅ Inference class updated for Offline/Kaggle use")

✅ Inference class defined


## 4. Load Model

In [None]:
# Show available models
AIOMInference.list_available_models()

# Select model based on available VRAM
selected_model = "gpt2"  # Change to other models based on your GPU
# Options: "gemma3-4b", "gemma3-12b", "llama4-scout", "qwen3-32b", etc.

print(f"Selected model: {selected_model}")

# Initialize inference engine
inferencer = AIOMInference(
    model_name=selected_model,
    lora_path=None,  # Set to fine-tuned model path after training
    device=DEVICE
)
inferencer.load_model()

Loading gpt2...


`torch_dtype` is deprecated! Use `dtype` instead!
Loading weights: 100%|██████████| 148/148 [00:00<00:00, 435.28it/s, Materializing param=transformer.wte.weight]             
GPT2LMHeadModel LOAD REPORT from: gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


✅ Model loaded!


## 5. Load Test Data

In [6]:
# 5. Check environment and initialize data loader
try:
    import aimo
    env = aimo.make_env()
    iter_test = env.iter_test()
    KAGGLE_MODE = True
    print("✅ Kaggle AIMO environment initialized")
except ImportError:
    KAGGLE_MODE = False
    print("⚠️ AIMO API not found - running in LOCAL/DEBUG mode")
    
    # Check available test data sources for local debugging
    test_paths = [
        "datasets/aimo3_test.csv",  # Local path (absolute)
        "../datasets/aimo3_test.csv",  # Local path (relative)
        "/kaggle/input/ai-mathematical-olympiad-progress-prize-3/test.csv"  # Kaggle public data
    ]
    
    test_path = None
    for path in test_paths:
        if os.path.exists(path):
            test_path = path
            break
            
    if test_path:
        print(f"Loading local test data from: {test_path}")
        test_df = pd.read_csv(test_path)
        print(f"✅ Loaded {len(test_df)} test problems")
    else:
        # Create dummy data if no file found
        print("⚠️ No test data found. Creating dummy data.")
        test_df = pd.DataFrame([
            {"id": "000aaa", "problem": "What is $1+1$?"},
            {"id": "111bbb", "problem": "Solve $x^2=4$ for positive $x$."},
            {"id": "222ccc", "problem": "Find the sum of angles in a triangle."}
        ])


Loading test data from: ../datasets/aimo3_test.csv
✅ Loaded 3 test problems
       id                 problem
0  000aaa          What is $1-1$?
1  111bbb    What is $0\times10$?
2  222ccc  Solve $4+x=4$ for $x$.


## 6. Generate Predictions

In [None]:
# 6. Core Classes & Prediction Loop
import time
import re
import sympy as sp
from typing import Any, Optional, Dict, Tuple, Union

# --- Core Classes (Injected for Kaggle Standalone) ---

class SymbolicCompute:
    """SymPy-based symbolic computation and verification."""
    
    @staticmethod
    def evaluate_expression(expr_str: str) -> Optional[Union[int, float]]:
        """Evaluate a mathematical expression string using SymPy."""
        try:
            expr = sp.sympify(expr_str)
            result = expr.evalf()
            return int(result) if result == int(result) else float(result)
        except Exception as e:
            return None
    
    @staticmethod
    def parse_llm_output_for_expressions(llm_text: str) -> Dict[str, Any]:
        """Parse LLM output to extract mathematical expressions."""
        result = {
            "expressions": [],
            "values": [],
            "final_value": None
        }
        try:
            # Extract expressions
            expr_pattern = r"(?:=|equals|is)\s*(\d+(?:\.\d+)?|[\w\s\+\-\*\/\(\)\.]+)"
            expr_matches = re.findall(expr_pattern, llm_text, re.IGNORECASE)
            
            # Extract numeric values
            num_pattern = r"\b(\d+(?:\.\d+)?)\b"
            num_matches = re.findall(num_pattern, llm_text)
            
            result["expressions"] = expr_matches[:5]
            result["values"] = [float(n) if '.' in n else int(n) for n in num_matches[:10]]
            
            if num_matches:
                result["final_value"] = float(num_matches[-1]) if '.' in num_matches[-1] else int(num_matches[-1])
        except:
            pass
        return result
    
    @staticmethod
    def verify_symbolic_result(
        llm_answer: int,
        llm_output: str,
        tolerance: float = 0.01
    ) -> Tuple[bool, float]:
        """Verify LLM answer by symbolic computation. Returns (is_valid, confidence)."""
        try:
            parsed = SymbolicCompute.parse_llm_output_for_expressions(llm_output)
            
            # Try to evaluate extracted expressions
            if parsed["expressions"]:
                for expr_str in parsed["expressions"]:
                    try:
                        result = SymbolicCompute.evaluate_expression(expr_str)
                        if result is not None:
                            if isinstance(result, float):
                                diff_percent = abs(result - llm_answer) / max(abs(llm_answer), 1)
                                if diff_percent <= tolerance:
                                    return True, 1.0 - diff_percent
                            else:
                                if int(result) == llm_answer:
                                    return True, 1.0
                    except:
                        continue
            
            # Check final value
            if parsed["final_value"] is not None and parsed["final_value"] == llm_answer:
                return True, 0.8
                    
        except:
            pass
        return False, 0.5

class AnswerValidator:
    """Validates and enforces answer format constraints."""
    
    AIMO_MIN = 0
    AIMO_MAX = 99999
    
    @staticmethod
    def validate_integer(value: Any) -> Optional[int]:
        """Validate and convert value to valid AIMO integer."""
        try:
            int_value = int(float(str(value).strip()))
            if int_value < AnswerValidator.AIMO_MIN: return AnswerValidator.AIMO_MIN
            if int_value > AnswerValidator.AIMO_MAX: return AnswerValidator.AIMO_MAX
            return int_value
        except:
            return None
    
    @staticmethod
    def validate_with_fallback_strategies(
        llm_answer: Optional[int],
        llm_text: str
    ) -> Dict[str, Any]:
        """Validate answer with multiple fallback strategies."""
        result = {
            "final_answer": 0,
            "confidence": 0.0,
            "strategy_used": "default_fallback",
            "fallback_applied": False
        }
        try:
            # Strategy 1: Use primary answer if valid
            if llm_answer is not None:
                validated = AnswerValidator.validate_integer(llm_answer)
                if validated is not None:
                    result["final_answer"] = validated
                    result["confidence"] = 0.9
                    result["strategy_used"] = "primary_llm_answer"
                    return result
            
            # Strategy 2: Try symbolic verification
            is_valid, confidence = SymbolicCompute.verify_symbolic_result(
                llm_answer if llm_answer is not None else 0,
                llm_text
            )
            if is_valid and llm_answer is not None:
                validated = AnswerValidator.validate_integer(llm_answer)
                if validated is not None:
                    result["final_answer"] = validated
                    result["confidence"] = confidence
                    result["strategy_used"] = "symbolic_verification"
                    result["fallback_applied"] = True
                    return result
            
            result["fallback_applied"] = True
        except:
            pass
        return result
    
    @staticmethod
    def handle_edge_cases(answer: int) -> Tuple[int, str]:
        """Handle edge cases in answer validation."""
        try:
            if answer < 0: return 0, "Negative answer converted to 0"
            if answer > AnswerValidator.AIMO_MAX * 10: 
                return AnswerValidator.AIMO_MAX, "Very large answer capped"
            validated = AnswerValidator.validate_integer(answer)
            return validated if validated is not None else 0, ""
        except:
            return 0, "Edge case error"

class ExecutionMetrics:
    """Track execution metrics for the pipeline."""
    def __init__(self):
        self.metrics = {
            "total": 0, "successful": 0, "failed": 0, 
            "fallback": 0, "verified": 0
        }
    
    def record_result(self, success: bool, fallback_used: bool = False, verified: bool = False):
        self.metrics["total"] += 1
        if success: self.metrics["successful"] += 1
        else: self.metrics["failed"] += 1
        if fallback_used: self.metrics["fallback"] += 1
        if verified: self.metrics["verified"] += 1

# --- Global Instances ---
symbolic_compute = SymbolicCompute()
answer_validator = AnswerValidator()
execution_metrics = ExecutionMetrics()
inference_engine = AIOMInference(model_name="gpt2") # Initialize inference

try:
    inference_engine.load_model()
except Exception as e:
    print(f"⚠️ Inference engine load failed (expected in CI/CD without full weights): {e}")

# --- Prediction Logic ---

predictions = []

def solve_one_problem(problem_id, problem_text):
    """Solve a single problem and return the answer with verification."""
    try:
        # 1. Generate initial answer
        answer_str = inference_engine.generate(problem_text)
        
        # 2. Extract numeric candidate
        initial_answer = 0
        try:
            initial_answer = int(float(answer_str))
        except:
            pass

        # 3. Apply Verification & Validation
        # Symbolic verification
        is_valid, confidence = symbolic_compute.verify_symbolic_result(initial_answer, str(problem_text))
        
        # Fallback validation if needed
        if not is_valid:
            validation_result = answer_validator.validate_with_fallback_strategies(initial_answer, str(problem_text))
            final_answer = validation_result.get('final_answer', initial_answer)
            fallback_used = True
        else:
            final_answer = initial_answer
            fallback_used = False
        
        # 4. Final edge case check
        final_answer, note = answer_validator.handle_edge_cases(final_answer)
        
        # 5. Record metrics
        execution_metrics.record_result(success=True, verified=is_valid, fallback_used=fallback_used)
        
        return final_answer
            
    except Exception as e:
        print(f"Error on {problem_id}: {e}")
        return 0

# Main Execution Loop
if KAGGLE_MODE:
    print("🚀 Running in KAGGLE SUBMISSION MODE")
    for (test, sample_prediction) in iter_test:
        test_row = test.iloc[0]
        problem_id = str(test_row['id'])
        problem_text = str(test_row['problem'])
        
        pred = solve_one_problem(problem_id, problem_text)
        sample_prediction['answer'] = pred
        env.predict(sample_prediction)
        
    print("✅ Submission loop complete")
    
else:
    print("🔧 Running in LOCAL DEBUG MODE")
    local_preds = []
    for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
        problem_id = str(row['id'])
        problem_text = str(row['problem'])
        
        pred = solve_one_problem(problem_id, problem_text)
        local_preds.append({'id': problem_id, 'answer': pred})
        
    submission_df = pd.DataFrame(local_preds)
    print(f"✅ Generated {len(submission_df)} local predictions")


Generating predictions for 3 problems...



  0%|          | 0/3 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 3/3 [01:14<00:00, 24.75s/it]


✅ Generated 3 predictions
       id answer
0  000aaa     48
1  111bbb      2
2  222ccc      5





## 7. Save Submission

In [None]:
# 7. Save Submission (Only needed for local mode)
if not KAGGLE_MODE:
    submission_path = "submission.csv"
    submission_df.to_csv(submission_path, index=False)
    
    print(f"✅ Submission saved to {submission_path}")
    print(f"\nFile size: {os.path.getsize(submission_path)} bytes")
    print(f"Rows: {len(submission_df)}")
    print(f"\nFirst 5 rows:")
    print(submission_df.head())
else:
    print("✅ Submission file generated via AIMO API")


## 8. Verification

In [None]:
# Phase 4: Export verification metrics and analysis
if PHASE4_AVAILABLE:
    print("📊 Phase 4 Metrics Export")
    print("=" * 50)
    
    # Get execution metrics
    metrics_summary = execution_metrics.get_summary()
    print(f"\nExecution Metrics:")
    for key, value in metrics_summary.items():
        if isinstance(value, float):
            if 'rate' in key:
                print(f"  {key}: {value:.1%}")
            else:
                print(f"  {key}: {value:.2f}")
        else:
            print(f"  {key}: {value}")
    
    # Get verification statistics
    verification_stats = verification_tracker.get_summary_statistics()
    print(f"\nVerification Statistics:")
    for key, value in verification_stats.items():
        if isinstance(value, float):
            if 'rate' in key:
                print(f"  {key}: {value:.1%}")
            else:
                print(f"  {key}: {value:.2f}")
        else:
            print(f"  {key}: {value}")
    
    # Save metrics to file
    metrics_output = {
        'execution_metrics': metrics_summary,
        'verification_stats': verification_stats
    }
    
    import json
    metrics_file = 'phase4_metrics.json'
    with open(metrics_file, 'w') as f:
        json.dump(metrics_output, f, indent=2, default=str)
    
    print(f"\n✅ Metrics saved to {metrics_file}")
else:
    print("⚠️ Phase 4 metrics not available (verification disabled)")

In [None]:
# Verify submission format
print("✅ SUBMISSION VERIFICATION:")
print(f"- Columns: {list(submission_df.columns)} (expected: ['id', 'answer'])")
print(f"- Row count: {len(submission_df)}")
print(f"- No missing values: {not submission_df.isnull().any().any()}")
print(f"- Answer examples: {submission_df['answer'].head().tolist()}")
print(f"\n✅ Submission ready for upload!")

In [None]:

# COMPATIBILITY FIX: Generate submission.parquet if required
try:
    if os.path.exists('submission.csv'):
        # Convert API output to parquet
        sub_df = pd.read_csv('submission.csv')
        sub_df.to_parquet('submission.parquet')
        print("✅ Converted submission.csv to submission.parquet")
    elif 'submission_df' in locals():
        # Save local df
        submission_df.to_parquet('submission.parquet')
        print("✅ Saved local submission_df to submission.parquet")
    else:
        print("⚠️ Could not generate submission.parquet: Source data missing")
except Exception as e:
    print(f"❌ Error generating submission.parquet: {e}")


In [None]:

# COMPATIBILITY FIX: Generate submission.parquet if required
try:
    # Check for parquet dependencies
    try:
        import pyarrow
    except ImportError:
        try:
            import fastparquet
        except ImportError:
            print("⚠️ No parquet engine found (pyarrow/fastparquet). Skipping parquet generation.")
            sub_df = None
    
    if 'sub_df' in locals() or os.path.exists('submission.csv') or 'submission_df' in locals():
        df_to_save = None
        if 'submission_df' in locals():
            df_to_save = submission_df
        elif os.path.exists('submission.csv'):
            df_to_save = pd.read_csv('submission.csv')
        
        if df_to_save is not None and not df_to_save.empty:
            try:
                df_to_save.to_parquet('submission.parquet')
                print("✅ Generated submission.parquet")
            except Exception as pe:
                print(f"⚠️ Parquet export failed (likely missing engine): {pe}")
        else:
            print("⚠️ No data to save to parquet.")
except Exception as e:
    print(f"❌ Error in parquet generation block: {e}")
