In [None]:
import pandas as pd
import json
import time
import os
from pathlib import Path
import signal
import sys
from datetime import datetime
import re
import numpy as np
from scipy import stats
from collections import defaultdict
import random
from typing import List, Dict, Optional, Tuple
import logging
import pickle
from dataclasses import dataclass, asdict
from datasets import load_dataset
import lmstudio as lms

# Configure logging with UTF-8 encoding to handle Unicode characters
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('jeebench_qwen2_5vl_evaluation.log', encoding='utf-8'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

@dataclass
class EvaluationState:
    """Persistent state for resuming evaluation"""
    current_run: int
    current_run_question_idx: int  # Which question within the current run
    completed_questions: int
    total_questions: int
    all_run_summaries: List[Dict]
    failed_questions: List[Dict]
    current_run_results: List[Dict]  # Results for the current incomplete run
    current_run_shuffled_indices: List[int]  # Shuffled indices for current run
    start_time: float
    last_save_time: float

class Qwen25VLClient:
    """Local Qwen 2.5 VL 7B client via LM Studio API"""
    
    def __init__(self, model_name: str = "qwen/qwen2.5-vl-7b"):
        self.model_name = model_name
        
        try:
            # Initialize LM Studio model
            self.model = lms.llm(model_name)
            logger.info(f"‚úÖ Qwen 2.5 VL 7B model loaded successfully: {model_name}")
            
            # Test the model with a simple query
            test_response = self.model.respond("Hello, can you see this?")
            test_response = test_response.content
            logger.info(f"üìù Model test response: {test_response[:100]}...")
            
        except Exception as e:
            logger.error(f"‚ùå Failed to initialize Qwen 2.5 VL model: {e}")
            raise ValueError(f"Could not load Qwen 2.5 VL model '{model_name}'. Please ensure LM Studio is running and the model is available.")
    
    def generate_content(self, prompt: str, max_retries: int = 3) -> Optional[str]:
        """Generate content using Qwen 2.5 VL model with retry logic"""
        
        for attempt in range(max_retries):
            try:
                logger.debug(f"üîÑ Generating content (attempt {attempt + 1}/{max_retries})")
                
                # Use LM Studio API to get response
                response = self.model.respond(prompt)
                
                if response and response.content:
                    return response.content
                else:
                    logger.warning(f"‚ö†Ô∏è Empty response on attempt {attempt + 1}")
                    
            except Exception as e:
                logger.error(f"‚ùå Error on attempt {attempt + 1}: {e}")
                if attempt == max_retries - 1:
                    logger.error(f"üí• All {max_retries} attempts failed")
                    return None
                
                # Wait before retry
                time.sleep(2 ** attempt)
        
        return None

class JEEBenchQwen25VLEvaluator:
    def __init__(self, num_runs: int = 10, model_name: str = "qwen/qwen2.5-vl-7b"):
        self.num_runs = num_runs
        self.model_name = model_name
        
        # Load JEEBench dataset
        logger.info("üìö Loading JEEBench dataset...")
        self.dataset = load_dataset("daman1209arora/jeebench")
        
        # Convert to pandas DataFrame for easier manipulation
        self.df = pd.DataFrame(self.dataset['test'])
        logger.info(f"üìä Loaded JEEBench dataset with {len(self.df)} questions")
        
        # Print dataset info
        logger.info(f"üìã Dataset columns: {list(self.df.columns)}")
        logger.info(f"üìù Question types: {self.df['type'].value_counts().to_dict()}")
        logger.info(f"üìö Subjects: {self.df['subject'].value_counts().to_dict()}")
        
        # Initialize Qwen 2.5 VL client
        logger.info("üöÄ Initializing Qwen 2.5 VL 7B model...")
        self.client = Qwen25VLClient(model_name)
        
        # Results and state management
        self.results_dir = Path("jeebench_qwen25vl_evaluation_results")
        self.results_dir.mkdir(exist_ok=True)
        
        # Create subdirectories for organization
        (self.results_dir / "partial_results").mkdir(exist_ok=True)
        (self.results_dir / "completed_runs").mkdir(exist_ok=True)
        (self.results_dir / "state_backups").mkdir(exist_ok=True)
        
        self.state_file = self.results_dir / "evaluation_state.pkl"
        self.state = self.load_or_create_state()
        
        # Control flags
        self.stop_requested = False
        self.interrupted = False
        
        logger.info(f"üíæ Results will be saved to: {self.results_dir}")
        logger.info(f"üéØ Running {num_runs} evaluations with Qwen 2.5 VL 7B")
    
    def load_or_create_state(self) -> EvaluationState:
        """Load existing state or create new one with enhanced recovery"""
        if self.state_file.exists():
            try:
                with open(self.state_file, 'rb') as f:
                    state = pickle.load(f)
                
                # Handle backward compatibility for old state files
                # Check if this is an old state format and upgrade it
                if not hasattr(state, 'current_run_question_idx'):
                    logger.info("üîÑ Upgrading old state format...")
                    state.current_run_question_idx = 0
                if not hasattr(state, 'current_run_results'):
                    state.current_run_results = []
                if not hasattr(state, 'current_run_shuffled_indices'):
                    state.current_run_shuffled_indices = []
                
                # Save the upgraded state immediately
                self.state = state
                self.save_state()
                
                logger.info(f"üîÑ Resumed from Run {state.current_run}, Question {state.current_run_question_idx + 1} within run, Total: {state.completed_questions}/{state.total_questions}")
                return state
            except Exception as e:
                logger.error(f"‚ùå Error loading state: {e}")
                
                # Try to load from backup
                backup_file = self.state_file.with_suffix('.backup')
                if backup_file.exists():
                    try:
                        with open(backup_file, 'rb') as f:
                            state = pickle.load(f)
                        
                        # Handle backward compatibility for old backup files
                        if not hasattr(state, 'current_run_question_idx'):
                            logger.info("üîÑ Upgrading old backup state format...")
                            state.current_run_question_idx = 0
                        if not hasattr(state, 'current_run_results'):
                            state.current_run_results = []
                        if not hasattr(state, 'current_run_shuffled_indices'):
                            state.current_run_shuffled_indices = []
                        
                        # Save the upgraded state
                        self.state = state
                        self.save_state()
                            
                        logger.info(f"üîÑ Recovered from backup: Run {state.current_run}, Question {state.current_run_question_idx + 1} within run, Total: {state.completed_questions}/{state.total_questions}")
                        return state
                    except Exception as backup_error:
                        logger.error(f"‚ùå Error loading backup state: {backup_error}")
        
        # Create new state
        logger.info("üÜï Creating new evaluation state")
        return EvaluationState(
            current_run=1,
            current_run_question_idx=0,
            completed_questions=0,
            total_questions=len(self.df) * self.num_runs,
            all_run_summaries=[],
            failed_questions=[],
            current_run_results=[],
            current_run_shuffled_indices=[],
            start_time=time.time(),
            last_save_time=time.time()
        )
    
    def _convert_to_json_serializable(self, obj):
        """Convert numpy/pandas types to JSON serializable types"""
        if hasattr(obj, 'item'):  # numpy scalar
            return obj.item()
        elif hasattr(obj, 'tolist'):  # numpy array
            return obj.tolist()
        elif isinstance(obj, (np.integer, np.int64, np.int32)):
            return int(obj)
        elif isinstance(obj, (np.floating, np.float64, np.float32)):
            return float(obj)
        elif isinstance(obj, np.bool_):
            return bool(obj)
        elif isinstance(obj, dict):
            return {key: self._convert_to_json_serializable(value) for key, value in obj.items()}
        elif isinstance(obj, list):
            return [self._convert_to_json_serializable(item) for item in obj]
        else:
            return obj

    def _save_partial_run_results(self, run_id: int, results: List[Dict], questions_completed: int):
        """Save partial results during a run to prevent data loss"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"jeebench_qwen25vl_run_{run_id:02d}_partial_{questions_completed}q_{timestamp}.json"
        filepath = self.results_dir / "partial_results" / filename
        
        # Create partial results directory if it doesn't exist
        filepath.parent.mkdir(exist_ok=True)
        
        # Calculate current accuracy for this partial run
        correct_count = sum(1 for r in results if r['is_correct'])
        accuracy = (correct_count / len(results)) * 100 if results else 0
        
        partial_summary = {
            'run_id': int(run_id),
            'model_name': str(self.model_name),
            'questions_completed': int(questions_completed),
            'total_questions_in_run': int(len(self.df)),
            'partial_results_count': int(len(results)),
            'correct_answers': int(correct_count),
            'partial_accuracy': float(accuracy),
            'timestamp': str(timestamp),
            'is_partial': True,
            'results': results
        }
        
        try:
            # Convert to JSON serializable format
            serializable_summary = self._convert_to_json_serializable(partial_summary)
            
            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(serializable_summary, f, indent=2, ensure_ascii=False)
            
            logger.info(f"üíæ Partial results saved: {questions_completed} questions, {accuracy:.1f}% accuracy")
        except Exception as e:
            logger.error(f"‚ùå Error saving partial results: {e}")
            # Try to save a simplified version without full results
            try:
                simple_summary = {
                    'run_id': int(run_id),
                    'questions_completed': int(questions_completed),
                    'correct_answers': int(correct_count),
                    'partial_accuracy': float(accuracy),
                    'timestamp': str(timestamp),
                    'error': 'Full results could not be serialized'
                }
                simple_filename = f"jeebench_qwen25vl_run_{run_id:02d}_partial_simple_{questions_completed}q_{timestamp}.json"
                simple_filepath = filepath.parent / simple_filename
                with open(simple_filepath, 'w', encoding='utf-8') as f:
                    json.dump(simple_summary, f, indent=2, ensure_ascii=False)
                logger.info(f"üíæ Simplified partial results saved as fallback")
            except Exception as fallback_error:
                logger.error(f"üí• Even simplified save failed: {fallback_error}")

    def save_state(self):
        """Save current evaluation state with enhanced error handling"""
        self.state.last_save_time = time.time()
        try:
            # Use temporary file for atomic write
            temp_file = self.state_file.with_suffix('.tmp')
            with open(temp_file, 'wb') as f:
                pickle.dump(self.state, f)
            temp_file.replace(self.state_file)
            
            # Also create a backup of the state file
            backup_file = self.state_file.with_suffix('.backup')
            with open(backup_file, 'wb') as f:
                pickle.dump(self.state, f)
            
            logger.debug(f"üíæ State saved: Run {self.state.current_run}, Question {self.state.current_run_question_idx + 1} within run, Total: {self.state.completed_questions}/{self.state.total_questions}")
        except Exception as e:
            logger.error(f"‚ùå Error saving state: {e}")
            # Try to save to backup location
            try:
                emergency_backup = self.results_dir / f"emergency_state_backup_{int(time.time())}.pkl"
                with open(emergency_backup, 'wb') as f:
                    pickle.dump(self.state, f)
                logger.info(f"üö® Emergency backup saved to: {emergency_backup}")
            except Exception as backup_error:
                logger.error(f"üí• Failed to create emergency backup: {backup_error}")
    
    def create_question_prompt(self, question_data: pd.Series) -> str:
        """Create appropriate prompt based on question type"""
        question_type = question_data['type']
        question_text = question_data['question']
        
        # Base prompt with question
        prompt = f"You are an expert at solving JEE (Joint Entrance Examination) problems. Please solve this question step by step.\n\nQuestion: {question_text}\n\n"
        
        # Add type-specific instructions
        if question_type == "MCQ":
            prompt += """This is a multiple choice question. Please analyze the question carefully, reason step-by-step and provide your answer.

For this question:
- Choose exactly ONE option (A, B, C, or D)
- Show your reasoning clearly
- Format your final answer in \\boxed{} as just one letter (e.g., \\boxed{A})

Your response should end with your final answer in the format \\boxed{X} where X is the correct option."""
        elif question_type == "MCQ(multiple)":
            prompt += """This is a multiple choice question where multiple options can be correct. Please analyze the question carefully, reason step-by-step and provide your answer.

For this question:
- Choose ONE OR MORE options (A, B, C, and/or D)
- Show your reasoning clearly
- Format your final answer in \\boxed{} with letters (e.g., \\boxed{ABC} or \\boxed{B})

Your response should end with your final answer in the format \\boxed{X} where X contains all correct options."""
        elif question_type == "Integer":
            prompt += """This is a numerical question. Please analyze the question carefully, reason step-by-step and provide your answer.

For this question:
- Provide a numerical value
- Show your complete calculation
- Round to appropriate decimal places if needed
- Format your final answer in \\boxed{} (e.g., \\boxed{2.5} or \\boxed{42})

Your response should end with your final answer in the format \\boxed{X} where X is the numerical answer."""
        else:
            # Default case
            prompt += """Please analyze the question carefully, reason step-by-step and provide your answer.
Show your complete reasoning and format your final answer in \\boxed{} (e.g., \\boxed{A} for MCQ or \\boxed{42} for numerical)"""
        
        return prompt
    
    def extract_answer(self, response_text: str, question_type: str) -> str:
        """Extract the final answer from model response"""
        try:
            # First try to find boxed answer
            boxed_match = re.search(r'\\boxed\{([^}]+)\}', response_text)
            if boxed_match:
                answer = boxed_match.group(1).strip()
            else:
                # Try other patterns
                answer_patterns = [
                    r'\*\*Answer:\*\*\s*(.+)',
                    r'Answer:\s*(.+)',
                    r'Final answer:\s*(.+)',
                    r'The answer is:\s*(.+)',
                    r'Therefore,?\s*(.+)',
                ]
                answer = None
                for pattern in answer_patterns:
                    match = re.search(pattern, response_text, re.IGNORECASE)
                    if match:
                        answer = match.group(1).strip()
                        break
                
                if not answer:
                    # Take last line as fallback
                    lines = response_text.strip().split('\n')
                    answer = lines[-1].strip()
            
            # Clean up answer based on question type
            if question_type == "MCQ":
                # Extract single letter
                match = re.search(r'[ABCD]', answer.upper())
                return match.group(0) if match else answer[:10]
            elif question_type == "MCQ(multiple)":
                # Extract multiple letters
                letters = re.findall(r'[ABCD]', answer.upper())
                unique_letters = sorted(set(letters))
                return ''.join(unique_letters) if unique_letters else answer[:20]
            elif question_type == "Integer":
                # Extract number
                number_match = re.search(r'-?\d+\.?\d*', answer)
                return number_match.group(0) if number_match else answer[:20]
            
            return answer[:50]
        except Exception as e:
            logger.error(f"‚ùå Error extracting answer: {e}")
            return response_text[:50]
    
    def is_answer_correct(self, predicted_answer: str, question_data: pd.Series) -> bool:
        """Check if predicted answer is correct"""
        try:
            predicted = str(predicted_answer).strip().upper()
            correct = str(question_data['gold']).strip().upper()
            question_type = question_data['type']
            
            if question_type == "MCQ":
                return predicted == correct
            elif question_type == "MCQ(multiple)":
                # Handle multiple choice with multiple correct answers
                pred_letters = set(re.findall(r'[ABCD]', predicted))
                correct_letters = set(re.findall(r'[ABCD]', correct))
                return pred_letters == correct_letters
            elif question_type == "Integer":
                # First try exact match
                if predicted == correct:
                    return True
                # Try numerical comparison with tolerance
                try:
                    pred_num = float(predicted)
                    correct_num = float(correct)
                    tolerance = abs(correct_num) * 0.01 if abs(correct_num) > 1 else 0.01
                    return abs(pred_num - correct_num) <= tolerance
                except ValueError:
                    return predicted == correct
            
            # Default exact match
            return predicted == correct
        except Exception as e:
            logger.error(f"‚ùå Error comparing answers: {e}")
            return False
    
    def evaluate_single_question(self, question_data: pd.Series, run_id: int, question_idx: int) -> Optional[Dict]:
        """Evaluate a single question using Qwen 2.5 VL"""
        try:
            prompt = self.create_question_prompt(question_data)
            
            start_time = time.time()
            response_text = self.client.generate_content(prompt)
            inference_time = time.time() - start_time
            
            if not response_text:
                logger.error(f"‚ùå Failed to get response for question {question_data.get('index', question_idx)}")
                return None
            
            predicted_answer = self.extract_answer(response_text, question_data['type'])
            is_correct = self.is_answer_correct(predicted_answer, question_data)
            
            # Log each completion
            status = "‚úÖ CORRECT" if is_correct else "‚ùå WRONG"
            logger.info(f"Run {run_id} | Q{question_idx+1}/{len(self.df)}: {status} ({inference_time:.1f}s) | Predicted: {predicted_answer} | Gold: {question_data['gold']}")
            
            return {
                'run_id': run_id,
                'question_idx': question_idx,
                'dataset_index': question_data.get('index', question_idx),
                'subject': question_data['subject'],
                'question_type': question_data['type'],
                'question_text': question_data['question'][:200] + "..." if len(question_data['question']) > 200 else question_data['question'],
                'correct_answer': question_data['gold'],
                'predicted_answer': predicted_answer,
                'is_correct': is_correct,
                'inference_time': inference_time,
                'full_response': response_text[:1000] + "..." if len(response_text) > 1000 else response_text  # Truncate for storage
            }
        except Exception as e:
            logger.error(f"‚ùå Error evaluating question {question_data.get('index', question_idx)}: {e}")
            return None
    
    def run_single_evaluation_run(self, run_id: int) -> Optional[Dict]:
        """Run a single evaluation run (sequential processing) with proper resume support"""
        logger.info(f"\n{'='*60}")
        logger.info(f"üöÄ Starting Run {run_id}/{self.num_runs}")
        logger.info(f"{'='*60}")
        
        # Check if we're resuming this run
        is_resuming_run = (run_id == self.state.current_run and 
                          self.state.current_run_question_idx > 0)
        
        if is_resuming_run:
            # Resume from existing state
            logger.info(f"üîÑ Resuming Run {run_id} from question {self.state.current_run_question_idx + 1}")
            results = self.state.current_run_results.copy()
            shuffled_indices = self.state.current_run_shuffled_indices
            start_idx = self.state.current_run_question_idx
        else:
            # Start new run
            logger.info(f"üÜï Starting new Run {run_id}")
            # Shuffle questions for this run
            shuffled_indices = list(range(len(self.df)))
            random.seed(run_id)  # Consistent shuffle for each run
            random.shuffle(shuffled_indices)
            
            # Update state for new run
            self.state.current_run = run_id
            self.state.current_run_question_idx = 0
            self.state.current_run_results = []
            self.state.current_run_shuffled_indices = shuffled_indices
            
            results = []
            start_idx = 0
        
        run_start_time = time.time()
        
        # Process questions sequentially starting from the resume point
        for idx in range(start_idx, len(shuffled_indices)):
            if self.stop_requested:
                logger.info("‚èπÔ∏è Stop requested, ending run early")
                break
            
            # Get the actual question data using shuffled index
            question_data = self.df.iloc[shuffled_indices[idx]]
            
            result = self.evaluate_single_question(question_data, run_id, idx)
            if result:
                results.append(result)
                
                # Update state immediately
                self.state.current_run_results = results.copy()
                self.state.current_run_question_idx = idx
                self.state.completed_questions += 1
            
            # Save state every 5 questions for minimal data loss
            if (idx + 1) % 5 == 0:
                self.save_state()
                progress = ((idx + 1) / len(shuffled_indices)) * 100
                logger.info(f"üìä Run {run_id} Progress: {progress:.1f}% ({idx + 1}/{len(shuffled_indices)} questions) [State Saved]")
            
            # Also save partial results every 25 questions
            if (idx + 1) % 25 == 0:
                self._save_partial_run_results(run_id, results, idx + 1)
        
        run_duration = time.time() - run_start_time
        
        if not results:
            logger.error(f"‚ùå No valid results for run {run_id}")
            return None
        
        # Calculate accuracy
        correct_count = sum(1 for r in results if r['is_correct'])
        accuracy = (correct_count / len(results)) * 100
        
        # Create run summary
        run_summary = {
            'run_id': run_id,
            'model_name': self.model_name,
            'total_questions': len(results),
            'correct_answers': correct_count,
            'accuracy': accuracy,
            'duration': run_duration,
            'avg_time_per_question': run_duration / len(results),
            'timestamp': datetime.now().strftime("%Y%m%d_%H%M%S"),
            'results': results
        }
        
        # Reset run state since this run is complete
        self.state.current_run_question_idx = 0
        self.state.current_run_results = []
        self.state.current_run_shuffled_indices = []
        
        logger.info(f"‚úÖ Run {run_id} completed: {accuracy:.2f}% accuracy ({correct_count}/{len(results)}) in {run_duration:.1f}s")
        logger.info(f"‚è±Ô∏è Average time per question: {run_summary['avg_time_per_question']:.1f}s")
        
        return run_summary
    
    def save_run_results(self, run_summary: Dict):
        """Save results for a single run to completed_runs directory"""
        timestamp = run_summary['timestamp']
        filename = f"jeebench_qwen25vl_run_{run_summary['run_id']:02d}_{timestamp}.json"
        filepath = self.results_dir / "completed_runs" / filename
        
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(run_summary, f, indent=2, ensure_ascii=False)
        
        logger.info(f"üíæ Run {run_summary['run_id']} results saved to: completed_runs/{filename}")
        
        # Clean up partial results for this run
        self._cleanup_partial_results(run_summary['run_id'])
    
    def _cleanup_partial_results(self, run_id: int):
        """Clean up partial result files after a run is completed"""
        partial_dir = self.results_dir / "partial_results"
        if partial_dir.exists():
            # Find and remove partial files for this run
            for partial_file in partial_dir.glob(f"*_run_{run_id:02d}_partial_*"):
                try:
                    partial_file.unlink()
                    logger.debug(f"üóëÔ∏è Cleaned up partial file: {partial_file.name}")
                except Exception as e:
                    logger.warning(f"‚ö†Ô∏è Could not clean up partial file {partial_file}: {e}")
    
    def calculate_statistics(self, all_run_summaries: List[Dict]) -> Dict:
        """Calculate overall statistics across all runs"""
        accuracies = [summary['accuracy'] for summary in all_run_summaries]
        
        return {
            'num_runs': len(accuracies),
            'mean_accuracy': np.mean(accuracies),
            'std_accuracy': np.std(accuracies, ddof=1) if len(accuracies) > 1 else 0,
            'sem_accuracy': stats.sem(accuracies) if len(accuracies) > 1 else 0,
            'min_accuracy': np.min(accuracies),
            'max_accuracy': np.max(accuracies),
            'confidence_interval_95': stats.t.interval(
                0.95, len(accuracies) - 1,
                loc=np.mean(accuracies),
                scale=stats.sem(accuracies)
            ) if len(accuracies) > 1 else (np.mean(accuracies), np.mean(accuracies)),
            'individual_accuracies': accuracies
        }
    
    def analyze_convergence_and_variance(self, all_run_summaries: List[Dict]) -> Dict:
        """Analyze convergence for optimal k determination"""
        accuracies = [summary['accuracy'] for summary in all_run_summaries]
        n_runs = len(accuracies)
        
        convergence_analysis = {
            'k_values': [],
            'running_means': [],
            'running_stds': [],
            'running_sems': [],
            'confidence_intervals': [],
            'relative_changes': [],
            'stability_metrics': [],
            'cost_effectiveness': []
        }
        
        for k in range(3, n_runs + 1):
            subset_accuracies = accuracies[:k]
            
            mean_acc = np.mean(subset_accuracies)
            std_acc = np.std(subset_accuracies, ddof=1) if k > 1 else 0
            sem_acc = stats.sem(subset_accuracies) if k > 1 else 0
            
            if k > 1:
                ci = stats.t.interval(0.95, k-1, loc=mean_acc, scale=sem_acc) if sem_acc > 0 else (mean_acc, mean_acc)
                ci_width = ci[1] - ci[0]
            else:
                ci = (mean_acc, mean_acc)
                ci_width = 0
            
            if k > 3:
                prev_mean = convergence_analysis['running_means'][-1]
                relative_change = abs(mean_acc - prev_mean) / prev_mean * 100 if prev_mean > 0 else 0
            else:
                relative_change = np.inf
            
            cv = std_acc / mean_acc * 100 if mean_acc > 0 else np.inf
            cost_effectiveness = ci_width * k
            
            convergence_analysis['k_values'].append(k)
            convergence_analysis['running_means'].append(mean_acc)
            convergence_analysis['running_stds'].append(std_acc)
            convergence_analysis['running_sems'].append(sem_acc)
            convergence_analysis['confidence_intervals'].append(ci)
            convergence_analysis['relative_changes'].append(relative_change)
            convergence_analysis['stability_metrics'].append(cv)
            convergence_analysis['cost_effectiveness'].append(cost_effectiveness)
        
        return convergence_analysis
    
    def run_evaluation(self):
        """Run complete evaluation with resume capability"""
        logger.info("üéØ Starting JEEBench Qwen 2.5 VL 7B Evaluation with Resume Capability")
        logger.info(f"üìö Dataset: {len(self.df)} questions")
        logger.info(f"ü§ñ Model: {self.model_name}")
        logger.info(f"üîÑ Total runs planned: {self.num_runs}")
        
        try:
            # Resume from where we left off
            for run_id in range(self.state.current_run, self.num_runs + 1):
                if self.stop_requested:
                    break
                
                # Run single evaluation
                run_summary = self.run_single_evaluation_run(run_id)
                
                if run_summary:
                    # Save run results
                    self.save_run_results(run_summary)
                    
                    # Update state
                    self.state.all_run_summaries.append(run_summary)
                    self.state.current_run = run_id + 1
                    
                    # Save state after each run
                    self.save_state()
                    
                    # Print progress
                    progress = (run_id / self.num_runs) * 100
                    elapsed = time.time() - self.state.start_time
                    eta = (elapsed / run_id) * (self.num_runs - run_id) if run_id > 0 else 0
                    
                    if len(self.state.all_run_summaries) > 0:
                        avg_accuracy = np.mean([s['accuracy'] for s in self.state.all_run_summaries])
                        logger.info(f"üìä Progress: {progress:.1f}% | ETA: {eta/3600:.1f}h | Avg accuracy so far: {avg_accuracy:.2f}%")
        
        except KeyboardInterrupt:
            logger.info("‚èπÔ∏è Evaluation interrupted by user")
            self.interrupted = True
        except Exception as e:
            logger.error(f"‚ùå Error during evaluation: {e}")
        finally:
            # Always save final state
            self.save_state()
            
            # Generate final report if we have results
            if self.state.all_run_summaries:
                self.generate_final_report()
        
        logger.info("üèÅ Evaluation session ended!")
    
    def generate_final_report(self):
        """Generate comprehensive final report"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        report_file = self.results_dir / f"jeebench_qwen25vl_final_report_{timestamp}.txt"
        
        stats = self.calculate_statistics(self.state.all_run_summaries)
        convergence_data = self.analyze_convergence_and_variance(self.state.all_run_summaries)
        
        # Collect all results for detailed analysis
        all_results = []
        for run_summary in self.state.all_run_summaries:
            all_results.extend(run_summary['results'])
        
        report_content = f"""JEEBench Qwen 2.5 VL 7B Evaluation Report
{'='*80}

Evaluation Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Model: {self.model_name}
Dataset: JEEBench ({len(self.df)} questions)
Completed Runs: {len(self.state.all_run_summaries)}/{self.num_runs}
Evaluation Mode: Sequential (Single GPU)

PERFORMANCE SUMMARY
{'-'*40}
Mean Accuracy: {stats['mean_accuracy']:.2f}% ¬± {stats['std_accuracy']:.2f}%
Standard Error: {stats['sem_accuracy']:.2f}%
95% Confidence Interval: [{stats['confidence_interval_95'][0]:.2f}%, {stats['confidence_interval_95'][1]:.2f}%]
Range: {stats['min_accuracy']:.2f}% - {stats['max_accuracy']:.2f}%

PERFORMANCE BREAKDOWN
{'-'*40}
"""
        
        # Performance by category
        categories = ['subject', 'question_type']
        for category in categories:
            report_content += f"\nBy {category.title()}:\n"
            category_stats = defaultdict(lambda: {'correct': 0, 'total': 0})
            
            for result in all_results:
                cat_value = result[category]
                category_stats[cat_value]['total'] += 1
                if result['is_correct']:
                    category_stats[cat_value]['correct'] += 1
            
            for cat_value, stats_dict in sorted(category_stats.items()):
                accuracy = (stats_dict['correct'] / stats_dict['total']) * 100
                report_content += f"  {cat_value}: {accuracy:.2f}% ({stats_dict['correct']}/{stats_dict['total']})\n"
        
        # Add individual run accuracies
        report_content += f"\nINDIVIDUAL RUN ACCURACIES\n{'-'*40}\n"
        for i, accuracy in enumerate(stats['individual_accuracies'], 1):
            report_content += f"Run {i}: {accuracy:.2f}%\n"
        
        # Add timing information
        if all_results:
            avg_inference_time = np.mean([r['inference_time'] for r in all_results])
            total_inference_time = sum([r['inference_time'] for r in all_results])
            report_content += f"\nTIMING ANALYSIS\n{'-'*40}\n"
            report_content += f"Average inference time per question: {avg_inference_time:.2f}s\n"
            report_content += f"Total inference time: {total_inference_time/3600:.2f}h\n"
        
        with open(report_file, 'w', encoding='utf-8') as f:
            f.write(report_content)
        
        logger.info(f"üìÑ Final report saved to: {report_file}")
        logger.info(f"üéØ Final Pass@1 Accuracy: {stats['mean_accuracy']:.2f}% ¬± {stats['std_accuracy']:.2f}%")

# Configuration for Jupyter Notebook

# Model configuration
MODEL_NAME = "qwen/qwen2.5-vl-7b"  # Change this if your model has a different name in LM Studio
NUM_RUNS = 10

def run_evaluation():
    """Main evaluation function for Jupyter"""
    evaluator = JEEBenchQwen25VLEvaluator(NUM_RUNS, MODEL_NAME)
    evaluator.run_evaluation()

def resume_evaluation():
    """Resume evaluation from saved state"""
    evaluator = JEEBenchQwen25VLEvaluator(NUM_RUNS, MODEL_NAME)
    logger.info("üîÑ Resuming evaluation from saved state...")
    evaluator.run_evaluation()

def check_progress():
    """Check current progress without running evaluation"""
    evaluator = JEEBenchQwen25VLEvaluator(NUM_RUNS, MODEL_NAME)
    state = evaluator.state
    
    print(f"\n{'='*60}")
    print("CURRENT EVALUATION PROGRESS")
    print(f"{'='*60}")
    print(f"Current Run: {state.current_run}/{NUM_RUNS}")
    
    if state.current_run_question_idx > 0:
        print(f"Current Run Progress: Question {state.current_run_question_idx + 1}/{len(evaluator.df)} within Run {state.current_run}")
        current_run_progress = ((state.current_run_question_idx + 1) / len(evaluator.df)) * 100
        print(f"Current Run Progress: {current_run_progress:.1f}%")
    
    print(f"Total Completed Questions: {state.completed_questions:,}/{state.total_questions:,}")
    print(f"Overall Progress: {(state.completed_questions/state.total_questions)*100:.1f}%")
    
    if state.all_run_summaries:
        accuracies = [s['accuracy'] for s in state.all_run_summaries]
        print(f"Completed Runs: {len(state.all_run_summaries)}")
        print(f"Average Accuracy: {np.mean(accuracies):.2f}% ¬± {np.std(accuracies, ddof=1):.2f}%")
        
        elapsed = time.time() - state.start_time
        if len(state.all_run_summaries) > 0:
            eta = (elapsed / len(state.all_run_summaries)) * (NUM_RUNS - len(state.all_run_summaries))
            print(f"Elapsed Time: {elapsed/3600:.1f}h")
            print(f"Estimated Time Remaining: {eta/3600:.1f}h")
    
    # Show current run results if any
    if state.current_run_results:
        current_correct = sum(1 for r in state.current_run_results if r['is_correct'])
        current_accuracy = (current_correct / len(state.current_run_results)) * 100
        print(f"Current Run {state.current_run} so far: {current_accuracy:.1f}% ({current_correct}/{len(state.current_run_results)})")
    
    print(f"Last Save: {datetime.fromtimestamp(state.last_save_time).strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"State File: {evaluator.state_file}")
    
    # Check for partial results
    partial_dir = evaluator.results_dir / "partial_results"
    if partial_dir.exists():
        partial_files = list(partial_dir.glob("*.json"))
        if partial_files:
            print(f"Partial Result Files: {len(partial_files)}")
            print("(These will be cleaned up when runs complete)")
    
    print(f"{'='*60}\n")

def recover_from_partial_results():
    """Attempt to recover progress from partial result files if state is corrupted"""
    evaluator = JEEBenchQwen25VLEvaluator(NUM_RUNS, MODEL_NAME)
    partial_dir = evaluator.results_dir / "partial_results"
    
    if not partial_dir.exists():
        print("‚ùå No partial results directory found")
        return
    
    partial_files = list(partial_dir.glob("*.json"))
    if not partial_files:
        print("‚ùå No partial result files found")
        return
    
    print(f"üîç Found {len(partial_files)} partial result files:")
    for pf in sorted(partial_files):
        try:
            with open(pf, 'r') as f:
                data = json.load(f)
            print(f"  üìÑ {pf.name}: Run {data.get('run_id', '?')}, {data.get('questions_completed', '?')} questions, {data.get('partial_accuracy', '?'):.1f}% accuracy")
        except Exception as e:
            print(f"  ‚ùå {pf.name}: Error reading file - {e}")
    
    print("\nüí° You can manually review these files if needed for data recovery")
    print("üîÑ Try running resume_evaluation() to continue from the last saved state")

def reset_evaluation():
    """Reset evaluation state (use with caution!)"""
    evaluator = JEEBenchQwen25VLEvaluator(NUM_RUNS, MODEL_NAME)
    
    print("‚ö†Ô∏è  WARNING: This will delete all progress and start fresh!")
    confirm = input("Type 'RESET' to confirm: ")
    
    if confirm == "RESET":
        files_to_remove = []
        
        # Main state file
        if evaluator.state_file.exists():
            files_to_remove.append(evaluator.state_file)
        
        # Backup state file
        backup_file = evaluator.state_file.with_suffix('.backup')
        if backup_file.exists():
            files_to_remove.append(backup_file)
        
        # Remove all files
        for file_path in files_to_remove:
            try:
                file_path.unlink()
                print(f"‚úÖ Removed: {file_path.name}")
            except Exception as e:
                print(f"‚ùå Error removing {file_path.name}: {e}")
        
        if files_to_remove:
            print("‚úÖ Evaluation state reset successfully!")
        else:
            print("‚ÑπÔ∏è  No existing state files found.")
    else:
        print("‚ùå Reset cancelled.")

def force_upgrade_state():
    """Force upgrade of old state file format"""
    try:
        evaluator = JEEBenchQwen25VLEvaluator(NUM_RUNS, MODEL_NAME)
        print("‚úÖ State file upgraded successfully!")
        check_progress()
    except Exception as e:
        print(f"‚ùå Error upgrading state: {e}")
        print("üí° Try reset_evaluation() if the issue persists")

def load_jeebench_sample(n_samples: int = 5):
    """Load and display a sample of JEEBench questions for inspection"""
    from datasets import load_dataset
    
    print("üìö Loading JEEBench dataset...")
    ds = load_dataset("daman1209arora/jeebench")
    df = pd.DataFrame(ds['test'])
    
    print(f"\nüìä Dataset Info:")
    print(f"Total questions: {len(df)}")
    print(f"Columns: {list(df.columns)}")
    print(f"Question types: {df['type'].value_counts().to_dict()}")
    print(f"Subjects: {df['subject'].value_counts().to_dict()}")
    
    print(f"\nüîç Sample {n_samples} questions:")
    print("="*80)
    
    sample_df = df.sample(n=min(n_samples, len(df)), random_state=42)
    
    for idx, row in sample_df.iterrows():
        print(f"\nüìù Question {idx + 1}:")
        print(f"Subject: {row['subject']}")
        print(f"Type: {row['type']}")
        print(f"Question: {row['question'][:200]}{'...' if len(row['question']) > 200 else ''}")
        print(f"Gold Answer: {row['gold']}")
        print("-" * 60)
    
    return df

def test_model_connection():
    """Test if Qwen 2.5 VL model is accessible via LM Studio"""
    try:
        print("üîÑ Testing Qwen 2.5 VL model connection...")
        model = lms.llm(MODEL_NAME)
        
        test_prompt = "Hello! Can you solve this simple math problem: What is 2 + 2?"
        response = model.respond(test_prompt)
        
        print("‚úÖ Model connection successful!")
        print(f"üìù Test response: {response}")
        return True
        
    except Exception as e:
        print(f"‚ùå Model connection failed: {e}")
        print("üîß Please ensure:")
        print("  1. LM Studio is running")
        print("  2. Qwen 2.5 VL 7B model is loaded")
        print("  3. API server is enabled in LM Studio")
        print(f"  4. Model name '{MODEL_NAME}' is correct")
        return False

"""
# 1. First, test if the model is accessible:
test_model_connection()

# 2. To inspect the dataset first:
df = load_jeebench_sample(10)

# 3. To start new evaluation:
run_evaluation()

# 4. To resume from where you left off:
resume_evaluation()

# 5. To check current progress:
check_progress()

# 6. If you get state compatibility errors, try:
force_upgrade_state()

# 7. To recover from partial results if state is corrupted:
recover_from_partial_results()

# 8. To reset everything (use carefully!):
reset_evaluation()

# Example of running a single question for testing:
def test_single_question():
    evaluator = JEEBenchQwen25VLEvaluator(1, MODEL_NAME)
    sample_question = evaluator.df.iloc[0]
    result = evaluator.evaluate_single_question(sample_question, 1, 0)
    print("Test result:", result)
    return result
"""

'\n# 1. First, test if the model is accessible:\ntest_model_connection()\n\n# 2. To inspect the dataset first:\ndf = load_jeebench_sample(10)\n\n# 3. To start new evaluation:\nrun_evaluation()\n\n# 4. To resume from where you left off:\nresume_evaluation()\n\n# 5. To check current progress:\ncheck_progress()\n\n# 6. If you get state compatibility errors, try:\nforce_upgrade_state()\n\n# 7. To recover from partial results if state is corrupted:\nrecover_from_partial_results()\n\n# 8. To reset everything (use carefully!):\nreset_evaluation()\n\n# Example of running a single question for testing:\ndef test_single_question():\n    evaluator = JEEBenchQwen25VLEvaluator(1, MODEL_NAME)\n    sample_question = evaluator.df.iloc[0]\n    result = evaluator.evaluate_single_question(sample_question, 1, 0)\n    print("Test result:", result)\n    return result\n\n# Note: With the new implementation, state is saved every 5 questions\n# and partial results are saved every 25 questions, so you\'ll lose 

In [3]:
test_model_connection()

2025-07-28 16:54:54,786 - INFO - {"client": "<lmstudio.sync_api.Client object at 0x0000026C52D90740>", "event": "Websocket handling thread started", "thread_id": "Thread-3"}
2025-07-28 16:54:54,789 - INFO - {"event": "Websocket handling task started", "ws_url": "ws://localhost:1234/llm"}


üîÑ Testing Qwen 2.5 VL model connection...


2025-07-28 16:54:55,265 - INFO - HTTP Request: GET ws://localhost:1234/llm "HTTP/1.1 101 Switching Protocols"
2025-07-28 16:54:55,266 - INFO - {"event": "Websocket session established (ws://localhost:1234/llm)", "ws_url": "ws://localhost:1234/llm"}


‚úÖ Model connection successful!
üìù Test response: Certainly! The answer to the equation \(2 + 2\) is:

\[4\]


True