In [None]:
import pandas as pd
import json
from collections import Counter
from tqdm import tqdm

def load_jsonl_file(filepath, max_lines=None):
    """
    Load a JSONL file line by line.
    
    Args:
        filepath: Path to the JSONL file
        max_lines: Maximum number of lines to read (for testing)
    
    Returns:
        List of dictionaries
    """
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if max_lines and i >= max_lines:
                break
            try:
                data.append(json.loads(line.strip()))
            except json.JSONDecodeError as e:
                print(f"Error parsing line {i+1}: {e}")
                continue
    return data

def filter_prolific_reviewers(reviews_data, min_reviews=20):
    """
    Filter reviews to only include those from users with >= min_reviews.
    
    Args:
        reviews_data: List of review dictionaries
        min_reviews: Minimum number of reviews a user must have
    
    Returns:
        Filtered list of reviews
    """
    # Count reviews per user
    print("Counting reviews per user...")
    user_review_counts = Counter(review['user_id'] for review in reviews_data)
    
    # Get users with >= min_reviews
    prolific_users = {user_id for user_id, count in user_review_counts.items() 
                      if count >= min_reviews}
    
    print(f"Found {len(prolific_users)} users with >= {min_reviews} reviews")
    print(f"Total users: {len(user_review_counts)}")
    
    # Filter reviews
    filtered_reviews = [review for review in reviews_data 
                        if review['user_id'] in prolific_users]
    
    print(f"Filtered reviews: {len(filtered_reviews)} out of {len(reviews_data)}")
    
    return filtered_reviews

def create_product_lookup(products_data):
    """
    Create a dictionary for fast product lookup by parent_asin.
    
    Args:
        products_data: List of product dictionaries
    
    Returns:
        Dictionary mapping parent_asin to product data
    """
    product_lookup = {}
    for product in products_data:
        parent_asin = product.get('parent_asin')
        if parent_asin:
            product_lookup[parent_asin] = product
    return product_lookup

def merge_reviews_with_products(reviews_data, products_data):
    """
    Merge review data with product metadata.
    Only includes rows with all required fields (no missing data).
    
    Args:
        reviews_data: List of review dictionaries
        products_data: List of product dictionaries
    
    Returns:
        Pandas DataFrame with merged data (no missing required fields)
    """
    print("Creating product lookup...")
    product_lookup = create_product_lookup(products_data)
    
    print("Merging reviews with products...")
    merged_data = []
    skipped_missing_data = 0
    skipped_no_product = 0
    
    for review in tqdm(reviews_data, desc="Merging data"):
        # Get the parent_asin from the review
        parent_asin = review.get('parent_asin')
        
        if parent_asin and parent_asin in product_lookup:
            product = product_lookup[parent_asin]
            
            # Check required fields before creating record
            # Required fields: product_title, review_rating, product_features, product_price
            product_title = product.get('title')
            review_rating = review.get('rating')
            product_features = product.get('features')
            product_price = product.get('price')
            
            # Skip if any required field is missing or None
            if not all([
                product_title is not None and product_title != '',
                review_rating is not None,
                product_features is not None and len(product_features) > 0,
                product_price is not None and product_price != ''
            ]):
                skipped_missing_data += 1
                continue
            
            # Create merged record
            merged_record = {
                # Review fields
                'review_rating': review_rating,
                'review_title': review.get('title'),
                'review_text': review.get('text'),
                'user_id': review.get('user_id'),
                'timestamp': review.get('timestamp'),
                'helpful_vote': review.get('helpful_vote'),
                'verified_purchase': review.get('verified_purchase'),
                'asin': review.get('asin'),
                
                # Product fields (required)
                'product_title': product_title,
                'product_features': ' | '.join(product_features),
                'product_price': product_price,
                
                # Product fields (optional)
                'product_category': product.get('main_category'),
                'product_avg_rating': product.get('average_rating'),
                'product_rating_count': product.get('rating_number'),
                'product_brand': product.get('details', {}).get('Brand'),
                'product_store': product.get('store'),
                'parent_asin': parent_asin
            }
            
            # Add product description if available
            if product.get('description'):
                merged_record['product_description'] = ' '.join(product['description'])
            else:
                merged_record['product_description'] = None
            
            merged_data.append(merged_record)
        else:
            skipped_no_product += 1
    
    print(f"Successfully merged {len(merged_data)} records")
    print(f"Skipped {skipped_missing_data} records due to missing required fields")
    print(f"Skipped {skipped_no_product} records due to no matching product")
    
    # Create DataFrame
    df = pd.DataFrame(merged_data)
    
    # Convert timestamp to datetime
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms', errors='coerce')
    
    # Final verification - ensure no nulls in required columns
    required_cols = ['product_title', 'review_rating', 'product_features', 'product_price']
    for col in required_cols:
        if col in df.columns:
            null_count = df[col].isnull().sum()
            if null_count > 0:
                print(f"WARNING: Found {null_count} null values in {col}")
                df = df.dropna(subset=[col])
    
    return df

def main(reviews_filepath='input/All_Beauty.jsonl', 
         products_filepath='input/meta_All_Beauty.jsonl',
         min_reviews=20,
         sample_size=None):
    """
    Main function to process and merge beauty product data.
    Only saves rows with all required fields (no missing data).
    
    Args:
        reviews_filepath: Path to reviews JSONL file
        products_filepath: Path to products metadata JSONL file
        min_reviews: Minimum number of reviews for a user to be included
        sample_size: If provided, only load this many lines (for testing)
    
    Returns:
        Merged DataFrame with complete data only
    """
    print(f"Loading reviews from {reviews_filepath}...")
    reviews_data = load_jsonl_file(reviews_filepath, max_lines=sample_size)
    print(f"Loaded {len(reviews_data)} reviews")
    
    print(f"\nLoading products from {products_filepath}...")
    products_data = load_jsonl_file(products_filepath, max_lines=sample_size)
    print(f"Loaded {len(products_data)} products")
    
    print(f"\nFiltering for users with >= {min_reviews} reviews...")
    filtered_reviews = filter_prolific_reviewers(reviews_data, min_reviews)
    
    print("\nMerging data (only complete records)...")
    df = merge_reviews_with_products(filtered_reviews, products_data)
    
    print("\n" + "="*50)
    print("SUMMARY")
    print("="*50)
    print(f"Final DataFrame shape: {df.shape}")
    print(f"Unique users: {df['user_id'].nunique()}")
    print(f"Unique products: {df['parent_asin'].nunique()}")
    print(f"\nDataFrame columns: {list(df.columns)}")
    
    # Verify required fields have no missing data
    print("\n" + "="*50)
    print("REQUIRED FIELDS VERIFICATION")
    print("="*50)
    required_fields = ['product_title', 'review_rating', 'product_features', 'product_price']
    for field in required_fields:
        if field in df.columns:
            non_null = df[field].notna().sum()
            total = len(df)
            print(f"{field}: {non_null}/{total} non-null ({non_null/total*100:.1f}%)")
    
    print(f"\nFirst few rows:")
    print(df[['product_title', 'review_rating', 'product_features', 'product_price']].head())
    
    return df


# Example usage
def parse(data_type):
    # For testing with a small sample
    # df = main(sample_size=10000, min_reviews=20)
    
    # For full processing
    #reviews_filepath='input/All_Beauty.jsonl', 
    #     products_filepath 
    
    if data_type == 'beauty':
        df = main(min_reviews=20)
    
        # Save to CSV
        output_file = 'input/beauty_merged_data.csv'
        df.to_csv(output_file, index=False)
        print(f"\nData saved to {output_file}")
        
        # Display some statistics
        print("\nDataset Statistics:")
        print(f"Average review rating: {df['review_rating'].mean():.2f}")
        print(f"Reviews per user: {df.groupby('user_id').size().mean():.1f}")
        print(f"Reviews per product: {df.groupby('parent_asin').size().mean():.1f}")
        print(f"Verified purchases: {df['verified_purchase'].sum()} ({df['verified_purchase'].mean()*100:.1f}%)")
    if data_type == 'industrial':
        df = main(reviews_filepath='input/Industrial_and_Scientific.jsonl',products_filepath='input/meta_Industrial_and_Scientific.jsonl',min_reviews=20)
    
        # Save to CSV
        output_file = 'input/ind_merged_data.csv'
        df.to_csv(output_file, index=False)

    if data_type == 'electronic':
        df = main(reviews_filepath='input/Electronics.jsonl',products_filepath='input/meta_Electronics.jsonl',min_reviews=20)
    
        # Save to CSV
        output_file = 'input/electronic_merged_data.csv'
        df.to_csv(output_file, index=False)

parse('electronic')

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm
import json

def create_train_test_split(df, first_n_reviews=10):
    """
    Split data into train and test sets where:
    - Train: First N reviews (by timestamp) for each user
    - Test: Positive/negative review pairs from remaining reviews
    
    Args:
        df: DataFrame with merged review and product data
        first_n_reviews: Number of first reviews to use for training
    
    Returns:
        train_df: Training DataFrame
        test_df: Test DataFrame with positive/negative pairs
        test_pairs: List of dictionaries containing pair information
    """
    
    # Ensure timestamp is datetime
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    # Sort by user and timestamp
    df_sorted = df.sort_values(['user_id', 'timestamp'])
    
    train_data = []
    test_data = []
    test_pairs = []
    
    print("Processing users to create train/test split...")
    
    # Group by user
    user_groups = df_sorted.groupby('user_id')
    
    for user_id, user_reviews in tqdm(user_groups, desc="Processing users"):

        user_reviews = user_reviews.sort_values('timestamp').reset_index(drop=True)
        
        # First N reviews go to training
        train_reviews = user_reviews.iloc[:first_n_reviews]
        train_data.append(train_reviews)
        
        # Remaining reviews for potential test pairs
        remaining_reviews = user_reviews.iloc[first_n_reviews:]
        
        if len(remaining_reviews) > 0:
            # Find positive reviews (rating >= 4)
            positive_reviews = remaining_reviews[remaining_reviews['review_rating'] >= 4]
            
            # Find negative reviews (rating <= 2)
            negative_reviews = remaining_reviews[remaining_reviews['review_rating'] <= 2]
            
            # Create pairs of positive and negative reviews
            if len(positive_reviews) > 0 and len(negative_reviews) > 0:
                # For each positive review, pair with each negative review
                for _, pos_review in positive_reviews.iterrows():
                    for _, neg_review in negative_reviews.iterrows():
                        # Add both reviews to test set
                        test_data.append(pos_review.to_frame().T)
                        test_data.append(neg_review.to_frame().T)
                        #print('pr' , pos_review)
                        # Store pair information
                        pair_info = {
                            'user_id': user_id,
                            'positive_review': {
                                'asin': pos_review['asin'],
                                'parent_asin': pos_review['parent_asin'],
                                'product_title': pos_review['product_title'],
                                'product_features' : pos_review['product_features'], 
                                'product_price' : pos_review['product_price'],
                                'rating': pos_review['review_rating'],
                                'timestamp': pos_review['timestamp'].isoformat() if pd.notna(pos_review['timestamp']) else None,
                                'review_text': pos_review['review_text']
                            },
                            'negative_review': {
                                'asin': neg_review['asin'],
                                'parent_asin': neg_review['parent_asin'],
                                'product_title': neg_review['product_title'],
                                'product_features' : neg_review['product_features'], 
                                'product_price' : neg_review['product_price'], 
                                'rating': neg_review['review_rating'],
                                'timestamp': neg_review['timestamp'].isoformat() if pd.notna(neg_review['timestamp']) else None,
                                'review_text': neg_review['review_text']
                            }
                        }
                        test_pairs.append(pair_info)
    
    # Combine all training data
    if train_data:
        train_df = pd.concat(train_data, ignore_index=True)
    else:
        train_df = pd.DataFrame()
    
    # Combine all test data and remove duplicates
    if test_data:
        test_df = pd.concat(test_data, ignore_index=True)
        # Remove duplicate reviews (same review might be in multiple pairs)
        test_df = test_df.drop_duplicates(subset=['user_id', 'asin', 'timestamp'])
    else:
        test_df = pd.DataFrame()
    
    print(f"\nSplit complete!")
    print(f"Training set: {len(train_df)} reviews")
    print(f"Test set: {len(test_df)} reviews")
    print(f"Test pairs: {len(test_pairs)} positive/negative pairs")
    
    return train_df, test_df, test_pairs

def analyze_split(train_df, test_df, test_pairs):
    """
    Analyze the train/test split to provide statistics.
    
    Args:
        train_df: Training DataFrame
        test_df: Test DataFrame
        test_pairs: List of test pairs
    
    Returns:
        Dictionary with statistics
    """
    stats = {}
    
    # Training statistics
    stats['train'] = {
        'total_reviews': len(train_df),
        'unique_users': train_df['user_id'].nunique() if len(train_df) > 0 else 0,
        'unique_products': train_df['parent_asin'].nunique() if len(train_df) > 0 else 0,
        'avg_rating': train_df['review_rating'].mean() if len(train_df) > 0 else 0,
        'rating_distribution': train_df['review_rating'].value_counts().to_dict() if len(train_df) > 0 else {}
    }
    
    # Test statistics
    stats['test'] = {
        'total_reviews': len(test_df),
        'unique_users': test_df['user_id'].nunique() if len(test_df) > 0 else 0,
        'unique_products': test_df['parent_asin'].nunique() if len(test_df) > 0 else 0,
        'avg_rating': test_df['review_rating'].mean() if len(test_df) > 0 else 0,
        'rating_distribution': test_df['review_rating'].value_counts().to_dict() if len(test_df) > 0 else {},
        'total_pairs': len(test_pairs)
    }
    
    # Users in both sets
    if len(train_df) > 0 and len(test_df) > 0:
        train_users = set(train_df['user_id'].unique())
        test_users = set(test_df['user_id'].unique())
        stats['overlap'] = {
            'users_in_both': len(train_users.intersection(test_users)),
            'users_only_train': len(train_users - test_users),
            'users_only_test': len(test_users - train_users)
        }
    
    return stats

def save_splits(train_df, test_df, test_pairs, output_prefix='beauty_split'):
    """
    Save the train/test splits and pairs information to files.
    
    Args:
        train_df: Training DataFrame
        test_df: Test DataFrame
        test_pairs: List of test pairs
        output_prefix: Prefix for output filenames
    """
    # Save training data
    train_file = f'input/{output_prefix}_train.csv'
    train_df.to_csv(train_file, index=False)
    print(f"Training data saved to {train_file}")
    
    # Save test data
    test_file = f'input/{output_prefix}_test.csv'
    test_df.to_csv(test_file, index=False)
    print(f"Test data saved to {test_file}")
    
    # Save test pairs information
    pairs_file = f'input/{output_prefix}_test_pairs.json'
    with open(pairs_file, 'w') as f:
        json.dump(test_pairs, f, indent=2, default=str)
    print(f"Test pairs information saved to {pairs_file}")
    
    # Save statistics
    stats = analyze_split(train_df, test_df, test_pairs)
    stats_file = f'input/{output_prefix}_statistics.json'
    with open(stats_file, 'w') as f:
        json.dump(stats, f, indent=2, default=str)
    print(f"Statistics saved to {stats_file}")

def create_preference_dataset(test_pairs, output_file='preference_pairs.jsonl'):
    """
    Create a preference dataset from test pairs in a format suitable for preference learning.
    
    Args:
        test_pairs: List of test pairs
        output_file: Output filename for preference data
    """
    preference_data = []
    
    for pair in test_pairs:
        # Create a preference record
        pref_record = {
            'user_id': pair['user_id'],
            'preferred': {
                'product_id': pair['positive_review']['parent_asin'],
                'product_title': pair['positive_review']['product_title'],
                'product_price': pair['positive_review']['product_price'],
                'product_features': pair['positive_review']['product_features'],
                'review_text': pair['positive_review']['review_text'],
                'rating': pair['positive_review']['rating'],
                'timestamp': pair['positive_review']['timestamp']
            },
            'rejected': {
                'product_id': pair['negative_review']['parent_asin'],
                'product_title': pair['negative_review']['product_title'],
                'product_price': pair['positive_review']['product_price'],
                'product_features': pair['positive_review']['product_features'],
                'review_text': pair['negative_review']['review_text'],
                'rating': pair['negative_review']['rating'],
                'timestamp': pair['negative_review']['timestamp']
            }
        }
        preference_data.append(pref_record)
    
    # Save as JSONL for easy streaming
    with open(output_file, 'w') as f:
        for record in preference_data:
            f.write(json.dumps(record) + '\n')
    
    print(f"Preference dataset saved to {output_file}")
    return preference_data

def main(input_file='beauty_merged_data.csv', first_n_reviews=5 ):
    """
    Main function to create train/test split with preference pairs.
    
    Args:
        input_file: Path to the merged data CSV file
        first_n_reviews: Number of first reviews per user for training
    """
    output_prefix = input_file[:4]
    print(f"Loading data from {input_file}...")
    df = pd.read_csv('input/' + input_file)
    print(f"Loaded {len(df)} reviews from {df['user_id'].nunique()} users")
    
    # Create train/test split
    train_df, test_df, test_pairs = create_train_test_split(df, first_n_reviews)
    
    # Analyze the split
    print("\n" + "="*50)
    print("SPLIT ANALYSIS")
    print("="*50)
    stats = analyze_split(train_df, test_df, test_pairs)
    
    print("\nTraining Set:")
    print(f"  - Total reviews: {stats['train']['total_reviews']}")
    print(f"  - Unique users: {stats['train']['unique_users']}")
    print(f"  - Unique products: {stats['train']['unique_products']}")
    print(f"  - Average rating: {stats['train']['avg_rating']:.2f}")
    
    print("\nTest Set:")
    print(f"  - Total reviews: {stats['test']['total_reviews']}")
    print(f"  - Unique users: {stats['test']['unique_users']}")
    print(f"  - Unique products: {stats['test']['unique_products']}")
    print(f"  - Average rating: {stats['test']['avg_rating']:.2f}")
    print(f"  - Total preference pairs: {stats['test']['total_pairs']}")
    
    if 'overlap' in stats:
        print("\nUser Overlap:")
        print(f"  - Users in both sets: {stats['overlap']['users_in_both']}")
        print(f"  - Users only in train: {stats['overlap']['users_only_train']}")
        print(f"  - Users only in test: {stats['overlap']['users_only_test']}")
    
    # Save the splits
    print("\n" + "="*50)
    print("SAVING FILES")
    print("="*50)
    print('TTT' , test_pairs)
    save_splits(train_df, test_df, test_pairs, output_prefix=output_prefix)
    
    # Create preference dataset
    if test_pairs:
        create_preference_dataset(test_pairs)
    
    return train_df, test_df, test_pairs

# Example usage
if __name__ == "__main__":
    # Run with default settings
    train_df, test_df, test_pairs = main(input_file = 'electronic_merged_data.csv')
    #train_df, test_df, test_pairs = main(input_file = 'beauty_merged_data.csv')
    
    # Or specify custom parameters
    # train_df, test_df, test_pairs = main(
    #     input_file='your_merged_data.csv',
    #     first_n_reviews=10
    # )

Processing users:  45%|████▍     | 58558/130231 [15:43<15:26, 77.35it/s] 

In [None]:
import pandas as pd
import json
import time
from typing import Dict, List, Tuple, Optional
from datetime import datetime
from tqdm import tqdm
import numpy as np
from pathlib import Path

# Import the prompt template (assumes it's in the same directory or installed)
from prompt import BeautyPromptTemplateAdapter#BeautyPreferencePromptTemplate, BeautyPreferenceBatchProcessor

# Import the LLM client
from remoteOss import get_local_client


class BeautyPreferenceLLMEvaluator:
    """
    Evaluates LLM performance on beauty product preference prediction.
    """
    
    def __init__(
        self,
        train_df: pd.DataFrame,
        test_pairs_file: str = 'input/beauty_split_test_pairs.json',
        model_id: str = None,
        include_product_features: bool = True,
        include_review_text: bool = True,
        reasoning: bool = True,
        max_review_text_length: int = 150,
        temperature: float = 0.0,
        max_new_tokens: int = 256,
        seed: int = 42
    ):
        """
        Initialize the evaluator.
        
        Args:
            train_df: Training DataFrame with user reviews
            test_pairs_file: Path to test pairs JSON file
            model_id: Model ID for the LLM (None uses default)
            include_product_features: Include features in prompts
            include_review_text: Include review text in prompts
            reasoning: Request reasoning from the LLM
            max_review_text_length: Max length for review text
            temperature: LLM temperature setting
            max_new_tokens: Max tokens for LLM response
            seed: Random seed for reproducibility
        """
        # Initialize prompt template
        self.prompt_template = BeautyPromptTemplateAdapter(
            train_df=train_df,
            test_pairs_file=test_pairs_file,
            include_product_features=include_product_features,
            include_review_text=include_review_text,
            reasoning=reasoning,
            max_review_text_length=max_review_text_length
        )
        
        # Initialize LLM client
        if model_id:
            self.llm_client = get_local_client(model_id=model_id)
        else:
            self.llm_client = get_local_client()
        
        # LLM parameters
        self.temperature = temperature
        self.max_new_tokens = max_new_tokens
        self.seed = seed
        
        # Results storage
        self.results = []
        self.errors = []
    
    def evaluate_single_pair(self, pair_index: int) -> Dict:
        """
        Evaluate a single test pair.
        
        Args:
            pair_index: Index of the test pair
            
        Returns:
            Dictionary with evaluation results
        """
        try:
            # Generate prompt
            prompt, pair_info = self.prompt_template.format_from_test_pair(pair_index)
            print('PROMPT ' , prompt , pair_info)
            # Get evaluation info (ground truth)
            eval_info = self.prompt_template.evaluate_pair(pair_index)
            
            # Call LLM
            start_time = time.time()
            
            # Format prompt for the LLM's call_oracle method
            # Extract schedule-like data from the pair for compatibility
            sched_a = {
                'product': eval_info['product_a_title'],
                'rating': eval_info['product_a_rating']
            }
            sched_b = {
                'product': eval_info['product_b_title'],
                'rating': eval_info['product_b_rating']
            }
            
            choice, raw_response = self.llm_client.call_oracle(
                prompt=prompt,
                sched_a=sched_a,
                sched_b=sched_b,
                temperature=self.temperature,
                max_new_tokens=self.max_new_tokens,
                seed=self.seed,
                stop=["===", "---", "\n\n\n"]  # Stop sequences to prevent rambling
            )
            
            print('cho' , choice, 'raw ,' , raw_response)
            inference_time = time.time() - start_time
            
            # Check if prediction is correct
            is_correct = (choice == eval_info['ground_truth'])
            
            result = {
                'pair_index': pair_index,
                'user_id': eval_info['user_id'],
                'predicted': choice,
                'ground_truth': eval_info['ground_truth'],
                'correct': is_correct,
                'product_a': eval_info['product_a_title'],
                'product_b': eval_info['product_b_title'],
                'rating_a': eval_info['product_a_rating'],
                'rating_b': eval_info['product_b_rating'],
                'raw_response': raw_response,
                'inference_time': inference_time,
                'prompt_length': len(prompt),
                'response_length': len(raw_response)
            }
            
            return result
            
        except Exception as e:
            error = {
                'pair_index': pair_index,
                'error': str(e),
                'timestamp': datetime.now().isoformat()
            }
            self.errors.append(error)
            print(f"Error evaluating pair {pair_index}: {e}")
            return None
    
    def evaluate_batch(
        self,
        pair_indices: List[int] = None,
        sample_size: int = None,
        verbose: bool = True,
        save_intermediate: bool = True,
        output_dir: str = 'evaluation_results'
    ) -> Dict:
        """
        Evaluate multiple test pairs.
        
        Args:
            pair_indices: Specific indices to evaluate
            sample_size: Random sample size (if pair_indices not provided)
            verbose: Print progress
            save_intermediate: Save results periodically
            output_dir: Directory to save results
            
        Returns:
            Dictionary with evaluation metrics
        """
        # Determine which pairs to evaluate
        if pair_indices is None:
            total_pairs = len(self.prompt_template.test_pairs)
            if sample_size:
                import random
                pair_indices = random.sample(range(total_pairs), min(sample_size, total_pairs))
            else:
                pair_indices = range(total_pairs)
        
        # Create output directory
        if save_intermediate:
            Path(output_dir).mkdir(parents=True, exist_ok=True)
        
        # Evaluate each pair
        for idx in tqdm(pair_indices, desc="Evaluating pairs", disable=not verbose):
            result = self.evaluate_single_pair(idx)
            if result:
                self.results.append(result)
            
            # Save intermediate results
            if save_intermediate and len(self.results) % 10 == 0:
                self._save_intermediate_results(output_dir)
        
        # Final save
        if save_intermediate:
            self._save_intermediate_results(output_dir)
        
        # Calculate metrics
        metrics = self.calculate_metrics()
        
        #if verbose:
        #    self.print_metrics(metrics)
        
        return metrics
    
    def calculate_metrics(self) -> Dict:
        """
        Calculate evaluation metrics from results.
        
        Returns:
            Dictionary with various metrics
        """
        if not self.results:
            return {'error': 'No results to evaluate'}
        
        df = pd.DataFrame(self.results)
        
        # Basic accuracy
        accuracy = df['correct'].mean()
        
        # Per-user accuracy
        user_accuracy = df.groupby('user_id')['correct'].agg(['mean', 'count'])
        
        # Rating difference analysis
        df['rating_diff'] = df['rating_a'] - df['rating_b']
        
        # Accuracy by rating difference magnitude
        df['rating_diff_abs'] = df['rating_diff'].abs()
        accuracy_by_diff = df.groupby('rating_diff_abs')['correct'].agg(['mean', 'count'])
        
        # Inference time statistics
        avg_inference_time = df['inference_time'].mean()
        median_inference_time = df['inference_time'].median()
        
        # Response analysis
        avg_response_length = df['response_length'].mean()
        avg_prompt_length = df['prompt_length'].mean()
        
        metrics = {
            'overall_accuracy': accuracy,
            'total_evaluated': len(df),
            'total_errors': len(self.errors),
            'user_metrics': {
                'unique_users': df['user_id'].nunique(),
                'avg_accuracy_per_user': user_accuracy['mean'].mean(),
                'std_accuracy_per_user': user_accuracy['mean'].std(),
                'min_user_accuracy': user_accuracy['mean'].min(),
                'max_user_accuracy': user_accuracy['mean'].max()
            },
            'rating_difference_metrics': {
                'accuracy_by_diff': accuracy_by_diff.to_dict() if not accuracy_by_diff.empty else {},
                'avg_rating_diff': df['rating_diff'].mean(),
                'accuracy_when_diff_1': df[df['rating_diff_abs'] == 1]['correct'].mean() if len(df[df['rating_diff_abs'] == 1]) > 0 else None,
                'accuracy_when_diff_2': df[df['rating_diff_abs'] == 2]['correct'].mean() if len(df[df['rating_diff_abs'] == 2]) > 0 else None,
                'accuracy_when_diff_3': df[df['rating_diff_abs'] == 3]['correct'].mean() if len(df[df['rating_diff_abs'] == 3]) > 0 else None,
                'accuracy_when_diff_4': df[df['rating_diff_abs'] == 4]['correct'].mean() if len(df[df['rating_diff_abs'] == 4]) > 0 else None,
            },
            'performance_metrics': {
                'avg_inference_time': avg_inference_time,
                'median_inference_time': median_inference_time,
                'total_inference_time': df['inference_time'].sum(),
                'avg_response_length': avg_response_length,
                'avg_prompt_length': avg_prompt_length
            },
            'prediction_distribution': {
                'predicted_A': (df['predicted'] == 'A').sum(),
                'predicted_B': (df['predicted'] == 'B').sum(),
                'ground_truth_A': (df['ground_truth'] == 'A').sum(),
                'ground_truth_B': (df['ground_truth'] == 'B').sum()
            }
        }
        
        # Add confusion matrix
        from sklearn.metrics import confusion_matrix, classification_report
        
        try:
            cm = confusion_matrix(df['ground_truth'], df['predicted'], labels=['A', 'B'])
            metrics['confusion_matrix'] = {
                'true_A_pred_A': int(cm[0, 0]),
                'true_A_pred_B': int(cm[0, 1]),
                'true_B_pred_A': int(cm[1, 0]),
                'true_B_pred_B': int(cm[1, 1])
            }
            
            # Add classification report
            report = classification_report(
                df['ground_truth'], 
                df['predicted'], 
                labels=['A', 'B'],
                output_dict=True
            )
            metrics['classification_report'] = report
        except:
            pass
        
        return metrics
    
    def print_metrics(self, metrics: Dict):
        """
        Print formatted evaluation metrics.
        
        Args:
            metrics: Dictionary of metrics to print
        """
        print("\n" + "="*60)
        print("EVALUATION RESULTS")
        print("="*60)
        
        #print(f"\nOverall Accuracy: {metrics['overall_accuracy']:.2%}")
        print(f"Total Pairs Evaluated: {metrics['total_evaluated']}")
        print(f"Total Errors: {metrics['total_errors']}")
        
        print("\n--- User Metrics ---")
        user_m = metrics['user_metrics']
        print(f"Unique Users: {user_m['unique_users']}")
        print(f"Avg Accuracy per User: {user_m['avg_accuracy_per_user']:.2%}")
        print(f"Std Accuracy per User: {user_m['std_accuracy_per_user']:.2%}")
        print(f"Min User Accuracy: {user_m['min_user_accuracy']:.2%}")
        print(f"Max User Accuracy: {user_m['max_user_accuracy']:.2%}")
        
        print("\n--- Accuracy by Rating Difference ---")
        diff_m = metrics['rating_difference_metrics']
        for diff in [1, 2, 3, 4]:
            key = f'accuracy_when_diff_{diff}'
            if diff_m.get(key) is not None:
                print(f"Rating Diff = {diff}: {diff_m[key]:.2%}")
        
        print("\n--- Performance Metrics ---")
        perf_m = metrics['performance_metrics']
        print(f"Avg Inference Time: {perf_m['avg_inference_time']:.2f}s")
        print(f"Median Inference Time: {perf_m['median_inference_time']:.2f}s")
        print(f"Avg Response Length: {perf_m['avg_response_length']:.0f} chars")
        print(f"Avg Prompt Length: {perf_m['avg_prompt_length']:.0f} chars")
        
        print("\n--- Prediction Distribution ---")
        pred_d = metrics['prediction_distribution']
        print(f"Predicted A: {pred_d['predicted_A']} ({pred_d['predicted_A']/metrics['total_evaluated']:.1%})")
        print(f"Predicted B: {pred_d['predicted_B']} ({pred_d['predicted_B']/metrics['total_evaluated']:.1%})")
        
        if 'confusion_matrix' in metrics:
            print("\n--- Confusion Matrix ---")
            cm = metrics['confusion_matrix']
            print("           Predicted")
            print("           A      B")
            print(f"True A:  {cm['true_A_pred_A']:4d}  {cm['true_A_pred_B']:4d}")
            print(f"True B:  {cm['true_B_pred_A']:4d}  {cm['true_B_pred_B']:4d}")
    
    def _save_intermediate_results(self, output_dir: str):
        """Save intermediate results to files."""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Save results
        if self.results:
            results_file = Path(output_dir) / f"results_{timestamp}.json"
            with open(results_file, 'w') as f:
                json.dump(self.results, f, indent=2)
        
        # Save errors
        if self.errors:
            errors_file = Path(output_dir) / f"errors_{timestamp}.json"
            with open(errors_file, 'w') as f:
                json.dump(self.errors, f, indent=2)
        
        # Save metrics
        if self.results:
            metrics = self.calculate_metrics()
            metrics_file = Path(output_dir) / f"metrics_{timestamp}.json"
            with open(metrics_file, 'w') as f:
                json.dump(metrics, f, indent=2, default=str)
    
    def analyze_failures(self, n_examples: int = 5) -> Dict:
        """
        Analyze failure cases to understand where the model fails.
        
        Args:
            n_examples: Number of example failures to show
            
        Returns:
            Dictionary with failure analysis
        """
        if not self.results:
            return {'error': 'No results to analyze'}
        
        df = pd.DataFrame(self.results)
        failures = df[~df['correct']]
        
        if failures.empty:
            return {'message': 'No failures found!'}
        
        analysis = {
            'total_failures': len(failures),
            'failure_rate': len(failures) / len(df),
            'failures_by_rating_diff': failures.groupby('rating_a').size().to_dict(),
            'example_failures': []
        }
        
        # Get example failures
        for _, row in failures.head(n_examples).iterrows():
            example = {
                'user_id': row['user_id'],
                'predicted': row['predicted'],
                'should_be': row['ground_truth'],
                'product_a': f"{row['product_a']} (rating: {row['rating_a']})",
                'product_b': f"{row['product_b']} (rating: {row['rating_b']})",
                'response_snippet': row['raw_response'][:200] + "..." if len(row['raw_response']) > 200 else row['raw_response']
            }
            analysis['example_failures'].append(example)
        
        return analysis


def main():
    """
    Main function to run the evaluation.
    """
    # Load training data
    print("Loading training data...")
    train_df = pd.read_csv('input/beauty_split_train.csv')
    print(f"Loaded {len(train_df)} training reviews from {train_df['user_id'].nunique()} users")
    
    # Initialize evaluator
    print("\nInitializing evaluator...")
    evaluator = BeautyPreferenceLLMEvaluator(
        train_df=train_df,
        test_pairs_file='input/beauty_split_test_pairs.json',
        include_product_features=True,
        include_review_text=True,
        reasoning=True,  # Request reasoning from LLM
        temperature=0.0,  # Deterministic for evaluation
        max_new_tokens=2560,
        seed=42
    )
    
    # Run evaluation
    print("\nStarting evaluation...")
    print("This may take a while depending on the number of test pairs and LLM speed.\n")
    
    # Evaluate a sample first to test
    print("Running initial test on 1 pairs...")
    test_metrics = evaluator.evaluate_batch(
        sample_size=1,
        verbose=True,
        save_intermediate=True,
        output_dir='evaluation_results_test'
    )
    
    # Ask user if they want to continue with full evaluation
    response = input("\nTest complete. Run full evaluation? (y/n): ")
    if response.lower() == 'y':
        # Clear previous results
        evaluator.results = []
        evaluator.errors = []
        
        # Run full evaluation
        print("\nRunning full evaluation...")
        metrics = evaluator.evaluate_batch(
            sample_size=100,  # Evaluate all pairs
            verbose=True,
            save_intermediate=True,
            output_dir='evaluation_results_full'
        )
        
        # Analyze failures
        print("\n" + "="*60)
        print("FAILURE ANALYSIS")
        print("="*60)
        failure_analysis = evaluator.analyze_failures(n_examples=5)
        
        if 'example_failures' in failure_analysis:
            print(f"\nTotal Failures: {failure_analysis['total_failures']}")
            print(f"Failure Rate: {failure_analysis['failure_rate']:.2%}")
            
            print("\nExample Failures:")
            for i, example in enumerate(failure_analysis['example_failures'], 1):
                print(f"\n{i}. User: {example['user_id']}")
                print(f"   Predicted: {example['predicted']}, Should be: {example['should_be']}")
                print(f"   Product A: {example['product_a']}")
                print(f"   Product B: {example['product_b']}")
                print(f"   Response: {example['response_snippet']}")
        
        # Save final results
        print("\nSaving final results...")
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        final_results = {
            'metrics': metrics,
            'failure_analysis': failure_analysis,
            'evaluation_summary': {
                'total_pairs_evaluated': len(evaluator.results),
                'overall_accuracy': metrics['overall_accuracy'],
                'timestamp': timestamp,
                'model_used': evaluator.llm_client.model_id
            }
        }
        
        with open(f'evaluation_results_full/final_summary_{timestamp}.json', 'w') as f:
            json.dump(final_results, f, indent=2, default=str)
        
        print(f"Results saved to evaluation_results_full/final_summary_{timestamp}.json")


if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import json
import time
from typing import Dict, List, Tuple, Optional
from datetime import datetime
from tqdm import tqdm
import numpy as np
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import asyncio

# Import the prompt template (assumes it's in the same directory or installed)
from prompt import BeautyPromptTemplateAdapter

# Import the LLM client
from remoteOss import get_local_client


class BeautyPreferenceLLMEvaluator:
    """
    Evaluates LLM performance on beauty product preference prediction with batch processing support.
    """
    
    def __init__(
        self,
        train_df: pd.DataFrame,
        test_pairs_file: str = 'input/beauty_split_test_pairs.json',
        model_id: str = None,
        include_product_features: bool = True,
        include_review_text: bool = True,
        reasoning: bool = True,
        max_review_text_length: int = 150,
        temperature: float = 0.0,
        max_new_tokens: int = 256,
        seed: int = 42,
        batch_size: int = 32,
        max_concurrent_batches: int = 2
    ):
        """
        Initialize the evaluator with batch processing support.
        
        Args:
            train_df: Training DataFrame with user reviews
            test_pairs_file: Path to test pairs JSON file
            model_id: Model ID for the LLM (None uses default)
            include_product_features: Include features in prompts
            include_review_text: Include review text in prompts
            reasoning: Request reasoning from the LLM
            max_review_text_length: Max length for review text
            temperature: LLM temperature setting
            max_new_tokens: Max tokens for LLM response
            seed: Random seed for reproducibility
            batch_size: Number of queries to process in a single batch
            max_concurrent_batches: Maximum number of concurrent batch requests
        """
        # Initialize prompt template
        self.prompt_template = BeautyPromptTemplateAdapter(
            train_df=train_df,
            test_pairs_file=test_pairs_file,
            include_product_features=include_product_features,
            include_review_text=include_review_text,
            reasoning=reasoning,
            max_review_text_length=max_review_text_length
        )
        
        # Initialize LLM client
        if model_id:
            self.llm_client = get_local_client(model_id=model_id)
        else:
            self.llm_client = get_local_client()
        
        # LLM parameters
        self.temperature = temperature
        self.max_new_tokens = max_new_tokens
        self.seed = seed
        
        # Batch processing parameters
        self.batch_size = batch_size
        self.max_concurrent_batches = max_concurrent_batches
        
        # Results storage
        self.results = []
        self.errors = []
    
    def prepare_batch_data(self, pair_indices: List[int]) -> Tuple[List[str], List[Dict]]:
        """
        Prepare batch data for multiple test pairs.
        
        Args:
            pair_indices: List of test pair indices
            
        Returns:
            Tuple of (prompts, pair_infos) for the batch
        """
        prompts = []
        pair_infos = []
        
        for idx in pair_indices:
            try:
                prompt, pair_info = self.prompt_template.format_from_test_pair(idx)
                eval_info = self.prompt_template.evaluate_pair(idx)
                
                # Combine pair_info with eval_info for complete context
                complete_info = {
                    'pair_index': idx,
                    'prompt': prompt,
                    **pair_info,
                    **eval_info
                }
                
                prompts.append(prompt)
                pair_infos.append(complete_info)
            except Exception as e:
                self.errors.append({
                    'pair_index': idx,
                    'error': f"Error preparing pair: {str(e)}",
                    'timestamp': datetime.now().isoformat()
                })
        
        return prompts, pair_infos
    
    def process_batch_responses(
        self, 
        responses: List[Tuple[str, str]], 
        pair_infos: List[Dict],
        inference_time: float
    ) -> List[Dict]:
        """
        Process batch responses from the LLM.
        
        Args:
            responses: List of (choice, raw_response) tuples from LLM
            pair_infos: List of pair information dictionaries
            inference_time: Total inference time for the batch
            
        Returns:
            List of result dictionaries
        """
        batch_results = []
        avg_inference_time = inference_time / len(responses) if responses else 0
        
        for (choice, raw_response), pair_info in zip(responses, pair_infos):
            try:
                # Check if prediction is correct
                is_correct = (choice == pair_info['ground_truth'])
                
                result = {
                    'pair_index': pair_info['pair_index'],
                    'user_id': pair_info['user_id'],
                    'predicted': choice,
                    'ground_truth': pair_info['ground_truth'],
                    'correct': is_correct,
                    'product_a': pair_info['product_a_title'],
                    'product_b': pair_info['product_b_title'],
                    'rating_a': pair_info['product_a_rating'],
                    'rating_b': pair_info['product_b_rating'],
                    'raw_response': raw_response,
                    'inference_time': avg_inference_time,
                    'prompt_length': len(pair_info['prompt']),
                    'response_length': len(raw_response)
                }
                
                batch_results.append(result)
                
            except Exception as e:
                self.errors.append({
                    'pair_index': pair_info.get('pair_index', -1),
                    'error': f"Error processing response: {str(e)}",
                    'timestamp': datetime.now().isoformat()
                })
        
        return batch_results
    
    def call_batch_oracle(self, prompts: List[str], pair_infos: List[Dict]) -> List[Tuple[str, str]]:
        """
        Call the LLM with a batch of prompts.
        
        Args:
            prompts: List of prompts to process
            pair_infos: List of pair information for context
            
        Returns:
            List of (choice, raw_response) tuples
        """
        # Check if the LLM client supports batch processing
        if hasattr(self.llm_client, 'call_batch_oracle'):
            # Use batch method if available
            schedules_a = []
            schedules_b = []
            
            for info in pair_infos:
                sched_a = {
                    'product': info['product_a_title'],
                    'rating': info['product_a_rating']
                }
                sched_b = {
                    'product': info['product_b_title'],
                    'rating': info['product_b_rating']
                }
                schedules_a.append(sched_a)
                schedules_b.append(sched_b)
            
            return self.llm_client.call_batch_oracle(
                prompts=prompts,
                schedules_a=schedules_a,
                schedules_b=schedules_b,
                temperature=self.temperature,
                max_new_tokens=self.max_new_tokens,
                seed=self.seed,
                stop=["===", "---", "\n\n\n"]
            )
        else:
            # Fallback to sequential processing if batch method not available
            responses = []
            for prompt, info in zip(prompts, pair_infos):
                sched_a = {
                    'product': info['product_a_title'],
                    'rating': info['product_a_rating']
                }
                sched_b = {
                    'product': info['product_b_title'],
                    'rating': info['product_b_rating']
                }
                
                choice, raw_response = self.llm_client.call_oracle(
                    prompt=prompt,
                    sched_a=sched_a,
                    sched_b=sched_b,
                    temperature=self.temperature,
                    max_new_tokens=self.max_new_tokens,
                    seed=self.seed,
                    stop=["===", "---", "\n\n\n"]
                )
                responses.append((choice, raw_response))
            
            return responses
    
    def evaluate_batch_concurrent(
        self,
        pair_indices: List[int] = None,
        sample_size: int = None,
        verbose: bool = True,
        save_intermediate: bool = True,
        output_dir: str = 'evaluation_results'
    ) -> Dict:
        """
        Evaluate multiple test pairs using batch processing with concurrency.
        
        Args:
            pair_indices: Specific indices to evaluate
            sample_size: Random sample size (if pair_indices not provided)
            verbose: Print progress
            save_intermediate: Save results periodically
            output_dir: Directory to save results
            
        Returns:
            Dictionary with evaluation metrics
        """
        # Determine which pairs to evaluate
        if pair_indices is None:
            total_pairs = len(self.prompt_template.test_pairs)
            if sample_size:
                import random
                pair_indices = random.sample(range(total_pairs), min(sample_size, total_pairs))
            else:
                pair_indices = list(range(total_pairs))
        else:
            pair_indices = list(pair_indices)
        
        # Create output directory
        if save_intermediate:
            Path(output_dir).mkdir(parents=True, exist_ok=True)
        
        # Split into batches
        batches = [pair_indices[i:i + self.batch_size] 
                  for i in range(0, len(pair_indices), self.batch_size)]
        
        if verbose:
            print(f"Processing {len(pair_indices)} pairs in {len(batches)} batches of size {self.batch_size}")
        
        # Process batches with concurrency control
        with ThreadPoolExecutor(max_workers=self.max_concurrent_batches) as executor:
            futures = []
            
            # Submit batches for processing
            for batch_idx, batch_indices in enumerate(batches):
                future = executor.submit(self._process_single_batch, batch_indices, batch_idx)
                futures.append(future)
            
            # Process completed batches
            for future in tqdm(as_completed(futures), total=len(futures), 
                             desc="Processing batches", disable=not verbose):
                try:
                    batch_results = future.result()
                    self.results.extend(batch_results)
                    
                    # Save intermediate results
                    if save_intermediate and len(self.results) % (self.batch_size * 2) == 0:
                        self._save_intermediate_results(output_dir)
                        
                except Exception as e:
                    print(f"Batch processing error: {e}")
        
        # Final save
        if save_intermediate:
            self._save_intermediate_results(output_dir)
        
        # Calculate metrics
        metrics = self.calculate_metrics()
        
        if verbose:
            self.print_metrics(metrics)
        
        return metrics
    
    def _process_single_batch(self, batch_indices: List[int], batch_idx: int) -> List[Dict]:
        """
        Process a single batch of test pairs.
        
        Args:
            batch_indices: Indices for this batch
            batch_idx: Batch number (for logging)
            
        Returns:
            List of results for this batch
        """
        try:
            # Prepare batch data
            prompts, pair_infos = self.prepare_batch_data(batch_indices)
            print('prompts' , prompts )
            if not prompts:
                return []
            
            # Call LLM with batch
            start_time = time.time()
            responses = self.call_batch_oracle(prompts, pair_infos)
            inference_time = time.time() - start_time
            print('responses ' , responses)
            # Process responses
            batch_results = self.process_batch_responses(responses, pair_infos, inference_time)
            
            return batch_results
            
        except Exception as e:
            print(f"Error processing batch {batch_idx}: {e}")
            self.errors.append({
                'batch_idx': batch_idx,
                'batch_indices': batch_indices,
                'error': str(e),
                'timestamp': datetime.now().isoformat()
            })
            return []
    
    def calculate_metrics(self) -> Dict:
        """
        Calculate evaluation metrics from results.
        
        Returns:
            Dictionary with various metrics
        """
        if not self.results:
            return {'error': 'No results to evaluate'}
        
        df = pd.DataFrame(self.results)
        
        # Basic accuracy
        accuracy = df['correct'].mean()
        
        # Per-user accuracy
        user_accuracy = df.groupby('user_id')['correct'].agg(['mean', 'count'])
        
        # Rating difference analysis
        df['rating_diff'] = df['rating_a'] - df['rating_b']
        
        # Accuracy by rating difference magnitude
        df['rating_diff_abs'] = df['rating_diff'].abs()
        accuracy_by_diff = df.groupby('rating_diff_abs')['correct'].agg(['mean', 'count'])
        
        # Inference time statistics
        avg_inference_time = df['inference_time'].mean()
        median_inference_time = df['inference_time'].median()
        
        # Response analysis
        avg_response_length = df['response_length'].mean()
        avg_prompt_length = df['prompt_length'].mean()
        
        metrics = {
            'overall_accuracy': accuracy,
            'total_evaluated': len(df),
            'total_errors': len(self.errors),
            'batch_size_used': self.batch_size,
            'max_concurrent_batches': self.max_concurrent_batches,
            'user_metrics': {
                'unique_users': df['user_id'].nunique(),
                'avg_accuracy_per_user': user_accuracy['mean'].mean(),
                'std_accuracy_per_user': user_accuracy['mean'].std(),
                'min_user_accuracy': user_accuracy['mean'].min(),
                'max_user_accuracy': user_accuracy['mean'].max()
            },
            'rating_difference_metrics': {
                'accuracy_by_diff': accuracy_by_diff.to_dict() if not accuracy_by_diff.empty else {},
                'avg_rating_diff': df['rating_diff'].mean(),
                'accuracy_when_diff_1': df[df['rating_diff_abs'] == 1]['correct'].mean() if len(df[df['rating_diff_abs'] == 1]) > 0 else None,
                'accuracy_when_diff_2': df[df['rating_diff_abs'] == 2]['correct'].mean() if len(df[df['rating_diff_abs'] == 2]) > 0 else None,
                'accuracy_when_diff_3': df[df['rating_diff_abs'] == 3]['correct'].mean() if len(df[df['rating_diff_abs'] == 3]) > 0 else None,
                'accuracy_when_diff_4': df[df['rating_diff_abs'] == 4]['correct'].mean() if len(df[df['rating_diff_abs'] == 4]) > 0 else None,
            },
            'performance_metrics': {
                'avg_inference_time_per_item': avg_inference_time,
                'median_inference_time_per_item': median_inference_time,
                'total_inference_time': df['inference_time'].sum(),
                'avg_response_length': avg_response_length,
                'avg_prompt_length': avg_prompt_length,
                'throughput_items_per_second': len(df) / df['inference_time'].sum() if df['inference_time'].sum() > 0 else 0
            },
            'prediction_distribution': {
                'predicted_A': (df['predicted'] == 'A').sum(),
                'predicted_B': (df['predicted'] == 'B').sum(),
                'ground_truth_A': (df['ground_truth'] == 'A').sum(),
                'ground_truth_B': (df['ground_truth'] == 'B').sum()
            }
        }
        
        # Add confusion matrix
        from sklearn.metrics import confusion_matrix, classification_report
        
        try:
            cm = confusion_matrix(df['ground_truth'], df['predicted'], labels=['A', 'B'])
            metrics['confusion_matrix'] = {
                'true_A_pred_A': int(cm[0, 0]),
                'true_A_pred_B': int(cm[0, 1]),
                'true_B_pred_A': int(cm[1, 0]),
                'true_B_pred_B': int(cm[1, 1])
            }
            
            # Add classification report
            report = classification_report(
                df['ground_truth'], 
                df['predicted'], 
                labels=['A', 'B'],
                output_dict=True
            )
            metrics['classification_report'] = report
        except:
            pass
        
        return metrics
    
    def print_metrics(self, metrics: Dict):
        """
        Print formatted evaluation metrics.
        
        Args:
            metrics: Dictionary of metrics to print
        """
        print("\n" + "="*60)
        print("EVALUATION RESULTS")
        print("="*60)
        
        print(f"\nOverall Accuracy: {metrics['overall_accuracy']:.2%}")
        print(f"Total Pairs Evaluated: {metrics['total_evaluated']}")
        print(f"Total Errors: {metrics['total_errors']}")
        print(f"Batch Size: {metrics['batch_size_used']}")
        print(f"Max Concurrent Batches: {metrics['max_concurrent_batches']}")
        
        print("\n--- User Metrics ---")
        user_m = metrics['user_metrics']
        print(f"Unique Users: {user_m['unique_users']}")
        print(f"Avg Accuracy per User: {user_m['avg_accuracy_per_user']:.2%}")
        print(f"Std Accuracy per User: {user_m['std_accuracy_per_user']:.2%}")
        print(f"Min User Accuracy: {user_m['min_user_accuracy']:.2%}")
        print(f"Max User Accuracy: {user_m['max_user_accuracy']:.2%}")
        
        print("\n--- Accuracy by Rating Difference ---")
        diff_m = metrics['rating_difference_metrics']
        for diff in [1, 2, 3, 4]:
            key = f'accuracy_when_diff_{diff}'
            if diff_m.get(key) is not None:
                print(f"Rating Diff = {diff}: {diff_m[key]:.2%}")
        
        print("\n--- Performance Metrics ---")
        perf_m = metrics['performance_metrics']
        print(f"Avg Inference Time per Item: {perf_m['avg_inference_time_per_item']:.3f}s")
        print(f"Median Inference Time per Item: {perf_m['median_inference_time_per_item']:.3f}s")
        print(f"Total Inference Time: {perf_m['total_inference_time']:.1f}s")
        print(f"Throughput: {perf_m['throughput_items_per_second']:.1f} items/second")
        print(f"Avg Response Length: {perf_m['avg_response_length']:.0f} chars")
        print(f"Avg Prompt Length: {perf_m['avg_prompt_length']:.0f} chars")
        
        print("\n--- Prediction Distribution ---")
        pred_d = metrics['prediction_distribution']
        print(f"Predicted A: {pred_d['predicted_A']} ({pred_d['predicted_A']/metrics['total_evaluated']:.1%})")
        print(f"Predicted B: {pred_d['predicted_B']} ({pred_d['predicted_B']/metrics['total_evaluated']:.1%})")
        
        if 'confusion_matrix' in metrics:
            print("\n--- Confusion Matrix ---")
            cm = metrics['confusion_matrix']
            print("           Predicted")
            print("           A      B")
            print(f"True A:  {cm['true_A_pred_A']:4d}  {cm['true_A_pred_B']:4d}")
            print(f"True B:  {cm['true_B_pred_A']:4d}  {cm['true_B_pred_B']:4d}")
    
    def _save_intermediate_results(self, output_dir: str):
        """Save intermediate results to files."""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Save results
        if self.results:
            results_file = Path(output_dir) / f"results_{timestamp}.json"
            with open(results_file, 'w') as f:
                json.dump(self.results, f, indent=2)
        
        # Save errors
        if self.errors:
            errors_file = Path(output_dir) / f"errors_{timestamp}.json"
            with open(errors_file, 'w') as f:
                json.dump(self.errors, f, indent=2)
        
        # Save metrics
        if self.results:
            metrics = self.calculate_metrics()
            metrics_file = Path(output_dir) / f"metrics_{timestamp}.json"
            with open(metrics_file, 'w') as f:
                json.dump(metrics, f, indent=2, default=str)
    
    def analyze_failures(self, n_examples: int = 5) -> Dict:
        """
        Analyze failure cases to understand where the model fails.
        
        Args:
            n_examples: Number of example failures to show
            
        Returns:
            Dictionary with failure analysis
        """
        if not self.results:
            return {'error': 'No results to analyze'}
        
        df = pd.DataFrame(self.results)
        failures = df[~df['correct']]
        
        if failures.empty:
            return {'message': 'No failures found!'}
        
        analysis = {
            'total_failures': len(failures),
            'failure_rate': len(failures) / len(df),
            'failures_by_rating_diff': failures.groupby('rating_a').size().to_dict(),
            'example_failures': []
        }
        
        # Get example failures
        for _, row in failures.head(n_examples).iterrows():
            example = {
                'user_id': row['user_id'],
                'predicted': row['predicted'],
                'should_be': row['ground_truth'],
                'product_a': f"{row['product_a']} (rating: {row['rating_a']})",
                'product_b': f"{row['product_b']} (rating: {row['rating_b']})",
                'response_snippet': row['raw_response'][:200] + "..." if len(row['raw_response']) > 200 else row['raw_response']
            }
            analysis['example_failures'].append(example)
        
        return analysis


def main():
    """
    Main function to run the batch evaluation.
    """
    # Load training data
    print("Loading training data...")
    train_df = pd.read_csv('input/beauty_split_train.csv')
    print(f"Loaded {len(train_df)} training reviews from {train_df['user_id'].nunique()} users")
    
    # Initialize evaluator with batch processing
    print("\nInitializing evaluator with batch processing...")
    evaluator = BeautyPreferenceLLMEvaluator(
        train_df=train_df,
        test_pairs_file='input/beauty_split_test_pairs.json',
        include_product_features=True,
        include_review_text=True,
        reasoning=True,
        temperature=0.0,
        max_new_tokens=2048,
        seed=42,
        batch_size=32,  # Process 32 queries at once
        max_concurrent_batches=2  # Allow 2 concurrent batch requests
    )
    
    # Run evaluation
    print("\nStarting batch evaluation...")
    print("This should be significantly faster with batch processing.\n")
    
    # Test with a small batch first
    print("Running initial test on 10 pairs...")
    test_metrics = evaluator.evaluate_batch_concurrent(
        sample_size=10,
        verbose=True,
        save_intermediate=True,
        output_dir='evaluation_results_test'
    )
    
    print(f"\nTest batch completed. Throughput: {test_metrics['performance_metrics']['throughput_items_per_second']:.1f} items/second")
    
    # Ask user if they want to continue with full evaluation
    response = input("\nTest complete. Run full evaluation? (y/n): ")
    if response.lower() == 'y':
        # Clear previous results
        evaluator.results = []
        evaluator.errors = []
        
        # You can adjust batch size based on your VLLM server capacity
        batch_size = input("Enter batch size (default 32, max recommended 128): ").strip()
        if batch_size.isdigit():
            evaluator.batch_size = int(batch_size)
        
        concurrent = input("Enter max concurrent batches (default 2): ").strip()
        if concurrent.isdigit():
            evaluator.max_concurrent_batches = int(concurrent)
        
        # Run full evaluation
        print(f"\nRunning full evaluation with batch_size={evaluator.batch_size}, max_concurrent={evaluator.max_concurrent_batches}...")
        
        start_time = time.time()
        metrics = evaluator.evaluate_batch_concurrent(
            sample_size=320,  # Evaluate all pairs
            verbose=True,
            save_intermediate=True,
            output_dir='evaluation_results_full'
        )
        total_time = time.time() - start_time
        
        print(f"\nTotal evaluation time: {total_time:.1f} seconds")
        print(f"Average throughput: {metrics['performance_metrics']['throughput_items_per_second']:.1f} items/second")
        
        # Analyze failures
        print("\n" + "="*60)
        print("FAILURE ANALYSIS")
        print("="*60)
        failure_analysis = evaluator.analyze_failures(n_examples=5)
        
        if 'example_failures' in failure_analysis:
            print(f"\nTotal Failures: {failure_analysis['total_failures']}")
            print(f"Failure Rate: {failure_analysis['failure_rate']:.2%}")
            
            print("\nExample Failures:")
            for i, example in enumerate(failure_analysis['example_failures'], 1):
                print(f"\n{i}. User: {example['user_id']}")
                print(f"   Predicted: {example['predicted']}, Should be: {example['should_be']}")
                print(f"   Product A: {example['product_a']}")
                print(f"   Product B: {example['product_b']}")
                print(f"   Response: {example['response_snippet']}")
        
        # Save final results
        print("\nSaving final results...")
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        final_results = {
            'metrics': metrics,
            'failure_analysis': failure_analysis,
            'evaluation_summary': {
                'total_pairs_evaluated': len(evaluator.results),
                'overall_accuracy': metrics['overall_accuracy'],
                'total_evaluation_time': total_time,
                'throughput_items_per_second': metrics['performance_metrics']['throughput_items_per_second'],
                'batch_size': evaluator.batch_size,
                'max_concurrent_batches': evaluator.max_concurrent_batches,
                'timestamp': timestamp,
                'model_used': evaluator.llm_client.model_id if hasattr(evaluator.llm_client, 'model_id') else 'unknown'
            }
        }
        
        with open(f'evaluation_results_full/final_summary_{timestamp}.json', 'w') as f:
            json.dump(final_results, f, indent=2, default=str)
        
        print(f"Results saved to evaluation_results_full/final_summary_{timestamp}.json")


if __name__ == "__main__":
    main()

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

def load_results(filepath):
    """Load the JSON results file."""
    with open(filepath, 'r') as f:
        data = json.load(f)
    return data

def compute_metrics(data):
    """Compute various evaluation metrics from the results."""
    
    # Convert to DataFrame for easier analysis
    df = pd.DataFrame(data)
    
    # Basic accuracy
    accuracy = df['correct'].mean()
    total_predictions = len(df)
    correct_predictions = df['correct'].sum()
    
    # Get predictions and ground truth
    y_true = df['ground_truth'].values
    y_pred = df['predicted'].values
    
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=['A', 'B'])
    
    # Classification report
    class_report = classification_report(y_true, y_pred, labels=['A', 'B'], output_dict=True)
    
    # Per-class accuracy
    class_a_mask = df['ground_truth'] == 'A'
    class_b_mask = df['ground_truth'] == 'B'
    
    accuracy_a = df[class_a_mask]['correct'].mean() if class_a_mask.any() else 0
    accuracy_b = df[class_b_mask]['correct'].mean() if class_b_mask.any() else 0
    
    # Rating difference analysis
    df['rating_diff'] = df['rating_a'] - df['rating_b']
    df['abs_rating_diff'] = df['rating_diff'].abs()
    
    # Accuracy by rating difference magnitude
    rating_bins = [0, 1, 2, 3, 4, 5]
    df['rating_diff_bin'] = pd.cut(df['abs_rating_diff'], bins=rating_bins, include_lowest=True)
    accuracy_by_rating_diff = df.groupby('rating_diff_bin')['correct'].agg(['mean', 'count'])
    
    # User-level accuracy (if multiple predictions per user)
    user_accuracy = df.groupby('user_id')['correct'].mean()
    
    # Inference time statistics
    inference_stats = {
        'mean': df['inference_time'].mean(),
        'median': df['inference_time'].median(),
        'std': df['inference_time'].std(),
        'min': df['inference_time'].min(),
        'max': df['inference_time'].max()
    }
    
    # Prompt and response length statistics
    prompt_stats = {
        'mean': df['prompt_length'].mean(),
        'median': df['prompt_length'].median(),
        'std': df['prompt_length'].std()
    }
    
    response_stats = {
        'mean': df['response_length'].mean(),
        'median': df['response_length'].median(),
        'std': df['response_length'].std()
    }
    
    return {
        'overall_accuracy': accuracy,
        'total_predictions': total_predictions,
        'correct_predictions': correct_predictions,
        'confusion_matrix': cm,
        'classification_report': class_report,
        'accuracy_class_A': accuracy_a,
        'accuracy_class_B': accuracy_b,
        'accuracy_by_rating_diff': accuracy_by_rating_diff,
        'user_accuracy': user_accuracy,
        'inference_time_stats': inference_stats,
        'prompt_length_stats': prompt_stats,
        'response_length_stats': response_stats,
        'dataframe': df
    }

def plot_confusion_matrix(cm, title='Confusion Matrix'):
    """Plot a confusion matrix."""
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Predicted A', 'Predicted B'],
                yticklabels=['True A', 'True B'])
    plt.title(title)
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.show()

def plot_accuracy_by_rating_diff(accuracy_by_diff):
    """Plot accuracy by rating difference."""
    if not accuracy_by_diff.empty:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
        
        # Accuracy by rating difference
        accuracy_by_diff['mean'].plot(kind='bar', ax=ax1, color='skyblue')
        ax1.set_title('Accuracy by Rating Difference')
        ax1.set_xlabel('Absolute Rating Difference')
        ax1.set_ylabel('Accuracy')
        ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45)
        
        # Sample count by rating difference
        accuracy_by_diff['count'].plot(kind='bar', ax=ax2, color='lightcoral')
        ax2.set_title('Sample Count by Rating Difference')
        ax2.set_xlabel('Absolute Rating Difference')
        ax2.set_ylabel('Number of Samples')
        ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45)
        
        plt.tight_layout()
        plt.show()

def print_summary(metrics):
    """Print a summary of the evaluation metrics."""
    print("=" * 60)
    print("EVALUATION RESULTS SUMMARY")
    print("=" * 60)
    
    print(f"\nOverall Performance:")
    print(f"  Total Predictions: {metrics['total_predictions']}")
    print(f"  Correct Predictions: {metrics['correct_predictions']}")
    print(f"  Overall Accuracy: {metrics['overall_accuracy']:.4f} ({metrics['overall_accuracy']*100:.2f}%)")
    
    print(f"\nPer-Class Accuracy:")
    print(f"  Class A Accuracy: {metrics['accuracy_class_A']:.4f} ({metrics['accuracy_class_A']*100:.2f}%)")
    print(f"  Class B Accuracy: {metrics['accuracy_class_B']:.4f} ({metrics['accuracy_class_B']*100:.2f}%)")
    
    print(f"\nConfusion Matrix:")
    print(f"  Rows: True labels, Columns: Predicted labels")
    cm = metrics['confusion_matrix']
    print(f"           Pred A   Pred B")
    print(f"  True A:   {cm[0,0]:5d}   {cm[0,1]:5d}")
    print(f"  True B:   {cm[1,0]:5d}   {cm[1,1]:5d}")
    
    print(f"\nClassification Report:")
    for class_label in ['A', 'B']:
        cr = metrics['classification_report'][class_label]
        print(f"  Class {class_label}:")
        print(f"    Precision: {cr['precision']:.4f}")
        print(f"    Recall: {cr['recall']:.4f}")
        print(f"    F1-Score: {cr['f1-score']:.4f}")
        print(f"    Support: {cr['support']}")
    
    print(f"\nInference Time Statistics (seconds):")
    inf_stats = metrics['inference_time_stats']
    print(f"  Mean: {inf_stats['mean']:.2f}")
    print(f"  Median: {inf_stats['median']:.2f}")
    print(f"  Std Dev: {inf_stats['std']:.2f}")
    print(f"  Min: {inf_stats['min']:.2f}")
    print(f"  Max: {inf_stats['max']:.2f}")
    
    print(f"\nPrompt Length Statistics:")
    prompt_stats = metrics['prompt_length_stats']
    print(f"  Mean: {prompt_stats['mean']:.0f}")
    print(f"  Median: {prompt_stats['median']:.0f}")
    print(f"  Std Dev: {prompt_stats['std']:.0f}")
    
    print(f"\nResponse Length Statistics:")
    resp_stats = metrics['response_length_stats']
    print(f"  Mean: {resp_stats['mean']:.0f}")
    print(f"  Median: {resp_stats['median']:.0f}")
    print(f"  Std Dev: {resp_stats['std']:.0f}")
    
    print(f"\nAccuracy by Rating Difference:")
    if not metrics['accuracy_by_rating_diff'].empty:
        for idx, row in metrics['accuracy_by_rating_diff'].iterrows():
            print(f"  {idx}: {row['mean']:.4f} (n={int(row['count'])})")
    
    print(f"\nUser-Level Statistics:")
    user_acc = metrics['user_accuracy']
    print(f"  Number of unique users: {len(user_acc)}")
    print(f"  Mean user accuracy: {user_acc.mean():.4f}")
    print(f"  Std dev of user accuracy: {user_acc.std():.4f}")

def main():
    # Load the results
    filepath = '/home/asj53/LISTEN/LISTEN/evaluation_results_full/results_20250824_131702.json'
    
    try:
        data = load_results(filepath)
        print(f"Successfully loaded {len(data)} predictions from {filepath}")
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        return
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON in file {filepath}")
        return
    
    # Compute metrics
    metrics = compute_metrics(data)
    
    # Print summary
    print_summary(metrics)
    
    # Plot visualizations
    plot_confusion_matrix(metrics['confusion_matrix'])
    plot_accuracy_by_rating_diff(metrics['accuracy_by_rating_diff'])
    
    # Additional analysis
    df = metrics['dataframe']
    
    # Analyze errors
    errors = df[~df['correct']]
    if not errors.empty:
        print(f"\n{'='*60}")
        print("ERROR ANALYSIS")
        print(f"{'='*60}")
        print(f"Total errors: {len(errors)}")
        print(f"Error rate: {len(errors)/len(df):.4f}")
        
        # Error distribution
        error_dist = errors.groupby(['ground_truth', 'predicted']).size()
        print(f"\nError distribution:")
        for (true_label, pred_label), count in error_dist.items():
            print(f"  True: {true_label}, Predicted: {pred_label}: {count} errors")
        
        # Errors by rating difference
        print(f"\nAverage rating difference for errors: {errors['abs_rating_diff'].mean():.2f}")
        print(f"Average rating difference for correct: {df[df['correct']]['abs_rating_diff'].mean():.2f}")
    
    return metrics

if __name__ == "__main__":
    metrics = main()