In [None]:
# Async-optimized Goodreads Book Emotion Analysis for Jupyter
import torch
import pandas as pd
import numpy as np
import asyncio
import aiohttp
import nest_asyncio
from PIL import Image
import io
import os
import sys
import json
import pickle
import time
import logging
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
import threading
from queue import Queue
import warnings
from tqdm.notebook import tqdm
import re

# Enable nested async loops for Jupyter
nest_asyncio.apply()
warnings.filterwarnings('ignore')

# Setup logging for Jupyter
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('jupyter_emotion_analysis.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Configuration optimized for Jupyter
class JupyterConfig:
    ARTEMIS_PATH = r'data/artemis/artemis'  # Your artemis path from the notebook
    CHECKPOINT_FILE = r'data/artemis/predictions/best_model_good_data.pt'  # Your model path
    DATA_FILE = 'preprocessed_books_2025_04_20.parquet'  # Your data file
    RESULTS_DIR = 'goodreads_emotion_results'
    
    # GPU configuration (auto-detect)
    GPU_IDS = list(range(torch.cuda.device_count())) if torch.cuda.is_available() else [0]
    BATCH_SIZE = 256  # Adjust based on your GPU memory
    
    # Async processing - optimized for Jupyter
    MAX_CONCURRENT_DOWNLOADS = 64  # Conservative for Jupyter
    MAX_CONCURRENT_PREPROCESSING = 32
    SEMAPHORE_LIMIT = 128
    DOWNLOAD_TIMEOUT = 10
    MAX_RETRIES = 3
    
    # Processing batches
    PROCESSING_BATCH_SIZE = 25000  # Large batches for efficiency
    CHECKPOINT_FREQUENCY = 10000   # Checkpoint every 10k books
    
    # Progress tracking (more frequent for Jupyter)
    PROGRESS_UPDATE_FREQUENCY = 500
    DETAILED_LOG_FREQUENCY = 2000
    
    # English language filtering
    ENGLISH_CODES = {
        'en', 'eng', 'en-us', 'en-gb', 'en-ca', 'en-au', 'en-nz', 'en-za', 
        'en-in', 'english', 'en_us', 'en_gb', 'en_ca', 'en_au'
    }
    
    # Emotion labels (matching your notebook)
    EMOTION_LABELS = ['amusement', 'anger', 'awe', 'contentment', 'disgust', 
                     'excitement', 'fear', 'sadness', 'something else']
    
    def __init__(self):
        os.makedirs(self.RESULTS_DIR, exist_ok=True)
        os.makedirs(f"{self.RESULTS_DIR}/checkpoints", exist_ok=True)
        print(f"✅ Configuration initialized")
        print(f"📊 Available GPUs: {len(self.GPU_IDS) if self.GPU_IDS[0] != 0 or torch.cuda.is_available() else 0}")
        print(f"🔄 Max concurrent downloads: {self.MAX_CONCURRENT_DOWNLOADS}")

config = JupyterConfig()

# # Add artemis to path
# if config.ARTEMIS_PATH not in sys.path:
#     sys.path.append(config.ARTEMIS_PATH)

# Import artemis modules (matching your notebook)
try:
    from artemis.emotions import ARTEMIS_EMOTIONS
    from artemis.neural_models.mlp import MLP
    from artemis.neural_models.resnet_encoder import ResnetEncoder
    from artemis.neural_models.image_emotion_clf import ImageEmotionClassifier
    from artemis.in_out.neural_net_oriented import torch_load_model
    print("✅ Artemis modules imported successfully")
except ImportError as e:
    print(f"❌ Error importing artemis modules: {e}")
    print("Please check your ARTEMIS_PATH in the config above")

def filter_english_books(books_df):
    """Filter books to include only English language books"""
    print(f"🔍 Filtering for English books from {len(books_df):,} total books")
    
    def is_english(lang_code):
        if pd.isna(lang_code):
            return False
        lang_code_clean = str(lang_code).lower().strip()
        return lang_code_clean in config.ENGLISH_CODES
    
    english_mask = books_df['language_code'].astype(str).apply(is_english)
    english_books = books_df[english_mask].copy()
    
    dropped_count = len(books_df) - len(english_books)
    print(f"✅ Filtered to {len(english_books):,} English books")
    print(f"📉 Dropped {dropped_count:,} non-English books ({dropped_count/len(books_df)*100:.1f}%)")
    
    return english_books

class JupyterProgressTracker:
    """Progress tracker with Jupyter-friendly output"""
    
    def __init__(self, total_books):
        self.total_books = total_books
        self.processed_books = 0
        self.successful_books = 0
        self.failed_books = 0
        self.start_time = time.time()
        self.lock = threading.Lock()
        
        # Performance metrics
        self.books_per_second = 0
        self.last_update_time = time.time()
        
    def update(self, successful=0, failed=0):
        with self.lock:
            self.successful_books += successful
            self.failed_books += failed
            self.processed_books = self.successful_books + self.failed_books
            
            # Calculate performance
            elapsed = time.time() - self.start_time
            if elapsed > 0:
                self.books_per_second = self.processed_books / elapsed
    
    def should_log(self, frequency):
        return self.processed_books % frequency == 0
    
    def get_jupyter_status(self):
        """Get status formatted for Jupyter output"""
        with self.lock:
            progress_pct = (self.processed_books / self.total_books) * 100
            success_rate = (self.successful_books / max(1, self.processed_books)) * 100
            
            remaining = self.total_books - self.processed_books
            eta_seconds = remaining / max(0.1, self.books_per_second)
            eta_hours = eta_seconds / 3600
            
            elapsed_hours = (time.time() - self.start_time) / 3600
            
            status = {
                'progress_percent': progress_pct,
                'processed': self.processed_books,
                'total': self.total_books,
                'successful': self.successful_books,
                'failed': self.failed_books,
                'success_rate': success_rate,
                'books_per_second': self.books_per_second,
                'elapsed_hours': elapsed_hours,
                'eta_hours': eta_hours
            }
            
            return status
    
    def display_progress(self):
        """Display progress in Jupyter-friendly format"""
        status = self.get_jupyter_status()
        
        # Progress bar using tqdm
        progress_bar = f"📊 Progress: {status['progress_percent']:.1f}% "
        progress_bar += f"({status['processed']:,}/{status['total']:,}) | "
        progress_bar += f"✅ Success: {status['success_rate']:.1f}% | "
        progress_bar += f"🚀 Speed: {status['books_per_second']:.1f} books/sec | "
        progress_bar += f"⏱️ ETA: {status['eta_hours']:.1f}h"
        
        print(progress_bar)

class JupyterCheckpointManager:
    """Checkpoint manager for Jupyter (compatible with existing checkpoints)"""
    
    def __init__(self, checkpoint_dir):
        self.checkpoint_dir = checkpoint_dir
        self.checkpoint_file = os.path.join(checkpoint_dir, 'progress_checkpoint.json')
        self.results_file = os.path.join(checkpoint_dir, 'partial_results.pkl')
        
    def save_checkpoint(self, processed_indices, results, failed_books, progress_tracker):
        """Save checkpoint compatible with previous format"""
        status = progress_tracker.get_jupyter_status()
        
        checkpoint_data = {
            'processed_indices': list(processed_indices),
            'num_results': len(results),
            'num_failed': len(failed_books),
            'timestamp': datetime.now().isoformat(),
            'books_per_second': status['books_per_second'],
            'success_rate': status['success_rate'],
            'jupyter_version': True
        }
        
        # Save checkpoint metadata
        with open(self.checkpoint_file, 'w') as f:
            json.dump(checkpoint_data, f, indent=2)
        
        # Save actual results
        with open(self.results_file, 'wb') as f:
            pickle.dump({
                'results': results,
                'failed_books': failed_books
            }, f)
        
        print(f"💾 Checkpoint saved: {len(results):,} results, {len(failed_books):,} failed")
    
    def load_checkpoint(self):
        """Load checkpoint (compatible with previous format)"""
        if os.path.exists(self.checkpoint_file) and os.path.exists(self.results_file):
            try:
                # Load metadata
                with open(self.checkpoint_file, 'r') as f:
                    checkpoint_data = json.load(f)
                
                # Load results
                with open(self.results_file, 'rb') as f:
                    saved_data = pickle.load(f)
                
                print(f"📂 Loaded checkpoint from {checkpoint_data['timestamp']}")
                print(f"📊 Previous progress: {checkpoint_data['num_results']:,} results, {checkpoint_data['num_failed']:,} failed")
                
                return (
                    set(checkpoint_data['processed_indices']),
                    saved_data['results'],
                    saved_data['failed_books']
                )
            except Exception as e:
                print(f"❌ Error loading checkpoint: {e}")
                return set(), [], []
        
        return set(), [], []

class JupyterModelManager:
    """GPU model manager for Jupyter"""
    
    def __init__(self, checkpoint_file, gpu_ids):
        self.gpu_ids = gpu_ids
        self.models = {}
        self.current_gpu = 0
        self.load_models(checkpoint_file)
        
    def load_models(self, checkpoint_file):
        """Load models on available GPUs"""
        for gpu_id in self.gpu_ids:
            try:
                if gpu_id is not None and torch.cuda.is_available():
                    device = torch.device(f"cuda:{gpu_id}")
                    torch.cuda.set_device(gpu_id)
                else:
                    device = torch.device("cpu")
                    gpu_id = "cpu"
                
                model = torch_load_model(checkpoint_file)
                model.to(device)
                model.eval()
                
                self.models[gpu_id] = (model, device)
                print(f"🔧 Model loaded on {'CPU' if gpu_id == 'cpu' else f'GPU {gpu_id}'}")
                
            except Exception as e:
                print(f"❌ Error loading model on GPU {gpu_id}: {e}")
    
    def get_model(self):
        """Get next available model (round-robin)"""
        if not self.models:
            return None, None
        
        gpu_ids = list(self.models.keys())
        gpu_id = gpu_ids[self.current_gpu % len(gpu_ids)]
        self.current_gpu += 1
        
        return self.models[gpu_id]
    
    def predict_batch(self, images, model, device):
        """Predict emotions for batch"""
        try:
            if not images:
                return None
            
            batch = torch.stack(images).to(device, non_blocking=True)
            
            with torch.no_grad():
                if device.type == 'cuda':
                    torch.cuda.synchronize()
                
                predictions = model(batch)
                probabilities = torch.exp(predictions).cpu().numpy()
            
            return probabilities
            
        except Exception as e:
            print(f"❌ Prediction error on {device}: {e}")
            return None

class JupyterImageProcessor:
    """Async image processor for Jupyter"""
    
    def __init__(self):
        self.session = None
        
    async def create_session(self):
        """Create aiohttp session"""
        timeout = aiohttp.ClientTimeout(total=config.DOWNLOAD_TIMEOUT)
        connector = aiohttp.TCPConnector(
            limit=config.MAX_CONCURRENT_DOWNLOADS,
            limit_per_host=30,
            keepalive_timeout=30
        )
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        self.session = aiohttp.ClientSession(
            connector=connector,
            timeout=timeout,
            headers=headers
        )
    
    async def close_session(self):
        """Close session"""
        if self.session:
            await self.session.close()
    
    async def download_image(self, url):
        """Download single image"""
        for attempt in range(config.MAX_RETRIES):
            try:
                async with self.session.get(url) as response:
                    if response.status == 200:
                        image_data = await response.read()
                        image = Image.open(io.BytesIO(image_data)).convert('RGB')
                        return image
                        
            except Exception as e:
                if attempt == config.MAX_RETRIES - 1:
                    return None
                await asyncio.sleep(0.1 * (attempt + 1))
        
        return None
    
    def preprocess_image(self, image, img_dim=256):
        """Preprocess image (matching your notebook's preprocessing)"""
        try:
            # Resize maintaining aspect ratio
            image.thumbnail((img_dim, img_dim), Image.Resampling.LANCZOS)
            
            # Pad to square
            new_image = Image.new('RGB', (img_dim, img_dim), (255, 255, 255))
            x = (img_dim - image.width) // 2
            y = (img_dim - image.height) // 2
            new_image.paste(image, (x, y))
            
            # Convert to tensor and normalize (matching Artemis preprocessing)
            image_array = np.array(new_image).astype(np.float32) / 255.0
            image_tensor = torch.from_numpy(image_array).permute(2, 0, 1)
            
            # ImageNet normalization
            mean = torch.tensor([0.485, 0.456, 0.406]).reshape(3, 1, 1)
            std = torch.tensor([0.229, 0.224, 0.225]).reshape(3, 1, 1)
            return (image_tensor - mean) / std
            
        except Exception:
            return None
    
    async def process_batch_async(self, batch_df):
        """Process batch of images asynchronously"""
        # Extract URLs and book data
        urls = []
        book_data = []
        
        for idx, row in batch_df.iterrows():
            url = row.get('image_url_large', row.get('image_url', ''))
            if url and 'nophoto' not in url:
                urls.append(url)
                book_data.append((idx, row))
        
        if not urls:
            return [], [], []
        
        # Download all images concurrently
        download_tasks = [self.download_image(url) for url in urls]
        download_results = await asyncio.gather(*download_tasks, return_exceptions=True)
        
        # Process downloaded images
        valid_images = []
        valid_indices = []
        valid_book_data = []
        
        # Use thread pool for CPU-intensive preprocessing
        with ThreadPoolExecutor(max_workers=config.MAX_CONCURRENT_PREPROCESSING) as executor:
            preprocess_tasks = []
            
            for i, result in enumerate(download_results):
                if isinstance(result, Image.Image):
                    task = executor.submit(self.preprocess_image, result)
                    preprocess_tasks.append((i, task))
            
            # Collect preprocessing results
            for i, task in preprocess_tasks:
                try:
                    processed_image = task.result(timeout=5)
                    if processed_image is not None:
                        valid_images.append(processed_image)
                        valid_indices.append(i)
                        valid_book_data.append(book_data[i])
                except Exception:
                    pass
        
        return valid_images, valid_indices, valid_book_data

class JupyterEmotionPredictor:
    """Main predictor class for Jupyter"""
    
    def __init__(self):
        self.checkpoint_manager = JupyterCheckpointManager(f"{config.RESULTS_DIR}/checkpoints")
        self.model_manager = JupyterModelManager(config.CHECKPOINT_FILE, config.GPU_IDS)
        self.image_processor = JupyterImageProcessor()
        self.progress_tracker = None
        
    async def process_all_books_async(self, resume_from_checkpoint=True):
        """Main async processing function"""
        
        print("🚀 Starting Goodreads emotion analysis...")
        
        # Load and filter data
        print("📂 Loading Goodreads data...")
        try:
            books_df = pd.read_parquet(config.DATA_FILE)
            print(f"📊 Loaded {len(books_df):,} total books")
            
            # Filter for English books
            books_df = filter_english_books(books_df)
            
        except Exception as e:
            print(f"❌ Error loading data: {e}")
            return None
        
        # Initialize progress tracking
        self.progress_tracker = JupyterProgressTracker(len(books_df))
        
        # Load previous progress if resuming
        processed_indices = set()
        all_results = []
        all_failed = []
        
        if resume_from_checkpoint:
            processed_indices, all_results, all_failed = self.checkpoint_manager.load_checkpoint()
            if processed_indices:
                print(f"🔄 Resuming from checkpoint: {len(processed_indices):,} books already processed")
                # Update progress tracker
                self.progress_tracker.update(
                    successful=len(all_results),
                    failed=len(all_failed)
                )
        
        # Filter out already processed books
        remaining_books = books_df[~books_df.index.isin(processed_indices)].copy()
        print(f"📋 Processing {len(remaining_books):,} remaining English books")
        
        if len(remaining_books) == 0:
            print("✅ All books already processed!")
            return pd.DataFrame(all_results)
        
        # Create aiohttp session
        await self.image_processor.create_session()
        
        try:
            # Process in batches with progress bar
            total_batches = (len(remaining_books) + config.PROCESSING_BATCH_SIZE - 1) // config.PROCESSING_BATCH_SIZE
            
            print(f"🔄 Processing {total_batches} batches of up to {config.PROCESSING_BATCH_SIZE:,} books each")
            
            # Create tqdm progress bar for batches
            batch_progress = tqdm(range(total_batches), desc="Processing batches")
            
            for batch_idx in batch_progress:
                start_idx = batch_idx * config.PROCESSING_BATCH_SIZE
                end_idx = min(start_idx + config.PROCESSING_BATCH_SIZE, len(remaining_books))
                batch_df = remaining_books.iloc[start_idx:end_idx]
                
                # Process this batch asynchronously
                batch_results, batch_failed = await self.process_batch_async(batch_df)
                
                # Update results
                all_results.extend(batch_results)
                all_failed.extend(batch_failed)
                
                # Update processed indices
                for idx in batch_df.index:
                    processed_indices.add(idx)
                
                # Update progress
                self.progress_tracker.update(
                    successful=len(batch_results),
                    failed=len(batch_failed)
                )
                
                # Update progress bar description
                status = self.progress_tracker.get_jupyter_status()
                batch_progress.set_postfix({
                    'Success Rate': f"{status['success_rate']:.1f}%",
                    'Speed': f"{status['books_per_second']:.1f} books/sec",
                    'ETA': f"{status['eta_hours']:.1f}h"
                })
                
                # Log detailed progress
                if self.progress_tracker.should_log(config.PROGRESS_UPDATE_FREQUENCY):
                    self.progress_tracker.display_progress()
                
                # Save checkpoint
                checkpoint_interval = max(1, config.CHECKPOINT_FREQUENCY // config.PROCESSING_BATCH_SIZE)
                if (batch_idx + 1) % checkpoint_interval == 0:
                    self.checkpoint_manager.save_checkpoint(
                        processed_indices, all_results, all_failed, self.progress_tracker
                    )
        
        except Exception as e:
            print(f"❌ Error during processing: {e}")
            return None
        
        finally:
            await self.image_processor.close_session()
        
        # Save final results
        return self.save_final_results(all_results, all_failed)
    
    async def process_batch_async(self, batch_df):
        """Process a single batch asynchronously"""
        batch_results = []
        batch_failed = []
        
        # Process in GPU-sized sub-batches
        for i in range(0, len(batch_df), config.BATCH_SIZE):
            sub_batch = batch_df.iloc[i:i+config.BATCH_SIZE]
            
            # Download and preprocess images asynchronously
            images, valid_indices, valid_book_data = \
                await self.image_processor.process_batch_async(sub_batch)
            
            if not images:
                # All failed in this sub-batch
                for idx, row in sub_batch.iterrows():
                    batch_failed.append({
                        'book_id': row['book_id'],
                        'reason': 'image_processing_failed'
                    })
                continue
            
            # Get model and predict
            model_device = self.model_manager.get_model()
            if model_device[0] is None:
                print("❌ No model available")
                continue
            
            model, device = model_device
            predictions = self.model_manager.predict_batch(images, model, device)
            
            if predictions is not None:
                # Process successful predictions
                for pred_idx, (original_idx, (idx, row)) in enumerate(zip(valid_indices, valid_book_data)):
                    emotion_probs = predictions[pred_idx]
                    
                    top_emotion_idx = np.argmax(emotion_probs)
                    top_emotion = config.EMOTION_LABELS[top_emotion_idx]
                    confidence = emotion_probs[top_emotion_idx]
                    
                    result = {
                        'book_id': row['book_id'],
                        'title': row['title'],
                        'authors': row['authors'],
                        'average_rating': row.get('average_rating'),
                        'ratings_count': row.get('ratings_count'),
                        'publication_year': row.get('publication_year'),
                        'language_code': row.get('language_code'),
                        'popular_shelves': row.get('popular_shelves'),
                        'predicted_emotion': top_emotion,
                        'confidence': confidence,
                        'emotion_probs': emotion_probs.tolist()
                    }
                    
                    # Add individual emotion probabilities
                    for j, emotion in enumerate(config.EMOTION_LABELS):
                        result[f'prob_{emotion.replace(" ", "_")}'] = emotion_probs[j]
                    
                    batch_results.append(result)
            
            # Add failed books
            failed_count = len(sub_batch) - len(valid_indices)
            for _ in range(failed_count):
                batch_failed.append({
                    'book_id': 'unknown',
                    'reason': 'download_or_preprocessing_failed'
                })
        
        return batch_results, batch_failed
    
    def save_final_results(self, results, failed):
        """Save final results"""
        results_df = pd.DataFrame(results)
        failed_df = pd.DataFrame(failed)
        
        # Save files
        results_file = f"{config.RESULTS_DIR}/goodreads_emotion_predictions_english.parquet"
        failed_file = f"{config.RESULTS_DIR}/failed_books_english.csv"
        
        results_df.to_parquet(results_file, index=False)
        failed_df.to_csv(failed_file, index=False)
        
        # Clear checkpoint files
        try:
            if os.path.exists(self.checkpoint_manager.checkpoint_file):
                os.remove(self.checkpoint_manager.checkpoint_file)
            if os.path.exists(self.checkpoint_manager.results_file):
                os.remove(self.checkpoint_manager.results_file)
        except:
            pass
        
        # Final statistics
        total_processed = len(results) + len(failed)
        success_rate = len(results) / total_processed * 100 if total_processed > 0 else 0
        
        print("\n" + "="*60)
        print("🎉 PROCESSING COMPLETED!")
        print("="*60)
        print(f"📊 Total processed: {total_processed:,}")
        print(f"✅ Successful: {len(results):,} ({success_rate:.2f}%)")
        print(f"❌ Failed: {len(failed):,}")
        print(f"💾 Results saved to: {results_file}")
        print("="*60)
        
        # Quick emotion analysis
        if len(results) > 0:
            emotion_counts = results_df['predicted_emotion'].value_counts()
            print("\n📈 Top predicted emotions:")
            for emotion, count in emotion_counts.head().items():
                pct = count / len(results_df) * 100
                print(f"  {emotion}: {count:,} ({pct:.1f}%)")
        
        return results_df

# Main execution function for Jupyter
async def run_emotion_analysis(resume_from_checkpoint=True):
    """
    Main function to run the emotion analysis in Jupyter
    
    Parameters:
    - resume_from_checkpoint: Whether to resume from existing checkpoint (default: True)
    """
    
    print("🎯 Goodreads Book Cover Emotion Analysis")
    print("📚 Processing English books only")
    print("🔄 Using async optimization for speed")
    print("-" * 50)
    
    # Check configuration
    if not os.path.exists(config.CHECKPOINT_FILE):
        print(f"❌ Model file not found: {config.CHECKPOINT_FILE}")
        print("Please update CHECKPOINT_FILE in the config above")
        return None
    
    if not os.path.exists(config.DATA_FILE):
        print(f"❌ Data file not found: {config.DATA_FILE}")
        print("Please update DATA_FILE in the config above")
        return None
    
    # Initialize and run predictor
    predictor = JupyterEmotionPredictor()
    results_df = await predictor.process_all_books_async(resume_from_checkpoint)
    
    return results_df

# Convenience function for one-line execution
def analyze_goodreads_emotions(resume_from_checkpoint=True):
    """
    One-line function to run the complete analysis
    
    Usage:
    results = analyze_goodreads_emotions()
    """
    return asyncio.run(run_emotion_analysis(resume_from_checkpoint))

# Display configuration
print("📋 CONFIGURATION SUMMARY")
print("-" * 40)
print(f"📁 Artemis path: {config.ARTEMIS_PATH}")
print(f"🤖 Model file: {config.CHECKPOINT_FILE}")
print(f"📊 Data file: {config.DATA_FILE}")
print(f"💾 Results directory: {config.RESULTS_DIR}")
print(f"🖥️ GPUs available: {len(config.GPU_IDS) if config.GPU_IDS[0] != 0 or torch.cuda.is_available() else 0}")
print(f"🔄 Max concurrent downloads: {config.MAX_CONCURRENT_DOWNLOADS}")
print(f"📦 Batch size: {config.BATCH_SIZE}")
print("-" * 40)
# Run the complete analysis (resumes from checkpoint if available)
results = analyze_goodreads_emotions()

✅ Configuration initialized
📊 Available GPUs: 4
🔄 Max concurrent downloads: 64
✅ Artemis modules imported successfully
📋 CONFIGURATION SUMMARY
----------------------------------------
📁 Artemis path: data/artemis/artemis
🤖 Model file: data/artemis/predictions/best_model_good_data.pt
📊 Data file: preprocessed_books_2025_04_20.parquet
💾 Results directory: goodreads_emotion_results
🖥️ GPUs available: 4
🔄 Max concurrent downloads: 64
📦 Batch size: 256
----------------------------------------
🎯 Goodreads Book Cover Emotion Analysis
📚 Processing English books only
🔄 Using async optimization for speed
--------------------------------------------------
🔧 Model loaded on GPU 0
🔧 Model loaded on GPU 1
🔧 Model loaded on GPU 2
🔧 Model loaded on GPU 3
🚀 Starting Goodreads emotion analysis...
📂 Loading Goodreads data...
📊 Loaded 931,229 total books
🔍 Filtering for English books from 931,229 total books
✅ Filtered to 687,029 English books
📉 Dropped 244,200 non-English books (26.2%)
📂 Loaded checkpoin

Processing batches:   0%|          | 0/2 [00:00<?, ?it/s]

📊 Progress: 98.2% (675,000/687,029) | ✅ Success: 100.0% | 🚀 Speed: 2576.2 books/sec | ⏱️ ETA: 0.0h
💾 Checkpoint saved: 674,962 results, 38 failed
💾 Checkpoint saved: 686,990 results, 39 failed

🎉 PROCESSING COMPLETED!
📊 Total processed: 687,029
✅ Successful: 686,990 (99.99%)
❌ Failed: 39
💾 Results saved to: goodreads_emotion_results/goodreads_emotion_predictions_english.parquet

📈 Top predicted emotions:
  awe: 203,032 (29.6%)
  amusement: 179,719 (26.2%)
  fear: 172,886 (25.2%)
  contentment: 44,676 (6.5%)
  something else: 34,407 (5.0%)


In [4]:
# ResNet Emotion Analysis Visualization Functions
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from collections import Counter
import warnings
import os
from datetime import datetime
warnings.filterwarnings('ignore')

def create_resnet_visualizations(results_df, save_dir=None):
    """
    Create comprehensive visualizations for ResNet emotion analysis results
    Each plot is saved as a separate image file
    
    Parameters:
    - results_df: DataFrame with ResNet emotion predictions from image analysis
    - save_dir: Directory to save plots (optional)
    """
    
    if results_df is None or len(results_df) == 0:
        print("❌ No results to visualize")
        return
    
    if save_dir is None:
        save_dir = 'resnet_visualizations'
    
    os.makedirs(save_dir, exist_ok=True)
    
    # Set up the plotting style
    plt.style.use('default')
    sns.set_palette("husl")
    
    print("🎨 Creating ResNet emotion analysis visualizations...")
    
    # Get emotion labels (assuming same as BERT)
    try:
        from artemis.emotions import ARTEMIS_EMOTIONS
        emotion_labels = ARTEMIS_EMOTIONS
    except:
        emotion_labels = ['amusement', 'anger', 'awe', 'contentment', 'disgust', 
                         'excitement', 'fear', 'sadness', 'something else']
    
    # 1. Emotion Distribution Bar Chart
    plt.figure(figsize=(12, 8))
    emotion_counts = results_df['predicted_emotion'].value_counts()
    bars = plt.bar(range(len(emotion_counts)), emotion_counts.values, 
                   alpha=0.8, color='steelblue', edgecolor='black')
    plt.xticks(range(len(emotion_counts)), emotion_counts.index, rotation=45, ha='right')
    plt.title('Emotion Distribution in Goodreads Book Covers\n(ResNet Image Analysis)', 
              fontsize=14, pad=20)
    plt.ylabel('Number of Books', fontsize=12)
    plt.xlabel('Predicted Emotion', fontsize=12)
    plt.grid(axis='y', alpha=0.3)
    
    # Add value labels on bars
    for i, (bar, value) in enumerate(zip(bars, emotion_counts.values)):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(emotion_counts.values) * 0.01, 
                f'{value:,}\n({value/len(results_df)*100:.1f}%)', 
                ha='center', va='bottom', fontsize=10, fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(f'{save_dir}/01_emotion_distribution.png', dpi=300, bbox_inches='tight')
    plt.close()
    print(f"✅ Saved: {save_dir}/01_emotion_distribution.png")
    
    # 2. Confidence Distribution Histogram
    plt.figure(figsize=(10, 6))
    plt.hist(results_df['confidence'], bins=50, alpha=0.7, color='skyblue', 
             edgecolor='black', density=True)
    plt.axvline(results_df['confidence'].mean(), color='red', linestyle='--', linewidth=2,
                label=f'Mean: {results_df["confidence"].mean():.3f}')
    plt.axvline(results_df['confidence'].median(), color='orange', linestyle='--', linewidth=2,
                label=f'Median: {results_df["confidence"].median():.3f}')
    plt.title('Prediction Confidence Distribution (ResNet)', fontsize=14, pad=20)
    plt.xlabel('Confidence Score', fontsize=12)
    plt.ylabel('Density', fontsize=12)
    plt.legend(fontsize=11)
    plt.grid(alpha=0.3)
    
    # Add statistics text box
    stats_text = f'Mean: {results_df["confidence"].mean():.3f}\n'
    stats_text += f'Std: {results_df["confidence"].std():.3f}\n'
    stats_text += f'Min: {results_df["confidence"].min():.3f}\n'
    stats_text += f'Max: {results_df["confidence"].max():.3f}'
    plt.text(0.02, 0.98, stats_text, transform=plt.gca().transAxes, 
             bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8),
             verticalalignment='top', fontsize=10)
    
    plt.tight_layout()
    plt.savefig(f'{save_dir}/02_confidence_distribution.png', dpi=300, bbox_inches='tight')
    plt.close()
    print(f"✅ Saved: {save_dir}/02_confidence_distribution.png")
    
    # 3. Average Confidence by Emotion
    plt.figure(figsize=(12, 8))
    emotion_confidence = results_df.groupby('predicted_emotion')['confidence'].agg(['mean', 'std']).sort_values('mean', ascending=False)
    
    x_pos = range(len(emotion_confidence))
    bars = plt.bar(x_pos, emotion_confidence['mean'].values, 
                   yerr=emotion_confidence['std'].values, 
                   alpha=0.8, capsize=5, color='lightcoral', edgecolor='black')
    
    plt.xticks(x_pos, emotion_confidence.index, rotation=45, ha='right')
    plt.title('Average Prediction Confidence by Emotion (ResNet)', fontsize=14, pad=20)
    plt.ylabel('Average Confidence Score', fontsize=12)
    plt.xlabel('Predicted Emotion', fontsize=12)
    plt.grid(axis='y', alpha=0.3)
    
    # Add value labels
    for i, (bar, mean_val, std_val) in enumerate(zip(bars, emotion_confidence['mean'].values, emotion_confidence['std'].values)):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + std_val + 0.01, 
                f'{mean_val:.3f}', ha='center', va='bottom', fontsize=10, fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(f'{save_dir}/03_confidence_by_emotion.png', dpi=300, bbox_inches='tight')
    plt.close()
    print(f"✅ Saved: {save_dir}/03_confidence_by_emotion.png")
    
    # 4. High Confidence Predictions Analysis
    plt.figure(figsize=(12, 8))
    confidence_thresholds = [0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
    coverage_data = []
    
    for threshold in confidence_thresholds:
        high_conf_count = (results_df['confidence'] >= threshold).sum()
        coverage_pct = (high_conf_count / len(results_df)) * 100
        coverage_data.append(coverage_pct)
    
    bars = plt.bar(range(len(confidence_thresholds)), coverage_data, 
                   alpha=0.8, color='gold', edgecolor='black')
    plt.xticks(range(len(confidence_thresholds)), 
               [f'≥{t}' for t in confidence_thresholds])
    plt.title('Coverage at Different Confidence Thresholds (ResNet)', fontsize=14, pad=20)
    plt.ylabel('Coverage (%)', fontsize=12)
    plt.xlabel('Confidence Threshold', fontsize=12)
    plt.grid(axis='y', alpha=0.3)
    
    # Add value labels
    for i, (bar, value) in enumerate(zip(bars, coverage_data)):
        count = (results_df['confidence'] >= confidence_thresholds[i]).sum()
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, 
                f'{value:.1f}%\n({count:,})', ha='center', va='bottom', 
                fontsize=10, fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(f'{save_dir}/04_confidence_thresholds.png', dpi=300, bbox_inches='tight')
    plt.close()
    print(f"✅ Saved: {save_dir}/04_confidence_thresholds.png")
    
    # 5. Book Ratings vs Predicted Emotions
    if 'average_rating' in results_df.columns:
        plt.figure(figsize=(12, 8))
        
        # Remove books without ratings
        rated_books = results_df.dropna(subset=['average_rating'])
        
        if len(rated_books) > 0:
            emotion_ratings = rated_books.groupby('predicted_emotion')['average_rating'].agg(['mean', 'std', 'count'])
            emotion_ratings = emotion_ratings.sort_values('mean', ascending=False)
            
            x_pos = range(len(emotion_ratings))
            bars = plt.bar(x_pos, emotion_ratings['mean'].values, 
                          yerr=emotion_ratings['std'].values, 
                          alpha=0.8, capsize=5, color='lightgreen', edgecolor='black')
            
            plt.xticks(x_pos, emotion_ratings.index, rotation=45, ha='right')
            plt.title('Average Book Rating by Predicted Emotion (ResNet)', fontsize=14, pad=20)
            plt.ylabel('Average Rating', fontsize=12)
            plt.xlabel('Predicted Emotion', fontsize=12)
            plt.grid(axis='y', alpha=0.3)
            plt.ylim(0, 5)
            
            # Add value labels and count
            for i, (bar, mean_val, count) in enumerate(zip(bars, emotion_ratings['mean'].values, emotion_ratings['count'].values)):
                plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, 
                        f'{mean_val:.2f}\n(n={count:,})', ha='center', va='bottom', 
                        fontsize=9, fontweight='bold')
        else:
            plt.text(0.5, 0.5, 'No rating data available', ha='center', va='center', 
                    transform=plt.gca().transAxes, fontsize=14)
        
        plt.tight_layout()
        plt.savefig(f'{save_dir}/05_ratings_by_emotion.png', dpi=300, bbox_inches='tight')
        plt.close()
        print(f"✅ Saved: {save_dir}/05_ratings_by_emotion.png")
    
    # 6. Publication Year vs Emotions (if available)
    if 'publication_year' in results_df.columns:
        plt.figure(figsize=(14, 8))

        # Filter reasonable publication years
        year_data = results_df.dropna(subset=['publication_year'])
        year_data['publication_year'] = pd.to_numeric(year_data['publication_year'], errors='coerce').astype(int)
        year_data = year_data[(year_data['publication_year'] >= 1800) & 
                             (year_data['publication_year'] <= 2024)]
        
        if len(year_data) > 0:
            # Create decade bins
            year_data['decade'] = (year_data['publication_year'] // 10) * 10
            decade_emotions = year_data.groupby(['decade', 'predicted_emotion']).size().unstack(fill_value=0)
            
            # Convert to percentages
            decade_emotions_pct = decade_emotions.div(decade_emotions.sum(axis=1), axis=0) * 100
            
            # Plot stacked bar chart
            decade_emotions_pct.plot(kind='bar', stacked=True, figsize=(14, 8), 
                                   colormap='tab10', alpha=0.8)
            plt.title('Emotion Distribution by Publication Decade (ResNet)', fontsize=14, pad=20)
            plt.xlabel('Publication Decade', fontsize=12)
            plt.ylabel('Percentage of Books', fontsize=12)
            plt.legend(title='Predicted Emotion', bbox_to_anchor=(1.05, 1), loc='upper left')
            plt.xticks(rotation=45)
            plt.grid(axis='y', alpha=0.3)
        else:
            plt.text(0.5, 0.5, 'No publication year data available', ha='center', va='center', 
                    transform=plt.gca().transAxes, fontsize=14)
        
        plt.tight_layout()
        plt.savefig(f'{save_dir}/06_emotions_by_decade.png', dpi=300, bbox_inches='tight')
        plt.close()
        print(f"✅ Saved: {save_dir}/06_emotions_by_decade.png")
    
    # 7. Emotion Probability Heatmap
    plt.figure(figsize=(12, 10))
    
    # Get probability columns
    prob_columns = [col for col in results_df.columns if col.startswith('prob_')]
    
    if prob_columns:
        # Sample for visualization (to avoid overcrowding)
        sample_size = min(100, len(results_df))
        sample_df = results_df.sample(sample_size, random_state=42)
        
        # Create probability matrix
        prob_matrix = sample_df[prob_columns].values.T
        emotion_names = [col.replace('prob_', '').replace('_', ' ') for col in prob_columns]
        
        # Create heatmap
        im = plt.imshow(prob_matrix, cmap='YlOrRd', aspect='auto', vmin=0, vmax=1)
        plt.colorbar(im, shrink=0.8, label='Probability')
        plt.yticks(range(len(emotion_names)), emotion_names)
        plt.xlabel(f'Book Samples (Random {sample_size} books)', fontsize=12)
        plt.ylabel('Emotions', fontsize=12)
        plt.title('Emotion Probability Heatmap (ResNet)', fontsize=14, pad=20)
        
        # Add grid
        plt.grid(True, alpha=0.3)
    else:
        plt.text(0.5, 0.5, 'No probability data available', ha='center', va='center', 
                transform=plt.gca().transAxes, fontsize=14)
    
    plt.tight_layout()
    plt.savefig(f'{save_dir}/07_emotion_probability_heatmap.png', dpi=300, bbox_inches='tight')
    plt.close()
    print(f"✅ Saved: {save_dir}/07_emotion_probability_heatmap.png")
    
    # 8. Confidence vs Rating Scatter Plot
    if 'average_rating' in results_df.columns:
        plt.figure(figsize=(10, 8))
        
        rated_confident = results_df.dropna(subset=['average_rating'])
        
        if len(rated_confident) > 0:
            # Create scatter plot with alpha for overlapping points
            scatter = plt.scatter(rated_confident['confidence'], rated_confident['average_rating'], 
                                alpha=0.6, s=20, c=rated_confident['confidence'], 
                                cmap='viridis', edgecolors='black', linewidth=0.5)
            
            # Add trend line
            z = np.polyfit(rated_confident['confidence'], rated_confident['average_rating'], 1)
            p = np.poly1d(z)
            plt.plot(rated_confident['confidence'], p(rated_confident['confidence']), 
                    "r--", alpha=0.8, linewidth=2, label=f'Trend line')
            
            plt.colorbar(scatter, label='Confidence Score')
            plt.xlabel('Prediction Confidence', fontsize=12)
            plt.ylabel('Average Book Rating', fontsize=12)
            plt.title('Prediction Confidence vs Book Rating (ResNet)', fontsize=14, pad=20)
            plt.grid(alpha=0.3)
            plt.legend()
            
            # Calculate correlation
            correlation = rated_confident['confidence'].corr(rated_confident['average_rating'])
            plt.text(0.05, 0.95, f'Correlation: {correlation:.3f}', 
                    transform=plt.gca().transAxes, 
                    bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8),
                    fontsize=12, fontweight='bold')
        else:
            plt.text(0.5, 0.5, 'No rating data available', ha='center', va='center', 
                    transform=plt.gca().transAxes, fontsize=14)
        
        plt.tight_layout()
        plt.savefig(f'{save_dir}/08_confidence_vs_rating.png', dpi=300, bbox_inches='tight')
        plt.close()
        print(f"✅ Saved: {save_dir}/08_confidence_vs_rating.png")
    
    # 9. Top Confident Predictions by Emotion
    plt.figure(figsize=(14, 10))
    
    # Get most confident prediction for each emotion
    top_confident_by_emotion = results_df.loc[results_df.groupby('predicted_emotion')['confidence'].idxmax()]
    
    if len(top_confident_by_emotion) > 0:
        emotions = top_confident_by_emotion['predicted_emotion'].values
        confidences = top_confident_by_emotion['confidence'].values
        
        bars = plt.bar(range(len(emotions)), confidences, 
                      alpha=0.8, color='purple', edgecolor='black')
        plt.xticks(range(len(emotions)), emotions, rotation=45, ha='right')
        plt.title('Highest Confidence Prediction per Emotion (ResNet)', fontsize=14, pad=20)
        plt.ylabel('Confidence Score', fontsize=12)
        plt.xlabel('Predicted Emotion', fontsize=12)
        plt.grid(axis='y', alpha=0.3)
        plt.ylim(0, 1)
        
        # Add value labels and book titles
        for i, (bar, conf, title) in enumerate(zip(bars, confidences, top_confident_by_emotion['title'])):
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, 
                    f'{conf:.3f}', ha='center', va='bottom', fontsize=10, fontweight='bold')
            # Add truncated title below bar
            truncated_title = title[:20] + '...' if len(str(title)) > 20 else str(title)
            plt.text(bar.get_x() + bar.get_width()/2, -0.05, truncated_title, 
                    ha='center', va='top', fontsize=8, rotation=45)
    else:
        plt.text(0.5, 0.5, 'No data available', ha='center', va='center', 
                transform=plt.gca().transAxes, fontsize=14)
    
    plt.tight_layout()
    plt.savefig(f'{save_dir}/09_top_confident_predictions.png', dpi=300, bbox_inches='tight')
    plt.close()
    print(f"✅ Saved: {save_dir}/09_top_confident_predictions.png")
    
    # 10. Summary Statistics Visualization
    create_resnet_summary_plot(results_df, save_dir)
    
    print(f"\n🎉 All ResNet visualizations saved to: {save_dir}")
    return save_dir

def create_resnet_summary_plot(results_df, save_dir):
    """Create a summary statistics visualization"""
    
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
    
    # Summary statistics table
    ax1.axis('off')
    
    # Calculate summary statistics
    stats = {
        'Total Books Analyzed': f"{len(results_df):,}",
        'Mean Confidence': f"{results_df['confidence'].mean():.3f}",
        'Median Confidence': f"{results_df['confidence'].median():.3f}",
        'Std Confidence': f"{results_df['confidence'].std():.3f}",
        'High Confidence (>0.8)': f"{(results_df['confidence'] > 0.8).sum():,} ({(results_df['confidence'] > 0.8).mean()*100:.1f}%)",
        'Most Common Emotion': f"{results_df['predicted_emotion'].mode().iloc[0]}",
        'Least Common Emotion': f"{results_df['predicted_emotion'].value_counts().index[-1]}",
        'Unique Emotions Found': f"{results_df['predicted_emotion'].nunique()}"
    }
    
    if 'average_rating' in results_df.columns:
        rated_books = results_df.dropna(subset=['average_rating'])
        if len(rated_books) > 0:
            stats['Books with Ratings'] = f"{len(rated_books):,} ({len(rated_books)/len(results_df)*100:.1f}%)"
            stats['Avg Book Rating'] = f"{rated_books['average_rating'].mean():.2f}"
    
    # Create table
    table_data = [[key, value] for key, value in stats.items()]
    table = ax1.table(cellText=table_data, colLabels=['Metric', 'Value'],
                     cellLoc='left', loc='center', bbox=[0, 0, 1, 1])
    table.auto_set_font_size(False)
    table.set_fontsize(11)
    table.scale(1.2, 2)
    
    # Style the table
    for i in range(len(stats) + 1):
        table[(i, 0)].set_facecolor('#E8E8E8')
        table[(i, 1)].set_facecolor('#F5F5F5')
        if i == 0:  # Header
            table[(i, 0)].set_facecolor('#D0D0D0')
            table[(i, 1)].set_facecolor('#D0D0D0')
    
    ax1.set_title('ResNet Analysis Summary Statistics', fontsize=14, pad=20, fontweight='bold')
    
    # Emotion distribution pie chart
    emotion_counts = results_df['predicted_emotion'].value_counts()
    top_emotions = emotion_counts.head(6)  # Top 6 emotions
    if len(emotion_counts) > 6:
        other_count = emotion_counts.iloc[6:].sum()
        top_emotions['Others'] = other_count
    
    colors = plt.cm.Set3(np.linspace(0, 1, len(top_emotions)))
    wedges, texts, autotexts = ax2.pie(top_emotions.values, labels=top_emotions.index, 
                                      autopct='%1.1f%%', colors=colors, startangle=90)
    ax2.set_title('Top Emotions Distribution', fontsize=14, pad=20, fontweight='bold')
    
    # Enhance pie chart text
    for autotext in autotexts:
        autotext.set_color('black')
        autotext.set_fontweight('bold')
        autotext.set_fontsize(10)
    
    # Confidence quartiles
    quartiles = results_df['confidence'].quantile([0.25, 0.5, 0.75, 1.0])
    quartile_labels = ['Q1 (25%)', 'Q2 (50%)', 'Q3 (75%)', 'Q4 (100%)']
    quartile_values = [quartiles[0.25], quartiles[0.5], quartiles[0.75], quartiles[1.0]]
    
    bars = ax3.bar(quartile_labels, quartile_values, alpha=0.8, color='lightblue', edgecolor='black')
    ax3.set_title('Confidence Score Quartiles', fontsize=14, pad=20, fontweight='bold')
    ax3.set_ylabel('Confidence Score', fontsize=12)
    ax3.grid(axis='y', alpha=0.3)
    
    for bar, value in zip(bars, quartile_values):
        ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                f'{value:.3f}', ha='center', va='bottom', fontsize=11, fontweight='bold')
    
    # Processing info (if available)
    ax4.axis('off')
    processing_info = [
        f"Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M')}",
        f"Model Type: ResNet Image Analysis",
        f"Total Books: {len(results_df):,}",
        f"Success Rate: 100%",  # Assuming all in results_df were successful
        "",
        "Key Findings:",
        f"• Most confident emotion: {results_df.loc[results_df['confidence'].idxmax(), 'predicted_emotion']}",
        f"• Highest confidence: {results_df['confidence'].max():.3f}",
        f"• Most common emotion: {results_df['predicted_emotion'].mode().iloc[0]}",
        f"• Emotion diversity: {results_df['predicted_emotion'].nunique()}/9 emotions found"
    ]
    
    for i, info in enumerate(processing_info):
        weight = 'bold' if info.startswith('•') or info.endswith(':') else 'normal'
        size = 12 if info.startswith('Analysis') or info.startswith('Key') else 11
        ax4.text(0.05, 0.95 - i*0.08, info, transform=ax4.transAxes, 
                fontsize=size, fontweight=weight, verticalalignment='top')
    
    ax4.set_title('Processing Information', fontsize=14, pad=20, fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(f'{save_dir}/10_summary_dashboard.png', dpi=300, bbox_inches='tight')
    plt.close()
    print(f"✅ Saved: {save_dir}/10_summary_dashboard.png")

def print_resnet_summary(results_df):
    """Print a detailed text summary of the ResNet emotion analysis"""
    
    print("="*60)
    print("🖼️ RESNET IMAGE EMOTION ANALYSIS SUMMARY")
    print("="*60)
    
    # Basic statistics
    print(f"📚 Total books analyzed: {len(results_df):,}")
    print(f"🎯 Mean confidence: {results_df['confidence'].mean():.3f}")
    print(f"📊 Median confidence: {results_df['confidence'].median():.3f}")
    print(f"📈 Std confidence: {results_df['confidence'].std():.3f}")
    print(f"🔍 Min confidence: {results_df['confidence'].min():.3f}")
    print(f"🔍 Max confidence: {results_df['confidence'].max():.3f}")
    
    # Emotion distribution
    print(f"\n🎭 EMOTION DISTRIBUTION:")
    emotion_counts = results_df['predicted_emotion'].value_counts()
    for emotion, count in emotion_counts.items():
        pct = count / len(results_df) * 100
        print(f"  {emotion:15}: {count:6,} ({pct:5.1f}%)")
    
    # Confidence analysis
    print(f"\n🎯 CONFIDENCE ANALYSIS:")
    conf_thresholds = [0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
    for threshold in conf_thresholds:
        high_conf = (results_df['confidence'] >= threshold).sum()
        pct = high_conf / len(results_df) * 100
        print(f"  Confidence ≥ {threshold}: {high_conf:6,} ({pct:5.1f}%)")
    
    # Rating analysis (if available)
    if 'average_rating' in results_df.columns:
        rated_books = results_df.dropna(subset=['average_rating'])
        if len(rated_books) > 0:
            print(f"\n⭐ RATING ANALYSIS:")
            print(f"  Books with ratings: {len(rated_books):,} ({len(rated_books)/len(results_df)*100:.1f}%)")
            print(f"  Average rating: {rated_books['average_rating'].mean():.2f}")
            print(f"  Rating std: {rated_books['average_rating'].std():.2f}")
            
            # Top rated emotions
            emotion_ratings = rated_books.groupby('predicted_emotion')['average_rating'].mean().sort_values(ascending=False)
            print(f"  Highest rated emotion: {emotion_ratings.index[0]} ({emotion_ratings.iloc[0]:.2f})")
            print(f"  Lowest rated emotion: {emotion_ratings.index[-1]} ({emotion_ratings.iloc[-1]:.2f})")
    
    # Most confident predictions
    print(f"\n🏆 MOST CONFIDENT PREDICTIONS:")
    top_confident = results_df.nlargest(5, 'confidence')[['title', 'predicted_emotion', 'confidence']]
    for _, row in top_confident.iterrows():
        title_short = row['title'][:40] + '...' if len(str(row['title'])) > 40 else str(row['title'])
        print(f"  {row['predicted_emotion']:12} ({row['confidence']:.3f}): {title_short}")
    
    # Emotion confidence rankings
    print(f"\n📊 AVERAGE CONFIDENCE BY EMOTION:")
    emotion_confidence = results_df.groupby('predicted_emotion')['confidence'].mean().sort_values(ascending=False)
    for emotion, conf in emotion_confidence.items():
        count = (results_df['predicted_emotion'] == emotion).sum()
        print(f"  {emotion:15}: {conf:.3f} (n={count:,})")
    
    print("="*60)

def analyze_resnet_results(results_df, save_dir=None):
    """
    Run complete analysis and visualization of ResNet results
    
    Usage after running ResNet analysis:
    analyze_resnet_results(results, 'resnet_visualizations')
    """
    
    if save_dir is None:
        save_dir = 'resnet_visualizations'
    
    print("🎨 Creating comprehensive ResNet emotion analysis visualizations...")
    print("📊 Each plot will be saved as a separate image file...")
    
    # Print text summary
    print_resnet_summary(results_df)
    
    # Create visualizations
    viz_dir = create_resnet_visualizations(results_df, save_dir)
    
    print(f"\n✅ Analysis complete! All visualizations saved to: {viz_dir}")
    print(f"📁 Created {len([f for f in os.listdir(viz_dir) if f.endswith('.png')])} visualization files")
    
    return results_df

analyze_resnet_results(results, 'resnet_visualizations')

🎨 Creating comprehensive ResNet emotion analysis visualizations...
📊 Each plot will be saved as a separate image file...
🖼️ RESNET IMAGE EMOTION ANALYSIS SUMMARY
📚 Total books analyzed: 686,990
🎯 Mean confidence: 0.196
📊 Median confidence: 0.185
📈 Std confidence: 0.048
🔍 Min confidence: 0.117
🔍 Max confidence: 0.699

🎭 EMOTION DISTRIBUTION:
  awe            : 203,032 ( 29.6%)
  amusement      : 179,719 ( 26.2%)
  fear           : 172,886 ( 25.2%)
  contentment    : 44,676 (  6.5%)
  something else : 34,407 (  5.0%)
  sadness        : 19,314 (  2.8%)
  anger          : 19,093 (  2.8%)
  excitement     : 13,863 (  2.0%)

🎯 CONFIDENCE ANALYSIS:
  Confidence ≥ 0.5:    284 (  0.0%)
  Confidence ≥ 0.6:     13 (  0.0%)
  Confidence ≥ 0.7:      0 (  0.0%)
  Confidence ≥ 0.8:      0 (  0.0%)
  Confidence ≥ 0.9:      0 (  0.0%)
  Confidence ≥ 0.95:      0 (  0.0%)

⭐ RATING ANALYSIS:
  Books with ratings: 686,990 (100.0%)
  Average rating: 3.90
  Rating std: 0.47
  Highest rated emotion: anger (

Unnamed: 0,book_id,title,authors,average_rating,ratings_count,publication_year,language_code,popular_shelves,predicted_emotion,confidence,emotion_probs,prob_amusement,prob_anger,prob_awe,prob_contentment,prob_disgust,prob_excitement,prob_fear,prob_sadness,prob_something_else
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","[{'author_id': '10333', 'role': ''}]",4.03,140.0,1987,eng,"[{'count': '58', 'name': 'to-read'}, {'count':...",awe,0.167223,"[0.13469205796718597, 0.09353847801685333, 0.1...",0.134692,0.093538,0.167223,0.102514,0.038228,0.078381,0.139589,0.143998,0.101838
1,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,"[{'author_id': '19158', 'role': ''}]",4.22,98.0,2009,en,"[{'count': '515', 'name': 'to-read'}, {'count'...",anger,0.191088,"[0.12145131081342697, 0.19108839333057404, 0.1...",0.121451,0.191088,0.170192,0.170070,0.026302,0.056422,0.124048,0.082300,0.058126
2,34883016,Playmaker: A Venom Series Novella,"[{'author_id': '5807700', 'role': ''}]",3.86,5.0,2017,en,"[{'count': '4', 'name': 'to-read'}, {'count': ...",fear,0.154665,"[0.09836606681346893, 0.09562262892723083, 0.1...",0.098366,0.095623,0.108461,0.129003,0.083867,0.093636,0.154665,0.116781,0.119598
3,287149,The Devil's Notebook,"[{'author_id': '2983296', 'role': ''}, {'autho...",3.81,986.0,2000,en,"[{'count': '961', 'name': 'to-read'}, {'count'...",fear,0.245942,"[0.06065783277153969, 0.08587128669023514, 0.1...",0.060658,0.085871,0.122800,0.075539,0.043163,0.068727,0.245942,0.142215,0.155085
4,6066814,"Crowner Royal (Crowner John Mystery, #13)","[{'author_id': '37778', 'role': ''}]",3.93,186.0,2009,en,"[{'count': '159', 'name': 'to-read'}, {'count'...",contentment,0.180720,"[0.16852310299873352, 0.10087335109710693, 0.1...",0.168523,0.100873,0.113825,0.180720,0.043875,0.066154,0.147142,0.088792,0.090095
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
686985,23252156,Wicked Reflection,"[{'author_id': '4590885', 'role': ''}]",4.00,16.0,2014,eng,"[{'count': '36', 'name': 'to-read'}, {'count':...",amusement,0.165978,"[0.1659782975912094, 0.09699578583240509, 0.10...",0.165978,0.096996,0.100686,0.148526,0.058673,0.100618,0.138341,0.086296,0.103887
686986,18069148,Different Breeds,"[{'author_id': '5344314', 'role': ''}]",2.00,2.0,2013,eng,"[{'count': '4', 'name': 'to-read'}, {'count': ...",fear,0.139925,"[0.12173610925674438, 0.09961151331663132, 0.1...",0.121736,0.099612,0.126545,0.079603,0.076664,0.106225,0.139925,0.111887,0.137802
686987,3084038,"This Sceptred Isle, Vol. 10: The Age of Victor...","[{'author_id': '4015', 'role': ''}, {'author_i...",4.05,12.0,1999,eng,"[{'count': '11', 'name': 'to-read'}, {'count':...",awe,0.240141,"[0.13476848602294922, 0.10167520493268967, 0.2...",0.134768,0.101675,0.240141,0.100583,0.038896,0.066142,0.073695,0.145328,0.098772
686988,26168430,Sherlock Holmes and the July Crisis,"[{'author_id': '2448', 'role': ''}, {'author_i...",3.50,6.0,2015,eng,"[{'count': '702', 'name': 'to-read'}, {'count'...",awe,0.171474,"[0.12546782195568085, 0.14953885972499847, 0.1...",0.125468,0.149539,0.171474,0.086329,0.028095,0.084800,0.112041,0.100494,0.141759
