In [2]:
# Goodreads Book Description Emotion Analysis using Fine-tuned BERT
import torch
import pandas as pd
import numpy as np
import asyncio
import aiohttp
import nest_asyncio
import os
import sys
import json
import pickle
import time
import logging
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
import threading
from queue import Queue
import warnings
from tqdm.notebook import tqdm
import re
import unicodedata
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Enable nested async loops for Jupyter
nest_asyncio.apply()
warnings.filterwarnings('ignore')

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('goodreads_bert_emotion_analysis.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Configuration matching your BERT training script
class BERTEmotionConfig:
    # Paths - UPDATE THESE TO MATCH YOUR SETUP
    ARTEMIS_PATH = r'data/artemis/artemis'  # Your artemis path
    BERT_MODEL_DIR = r'data/artemis/artemis/predictions/bert_based/best_model'  # Your trained BERT model path
    DATA_FILE = 'preprocessed_books_2025_04_20.parquet'  # Your Goodreads data
    RESULTS_DIR = 'goodreads_bert_emotion_results'
    
    # BERT configuration (matching your training script exactly)
    MAX_LENGTH = 512  # Same as your training
    BATCH_SIZE = 16   # Same as your training script
    MODEL_NAME = 'google-bert/bert-base-uncased'  # Same base model
    
    # Processing configuration
    PROCESSING_BATCH_SIZE = 5000  # Smaller batches for BERT
    CHECKPOINT_FREQUENCY = 2500
    
    # Progress tracking
    PROGRESS_UPDATE_FREQUENCY = 500
    DETAILED_LOG_FREQUENCY = 1000
    
    # English language filtering
    ENGLISH_CODES = {
        'en', 'eng', 'en-us', 'en-gb', 'en-ca', 'en-au', 'en-nz', 'en-za', 
        'en-in', 'english', 'en_us', 'en_gb', 'en_ca', 'en_au'
    }
    
    # GPU configuration
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    def __init__(self):
        os.makedirs(self.RESULTS_DIR, exist_ok=True)
        os.makedirs(f"{self.RESULTS_DIR}/checkpoints", exist_ok=True)
        print(f"✅ BERT Emotion Analysis Configuration initialized")
        print(f"🤖 Device: {self.DEVICE}")
        print(f"📦 Batch size: {self.BATCH_SIZE}")
        print(f"📏 Max length: {self.MAX_LENGTH}")

config = BERTEmotionConfig()

# Add artemis to path
# if config.ARTEMIS_PATH not in sys.path:
#     sys.path.append(config.ARTEMIS_PATH)

# Import artemis modules (matching your training script)
try:
    from artemis.emotions import ARTEMIS_EMOTIONS, IDX_TO_EMOTION, positive_negative_else
    from artemis.in_out.basics import create_dir
    print("✅ Artemis emotion modules imported successfully")
    print(f"📊 Emotion classes: {ARTEMIS_EMOTIONS}")
    print(f"📊 Number of labels: {len(ARTEMIS_EMOTIONS)}")
except ImportError as e:
    print(f"❌ Error importing artemis modules: {e}")
    print("Please check your ARTEMIS_PATH in the config above")

def preprocess_text_artemis_style(text):
    """
    Preprocess text following ArtEmis conventions
    This should match the preprocessing used in utterance_spelled
    """
    if pd.isna(text) or text is None:
        return ""
    
    # Convert to string and normalize unicode (like ArtEmis does)
    text = str(text)
    text = unicodedata.normalize('NFC', text)
    
    # Basic cleaning while preserving meaningful content
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    # Remove very long sequences that might cause issues
    # But keep the text mostly intact since BERT can handle various formats
    if len(text) > 10000:  # Very long descriptions
        text = text[:10000] + "..."
    
    return text

def filter_english_books(books_df):
    """Filter books to include only English language books"""
    print(f"🔍 Filtering for English books from {len(books_df):,} total books")
    
    def is_english(lang_code):
        if pd.isna(lang_code) or lang_code == '':
            return False
        lang_code_clean = str(lang_code).lower().strip()
        return lang_code_clean in config.ENGLISH_CODES
    english_mask = books_df['language_code'].astype(str).apply(is_english)
    english_books = books_df[english_mask].copy()
    
    dropped_count = len(books_df) - len(english_books)
    print(f"✅ Filtered to {len(english_books):,} English books")
    print(f"📉 Dropped {dropped_count:,} non-English books ({dropped_count/len(books_df)*100:.1f}%)")
    
    return english_books

def filter_books_with_descriptions(books_df):
    """Filter books that have descriptions"""
    print(f"🔍 Filtering books with descriptions from {len(books_df):,} books")
    
    # Check for description fields (common field names in Goodreads data)
    description_fields = ['description', 'book_description', 'summary', 'plot', 'desc']
    description_field = None
    
    for field in description_fields:
        if field in books_df.columns:
            description_field = field
            break
    
    if description_field is None:
        print("❌ No description field found in the dataset")
        print(f"Available columns: {list(books_df.columns)}")
        # Return empty dataframe if no description field
        return books_df.iloc[:0].copy(), None
    
    print(f"📝 Using description field: '{description_field}'")
    
    # Filter books with non-empty descriptions
    has_description = (
        books_df[description_field].notna() & 
        (books_df[description_field].astype(str).str.strip() != '') &
        (books_df[description_field].astype(str).str.len() > 10)  # At least 10 characters
    )
    
    books_with_desc = books_df[has_description].copy()
    
    dropped_count = len(books_df) - len(books_with_desc)
    print(f"✅ Found {len(books_with_desc):,} books with meaningful descriptions")
    print(f"📉 Dropped {dropped_count:,} books without descriptions ({dropped_count/len(books_df)*100:.1f}%)")
    
    return books_with_desc, description_field

class BERTProgressTracker:
    """Progress tracker for BERT emotion analysis"""
    
    def __init__(self, total_books):
        self.total_books = total_books
        self.processed_books = 0
        self.successful_books = 0
        self.failed_books = 0
        self.start_time = time.time()
        self.lock = threading.Lock()
        
        # Performance metrics
        self.books_per_second = 0
        
    def update(self, successful=0, failed=0):
        with self.lock:
            self.successful_books += successful
            self.failed_books += failed
            self.processed_books = self.successful_books + self.failed_books
            
            # Calculate performance
            elapsed = time.time() - self.start_time
            if elapsed > 0:
                self.books_per_second = self.processed_books / elapsed
    
    def should_log(self, frequency):
        return self.processed_books % frequency == 0
    
    def get_status_message(self):
        with self.lock:
            progress_pct = (self.processed_books / self.total_books) * 100
            success_rate = (self.successful_books / max(1, self.processed_books)) * 100
            
            remaining = self.total_books - self.processed_books
            eta_seconds = remaining / max(0.1, self.books_per_second)
            eta_hours = eta_seconds / 3600
            
            elapsed_hours = (time.time() - self.start_time) / 3600
            
            msg = f"📊 Progress: {progress_pct:.1f}% "
            msg += f"({self.processed_books:,}/{self.total_books:,}) | "
            msg += f"✅ Success: {success_rate:.1f}% | "
            msg += f"🚀 Speed: {self.books_per_second:.1f} books/sec | "
            msg += f"⏱️ ETA: {eta_hours:.1f}h"
            
            return msg

class BERTCheckpointManager:
    """Checkpoint manager for BERT processing"""
    
    def __init__(self, checkpoint_dir):
        self.checkpoint_dir = checkpoint_dir
        self.checkpoint_file = os.path.join(checkpoint_dir, 'bert_progress_checkpoint.json')
        self.results_file = os.path.join(checkpoint_dir, 'bert_partial_results.pkl')
        
    def save_checkpoint(self, processed_indices, results, failed_books, progress_tracker):
        """Save checkpoint"""
        checkpoint_data = {
            'processed_indices': list(processed_indices),
            'num_results': len(results),
            'num_failed': len(failed_books),
            'timestamp': datetime.now().isoformat(),
            'books_per_second': progress_tracker.books_per_second,
            'success_rate': progress_tracker.successful_books / max(1, progress_tracker.processed_books),
            'bert_version': True
        }
        
        # Save checkpoint metadata
        with open(self.checkpoint_file, 'w') as f:
            json.dump(checkpoint_data, f, indent=2)
        
        # Save actual results
        with open(self.results_file, 'wb') as f:
            pickle.dump({
                'results': results,
                'failed_books': failed_books
            }, f)
        
        print(f"💾 BERT Checkpoint saved: {len(results):,} results, {len(failed_books):,} failed")
    
    def load_checkpoint(self):
        """Load checkpoint"""
        if os.path.exists(self.checkpoint_file) and os.path.exists(self.results_file):
            try:
                # Load metadata
                with open(self.checkpoint_file, 'r') as f:
                    checkpoint_data = json.load(f)
                
                # Load results
                with open(self.results_file, 'rb') as f:
                    saved_data = pickle.load(f)
                
                print(f"📂 Loaded BERT checkpoint from {checkpoint_data['timestamp']}")
                print(f"📊 Previous progress: {checkpoint_data['num_results']:,} results, {checkpoint_data['num_failed']:,} failed")
                
                return (
                    set(checkpoint_data['processed_indices']),
                    saved_data['results'],
                    saved_data['failed_books']
                )
            except Exception as e:
                print(f"❌ Error loading BERT checkpoint: {e}")
                return set(), [], []
        
        return set(), [], []

class BERTEmotionPredictor:
    """BERT-based emotion predictor for Goodreads descriptions"""
    
    def __init__(self):
        self.checkpoint_manager = BERTCheckpointManager(f"{config.RESULTS_DIR}/checkpoints")
        self.model = None
        self.tokenizer = None
        self.progress_tracker = None
        self.load_model()
        
    def load_model(self):
        """Load the fine-tuned BERT model and tokenizer"""
        try:
            print(f"🤖 Loading fine-tuned BERT model from {config.BERT_MODEL_DIR}")
            
            # Check if model directory exists
            if not os.path.exists(config.BERT_MODEL_DIR):
                print(f"❌ Model directory not found: {config.BERT_MODEL_DIR}")
                print("Please ensure you have trained the model using utterance_to_emotion_with_transformer_my_try.py")
                print("And that the paths match your artemis directory structure")
                raise FileNotFoundError(f"Model directory not found: {config.BERT_MODEL_DIR}")
            
            # List files in model directory for debugging
            model_files = os.listdir(config.BERT_MODEL_DIR)
            print(f"📁 Files in model directory: {model_files}")
            
            # Load tokenizer and model
            self.tokenizer = AutoTokenizer.from_pretrained(config.BERT_MODEL_DIR)
            self.model = AutoModelForSequenceClassification.from_pretrained(config.BERT_MODEL_DIR)
            
            # Move to device
            self.model.to(config.DEVICE)
            self.model.eval()
            
            print(f"✅ BERT model loaded successfully")
            print(f"📊 Model expects {self.model.config.num_labels} emotion classes")
            print(f"🎯 Expected emotions: {len(ARTEMIS_EMOTIONS)}")
            
            # Verify model configuration
            if self.model.config.num_labels != len(ARTEMIS_EMOTIONS):
                print(f"⚠️ Warning: Model has {self.model.config.num_labels} labels, but ArtEmis has {len(ARTEMIS_EMOTIONS)} emotions")
            
        except Exception as e:
            print(f"❌ Error loading BERT model: {e}")
            print("Please ensure you have:")
            print("1. Trained the model using utterance_to_emotion_with_transformer_my_try.py")
            print("2. Set do_training=True in that script")
            print("3. Updated the paths to match your directory structure")
            raise
    
    def predict_emotions_batch(self, texts):
        """Predict emotions for a batch of texts"""
        try:
            # Tokenize texts (same parameters as training)
            encodings = self.tokenizer(
                texts,
                truncation=True,
                padding='max_length',  # Same as training
                max_length=config.MAX_LENGTH,
                return_tensors='pt'
            )
            
            # Move to device
            input_ids = encodings['input_ids'].to(config.DEVICE)
            attention_mask = encodings['attention_mask'].to(config.DEVICE)
            
            # Predict
            with torch.no_grad():
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                
                # Convert to probabilities
                probabilities = torch.softmax(logits, dim=-1)
                predictions = torch.argmax(probabilities, dim=-1)
            
            return predictions.cpu().numpy(), probabilities.cpu().numpy()
            
        except Exception as e:
            print(f"❌ Error in BERT prediction: {e}")
            return None, None
    
    def process_batch(self, batch_df, description_field):
        """Process a batch of books"""
        batch_results = []
        batch_failed = []
        
        # Extract and preprocess descriptions
        descriptions = []
        book_data = []
        
        for idx, row in batch_df.iterrows():
            description = row.get(description_field, '')
            
            if pd.isna(description) or str(description).strip() == '':
                batch_failed.append({
                    'book_id': row.get('book_id', idx),
                    'reason': 'no_description'
                })
                continue
            
            # Preprocess description (minimal preprocessing to match training)
            processed_desc = preprocess_text_artemis_style(description)
            
            if len(processed_desc.strip()) < 5:  # Very short descriptions
                batch_failed.append({
                    'book_id': row.get('book_id', idx),
                    'reason': 'description_too_short'
                })
                continue
            
            descriptions.append(processed_desc)
            book_data.append((idx, row))
        
        if not descriptions:
            return batch_results, batch_failed
        
        # Process in smaller sub-batches for BERT (same batch size as training)
        for i in range(0, len(descriptions), config.BATCH_SIZE):
            sub_descriptions = descriptions[i:i+config.BATCH_SIZE]
            sub_book_data = book_data[i:i+config.BATCH_SIZE]
            
            # Predict emotions
            predictions, probabilities = self.predict_emotions_batch(sub_descriptions)
            
            if predictions is None:
                # All failed in this sub-batch
                for j, (idx, row) in enumerate(sub_book_data):
                    batch_failed.append({
                        'book_id': row.get('book_id', idx),
                        'reason': 'prediction_failed'
                    })
                continue
            
            # Process results
            for j, (idx, row) in enumerate(sub_book_data):
                pred_idx = predictions[j]
                emotion_probs = probabilities[j]
                
                # Get emotion label using IDX_TO_EMOTION (same as training)
                predicted_emotion = IDX_TO_EMOTION[pred_idx]
                confidence = emotion_probs[pred_idx]
                
                # Get positive/negative/else classification (same as training)
                emotion_pne = positive_negative_else(predicted_emotion)
                
                result = {
                    'book_id': row.get('book_id', idx),
                    'title': row.get('title', ''),
                    'authors': row.get('authors', ''),
                    'average_rating': row.get('average_rating'),
                    'ratings_count': row.get('ratings_count'),
                    'publication_year': row.get('publication_year'),
                    'language_code': row.get('language_code'),
                    'popular_shelves': row.get('popular_shelves'),
                    'description_preview': sub_descriptions[j][:200] + '...' if len(sub_descriptions[j]) > 200 else sub_descriptions[j],
                    'description_length': len(sub_descriptions[j]),
                    'predicted_emotion': predicted_emotion,
                    'emotion_category': emotion_pne,
                    'confidence': float(confidence),
                    'emotion_probs': emotion_probs.tolist()
                }
                
                # Add individual emotion probabilities (same as training order)
                for k, emotion in enumerate(ARTEMIS_EMOTIONS):
                    result[f'prob_{emotion.replace(" ", "_")}'] = float(emotion_probs[k])
                
                batch_results.append(result)
        
        return batch_results, batch_failed
    
    def process_all_books(self, resume_from_checkpoint=True):
        """Main processing function"""
        
        print("🚀 Starting BERT-based Goodreads emotion analysis...")
        print(f"🤖 Using model from: {config.BERT_MODEL_DIR}")
        
        # Load and filter data
        print("📂 Loading Goodreads data...")
        try:
            books_df = pd.read_parquet(config.DATA_FILE)
            print(f"📊 Loaded {len(books_df):,} total books")
            
            # Filter for English books
            books_df = filter_english_books(books_df)
            
            # Filter for books with descriptions
            books_df, description_field = filter_books_with_descriptions(books_df)
            
            if description_field is None:
                print("❌ No description field found. Cannot proceed.")
                return None
            
        except Exception as e:
            print(f"❌ Error loading data: {e}")
            return None
        
        # Initialize progress tracking
        self.progress_tracker = BERTProgressTracker(len(books_df))
        
        # Load previous progress if resuming
        processed_indices = set()
        all_results = []
        all_failed = []
        
        if resume_from_checkpoint:
            processed_indices, all_results, all_failed = self.checkpoint_manager.load_checkpoint()
            if processed_indices:
                print(f"🔄 Resuming from checkpoint: {len(processed_indices):,} books already processed")
                # Update progress tracker
                self.progress_tracker.update(
                    successful=len(all_results),
                    failed=len(all_failed)
                )
        
        # Filter out already processed books
        remaining_books = books_df[~books_df.index.isin(processed_indices)].copy()
        print(f"📋 Processing {len(remaining_books):,} remaining English books with descriptions")
        
        if len(remaining_books) == 0:
            print("✅ All books already processed!")
            return pd.DataFrame(all_results)
        
        try:
            # Process in batches
            total_batches = (len(remaining_books) + config.PROCESSING_BATCH_SIZE - 1) // config.PROCESSING_BATCH_SIZE
            
            print(f"🔄 Processing {total_batches} batches of up to {config.PROCESSING_BATCH_SIZE:,} books each")
            print(f"🎯 Using BERT batch size of {config.BATCH_SIZE} for predictions")
            
            # Create tqdm progress bar
            batch_progress = tqdm(range(total_batches), desc="Processing BERT batches")
            
            for batch_idx in batch_progress:
                start_idx = batch_idx * config.PROCESSING_BATCH_SIZE
                end_idx = min(start_idx + config.PROCESSING_BATCH_SIZE, len(remaining_books))
                batch_df = remaining_books.iloc[start_idx:end_idx]
                
                # Process this batch
                batch_results, batch_failed = self.process_batch(batch_df, description_field)
                
                # Update results
                all_results.extend(batch_results)
                all_failed.extend(batch_failed)
                
                # Update processed indices
                for idx in batch_df.index:
                    processed_indices.add(idx)
                
                # Update progress
                self.progress_tracker.update(
                    successful=len(batch_results),
                    failed=len(batch_failed)
                )
                
                # Update progress bar
                success_rate = self.progress_tracker.successful_books / max(1, self.progress_tracker.processed_books) * 100
                batch_progress.set_postfix({
                    'Success Rate': f"{success_rate:.1f}%",
                    'Speed': f"{self.progress_tracker.books_per_second:.1f} books/sec"
                })
                
                # Log progress
                if self.progress_tracker.should_log(config.PROGRESS_UPDATE_FREQUENCY):
                    print(self.progress_tracker.get_status_message())
                
                # Save checkpoint
                checkpoint_interval = max(1, config.CHECKPOINT_FREQUENCY // config.PROCESSING_BATCH_SIZE)
                if (batch_idx + 1) % checkpoint_interval == 0:
                    self.checkpoint_manager.save_checkpoint(
                        processed_indices, all_results, all_failed, self.progress_tracker
                    )
        
        except Exception as e:
            print(f"❌ Error during processing: {e}")
            # Save emergency checkpoint
            self.checkpoint_manager.save_checkpoint(
                processed_indices, all_results, all_failed, self.progress_tracker
            )
            return None
        
        # Save final results
        return self.save_final_results(all_results, all_failed)
    
    def save_final_results(self, results, failed):
        """Save final results"""
        results_df = pd.DataFrame(results)
        failed_df = pd.DataFrame(failed)
        
        # Save files
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        results_file = f"{config.RESULTS_DIR}/goodreads_bert_emotion_predictions_{timestamp}.parquet"
        failed_file = f"{config.RESULTS_DIR}/goodreads_bert_failed_books_{timestamp}.csv"
        
        results_df.to_parquet(results_file, index=False)
        failed_df.to_csv(failed_file, index=False)
        
        # Clear checkpoint files
        try:
            if os.path.exists(self.checkpoint_manager.checkpoint_file):
                os.remove(self.checkpoint_manager.checkpoint_file)
            if os.path.exists(self.checkpoint_manager.results_file):
                os.remove(self.checkpoint_manager.results_file)
        except:
            pass
        
        # Final statistics
        total_processed = len(results) + len(failed)
        success_rate = len(results) / total_processed * 100 if total_processed > 0 else 0
        
        print("\n" + "="*60)
        print("🎉 BERT EMOTION ANALYSIS COMPLETED!")
        print("="*60)
        print(f"📊 Total processed: {total_processed:,}")
        print(f"✅ Successful: {len(results):,} ({success_rate:.2f}%)")
        print(f"❌ Failed: {len(failed):,}")
        print(f"💾 Results saved to: {results_file}")
        print("="*60)
        
        # Emotion analysis
        if len(results) > 0:
            print("\n📈 Emotion Distribution:")
            emotion_counts = results_df['predicted_emotion'].value_counts()
            for emotion, count in emotion_counts.items():
                pct = count / len(results_df) * 100
                print(f"  {emotion}: {count:,} ({pct:.1f}%)")
            
            print("\n📊 Emotion Categories (Positive/Negative/Else):")
            category_counts = results_df['emotion_category'].value_counts()
            for category, count in category_counts.items():
                pct = count / len(results_df) * 100
                print(f"  {category}: {count:,} ({pct:.1f}%)")
            
            print("\n🎯 Confidence Statistics:")
            print(f"  Mean confidence: {results_df['confidence'].mean():.3f}")
            print(f"  Median confidence: {results_df['confidence'].median():.3f}")
            high_conf = (results_df['confidence'] > 0.8).sum()
            print(f"  High confidence (>0.8): {high_conf:,} ({high_conf/len(results_df)*100:.1f}%)")
        
        return results_df

# Main execution functions
def run_bert_emotion_analysis(resume_from_checkpoint=True):
    """
    Main function to run BERT emotion analysis
    
    Parameters:
    - resume_from_checkpoint: Whether to resume from existing checkpoint (default: True)
    """
    
    print("🤖 BERT-based Goodreads Book Description Emotion Analysis")
    print("📚 Processing English books with descriptions only")
    print("🔄 Using fine-tuned BERT model from ArtEmis training")
    print("-" * 60)
    
    # Check configuration
    if not os.path.exists(config.BERT_MODEL_DIR):
        print(f"❌ BERT model directory not found: {config.BERT_MODEL_DIR}")
        print("Please ensure you have:")
        print("1. Trained the model using utterance_to_emotion_with_transformer_my_try.py")
        print("2. Set do_training=True in that script")
        print("3. Updated the path to match your artemis directory structure")
        
        # Suggest alternative paths
        possible_paths = [
            r'artemis\predictions\bert_based\best_model',
            r'data\artemis\artemis\predictions\bert_based\best_model',
            r'predictions\bert_based\best_model'
        ]
        print("\nPossible model paths to check:")
        for path in possible_paths:
            print(f"  - {path}")
        
        return None
    
    if not os.path.exists(config.DATA_FILE):
        print(f"❌ Data file not found: {config.DATA_FILE}")
        print("Please update DATA_FILE in the config above")
        return None
    
    # Initialize and run predictor
    predictor = BERTEmotionPredictor()
    results_df = predictor.process_all_books(resume_from_checkpoint)
    
    return results_df

# Convenience function for one-line execution
def analyze_goodreads_bert_emotions(resume_from_checkpoint=True):
    """
    One-line function to run the complete BERT emotion analysis
    
    Usage:
    results = analyze_goodreads_bert_emotions()
    """
    return run_bert_emotion_analysis(resume_from_checkpoint)

# Display configuration
print("📋 BERT EMOTION ANALYSIS CONFIGURATION")
print("-" * 50)
print(f"📁 Artemis path: {config.ARTEMIS_PATH}")
print(f"🤖 BERT model directory: {config.BERT_MODEL_DIR}")
print(f"📊 Data file: {config.DATA_FILE}")
print(f"💾 Results directory: {config.RESULTS_DIR}")
print(f"🖥️ Device: {config.DEVICE}")
print(f"📦 Batch size: {config.BATCH_SIZE}")
print(f"📏 Max sequence length: {config.MAX_LENGTH}")
print(f"🎯 Expected emotion classes: {len(ARTEMIS_EMOTIONS) if 'ARTEMIS_EMOTIONS' in globals() else 'Not loaded'}")
print("-" * 50)

# Check if model exists and show status
if os.path.exists(config.BERT_MODEL_DIR):
    model_files = os.listdir(config.BERT_MODEL_DIR)
    print(f"✅ Model directory found with files: {model_files}")
    print("Ready to run! Execute: results = analyze_goodreads_bert_emotions()")
else:
    print(f"❌ Model directory not found: {config.BERT_MODEL_DIR}")
    print("Please train the BERT model first using utterance_to_emotion_with_transformer_my_try.py")

results = analyze_goodreads_bert_emotions()

✅ BERT Emotion Analysis Configuration initialized
🤖 Device: cuda
📦 Batch size: 16
📏 Max length: 512
✅ Artemis emotion modules imported successfully
📊 Emotion classes: ['amusement', 'awe', 'contentment', 'excitement', 'anger', 'disgust', 'fear', 'sadness', 'something else']
📊 Number of labels: 9
📋 BERT EMOTION ANALYSIS CONFIGURATION
--------------------------------------------------
📁 Artemis path: data/artemis/artemis
🤖 BERT model directory: data/artemis/artemis/predictions/bert_based/best_model
📊 Data file: preprocessed_books_2025_04_20.parquet
💾 Results directory: goodreads_bert_emotion_results
🖥️ Device: cuda
📦 Batch size: 16
📏 Max sequence length: 512
🎯 Expected emotion classes: 9
--------------------------------------------------
✅ Model directory found with files: ['model.safetensors', 'config.json', 'tokenizer_config.json', 'vocab.txt', 'special_tokens_map.json', 'tokenizer.json']
Ready to run! Execute: results = analyze_goodreads_bert_emotions()
🤖 BERT-based Goodreads Book Desc

Processing BERT batches:   0%|          | 0/62 [00:00<?, ?it/s]

📊 Progress: 56.0% (385,000/686,946) | ✅ Success: 100.0% | 🚀 Speed: 12827.0 books/sec | ⏱️ ETA: 0.0h
💾 BERT Checkpoint saved: 385,000 results, 0 failed
📊 Progress: 56.8% (390,000/686,946) | ✅ Success: 100.0% | 🚀 Speed: 5830.6 books/sec | ⏱️ ETA: 0.0h
💾 BERT Checkpoint saved: 390,000 results, 0 failed
📊 Progress: 57.5% (395,000/686,946) | ✅ Success: 100.0% | 🚀 Speed: 3873.0 books/sec | ⏱️ ETA: 0.0h
💾 BERT Checkpoint saved: 395,000 results, 0 failed
📊 Progress: 58.2% (400,000/686,946) | ✅ Success: 100.0% | 🚀 Speed: 2900.9 books/sec | ⏱️ ETA: 0.0h
💾 BERT Checkpoint saved: 400,000 results, 0 failed
📊 Progress: 59.0% (405,000/686,946) | ✅ Success: 100.0% | 🚀 Speed: 2298.5 books/sec | ⏱️ ETA: 0.0h
💾 BERT Checkpoint saved: 405,000 results, 0 failed
📊 Progress: 59.7% (410,000/686,946) | ✅ Success: 100.0% | 🚀 Speed: 1896.5 books/sec | ⏱️ ETA: 0.0h
💾 BERT Checkpoint saved: 410,000 results, 0 failed
📊 Progress: 60.4% (415,000/686,946) | ✅ Success: 100.0% | 🚀 Speed: 1628.1 books/sec | ⏱️ ETA: 0.0h


In [None]:
# BERT Emotion Analysis Visualization Functions
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

def create_bert_visualizations(results_df, save_dir=None):
    """
    Create comprehensive visualizations for BERT emotion analysis results
    
    Parameters:
    - results_df: DataFrame with BERT emotion predictions
    - save_dir: Directory to save plots (optional)
    """
    
    if results_df is None or len(results_df) == 0:
        print("❌ No results to visualize")
        return
    
    # Set up the plotting style
    plt.style.use('default')
    sns.set_palette("husl")
    
    # Create figure with subplots
    fig = plt.figure(figsize=(20, 16))
    
    # 1. Emotion Distribution (Bar Plot)
    plt.subplot(3, 3, 1)
    emotion_counts = results_df['predicted_emotion'].value_counts()
    bars = plt.bar(range(len(emotion_counts)), emotion_counts.values, alpha=0.8)
    plt.xticks(range(len(emotion_counts)), emotion_counts.index, rotation=45, ha='right')
    plt.title('Emotion Distribution in Goodreads Books\n(BERT Predictions)', fontsize=12, pad=20)
    plt.ylabel('Number of Books')
    plt.grid(axis='y', alpha=0.3)
    
    # Add value labels on bars
    for i, (bar, value) in enumerate(zip(bars, emotion_counts.values)):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 100, 
                f'{value:,}', ha='center', va='bottom', fontsize=9)
    
    # 2. Emotion Categories Pie Chart
    plt.subplot(3, 3, 2)
    category_counts = results_df['emotion_category'].value_counts()
    colors = ['#2ecc71', '#e74c3c', '#95a5a6']  # Green, Red, Gray
    wedges, texts, autotexts = plt.pie(category_counts.values, labels=category_counts.index, 
                                      autopct='%1.1f%%', colors=colors, startangle=90)
    plt.title('Emotion Categories Distribution\n(Positive/Negative/Else)', fontsize=12, pad=20)
    
    # Enhance pie chart text
    for autotext in autotexts:
        autotext.set_color('white')
        autotext.set_fontweight('bold')
    
    # 3. Confidence Distribution Histogram
    plt.subplot(3, 3, 3)
    plt.hist(results_df['confidence'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
    plt.axvline(results_df['confidence'].mean(), color='red', linestyle='--', 
                label=f'Mean: {results_df["confidence"].mean():.3f}')
    plt.axvline(results_df['confidence'].median(), color='orange', linestyle='--', 
                label=f'Median: {results_df["confidence"].median():.3f}')
    plt.title('Prediction Confidence Distribution', fontsize=12, pad=20)
    plt.xlabel('Confidence Score')
    plt.ylabel('Number of Books')
    plt.legend()
    plt.grid(alpha=0.3)
    
    # 4. Average Confidence by Emotion
    plt.subplot(3, 3, 4)
    emotion_confidence = results_df.groupby('predicted_emotion')['confidence'].agg(['mean', 'std']).sort_values('mean', ascending=False)
    bars = plt.bar(range(len(emotion_confidence)), emotion_confidence['mean'].values, 
                   yerr=emotion_confidence['std'].values, alpha=0.8, capsize=5)
    plt.xticks(range(len(emotion_confidence)), emotion_confidence.index, rotation=45, ha='right')
    plt.title('Average Prediction Confidence by Emotion', fontsize=12, pad=20)
    plt.ylabel('Average Confidence')
    plt.grid(axis='y', alpha=0.3)
    
    # Add value labels
    for i, (bar, value) in enumerate(zip(bars, emotion_confidence['mean'].values)):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                f'{value:.3f}', ha='center', va='bottom', fontsize=9)
    
    # 5. Confidence vs Emotion Category
    plt.subplot(3, 3, 5)
    categories = results_df['emotion_category'].unique()
    confidence_by_category = [results_df[results_df['emotion_category'] == cat]['confidence'].values 
                             for cat in categories]
    
    box_plot = plt.boxplot(confidence_by_category, labels=categories, patch_artist=True)
    colors = ['lightgreen', 'lightcoral', 'lightgray']
    for patch, color in zip(box_plot['boxes'], colors):
        patch.set_facecolor(color)
    
    plt.title('Confidence Distribution by Emotion Category', fontsize=12, pad=20)
    plt.ylabel('Confidence Score')
    plt.grid(axis='y', alpha=0.3)
    
    # 6. Description Length vs Confidence
    plt.subplot(3, 3, 6)
    plt.scatter(results_df['description_length'], results_df['confidence'], alpha=0.5, s=10)
    
    # Add trend line
    z = np.polyfit(results_df['description_length'], results_df['confidence'], 1)
    p = np.poly1d(z)
    plt.plot(results_df['description_length'], p(results_df['description_length']), "r--", alpha=0.8)
    
    plt.title('Description Length vs Prediction Confidence', fontsize=12, pad=20)
    plt.xlabel('Description Length (characters)')
    plt.ylabel('Confidence Score')
    plt.grid(alpha=0.3)
    
    # Calculate correlation
    correlation = results_df['description_length'].corr(results_df['confidence'])
    plt.text(0.05, 0.95, f'Correlation: {correlation:.3f}', transform=plt.gca().transAxes, 
             bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8))
    
    # 7. High Confidence Predictions by Emotion
    plt.subplot(3, 3, 7)
    high_conf_threshold = 0.8
    high_conf_emotions = results_df[results_df['confidence'] > high_conf_threshold]['predicted_emotion'].value_counts()
    
    if len(high_conf_emotions) > 0:
        bars = plt.bar(range(len(high_conf_emotions)), high_conf_emotions.values, alpha=0.8, color='gold')
        plt.xticks(range(len(high_conf_emotions)), high_conf_emotions.index, rotation=45, ha='right')
        plt.title(f'High Confidence Predictions (>{high_conf_threshold})', fontsize=12, pad=20)
        plt.ylabel('Number of Books')
        plt.grid(axis='y', alpha=0.3)
        
        # Add percentages
        total_high_conf = len(results_df[results_df['confidence'] > high_conf_threshold])
        for i, (bar, value) in enumerate(zip(bars, high_conf_emotions.values)):
            pct = value / total_high_conf * 100
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 10, 
                    f'{pct:.1f}%', ha='center', va='bottom', fontsize=9)
    else:
        plt.text(0.5, 0.5, 'No high confidence predictions', ha='center', va='center', 
                transform=plt.gca().transAxes, fontsize=12)
        plt.title(f'High Confidence Predictions (>{high_conf_threshold})', fontsize=12, pad=20)
    
    # 8. Emotion Probability Heatmap (Top emotions)
    plt.subplot(3, 3, 8)
    
    # Get top 6 most frequent emotions for readability
    top_emotions = emotion_counts.head(6).index
    prob_columns = [f'prob_{emotion.replace(" ", "_")}' for emotion in top_emotions]
    
    # Sample data for heatmap (use first 100 books or less)
    sample_size = min(100, len(results_df))
    sample_df = results_df.head(sample_size)
    prob_matrix = sample_df[prob_columns].values.T
    
    im = plt.imshow(prob_matrix, cmap='YlOrRd', aspect='auto')
    plt.colorbar(im, shrink=0.8)
    plt.yticks(range(len(top_emotions)), top_emotions)
    plt.xlabel('Book Samples')
    plt.title(f'Emotion Probability Heatmap\n(First {sample_size} books)', fontsize=12, pad=20)
    
    # 9. Summary Statistics Table
    plt.subplot(3, 3, 9)
    plt.axis('off')
    
    # Calculate summary statistics
    stats = {
        'Total Books Analyzed': f"{len(results_df):,}",
        'Mean Confidence': f"{results_df['confidence'].mean():.3f}",
        'Median Confidence': f"{results_df['confidence'].median():.3f}",
        'High Confidence (>0.8)': f"{(results_df['confidence'] > 0.8).sum():,} ({(results_df['confidence'] > 0.8).mean()*100:.1f}%)",
        'Most Common Emotion': f"{emotion_counts.index[0]} ({emotion_counts.iloc[0]:,})",
        'Positive Emotions': f"{(results_df['emotion_category'] == 'positive').sum():,} ({(results_df['emotion_category'] == 'positive').mean()*100:.1f}%)",
        'Negative Emotions': f"{(results_df['emotion_category'] == 'negative').sum():,} ({(results_df['emotion_category'] == 'negative').mean()*100:.1f}%)",
        'Avg Description Length': f"{results_df['description_length'].mean():.0f} chars"
    }
    
    # Create table
    table_data = [[key, value] for key, value in stats.items()]
    table = plt.table(cellText=table_data, colLabels=['Metric', 'Value'],
                     cellLoc='left', loc='center', bbox=[0, 0, 1, 1])
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1.2, 2)
    
    # Style the table
    for i in range(len(stats) + 1):
        table[(i, 0)].set_facecolor('#E8E8E8')
        table[(i, 1)].set_facecolor('#F5F5F5')
    
    plt.title('Summary Statistics', fontsize=12, pad=20)
    
    plt.tight_layout()
    
    # Save the plot if directory provided
    if save_dir:
        import os
        os.makedirs(save_dir, exist_ok=True)
        plt.savefig(f'{save_dir}/bert_emotion_analysis_comprehensive.png', 
                   dpi=300, bbox_inches='tight', facecolor='white')
        print(f"📊 Comprehensive visualization saved to {save_dir}/bert_emotion_analysis_comprehensive.png")
    
    plt.show()
    
    # Additional detailed visualizations
    create_detailed_emotion_analysis(results_df, save_dir)

def create_detailed_emotion_analysis(results_df, save_dir=None):
    """Create additional detailed analysis plots"""
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # 1. Confidence distribution by emotion (violin plot)
    ax1 = axes[0, 0]
    emotions = results_df['predicted_emotion'].unique()
    confidence_data = [results_df[results_df['predicted_emotion'] == emotion]['confidence'].values 
                      for emotion in emotions]
    
    parts = ax1.violinplot(confidence_data, positions=range(len(emotions)), showmeans=True)
    ax1.set_xticks(range(len(emotions)))
    ax1.set_xticklabels(emotions, rotation=45, ha='right')
    ax1.set_title('Confidence Distribution by Emotion (Violin Plot)')
    ax1.set_ylabel('Confidence Score')
    ax1.grid(axis='y', alpha=0.3)
    
    # 2. Emotion transitions/correlations heatmap
    ax2 = axes[0, 1]
    from artemis.emotions import ARTEMIS_EMOTIONS
    
    # Create correlation matrix of emotion probabilities
    prob_columns = [f'prob_{emotion.replace(" ", "_")}' for emotion in ARTEMIS_EMOTIONS]
    corr_matrix = results_df[prob_columns].corr()
    
    im = ax2.imshow(corr_matrix, cmap='RdBu_r', vmin=-1, vmax=1)
    ax2.set_xticks(range(len(ARTEMIS_EMOTIONS)))
    ax2.set_yticks(range(len(ARTEMIS_EMOTIONS)))
    ax2.set_xticklabels(ARTEMIS_EMOTIONS, rotation=45, ha='right')
    ax2.set_yticklabels(ARTEMIS_EMOTIONS)
    ax2.set_title('Emotion Probability Correlations')
    
    # Add correlation values
    for i in range(len(ARTEMIS_EMOTIONS)):
        for j in range(len(ARTEMIS_EMOTIONS)):
            text = ax2.text(j, i, f'{corr_matrix.iloc[i, j]:.2f}',
                           ha="center", va="center", color="black" if abs(corr_matrix.iloc[i, j]) < 0.5 else "white")
    
    plt.colorbar(im, ax=ax2, shrink=0.8)
    
    # 3. Description length distribution by emotion category
    ax3 = axes[1, 0]
    categories = results_df['emotion_category'].unique()
    length_data = [results_df[results_df['emotion_category'] == cat]['description_length'].values 
                  for cat in categories]
    
    box_plot = ax3.boxplot(length_data, labels=categories, patch_artist=True)
    colors = ['lightgreen', 'lightcoral', 'lightgray']
    for patch, color in zip(box_plot['boxes'], colors[:len(categories)]):
        patch.set_facecolor(color)
    
    ax3.set_title('Description Length by Emotion Category')
    ax3.set_ylabel('Description Length (characters)')
    ax3.grid(axis='y', alpha=0.3)
    
    # 4. Confidence threshold analysis
    ax4 = axes[1, 1]
    thresholds = np.arange(0.1, 1.0, 0.05)
    coverage = []
    accuracies = []
    
    for threshold in thresholds:
        high_conf_mask = results_df['confidence'] >= threshold
        coverage.append(high_conf_mask.mean() * 100)
        
        # For accuracy, we'd need ground truth labels
        # For now, just show coverage
    
    ax4.plot(thresholds, coverage, 'b-', linewidth=2, label='Coverage %')
    ax4.set_xlabel('Confidence Threshold')
    ax4.set_ylabel('Coverage (%)')
    ax4.set_title('Coverage vs Confidence Threshold')
    ax4.grid(alpha=0.3)
    ax4.legend()
    
    # Add some key points
    for i, (thresh, cov) in enumerate(zip(thresholds[::5], coverage[::5])):
        ax4.annotate(f'{cov:.1f}%', (thresh, cov), textcoords="offset points", 
                    xytext=(0,10), ha='center', fontsize=8)
    
    plt.tight_layout()
    
    if save_dir:
        plt.savefig(f'{save_dir}/bert_detailed_analysis.png', 
                   dpi=300, bbox_inches='tight', facecolor='white')
        print(f"📊 Detailed analysis saved to {save_dir}/bert_detailed_analysis.png")
    
    plt.show()

def print_emotion_summary(results_df):
    """Print a detailed text summary of the emotion analysis"""
    
    print("="*60)
    print("🎭 BERT EMOTION ANALYSIS SUMMARY")
    print("="*60)
    
    # Basic statistics
    print(f"📚 Total books analyzed: {len(results_df):,}")
    print(f"🎯 Mean confidence: {results_df['confidence'].mean():.3f}")
    print(f"📊 Median confidence: {results_df['confidence'].median():.3f}")
    print(f"📏 Average description length: {results_df['description_length'].mean():.0f} characters")
    
    # Emotion distribution
    print(f"\n🎭 EMOTION DISTRIBUTION:")
    emotion_counts = results_df['predicted_emotion'].value_counts()
    for emotion, count in emotion_counts.items():
        pct = count / len(results_df) * 100
        print(f"  {emotion:15}: {count:6,} ({pct:5.1f}%)")
    
    # Category distribution
    print(f"\n📈 EMOTION CATEGORIES:")
    category_counts = results_df['emotion_category'].value_counts()
    for category, count in category_counts.items():
        pct = count / len(results_df) * 100
        print(f"  {category:15}: {count:6,} ({pct:5.1f}%)")
    
    # Confidence analysis
    print(f"\n🎯 CONFIDENCE ANALYSIS:")
    conf_thresholds = [0.5, 0.6, 0.7, 0.8, 0.9]
    for threshold in conf_thresholds:
        high_conf = (results_df['confidence'] >= threshold).sum()
        pct = high_conf / len(results_df) * 100
        print(f"  Confidence ≥ {threshold}: {high_conf:6,} ({pct:5.1f}%)")
    
    # Most confident predictions
    print(f"\n🏆 MOST CONFIDENT PREDICTIONS:")
    top_confident = results_df.nlargest(5, 'confidence')[['title', 'predicted_emotion', 'confidence']]
    for _, row in top_confident.iterrows():
        print(f"  {row['predicted_emotion']:12} ({row['confidence']:.3f}): {row['title'][:50]}...")
    
    print("="*60)

# Usage function to run all visualizations
def analyze_bert_results(results_df, save_dir=None):
    """
    Run complete analysis and visualization of BERT results
    
    Usage after running BERT analysis:
    analyze_bert_results(results, 'visualizations')
    """
    
    if save_dir is None:
        save_dir = config.RESULTS_DIR if 'config' in globals() else 'bert_visualizations'
    
    print("🎨 Creating comprehensive BERT emotion analysis visualizations...")
    
    # Print text summary
    print_emotion_summary(results_df)
    
    # Create visualizations
    create_bert_visualizations(results_df, save_dir)
    
    print(f"\n✅ Analysis complete! Visualizations saved to: {save_dir}")
    
    return results_df

# Example usage:
# After running your BERT analysis:
# results = analyze_goodreads_bert_emotions()

vis_dir = f"{config.RESULTS_DIR}/visualizations"
os.makedirs(vis_dir, exist_ok=True)
analyze_bert_results(results, vis_dir)

🎨 Creating comprehensive BERT emotion analysis visualizations...
🎭 BERT EMOTION ANALYSIS SUMMARY
📚 Total books analyzed: 686,946
🎯 Mean confidence: 0.481
📊 Median confidence: 0.441
📏 Average description length: 891 characters

🎭 EMOTION DISTRIBUTION:
  sadness        : 192,494 ( 28.0%)
  amusement      : 110,417 ( 16.1%)
  excitement     : 100,331 ( 14.6%)
  fear           : 94,141 ( 13.7%)
  awe            : 89,756 ( 13.1%)
  anger          : 35,642 (  5.2%)
  contentment    : 26,742 (  3.9%)
  something else : 19,989 (  2.9%)
  disgust        : 17,434 (  2.5%)

📈 EMOTION CATEGORIES:
                1: 339,711 ( 49.5%)
                0: 327,246 ( 47.6%)
                2: 19,989 (  2.9%)

🎯 CONFIDENCE ANALYSIS:
  Confidence ≥ 0.5: 265,905 ( 38.7%)
  Confidence ≥ 0.6: 169,487 ( 24.7%)
  Confidence ≥ 0.7: 102,818 ( 15.0%)
  Confidence ≥ 0.8: 54,237 (  7.9%)
  Confidence ≥ 0.9: 18,565 (  2.7%)

🏆 MOST CONFIDENT PREDICTIONS:
  amusement    (0.994): Zombie High Yearbook '64...
  amusement