In [None]:
import sys
import torch
import transformers
import pandas as pd
import numpy as np
from azure.eventhub import EventHubConsumerClient
import psutil
import time

print("🔍 ENVIRONMENT VERIFICATION")
print("=" * 50)

print(f"Python: {sys.version}")

print(f"\nCPU cores: {psutil.cpu_count()}")
print(f"RAM: {psutil.virtual_memory().total / (1024**3):.1f} GB")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / (1024**3):.1f} GB")
    print(f"CUDA Version: {torch.version.cuda}")
else:
    print("GPU: Not available (CPU only)")

print(f"\nLibrary Versions:")
print(f"  PyTorch: {torch.__version__}")
print(f"  Transformers: {transformers.__version__}")
print(f"  Pandas: {pd.__version__}")
print(f"  NumPy: {np.__version__}")

print(f"\n🧠 Testing RoBERTa Model Loading...")
try:
    start_time = time.time()
    from transformers import pipeline
    
    classifier = pipeline(
        "sentiment-analysis",
        model="cardiffnlp/twitter-roberta-base-sentiment-latest",
        device=0 if torch.cuda.is_available() else -1
    )
    
    result = classifier("This is a test sentence for sentiment analysis!")
    load_time = time.time() - start_time
    
    print(f"✅ RoBERTa loaded successfully in {load_time:.2f} seconds")
    print(f"   Test result: {result}")
    
    test_texts = ["I love this!"] * 100
    start_time = time.time()
    results = classifier(test_texts)
    batch_time = time.time() - start_time
    
    print(f"✅ Batch processing: 100 texts in {batch_time:.2f} seconds")
    print(f"   Speed: {100/batch_time:.1f} texts/second")
    
except Exception as e:
    print(f"❌ RoBERTa loading failed: {e}")

print(f"\n🔗 Testing Azure Event Hubs connectivity...")
try:
    # Just test import and client creation (don't actually connect)
    from azure.eventhub import EventHubConsumerClient
    print("✅ Azure Event Hubs library available")
except Exception as e:
    print(f"❌ Azure Event Hubs failed: {e}")

print(f"\n" + "=" * 50)
print("🎯 Environment verification complete!")

# Resource recommendations
total_ram = psutil.virtual_memory().total / (1024**3)
if total_ram < 16:
    print("⚠️  WARNING: Less than 16GB RAM - may struggle with large models")
elif total_ram < 32:
    print("✅ RAM sufficient for basic processing")
else:
    print("🚀 Excellent RAM for heavy processing")

if torch.cuda.is_available():
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
    if gpu_memory < 8:
        print("⚠️  WARNING: Less than 8GB GPU memory - may need CPU processing")
    else:
        print("🎮 Excellent GPU setup for ML models")

In [None]:
# COMPLETE FIXED COLLEGE CLUSTER PIPELINE
# All logical issues resolved - ready to run

import os
import pandas as pd
import json
import subprocess
from datetime import datetime
import torch
from transformers import pipeline
import requests
import base64
import time
# =============================================================================
# CLASS 1: GitHub Data Manager
# =============================================================================

class UpdatedGitHubDataManager:
    """Updated GitHub manager with FIXED batch processing logic - NOW PROCESSES ALL BATCHES"""
    
    def __init__(self, username, repo_name):
        self.username = username
        self.repo_name = repo_name
        self.repo_url = f"https://github.com/{username}/{repo_name}.git"
        self.repo_dir = f"./{repo_name}/"
        self.processed_batches = set()  # Track processed batches
        self.setup_repo()
        
    def setup_repo(self):
        """Clone and sync repo"""
        print("📁 Setting up GitHub repository...")
        
        if not os.path.exists(self.repo_dir):
            print("📥 Cloning repository...")
            result = subprocess.run(['git', 'clone', self.repo_url], 
                                  capture_output=True, text=True)
            if result.returncode == 0:
                print("✅ Repository cloned")
            else:
                print(f"❌ Clone failed: {result.stderr}")
                return False
        else:
            print("✅ Repository exists")
        
        self.force_sync()
        self.load_processed_batches()
        return True
    
    def force_sync(self):
        """Force sync with GitHub"""
        original_dir = os.getcwd()
        os.chdir(self.repo_dir)
        
        try:
            subprocess.run(['git', 'fetch', 'origin'], check=True)
            subprocess.run(['git', 'reset', '--hard', 'origin/main'], check=True)
            print("✅ Synced with GitHub")
        except Exception as e:
            print(f"⚠️ Sync warning: {e}")
        finally:
            os.chdir(original_dir)
    
    def load_processed_batches(self):
        """Load list of already processed batches"""
        batch_history_file = f"{self.repo_dir}/processed_batches.json"
        
        if os.path.exists(batch_history_file):
            with open(batch_history_file, 'r') as f:
                history = json.load(f)
                self.processed_batches = set(history.get('processed_batches', []))
                print(f"📋 Batch history: {len(self.processed_batches)} batches already processed")
        else:
            print("📋 No batch history - starting fresh")
    
    def find_all_unprocessed_batches(self):
        """✅ NEW: Find ALL unprocessed batches instead of just one"""
        print("🔍 Finding ALL unprocessed batches...")
        
        incremental_dir = f"{self.repo_dir}/data/incremental/"
        
        if not os.path.exists(incremental_dir):
            print("❌ No incremental directory - using fallback method")
            return []
        
        # Get all unprocessed batch files
        unprocessed_batches = []
        for f in os.listdir(incremental_dir):
            if f.endswith('.json') and f.startswith('batch_'):
                if f not in self.processed_batches:  # Only unprocessed batches
                    filepath = os.path.join(incremental_dir, f)
                    mtime = os.path.getmtime(filepath)
                    unprocessed_batches.append((f, mtime, filepath))
        
        # Sort by timestamp (oldest first - process in order)
        unprocessed_batches.sort(key=lambda x: x[1])
        
        print(f"📦 Found {len(unprocessed_batches)} unprocessed batches:")
        for filename, mtime, _ in unprocessed_batches:
            print(f"   - {filename} (modified: {datetime.fromtimestamp(mtime)})")
        
        return unprocessed_batches
    
    def find_unprocessed_incremental_batch(self):
        """LEGACY: Keep for compatibility - now returns first unprocessed batch"""
        unprocessed_batches = self.find_all_unprocessed_batches()
        
        if not unprocessed_batches:
            print("📋 No unprocessed incremental batches found")
            return self.find_latest_data_file_fallback()
        
        # Return first (oldest) unprocessed batch
        filename, mtime, filepath = unprocessed_batches[0]
        print(f"📦 Next unprocessed batch: {filename}")
        return filepath, filename
    
    def find_latest_data_file_fallback(self):
        """Fallback to old method for compatibility"""
        print("🔄 Using fallback method (raw data folder)...")
        
        data_dir = f"{self.repo_dir}/data/raw/"
        
        if not os.path.exists(data_dir):
            print("❌ No data directories found")
            return None, None
        
        files = []
        for f in os.listdir(data_dir):
            if f.endswith(('.parquet', '.json')):
                filepath = os.path.join(data_dir, f)
                mtime = os.path.getmtime(filepath)
                files.append((f, mtime, filepath))
        
        if not files:
            print("❌ No data files found")
            return None, None
        
        latest_file = max(files, key=lambda x: x[1])
        filename, mtime, filepath = latest_file
        
        print(f"📁 Latest file (fallback): {filename}")
        return filepath, filename
    
    def load_data_file(self, filepath):
        """Load data file (handles both formats)"""
        try:
            if filepath.endswith('.parquet'):
                df = pd.read_parquet(filepath)
                print(f"✅ Loaded parquet: {len(df)} posts")
            elif filepath.endswith('.json'):
                with open(filepath, 'r') as f:
                    data = json.load(f)
                
                # Handle incremental batch format
                if 'posts' in data:
                    df = pd.DataFrame(data['posts'])
                    print(f"✅ Loaded incremental batch: {len(df)} posts")
                elif 'batch_info' in data:
                    df = pd.DataFrame(data['posts'])
                    print(f"✅ Loaded 10-minute batch: {len(df)} posts")
                else:
                    df = pd.DataFrame(data)
                    print(f"✅ Loaded JSON: {len(df)} posts")
            else:
                print(f"❌ Unknown file format: {filepath}")
                return None
                
            return df
            
        except Exception as e:
            print(f"❌ Failed to load {filepath}: {e}")
            return None
    
    def get_latest_data(self):
        """LEGACY: Get single batch (keep for compatibility)"""
        self.force_sync()
        
        # Try incremental batch first, then fallback
        file_info = self.find_unprocessed_incremental_batch()
        if not file_info[0]:
            return None, None
        
        filepath, filename = file_info
        df = self.load_data_file(filepath)
        
        if df is not None:
            print(f"🎯 Ready to process {len(df)} posts from {filename}")
            return df, filename
        else:
            return None, None
    
    def get_all_unprocessed_data(self):
        """✅ NEW: Get ALL unprocessed batches as list of (df, filename) tuples"""
        print("🚀 GETTING ALL UNPROCESSED DATA")
        print("=" * 50)
        
        self.force_sync()
        
        unprocessed_batches = self.find_all_unprocessed_batches()
        
        if not unprocessed_batches:
            print("✅ No unprocessed batches found")
            return []
        
        batch_data_list = []
        
        for filename, mtime, filepath in unprocessed_batches:
            df = self.load_data_file(filepath)
            
            if df is not None and len(df) > 0:
                batch_data_list.append((df, filename))
                print(f"   ✅ Added {filename}: {len(df)} posts")
            else:
                print(f"   ⚠️ Skipped {filename}: no valid data")
        
        total_posts = sum(len(df) for df, _ in batch_data_list)
        print(f"\n📊 READY TO PROCESS:")
        print(f"   Batches: {len(batch_data_list)}")
        print(f"   Total posts: {total_posts}")
        
        return batch_data_list
    
    def mark_batch_processed(self, batch_filename):
        """Mark ANY batch as processed (not just 'batch_' files)"""
        self.processed_batches.add(batch_filename)  # Mark ALL files, not just batch_ files
        
        # Save updated history
        batch_history = {
            'processed_batches': list(self.processed_batches),
            'last_processed': datetime.now().isoformat()
        }
        
        batch_history_file = f"{self.repo_dir}/processed_batches.json"
        with open(batch_history_file, 'w') as f:
            json.dump(batch_history, f, indent=2)
            
        print(f"✅ Marked batch as processed: {batch_filename}")
    
    def save_enhanced_data(self, enhanced_df, original_filename):
        """Save enhanced data and mark batch as processed"""
        print("💾 Saving enhanced data...")
        
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = f"enhanced_incremental_{timestamp}.parquet"
        
        enhanced_df.to_parquet(output_file)
        
        # Mark the original batch as processed
        self.mark_batch_processed(original_filename)
        
        print(f"✅ Enhanced data saved: {output_file}")
        print(f"📊 Sentiment coverage: {enhanced_df['ml_sentiment_score'].notna().sum()}/{len(enhanced_df)}")
        print(f"📊 Emotion coverage: {enhanced_df['dominant_emotion'].notna().sum()}/{len(enhanced_df)}")
        
        return output_file
        
# =============================================================================
# CLASS 2: ML Sentiment Processor 
# =============================================================================

class MLSentimentProcessor:
    """ML processor with FIXED sentiment label conversion"""
    
    def __init__(self):
        self.roberta_model = None
        self.emotion_model = None
        self.setup_models()
    
    def setup_models(self):
        """Initialize ML models"""
        print("🧠 Loading ML models...")
        
        device = 0 if torch.cuda.is_available() else -1
        print(f"🎮 Device: {'GPU' if device == 0 else 'CPU'}")
        
        # Load RoBERTa
        self.roberta_model = pipeline(
            "sentiment-analysis",
            model="cardiffnlp/twitter-roberta-base-sentiment-latest",
            device=device,
            truncation=True,
            max_length=512,
            batch_size=4,
            return_all_scores=True
        )

        print("✅ RoBERTa loaded")
        
        # Load emotion model
        self.emotion_model = pipeline(
            "text-classification", 
            model="j-hartmann/emotion-english-distilroberta-base",
            device=device,
            truncation=True,
            max_length=512,
            batch_size=4,
            top_k=None
        )
        print("✅ Emotion model loaded")
    
    def clean_texts(self, texts):
        """Clean and validate texts"""
        cleaned = []
        valid_indices = []
        
        for i, text in enumerate(texts):
            if text and len(str(text).strip()) >= 10:
                clean_text = str(text).strip()[:500]  # Truncate long texts
                cleaned.append(clean_text)
                valid_indices.append(i)
        
        print(f"✅ Cleaned {len(cleaned)} valid texts from {len(texts)} total")
        return cleaned, valid_indices
    
    def process_sentiment(self, texts):
        """Process sentiment in chunks"""
        print("🤖 Processing sentiment...")
        
        if not texts:
            return []
        
        # Process in chunks
        chunk_size = 50
        all_results = []
        
        for i in range(0, len(texts), chunk_size):
            chunk = texts[i:i+chunk_size]
            print(f"   Chunk {i//chunk_size + 1}/{(len(texts)-1)//chunk_size + 1}")
            
            try:
                chunk_results = self.roberta_model(chunk)
                all_results.extend(chunk_results)
            except Exception as e:
                print(f"   ❌ Chunk failed: {e}")
                # Add empty results for failed chunk
                all_results.extend([[{'label': 'neutral', 'score': 1.0}]] * len(chunk))
        
        print(f"✅ Sentiment processing complete: {len(all_results)} results")
        return all_results
    
    def process_emotions(self, texts):
        """Process emotions in chunks"""
        print("😊 Processing emotions...")
        
        if not texts:
            return []
        
        chunk_size = 50
        all_results = []
        
        for i in range(0, len(texts), chunk_size):
            chunk = texts[i:i+chunk_size]
            print(f"   Emotion chunk {i//chunk_size + 1}/{(len(texts)-1)//chunk_size + 1}")
            
            try:
                chunk_results = self.emotion_model(chunk)
                all_results.extend(chunk_results)
            except Exception as e:
                print(f"   ❌ Emotion chunk failed: {e}")
                all_results.extend([[]] * len(chunk))
        
        print(f"✅ Emotion processing complete: {len(all_results)} results")
        return all_results
    
    def enhance_dataframe(self, df):
        """Complete enhancement with FIXED sentiment conversion"""
        print(f"⚡ Enhancing {len(df)} posts with ML...")
        
        start_time = datetime.now()
        
        # Prepare texts - handle missing text_cleaned column
        if 'text_cleaned' in df.columns:
            texts = df['text_cleaned'].fillna('').tolist()
        elif 'text' in df.columns:
            texts = df['text'].fillna('').tolist()
        else:
            print("❌ No text column found")
            return df
            
        cleaned_texts, valid_indices = self.clean_texts(texts)
        
        if not cleaned_texts:
            print("❌ No valid texts")
            return df
        
        # Process with ML models
        sentiment_results = self.process_sentiment(cleaned_texts)
        emotion_results = self.process_emotions(cleaned_texts)
        
        # Calculate timing
        processing_time = (datetime.now() - start_time).total_seconds()
        speed = len(cleaned_texts) / processing_time if processing_time > 0 else 0
        
        print(f"⏱️ Processing time: {processing_time:.2f}s ({speed:.1f} posts/sec)")
        
        # Apply FIXED sentiment conversion
        enhanced_df = self.apply_ml_results_FIXED(
            df, sentiment_results, emotion_results, valid_indices, speed
        )
        
        return enhanced_df
    
    def apply_ml_results_FIXED(self, df, sentiment_results, emotion_results, valid_indices, speed):
        """✅ COMPLETELY FIXED: Handle list of scores correctly + DataFrame reference"""
        print("📊 Applying CONTINUOUS sentiment results (FIXED)...")
        
        enhanced_df = df.copy()
        
        # Initialize ML columns
        enhanced_df['ml_sentiment_score'] = None
        enhanced_df['ml_sentiment_label'] = None
        enhanced_df['dominant_emotion'] = None
        enhanced_df['emotion_confidence'] = None
        
        # ✅ FIXED: Process sentiment results (handle list format)
        if sentiment_results and len(sentiment_results) == len(valid_indices):
            pos_count = neg_count = neu_count = 0
            
            for i, all_scores in enumerate(sentiment_results):
                original_index = valid_indices[i]
                
                # ✅ FIXED: all_scores is a list like:
                # [{'label': 'negative', 'score': 0.1}, 
                #  {'label': 'neutral', 'score': 0.2}, 
                #  {'label': 'positive', 'score': 0.7}]
                
                pos_prob = neg_prob = neu_prob = 0.0
                
                for score_dict in all_scores:
                    label = score_dict['label'].lower()
                    prob = score_dict['score']
                    
                    if label == 'positive':
                        pos_prob = prob
                    elif label == 'negative':
                        neg_prob = prob
                    elif label == 'neutral':
                        neu_prob = prob
                
                # ✅ CONTINUOUS: Real sentiment score (-1 to +1)
                continuous_score = pos_prob - neg_prob
                
                # Optional: Reduce if neutral is very confident
                if neu_prob > 0.8:  # Very confident neutral
                    continuous_score = continuous_score * 0.3
                
                # ✅ AGGRESSIVE: Apply final labeling
                if continuous_score >= 0.01:     
                    final_label = 'positive'
                    pos_count += 1
                elif continuous_score <= -0.01:  
                    final_label = 'negative'
                    neg_count += 1
                else:                            
                    final_label = 'neutral'
                    neu_count += 1
                
                enhanced_df.at[original_index, 'ml_sentiment_score'] = continuous_score
                enhanced_df.at[original_index, 'ml_sentiment_label'] = final_label
            
            print(f"🎯 CONTINUOUS conversion: {pos_count} positive, {neg_count} negative, {neu_count} neutral")
            
            # Show the improvement!
            scores = enhanced_df['ml_sentiment_score'].dropna()
            exact_zeros = (scores == 0.0).sum()
            print(f"📊 Score range: {scores.min():.3f} to {scores.max():.3f}")
            print(f"📊 Exact zeros: {exact_zeros} (should be much fewer!)")
        
        # Process emotions (unchanged)
        if emotion_results and len(emotion_results) == len(valid_indices):
            for i, emotions in enumerate(emotion_results):
                original_index = valid_indices[i]
                
                if emotions:
                    top_emotion = max(emotions, key=lambda x: x['score'])
                    enhanced_df.at[original_index, 'dominant_emotion'] = top_emotion['label']
                    enhanced_df.at[original_index, 'emotion_confidence'] = top_emotion['score']
        
        # Add metadata
        enhanced_df['ml_processed_at'] = datetime.now()
        enhanced_df['processing_speed'] = speed
        enhanced_df['processed_by'] = 'continuous_sentiment_fixed'
        
        # ✅ FIXED: Use enhanced_df, not df
        sentiment_count = enhanced_df['ml_sentiment_score'].notna().sum()
        emotion_count = enhanced_df['dominant_emotion'].notna().sum()
        
        print(f"✅ CONTINUOUS enhancement complete:")
        print(f"   Sentiment: {sentiment_count}/{len(enhanced_df)} posts")  # FIXED: enhanced_df
        print(f"   Emotions: {emotion_count}/{len(enhanced_df)} posts")    # FIXED: enhanced_df
        
        return enhanced_df

# =============================================================================
# CLASS 3: GitHub Enhanced Uploader (Fixedreturn types
# =============================================================================

class SmartGitHubEnhancedUploader:
    """✅ SMART: Upload to 'new' folder, move to 'processed' after Databricks import"""
    
    def __init__(self, username, repo_name):
        self.username = username
        self.repo_name = repo_name
        self.token = token or os.environ.get('GITHUB_TOKEN', 'your_github_token')
        self.api_base = f"https://api.github.com/repos/{username}/{repo_name}"
        
        if self.token == 'your_github_token':
            print("⚠️ WARNING: Using placeholder token - GitHub upload will fail")
        else:
            print("✅ GitHub token configured")
        
    def upload_enhanced_batch(self, enhanced_df, original_batch_filename):
        """✅ SMART: Upload to 'new' folder for Databricks to discover"""
        
        print(f"📤 Uploading enhanced data to GitHub NEW folder...")
        
        try:
            # Prepare enhanced data for upload
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            enhanced_filename = f"enhanced_{timestamp}.json"
            
            total_posts = len(enhanced_df)
            
            enhanced_data = {
                'enhancement_info': {
                    'timestamp': timestamp,
                    'original_batch': original_batch_filename,
                    'enhanced_post_count': total_posts,
                    'ml_processing_complete': True,
                    'college_cluster_processing': {
                        'sentiment_coverage': int(enhanced_df['ml_sentiment_score'].notna().sum()),
                        'emotion_coverage': int(enhanced_df['dominant_emotion'].notna().sum()),
                        'processing_speed': float(enhanced_df['processing_speed'].iloc[0]) if total_posts > 0 else 0,
                        'processed_by': enhanced_df['processed_by'].iloc[0] if total_posts > 0 else 'unknown'
                    },
                    'databricks_status': 'waiting_for_import'  # Track import status
                },
                'enhanced_posts': enhanced_df.fillna('').to_dict('records')
            }
            
            # ✅ SMART: Upload to 'new' folder instead of 'enhanced'
            success = self.upload_to_github_new_folder(enhanced_data, enhanced_filename)
            
            if success:
                # Update the manifest to track new uploads
                self.update_new_upload_manifest(timestamp, enhanced_data['enhancement_info'])
                print(f"✅ Enhanced data uploaded to NEW folder: {enhanced_filename}")
                return enhanced_filename
            else:
                print("❌ Upload to GitHub failed")
                return None
                
        except Exception as e:
            print(f"❌ Enhanced upload error: {e}")
            return None
    
    def upload_to_github_new_folder(self, enhanced_data, filename):
        """Upload enhanced data to GitHub 'new' folder for Databricks discovery"""
        try:
            # Convert to JSON and encode
            json_content = json.dumps(enhanced_data, indent=2, default=str)
            encoded_content = base64.b64encode(json_content.encode()).decode()
            
            # ✅ SMART: Upload to 'new' folder instead of 'enhanced'
            filepath = f"data/enhanced/new/{filename}"  # NEW path structure
            
            headers = {
                "Authorization": f"token {self.token}",
                "Accept": "application/vnd.github.v3+json"
            }
            
            payload = {
                "message": f"NEW enhanced data: {enhanced_data['enhancement_info']['enhanced_post_count']} posts",
                "content": encoded_content,
                "branch": "main"
            }
            
            upload_url = f"{self.api_base}/contents/{filepath}"
            response = requests.put(upload_url, headers=headers, json=payload)
            
            if response.status_code in [200, 201]:
                print(f"✅ Uploaded to GitHub NEW folder: {filepath}")
                return True
            else:
                print(f"❌ GitHub upload failed: {response.status_code}")
                return False
                
        except Exception as e:
            print(f"❌ GitHub upload error: {e}")
            return False
    
    def update_new_upload_manifest(self, timestamp, enhancement_info):
        """Update manifest to track files waiting for Databricks import"""
        
        new_upload_manifest = {
            'pending_imports': [
                {
                    'timestamp': timestamp,
                    'filename': f"enhanced_{timestamp}.json",
                    'post_count': enhancement_info['enhanced_post_count'],
                    'original_batch': enhancement_info['original_batch'],
                    'status': 'waiting_for_databricks_import',
                    'uploaded_at': datetime.now().isoformat()
                }
            ],
            'databricks_import_instructions': {
                'new_files_location': 'data/enhanced/new/',
                'processed_files_location': 'data/enhanced/processed/',
                'table_target': 'social_media.bluesky_enhanced_clean',
                'import_type': 'incremental_append'
            }
        }
        
        try:
            # Get existing manifest or create new
            manifest_url = f"{self.api_base}/contents/data/enhanced/new_uploads_manifest.json"
            headers = {"Authorization": f"token {self.token}"}
            
            # Check if manifest exists
            check_response = requests.get(manifest_url, headers=headers)
            
            if check_response.status_code == 200:
                # Merge with existing manifest
                existing_content = check_response.json()['content']
                existing_manifest = json.loads(base64.b64decode(existing_content).decode('utf-8'))
                
                # Add to existing pending imports
                existing_manifest['pending_imports'].extend(new_upload_manifest['pending_imports'])
                
                # Keep only last 50 imports to avoid huge manifest
                existing_manifest['pending_imports'] = existing_manifest['pending_imports'][-50:]
                
                new_upload_manifest = existing_manifest
                sha = check_response.json()["sha"]
            else:
                sha = None
            
            # Upload updated manifest
            manifest_content = json.dumps(new_upload_manifest, indent=2, default=str)
            encoded_manifest = base64.b64encode(manifest_content.encode()).decode()
            
            payload = {
                "message": f"Track new upload: enhanced_{timestamp}.json",
                "content": encoded_manifest,
                "branch": "main"
            }
            
            if sha:
                payload["sha"] = sha
            
            response = requests.put(manifest_url, headers=headers, json=payload)
            
            if response.status_code in [200, 201]:
                print("✅ New upload manifest updated")
            else:
                print(f"⚠️ Manifest update failed: {response.status_code}")
                
        except Exception as e:
            print(f"⚠️ Manifest error: {e}")
    
    def move_file_to_processed(self, filename):
        """Move file from 'new' to 'processed' after Databricks import"""
        try:
            headers = {"Authorization": f"token {self.token}"}
            
            # 1. Get file from 'new' folder
            new_file_url = f"{self.api_base}/contents/data/enhanced/new/{filename}"
            response = requests.get(new_file_url, headers=headers)
            
            if response.status_code != 200:
                print(f"❌ Could not find file in NEW folder: {filename}")
                return False
            
            file_data = response.json()
            file_content = file_data['content']
            file_sha = file_data['sha']
            
            # 2. Create file in 'processed' folder
            processed_file_url = f"{self.api_base}/contents/data/enhanced/processed/{filename}"
            
            upload_payload = {
                "message": f"Move to processed: {filename}",
                "content": file_content,
                "branch": "main"
            }
            
            upload_response = requests.put(processed_file_url, headers=headers, json=upload_payload)
            
            if upload_response.status_code not in [200, 201]:
                print(f"❌ Could not create file in PROCESSED folder")
                return False
            
            # 3. Delete file from 'new' folder
            delete_payload = {
                "message": f"Remove from new folder: {filename}",
                "sha": file_sha,
                "branch": "main"
            }
            
            delete_response = requests.delete(new_file_url, headers=headers, json=delete_payload)
            
            if delete_response.status_code == 200:
                print(f"✅ Moved {filename}: new → processed")
                return True
            else:
                print(f"⚠️ File copied but not deleted from NEW folder")
                return True  # File is still accessible in processed
                
        except Exception as e:
            print(f"❌ Move file error: {e}")
            return False

# =============================================================================
# CLASS 4: Complete Pipeline Orchestrator (Fixed variable usage)
# =============================================================================

class CompletePipelineOrchestratorWithUpload:
    """✅ UPDATED: Complete pipeline that processes ALL batches automatically"""
    
    def __init__(self, github_username, repo_name):
        print("🏗️ INITIALIZING COMPLETE PIPELINE WITH GITHUB UPLOAD")
        print("=" * 60)
        
        self.github_manager = UpdatedGitHubDataManager(github_username, repo_name)
        self.ml_processor = MLSentimentProcessor()
        self.github_uploader = SmartGitHubEnhancedUploader(github_username, repo_name)
        
        print("✅ Pipeline + GitHub uploader initialized!")
    
    def run_complete_cycle_with_upload(self):
        """✅ UPDATED: Process ALL available batches in one run"""
        
        print("\n🚀 COMPLETE CYCLE: PROCESS ALL BATCHES + UPLOAD TO GITHUB")
        print("=" * 70)
        
        # Step 1: Get ALL unprocessed data
        batch_data_list = self.github_manager.get_all_unprocessed_data()
        
        if not batch_data_list:
            print("📋 No new data to process - all caught up!")
            return None
        
        print(f"\n🎯 PROCESSING {len(batch_data_list)} BATCHES...")
        
        # Step 2: Process each batch
        all_enhanced_data = []
        successful_batches = 0
        failed_batches = 0
        
        for i, (df, filename) in enumerate(batch_data_list, 1):
            print(f"\n📦 BATCH {i}/{len(batch_data_list)}: {filename}")
            print(f"   Posts in batch: {len(df)}")
            
            try:
                # ML enhancement
                enhanced_df = self.ml_processor.enhance_dataframe(df)
                
                # Check if processing was successful
                sentiment_count = enhanced_df['ml_sentiment_score'].notna().sum()
                if sentiment_count == 0:
                    print(f"   ❌ No sentiment analysis results")
                    failed_batches += 1
                    continue
                
                # Save locally and mark as processed
                local_file = self.github_manager.save_enhanced_data(enhanced_df, filename)
                
                # Upload to GitHub
                github_filename = self.github_uploader.upload_enhanced_batch(enhanced_df, filename)
                
                if github_filename:
                    print(f"   ✅ Successfully processed and uploaded: {github_filename}")
                    successful_batches += 1
                    all_enhanced_data.append(enhanced_df)
                else:
                    print(f"   ⚠️ Local processing succeeded, GitHub upload failed")
                    successful_batches += 1  # Still count as success
                    all_enhanced_data.append(enhanced_df)
                
                # Small delay to avoid overwhelming APIs
                time.sleep(1)
                
            except Exception as e:
                print(f"   ❌ Batch processing failed: {e}")
                failed_batches += 1
                continue
        
        # Step 3: Summary
        print(f"\n📊 BATCH PROCESSING SUMMARY")
        print("=" * 50)
        print(f"   ✅ Successful batches: {successful_batches}")
        print(f"   ❌ Failed batches: {failed_batches}")
        print(f"   📈 Total processed: {successful_batches + failed_batches}")
        
        if successful_batches > 0:
            # Combine all enhanced data for analysis
            combined_enhanced_df = pd.concat(all_enhanced_data, ignore_index=True)
            
            total_posts = len(combined_enhanced_df)
            sentiment_posts = combined_enhanced_df['ml_sentiment_score'].notna().sum()
            
            print(f"\n🎉 COMPLETE SUCCESS!")
            print(f"✅ Processed {successful_batches} batches")
            print(f"✅ Enhanced {total_posts} total posts")
            print(f"✅ Sentiment analysis: {sentiment_posts}/{total_posts} posts")
            print(f"✅ All enhanced data uploaded to GitHub")
            print(f"✅ Ready for Databricks import")
            
            return combined_enhanced_df
        else:
            print(f"\n❌ No batches were successfully processed")
            return None
    
    def run_single_batch_cycle(self):
        """LEGACY: Keep old single-batch method for compatibility"""
        
        print("\n🚀 SINGLE BATCH CYCLE: PROCESS + UPLOAD TO GITHUB")
        print("=" * 60)
        
        # Step 1: Get latest data
        df, filename = self.github_manager.get_latest_data()
        
        if df is None:
            print("📋 No new data to process")
            return None
        
        # Step 2: ML enhancement
        print(f"🎯 Processing {len(df)} posts with ML...")
        enhanced_df = self.ml_processor.enhance_dataframe(df)
        
        # Step 3: Save locally
        local_file = self.github_manager.save_enhanced_data(enhanced_df, filename)
        
        # Step 4: Upload to GitHub for Databricks
        print("\n📤 Uploading enhanced data to GitHub for Databricks...")
        github_filename = self.github_uploader.upload_enhanced_batch(enhanced_df, filename)
        
        if github_filename:
            print("🎉 SINGLE BATCH SUCCESS!")
            print("✅ Data processed with ML")
            print("✅ Enhanced data uploaded to GitHub")
            print("✅ Ready for Databricks import")
            
            print(f"\n🔍 ENHANCED DATA LOCATIONS:")
            print(f"   Local file: {local_file}")
            print(f"   GitHub file: data/enhanced/{github_filename}")
            print(f"   Ready for: Databricks import")
            
            return enhanced_df
        else:
            print("❌ GitHub upload failed - but local processing succeeded")
            return enhanced_df
    
    def show_detailed_results(self, enhanced_df):
        """Show detailed results with sentiment analysis"""
        print("\n📊 DETAILED RESULTS SUMMARY")
        print("=" * 50)
        
        total_posts = len(enhanced_df)
        sentiment_posts = enhanced_df['ml_sentiment_score'].notna().sum()
        emotion_posts = enhanced_df['dominant_emotion'].notna().sum()
        
        print(f"📈 Processing Summary:")
        print(f"   Total posts: {total_posts}")
        print(f"   Sentiment analysis: {sentiment_posts} posts ({sentiment_posts/total_posts*100:.1f}%)")
        print(f"   Emotion analysis: {emotion_posts} posts ({emotion_posts/total_posts*100:.1f}%)")
        
        # Sentiment distribution
        if 'ml_sentiment_label' in enhanced_df.columns:
            label_dist = enhanced_df['ml_sentiment_label'].value_counts()
            print(f"\n🏷️ Sentiment Distribution:")
            for label, count in label_dist.items():
                percentage = (count / total_posts) * 100
                print(f"   {label}: {count} posts ({percentage:.1f}%)")
        
        # Score statistics
        scores = enhanced_df['ml_sentiment_score'].dropna()
        if len(scores) > 0:
            print(f"\n📊 Sentiment Score Statistics:")
            print(f"   Range: {scores.min():.3f} to {scores.max():.3f}")
            print(f"   Mean: {scores.mean():.3f}")
            print(f"   Std: {scores.std():.3f}")
            
            # Check improvement
            exact_zeros = (scores == 0.0).sum()
            near_zeros = ((scores >= -0.01) & (scores <= 0.01)).sum()
            print(f"   Exact zeros: {exact_zeros}")
            print(f"   Near-zero (-0.01 to 0.01): {near_zeros}")
        
        # Emotion distribution
        if 'dominant_emotion' in enhanced_df.columns:
            emotion_dist = enhanced_df['dominant_emotion'].value_counts()
            print(f"\n😊 Emotion Distribution:")
            for emotion, count in emotion_dist.head().items():
                percentage = (count / emotion_posts) * 100 if emotion_posts > 0 else 0
                print(f"   {emotion}: {count} posts ({percentage:.1f}%)")
        
        # Sample results
        print(f"\n🔍 Sample Enhanced Posts:")
        sample_cols = ['text', 'ml_sentiment_score', 'ml_sentiment_label', 'dominant_emotion']
        available_cols = [col for col in sample_cols if col in enhanced_df.columns]
        
        if 'text' not in enhanced_df.columns and 'text_cleaned' in enhanced_df.columns:
            available_cols = ['text_cleaned'] + [col for col in sample_cols[1:] if col in enhanced_df.columns]
        
        sample_df = enhanced_df[available_cols].dropna(subset=['ml_sentiment_score']).head(3)
        
        for idx, row in sample_df.iterrows():
            text_col = 'text' if 'text' in row else 'text_cleaned'
            if text_col in row:
                print(f"\nPost {idx+1}:")
                print(f"   Text: {str(row[text_col])[:100]}...")
                print(f"   Sentiment: {row['ml_sentiment_score']:.3f} ({row['ml_sentiment_label']})")
                if 'dominant_emotion' in row and pd.notna(row['dominant_emotion']):
                    print(f"   Emotion: {row['dominant_emotion']}")


def run_fixed_pipeline():
    """✅ UPDATED: Run the complete fixed pipeline - processes ALL batches"""
    
    print("🚀 STARTING COMPLETE FIXED COLLEGE CLUSTER PIPELINE")
    print("🔄 NOW PROCESSES ALL AVAILABLE BATCHES AUTOMATICALLY")
    print("=" * 70)
    
    # Configuration
    GITHUB_USERNAME = "AlexanderHuynhKoehler"
    REPO_NAME = "bluesky-data-pipeline"
    
    try:
        complete_pipeline = CompletePipelineOrchestratorWithUpload(GITHUB_USERNAME, REPO_NAME)
        
        # ✅ This will now process ALL available batches
        enhanced_results = complete_pipeline.run_complete_cycle_with_upload()

        if enhanced_results is not None:
            print("\n" + "=" * 70)
            print("🎉 COMPLETE PIPELINE SUCCESS!")
            print("=" * 70)
            
            # Show detailed results
            complete_pipeline.show_detailed_results(enhanced_results)
            
            globals()['enhanced_data'] = enhanced_results
            print("\n💡 Enhanced data available as 'enhanced_data' variable")
            print("   Use: enhanced_data.head() to explore")
            print("   Use: enhanced_data['ml_sentiment_label'].value_counts() to see distribution")
            
            return enhanced_results
            
        else:
            print("\n❌ No data processed - check if new batches are available")
            return None
            
    except Exception as e:
        print(f"\n❌ Pipeline failed: {e}")
        import traceback
        traceback.print_exc()
        return None

enhanced_results = run_fixed_pipeline()
