# Visualizer
### Receives processed data from producer to display in real time.

In [7]:
import pandas as pd
import numpy as np
import re
import sys
import subprocess
import os
import glob
import json
import time
from datetime import datetime, timedelta
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.offline as pyo

# Jupyter and interactive components
from IPython.display import display, HTML, clear_output
try:
    import ipywidgets as widgets
    from ipywidgets import interact, interact_manual
    print("✅ Interactive widgets available")
except ImportError:
    print("⚠️ Interactive widgets not available")

try:
    import nltk
    from textblob import TextBlob
    from wordcloud import WordCloud
    print("✅ NLP libraries loaded successfully")
except ImportError as e:
    print(f"📦 NLP libraries not available: {e}")
try:
    from nltk.corpus import stopwords
    from nltk.sentiment import SentimentIntensityAnalyzer
    print("✅ NLTK sentiment tools available")
except ImportError:
    print("⚠️ NLTK sentiment tools will be installed automatically if needed")

import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Plotly offline mode for Jupyter
pyo.init_notebook_mode(connected=True)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

✅ Interactive widgets available
📦 NLP libraries not available: No module named 'wordcloud'
✅ NLTK sentiment tools available


In [22]:
def simple_data_loader():
    """Simple approach - monitor the data directory directly"""
    
    print("🔍 Scanning for any Reddit data files...")
    
    # Scan all possible locations
    search_patterns = [
        "/tmp/reddit_raw_data/**/*",
        "/tmp/reddit_metrics_data/**/*", 
        "/tmp/*reddit*",
        "./data/**/*",  # In case data is in local directory
    ]
    
    all_files = []
    for pattern in search_patterns:
        files = glob.glob(pattern, recursive=True)
        all_files.extend([f for f in files if os.path.isfile(f)])
    
    print(f"📄 Found {len(all_files)} total files")
    
    # Filter for data files
    data_files = [f for f in all_files if f.endswith(('.csv', '.json', '.parquet', '.txt'))]
    print(f"💾 Found {len(data_files)} data files")
    
    if data_files:
        print("\n📋 Available data files:")
        for i, file in enumerate(data_files[:10]):  # Show first 10
            size = os.path.getsize(file)
            mtime = datetime.fromtimestamp(os.path.getmtime(file))
            print(f"  {i+1}. {file}")
            print(f"      Size: {size} bytes, Modified: {mtime}")
        
        # Try to load the most recent file
        latest_file = max(data_files, key=os.path.getmtime)
        print(f"\n🎯 Loading latest file: {latest_file}")
        
        try:
            if latest_file.endswith('.csv'):
                df = pd.read_csv(latest_file)
            elif latest_file.endswith('.json'):
                try:
                    df = pd.read_json(latest_file, lines=True)
                except:
                    df = pd.read_json(latest_file)
            elif latest_file.endswith('.parquet'):
                df = pd.read_parquet(latest_file)
            else:
                # Try as text/CSV
                df = pd.read_csv(latest_file)
                
            print(f"✅ Successfully loaded {len(df)} records!")
            print(f"📊 Columns: {list(df.columns)}")
            print("\n🔍 First few rows:")
            display(df.head())
            
            return df, None
            
        except Exception as e:
            print(f"❌ Error loading file: {e}")
            print("📄 Let's check the file content:")
            with open(latest_file, 'r') as f:
                print(f.read()[:500] + "..." if len(f.read()) > 500 else f.read())
    
    return None, None

# Try the simple loader
simple_raw_df, simple_metrics_df = simple_data_loader()

🔍 Scanning for any Reddit data files...
📄 Found 331 total files
💾 Found 233 data files

📋 Available data files:
  1. /tmp/reddit_raw_data/batch_16/subreddit=AskReddit/part-00001-2addee42-631c-4f37-bd56-36db74254d63.c000.snappy.parquet
      Size: 7615 bytes, Modified: 2025-06-09 13:24:00.559907
  2. /tmp/reddit_raw_data/batch_16/subreddit=AskReddit/part-00003-2addee42-631c-4f37-bd56-36db74254d63.c000.snappy.parquet
      Size: 8313 bytes, Modified: 2025-06-09 13:24:00.549906
  3. /tmp/reddit_raw_data/batch_16/subreddit=AskReddit/part-00000-2addee42-631c-4f37-bd56-36db74254d63.c000.snappy.parquet
      Size: 7490 bytes, Modified: 2025-06-09 13:24:00.569908
  4. /tmp/reddit_raw_data/batch_16/subreddit=AskReddit/part-00004-2addee42-631c-4f37-bd56-36db74254d63.c000.snappy.parquet
      Size: 9576 bytes, Modified: 2025-06-09 13:24:00.559907
  5. /tmp/reddit_raw_data/batch_16/subreddit=AskReddit/part-00002-2addee42-631c-4f37-bd56-36db74254d63.c000.snappy.parquet
      Size: 11050 bytes, Modi

Unnamed: 0,author,posts_count,avg_score
0,FriendshipBudget1341,1,1


In [23]:
def load_all_reddit_data():
    """Load all Reddit data from all batches and combine them"""
    try:
        # Get all parquet files
        all_parquet_files = glob.glob("/tmp/reddit_raw_data/**/*.parquet", recursive=True)
        all_metrics_files = glob.glob("/tmp/reddit_metrics_data/**/*.parquet", recursive=True)
        
        print(f"🔍 Found {len(all_parquet_files)} raw data files")
        print(f"🔍 Found {len(all_metrics_files)} metrics files")
        
        if not all_parquet_files:
            return pd.DataFrame(), pd.DataFrame()
        
        # Load and combine all raw data
        all_dfs = []
        batch_info = {}
        
        for file_path in all_parquet_files:
            try:
                # Extract batch number from path
                batch_match = re.search(r'batch_(\d+)', file_path)
                batch_num = int(batch_match.group(1)) if batch_match else 0
                
                # Load parquet file
                df = pd.read_parquet(file_path)
                df['batch_number'] = batch_num
                df['file_path'] = file_path
                df['file_timestamp'] = pd.to_datetime(os.path.getmtime(file_path), unit='s')
                
                all_dfs.append(df)
                
                if batch_num not in batch_info:
                    batch_info[batch_num] = {'files': 0, 'records': 0}
                batch_info[batch_num]['files'] += 1
                batch_info[batch_num]['records'] += len(df)
                
            except Exception as e:
                print(f"⚠️ Error loading {file_path}: {e}")
                continue
        
        # Combine all data
        if all_dfs:
            combined_df = pd.concat(all_dfs, ignore_index=True)
            
            # Data cleaning and type conversion
            if 'created_datetime' in combined_df.columns:
                combined_df['created_datetime'] = pd.to_datetime(combined_df['created_datetime'])
            if 'timestamp_received' in combined_df.columns:
                combined_df['timestamp_received'] = pd.to_datetime(combined_df['timestamp_received'])
            
            print(f"✅ Combined {len(combined_df)} total records from {len(batch_info)} batches")
            
            # Print batch summary
            print("\n📊 Batch Summary:")
            for batch_num, info in sorted(batch_info.items()):
                print(f"   Batch {batch_num}: {info['records']} records from {info['files']} files")
            
            return combined_df, pd.DataFrame()  # Return empty metrics for now
        
        return pd.DataFrame(), pd.DataFrame()
        
    except Exception as e:
        print(f"❌ Error in load_all_reddit_data: {e}")
        import traceback
        traceback.print_exc()
        return pd.DataFrame(), pd.DataFrame()

# Load all historical data
import re
historical_raw, historical_metrics = load_all_reddit_data()

# Use the latest data as current
raw_df = simple_raw_df  # From our simple loader
if not historical_raw.empty:
    latest_batch = historical_raw['batch_number'].max()
    current_batch_df = historical_raw[historical_raw['batch_number'] == latest_batch]
    print(f"\n🎯 Current batch ({latest_batch}) has {len(current_batch_df)} records")
else:
    current_batch_df = raw_df if raw_df is not None else pd.DataFrame()

print(f"\n📈 Data Summary:")
print(f"   • Historical records: {len(historical_raw)}")
print(f"   • Current batch records: {len(current_batch_df)}")


🔍 Found 114 raw data files
🔍 Found 0 metrics files
✅ Combined 115 total records from 23 batches

📊 Batch Summary:
   Batch 1: 13 records from 13 files
   Batch 2: 20 records from 19 files
   Batch 3: 18 records from 18 files
   Batch 4: 7 records from 7 files
   Batch 5: 4 records from 4 files
   Batch 6: 1 records from 1 files
   Batch 7: 3 records from 3 files
   Batch 8: 5 records from 5 files
   Batch 9: 5 records from 5 files
   Batch 10: 1 records from 1 files
   Batch 11: 3 records from 3 files
   Batch 12: 4 records from 4 files
   Batch 13: 5 records from 5 files
   Batch 14: 4 records from 4 files
   Batch 15: 1 records from 1 files
   Batch 16: 6 records from 6 files
   Batch 17: 1 records from 1 files
   Batch 18: 3 records from 3 files
   Batch 19: 4 records from 4 files
   Batch 20: 1 records from 1 files
   Batch 21: 2 records from 2 files
   Batch 22: 3 records from 3 files
   Batch 23: 1 records from 1 files

🎯 Current batch (23) has 1 records

📈 Data Summary:
   • His

In [24]:
def display_current_stats():
    """Display current batch statistics with beautiful cards"""
    
    # Use historical data if available, otherwise current batch
    display_data = historical_raw if not historical_raw.empty else (raw_df if raw_df is not None else pd.DataFrame())
    
    if display_data is not None and not display_data.empty:
        
        # Calculate statistics
        total_records = len(display_data)
        posts_count = len(display_data[display_data['type'] == 'post']) if 'type' in display_data.columns else 0
        comments_count = len(display_data[display_data['type'] == 'comment']) if 'type' in display_data.columns else 0
        subreddits_count = display_data['subreddit'].nunique() if 'subreddit' in display_data.columns else 0
        authors_count = display_data['author'].nunique() if 'author' in display_data.columns else 0
        avg_score = display_data['score'].mean() if 'score' in display_data.columns else 0
        
        # Get unique batch numbers
        if 'batch_number' in display_data.columns:
            batch_numbers = sorted(display_data['batch_number'].unique())
            batches_text = f"Batches: {min(batch_numbers)}-{max(batch_numbers)}" if len(batch_numbers) > 1 else f"Batch: {batch_numbers[0]}"
        else:
            batches_text = "Single Batch"
        
        # Create enhanced info cards
        html_template = """
        <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); 
                    padding: 25px; border-radius: 15px; margin: 20px 0; color: white;">
            <h2 style="text-align: center; margin: 0 0 20px 0; font-size: 28px;">
                🚀 Reddit Analytics Dashboard - Live Data
            </h2>
            <p style="text-align: center; margin: 0 0 20px 0; font-size: 16px; opacity: 0.9;">
                {batches_info} | Last Updated: {timestamp}
            </p>
        </div>
        
        <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); 
                    gap: 20px; margin: 20px 0;">
            
            <div style="background: linear-gradient(45deg, #667eea 0%, #764ba2 100%); 
                        color: white; padding: 25px; border-radius: 15px; text-align: center; 
                        box-shadow: 0 8px 32px rgba(0,0,0,0.1);">
                <div style="font-size: 36px; font-weight: bold; margin-bottom: 10px;">{total}</div>
                <div style="font-size: 16px; opacity: 0.9;">📊 Total Records</div>
            </div>
            
            <div style="background: linear-gradient(45deg, #f093fb 0%, #f5576c 100%); 
                        color: white; padding: 25px; border-radius: 15px; text-align: center;
                        box-shadow: 0 8px 32px rgba(0,0,0,0.1);">
                <div style="font-size: 36px; font-weight: bold; margin-bottom: 10px;">{posts}</div>
                <div style="font-size: 16px; opacity: 0.9;">📝 Posts</div>
            </div>
            
            <div style="background: linear-gradient(45deg, #4facfe 0%, #00f2fe 100%); 
                        color: white; padding: 25px; border-radius: 15px; text-align: center;
                        box-shadow: 0 8px 32px rgba(0,0,0,0.1);">
                <div style="font-size: 36px; font-weight: bold; margin-bottom: 10px;">{comments}</div>
                <div style="font-size: 16px; opacity: 0.9;">💬 Comments</div>
            </div>
            
            <div style="background: linear-gradient(45deg, #43e97b 0%, #38f9d7 100%); 
                        color: white; padding: 25px; border-radius: 15px; text-align: center;
                        box-shadow: 0 8px 32px rgba(0,0,0,0.1);">
                <div style="font-size: 36px; font-weight: bold; margin-bottom: 10px;">{subreddits}</div>
                <div style="font-size: 16px; opacity: 0.9;">🏠 Subreddits</div>
            </div>
            
            <div style="background: linear-gradient(45deg, #fa709a 0%, #fee140 100%); 
                        color: white; padding: 25px; border-radius: 15px; text-align: center;
                        box-shadow: 0 8px 32px rgba(0,0,0,0.1);">
                <div style="font-size: 36px; font-weight: bold; margin-bottom: 10px;">{authors}</div>
                <div style="font-size: 16px; opacity: 0.9;">👥 Authors</div>
            </div>
            
            <div style="background: linear-gradient(45deg, #a8edea 0%, #fed6e3 100%); 
                        color: #333; padding: 25px; border-radius: 15px; text-align: center;
                        box-shadow: 0 8px 32px rgba(0,0,0,0.1);">
                <div style="font-size: 36px; font-weight: bold; margin-bottom: 10px;">{avg_score:.1f}</div>
                <div style="font-size: 16px; opacity: 0.8;">⭐ Avg Score</div>
            </div>
        </div>
        """
        
        html_content = html_template.format(
            total=total_records,
            posts=posts_count,
            comments=comments_count,
            subreddits=subreddits_count,
            authors=authors_count,
            avg_score=avg_score,
            batches_info=batches_text,
            timestamp=datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        )
        
        display(HTML(html_content))
        
        # Additional detailed stats
        print(f"\n🔍 Detailed Breakdown:")
        print(f"   • Posts vs Comments ratio: {posts_count}:{comments_count}")
        if 'subreddit' in display_data.columns:
            top_subreddits = display_data['subreddit'].value_counts().head(3)
            print(f"   • Top subreddits: {', '.join([f'{k}({v})' for k, v in top_subreddits.items()])}")
        if 'score' in display_data.columns:
            print(f"   • Score range: {display_data['score'].min():.1f} to {display_data['score'].max():.1f}")
            
    else:
        display(HTML('''
        <div style="background: linear-gradient(45deg, #ff6b6b 0%, #feca57 100%); 
                    color: white; padding: 30px; border-radius: 15px; text-align: center; margin: 20px 0;">
            <div style="font-size: 48px; margin-bottom: 15px;">⚠️</div>
            <h2 style="margin: 0;">No Data Available</h2>
            <p style="margin: 10px 0 0 0; opacity: 0.9;">
                Waiting for Reddit data stream... Make sure the producer and consumer are running.
            </p>
        </div>
        '''))

# Display current stats
display_current_stats()


🔍 Detailed Breakdown:
   • Posts vs Comments ratio: 62:53
   • Score range: 0.0 to 1548.0


In [25]:
def create_smart_text_analysis():
    """Create comprehensive text analysis with enhanced sentiment detection for nuanced content"""
    
    data = historical_raw if not historical_raw.empty else (raw_df if raw_df is not None else pd.DataFrame())
    
    if data is None or data.empty:
        print("⚠️ No data available for text analysis")
        return
    
    # Prepare text data
    text_columns = ['title', 'text', 'full_text', 'body', 'content']
    available_text_cols = [col for col in text_columns if col in data.columns]
    
    if not available_text_cols:
        print("⚠️ No text columns available for analysis")
        return
    
    print(f"🔍 Analyzing text from columns: {available_text_cols}")
    
    # Enhanced text preprocessing
    def preprocess_text(text):
        if pd.isna(text) or not str(text).strip():
            return ""
        
        text = str(text)
        
        # Preserve some context while cleaning
        text = re.sub(r'http\S+|www\S+|https\S+', '[URL]', text, flags=re.MULTILINE)
        text = re.sub(r'@\w+', '[USER]', text)
        text = re.sub(r'#\w+', '[TAG]', text)
        text = re.sub(r'[^\w\s\[\]]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text.lower()
    
    # EXPANDED and more nuanced sentiment word lists
    strong_positive = set(['amazing', 'awesome', 'excellent', 'fantastic', 'incredible', 'outstanding', 
                          'brilliant', 'wonderful', 'perfect', 'superb', 'magnificent', 'spectacular'])
    
    positive_words = set(['good', 'great', 'nice', 'love', 'like', 'best', 'better', 'beautiful', 
                         'helpful', 'useful', 'cool', 'fun', 'interesting', 'solid', 'decent', 
                         'recommend', 'recommended', 'worth', 'quality', 'smooth', 'easy', 'clear',
                         'glad', 'happy', 'pleased', 'satisfied', 'works', 'working', 'fixed'])
    
    mild_positive = set(['ok', 'okay', 'fine', 'alright', 'fair', 'reasonable', 'acceptable', 'adequate'])
    
    strong_negative = set(['terrible', 'awful', 'horrible', 'disgusting', 'pathetic', 'useless', 
                          'garbage', 'trash', 'worst', 'hate', 'despise', 'nightmare'])
    
    negative_words = set(['bad', 'worse', 'suck', 'sucks', 'annoying', 'stupid', 'dumb', 'ridiculous', 
                         'frustrating', 'disappointing', 'failed', 'broken', 'problem', 'problems',
                         'issue', 'issues', 'wrong', 'error', 'bug', 'bugs', 'difficult', 'hard',
                         'confusing', 'complicated', 'slow', 'laggy', 'glitch', 'crash', 'crashed'])
    
    mild_negative = set(['meh', 'boring', 'bland', 'mediocre', 'average', 'lacking', 'outdated'])
    
    # Context-aware sentiment analysis
    def calculate_enhanced_sentiment(text):
        words = text.lower().split()
        
        # Count different sentiment intensities
        strong_pos = sum(1 for word in words if word in strong_positive)
        pos_count = sum(1 for word in words if word in positive_words)
        mild_pos = sum(1 for word in words if word in mild_positive)
        
        strong_neg = sum(1 for word in words if word in strong_negative) 
        neg_count = sum(1 for word in words if word in negative_words)
        mild_neg = sum(1 for word in words if word in mild_negative)
        
        # Look for negation patterns
        negation_words = {'not', 'no', 'never', 'dont', 'doesn', 'isn', 'aren', 'wasn', 'weren'}
        negated_positive = 0
        for i, word in enumerate(words):
            if word in negation_words and i + 1 < len(words):
                if words[i + 1] in positive_words or words[i + 1] in strong_positive:
                    negated_positive += 1
        
        # Weighted sentiment calculation
        sentiment_score = (
            (strong_pos * 2.0) + (pos_count * 1.0) + (mild_pos * 0.3) -
            (strong_neg * 2.0) - (neg_count * 1.0) - (mild_neg * 0.3) -
            (negated_positive * 1.5)
        )
        
        # Normalize by text length (but give minimum weight to avoid division issues)
        text_length = max(len(words), 5)
        normalized_score = sentiment_score / text_length
        
        return normalized_score
    
    # Enhanced topic detection using keyword clusters
    topic_keywords = {
        'technical': set(['code', 'programming', 'software', 'app', 'website', 'development', 'tech', 'bug', 'feature']),
        'gaming': set(['game', 'gaming', 'play', 'player', 'level', 'fps', 'console', 'pc', 'steam']),
        'question': set(['how', 'what', 'why', 'when', 'where', 'help', 'question', 'anyone', 'does']),
        'discussion': set(['think', 'opinion', 'thoughts', 'discussion', 'debate', 'view', 'perspective']),
        'news': set(['news', 'update', 'announcement', 'breaking', 'report', 'article', 'source']),
        'personal': set(['my', 'me', 'i', 'personal', 'experience', 'story', 'happened', 'today'])
    }
    
    def detect_topic(text):
        words = set(text.lower().split())
        topic_scores = {}
        
        for topic, keywords in topic_keywords.items():
            score = len(words.intersection(keywords))
            if score > 0:
                topic_scores[topic] = score
        
        if topic_scores:
            return max(topic_scores.items(), key=lambda x: x[1])[0]
        return 'general'
    
    # Process all text data with enhanced metrics
    text_data = []
    
    for idx, row in data.iterrows():
        combined_text = []
        for col in available_text_cols:
            if pd.notna(row[col]) and str(row[col]).strip():
                cleaned_text = preprocess_text(row[col])
                if cleaned_text:
                    combined_text.append(cleaned_text)
        
        if combined_text:
            full_text = ' '.join(combined_text)
            original_text = ' '.join([str(row[col]) for col in available_text_cols if pd.notna(row[col])])
            
            # Calculate enhanced metrics
            word_count = len(full_text.split())
            char_count = len(full_text)
            sentiment = calculate_enhanced_sentiment(full_text)
            topic = detect_topic(full_text)
            
            # More nuanced sentiment categories
            if sentiment > 0.15:
                sentiment_cat = 'positive'
            elif sentiment > 0.05:
                sentiment_cat = 'slightly_positive'
            elif sentiment < -0.15:
                sentiment_cat = 'negative'
            elif sentiment < -0.05:
                sentiment_cat = 'slightly_negative'
            else:
                sentiment_cat = 'neutral'
            
            text_data.append({
                'text': full_text,
                'original_text': original_text,
                'length': char_count,
                'word_count': word_count,
                'subreddit': row.get('subreddit', 'unknown'),
                'author': row.get('author', 'unknown'),
                'score': row.get('score', 0),
                'sentiment': sentiment,
                'sentiment_category': sentiment_cat,
                'topic': topic,
                'has_question': any(word in full_text for word in ['?', 'how', 'what', 'why', 'help']),
                'engagement_score': row.get('score', 0) + row.get('num_comments', 0) * 2  # Weighted engagement
            })
    
    if not text_data:
        print("⚠️ No valid text content found")
        return
    
    text_df = pd.DataFrame(text_data)
    print(f"✅ Processed {len(text_df)} text entries with enhanced analysis")
    
    # Enhanced stop words (Reddit-specific additions)
    stop_words = set(['the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
                     'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
                     'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might',
                     'a', 'an', 'as', 'if', 'it', 'he', 'she', 'they', 'we', 'you', 'i', 'me',
                     'my', 'myself', 'this', 'that', 'these', 'those', 'am', 'get', 'got', 'can', 
                     'just', 'like', 'one', 'also', 'even', 'way', 'use', 'used', 'using', 'make', 
                     'made', 'making', 'take', 'know', 'think', 'see', 'go', 'going', 'went', 'come',
                     'now', 'time', 'people', 'than', 'first', 'been', 'its', 'who', 'user', 'tag', 'url'])
    
    # Extract meaningful content
    all_words = []
    bigrams = []
    topics_found = []
    
    for idx, row in text_df.iterrows():
        text = row['text']
        topic = row['topic']
        topics_found.append(topic)
        
        # Extract meaningful words (improved filtering)
        words = re.findall(r'\b[a-zA-Z]{3,}\b', text)
        meaningful_words = [word for word in words 
                          if word not in stop_words 
                          and not word.isdigit() 
                          and len(word) > 2
                          and word not in ['user', 'tag', 'url']]
        all_words.extend(meaningful_words)
        
        # Create contextual bigrams
        for i in range(len(meaningful_words) - 1):
            bigram = f"{meaningful_words[i]} {meaningful_words[i+1]}"
            bigrams.append(bigram)
    
    # Create enhanced visualizations
    fig = make_subplots(
        rows=3, cols=3,
        subplot_titles=(
            '📊 Enhanced Sentiment Distribution', '🎯 Content Topics', '📝 Text Complexity',
            '🔤 Most Discussed Terms', '🔍 Popular Phrases', '📈 Engagement vs Sentiment',
            '💬 Question vs Statement Ratio', '📚 Author Contribution', '⚖️ Sentiment Intensity Scale'
        ),
        specs=[
            [{"type": "bar"}, {"type": "pie"}, {"type": "scatter"}],
            [{"type": "bar"}, {"type": "bar"}, {"type": "scatter"}],
            [{"type": "pie"}, {"type": "bar"}, {"type": "histogram"}]
        ]
    )
    
    # 1. Enhanced Sentiment Distribution (with more categories)
    sentiment_counts = text_df['sentiment_category'].value_counts()
    sentiment_colors = {
        'positive': '#27ae60', 
        'slightly_positive': '#58d68d',
        'neutral': '#f39c12', 
        'slightly_negative': '#f1948a',
        'negative': '#e74c3c'
    }
    
    fig.add_trace(
        go.Bar(x=sentiment_counts.index, y=sentiment_counts.values,
               marker_color=[sentiment_colors.get(cat, '#95a5a6') for cat in sentiment_counts.index],
               name='Enhanced Sentiment',
               hovertemplate='Sentiment: %{x}<br>Count: %{y}<br>Percentage: %{y/' + str(len(text_df)) + '*100:.1f}%<extra></extra>'),
        row=1, col=1
    )
    
    # 2. Content Topics
    topic_counts = Counter(topics_found)
    fig.add_trace(
        go.Pie(labels=list(topic_counts.keys()), values=list(topic_counts.values()),
               name='Content Topics', hole=0.3,
               hovertemplate='Topic: %{label}<br>Count: %{value}<br>Percentage: %{percent}<extra></extra>'),
        row=1, col=2
    )
    
    # 3. Text Complexity (Word Count vs Character Count)
    fig.add_trace(
        go.Scatter(x=text_df['word_count'], y=text_df['length'],
                  mode='markers', name='Text Complexity',
                  marker=dict(color=text_df['sentiment'], 
                            colorscale='RdYlGn',
                            size=6, opacity=0.6,
                            colorbar=dict(title="Sentiment Score")),
                  hovertemplate='Words: %{x}<br>Characters: %{y}<br>Sentiment: %{marker.color:.3f}<extra></extra>'),
        row=1, col=3
    )
    
    # 4. Most Discussed Terms (improved filtering)
    if all_words:
        word_freq = Counter(all_words)
        # Filter out very common but meaningless words
        meaningful_words = {word: count for word, count in word_freq.items() 
                          if count > 1 and len(word) > 3}
        top_words = dict(Counter(meaningful_words).most_common(12))
        
        fig.add_trace(
            go.Bar(x=list(top_words.values()), y=list(top_words.keys()),
                   orientation='h', name='Key Terms',
                   marker_color='#43e97b',
                   hovertemplate='Term: %{y}<br>Frequency: %{x}<br>Relative: %{x/' + str(len(text_df)) + '*100:.1f}%<extra></extra>'),
            row=2, col=1
        )
    
    # 5. Popular Phrases (improved bigrams)
    if bigrams:
        bigram_freq = Counter(bigrams)
        # Filter meaningful phrases
        meaningful_bigrams = {phrase: count for phrase, count in bigram_freq.items() 
                            if count > 1}
        top_bigrams = dict(Counter(meaningful_bigrams).most_common(8))
        
        fig.add_trace(
            go.Bar(x=list(top_bigrams.values()), y=list(top_bigrams.keys()),
                   orientation='h', name='Key Phrases',
                   marker_color='#9b59b6',
                   hovertemplate='Phrase: %{y}<br>Frequency: %{x}<extra></extra>'),
            row=2, col=2
        )
    
    # 6. Engagement vs Sentiment (enhanced)
    fig.add_trace(
        go.Scatter(x=text_df['sentiment'], y=text_df['engagement_score'],
                  mode='markers', name='Engagement Analysis',
                  marker=dict(color=text_df['word_count'], 
                            colorscale='viridis',
                            size=8, opacity=0.7,
                            colorbar=dict(title="Word Count")),
                  hovertemplate='Sentiment: %{x:.3f}<br>Engagement: %{y}<br>Words: %{marker.color}<extra></extra>'),
        row=2, col=3
    )
    
    # 7. Question vs Statement Analysis
    question_ratio = text_df['has_question'].value_counts()
    fig.add_trace(
        go.Pie(labels=['Statements', 'Questions'], values=[question_ratio.get(False, 0), question_ratio.get(True, 0)],
               name='Content Type', hole=0.4),
        row=3, col=1
    )
    
    # 8. Top Authors (if available)
    if 'author' in text_df.columns and text_df['author'].nunique() > 1:
        author_engagement = text_df.groupby('author')['engagement_score'].sum().sort_values(ascending=True).tail(8)
        fig.add_trace(
            go.Bar(x=author_engagement.values, y=author_engagement.index,
                   orientation='h', name='Top Contributors',
                   marker_color='#1abc9c',
                   hovertemplate='Author: %{y}<br>Total Engagement: %{x}<extra></extra>'),
            row=3, col=2
        )
    
    # 9. Sentiment Intensity Distribution
    fig.add_trace(
        go.Histogram(x=text_df['sentiment'], nbinsx=30, name='Sentiment Intensity',
                    marker_color='#e67e22', opacity=0.7,
                    hovertemplate='Sentiment Range: %{x}<br>Count: %{y}<extra></extra>'),
        row=3, col=3
    )
    
    # Enhanced layout
    fig.update_layout(
        height=1400,
        title_text="🧠 Enhanced Text Analytics Dashboard",
        title_x=0.5,
        title_font_size=24,
        showlegend=True,
        template="plotly_white",
        font=dict(size=11)
    )
    
    fig.show()
    
    # Enhanced Statistics Summary
    print(f"\n🧠 ENHANCED TEXT ANALYTICS SUMMARY:")
    print("=" * 70)
    print(f"📊 Total Entries Analyzed: {len(text_df):,}")
    print(f"📝 Average Text Length: {text_df['length'].mean():.1f} characters (±{text_df['length'].std():.1f})")
    print(f"🔤 Average Word Count: {text_df['word_count'].mean():.1f} words (±{text_df['word_count'].std():.1f})")
    print(f"📚 Unique Terms Found: {len(set(all_words)):,}")
    print(f"🔗 Unique Phrases Found: {len(set(bigrams)):,}")
    
    # Enhanced sentiment analysis with distribution
    print(f"\n💭 DETAILED SENTIMENT ANALYSIS:")
    sentiment_summary = text_df['sentiment_category'].value_counts().sort_index()
    total_posts = len(text_df)
    
    sentiment_emojis = {
        'positive': '😊', 'slightly_positive': '🙂', 'neutral': '😐',
        'slightly_negative': '😕', 'negative': '😞'
    }
    
    for sentiment, count in sentiment_summary.items():
        percentage = (count / total_posts) * 100
        emoji = sentiment_emojis.get(sentiment, '📊')
        print(f"   {emoji} {sentiment.replace('_', ' ').title()}: {count:,} posts ({percentage:.1f}%)")
    
    # Sentiment statistics
    avg_sentiment = text_df['sentiment'].mean()
    sentiment_std = text_df['sentiment'].std()
    print(f"\n📈 Sentiment Statistics:")
    print(f"   • Average Score: {avg_sentiment:.4f} (±{sentiment_std:.4f})")
    print(f"   • Score Range: {text_df['sentiment'].min():.4f} to {text_df['sentiment'].max():.4f}")
    
    # Topic analysis
    print(f"\n🎯 TOPIC ANALYSIS:")
    topic_summary = Counter(topics_found).most_common()
    for topic, count in topic_summary:
        percentage = (count / total_posts) * 100
        print(f"   📋 {topic.title()}: {count} posts ({percentage:.1f}%)")
    
    # Enhanced insights with correlations
    if 'score' in text_df.columns:
        sentiment_corr = text_df['sentiment'].corr(text_df['engagement_score'])
        length_corr = text_df['word_count'].corr(text_df['engagement_score'])
        
        print(f"\n📈 ENGAGEMENT CORRELATIONS:")
        print(f"   • Sentiment ↔ Engagement: {sentiment_corr:.3f}")
        print(f"   • Length ↔ Engagement: {length_corr:.3f}")
        
        # More nuanced insights
        if abs(sentiment_corr) > 0.1:
            direction = "positively" if sentiment_corr > 0 else "negatively"
            strength = "strongly" if abs(sentiment_corr) > 0.3 else "moderately" if abs(sentiment_corr) > 0.2 else "weakly"
            print(f"   💡 Sentiment {strength} correlates {direction} with engagement")
        else:
            print(f"   📊 Sentiment shows minimal correlation with engagement")
        
        if abs(length_corr) > 0.1:
            direction = "longer" if length_corr > 0 else "shorter"
            strength = "strongly" if abs(length_corr) > 0.3 else "moderately" if abs(length_corr) > 0.2 else "slightly"
            print(f"   💡 {direction.title()} posts {strength} correlate with higher engagement")
    
    # Content insights
    questions_pct = (text_df['has_question'].sum() / len(text_df)) * 100
    print(f"\n💬 CONTENT INSIGHTS:")
    print(f"   • Questions: {text_df['has_question'].sum()} ({questions_pct:.1f}%)")
    print(f"   • Statements: {(~text_df['has_question']).sum()} ({100-questions_pct:.1f}%)")
    
    # Top terms and phrases
    if all_words:
        top_5_words = Counter(all_words).most_common(5)
        word_list = ', '.join([f"'{word}' ({count})" for word, count in top_5_words])
        print(f"\n🔝 Most Discussed Terms: {word_list}")
    
    if bigrams:
        top_3_bigrams = Counter(bigrams).most_common(3)
        phrase_list = ', '.join([f'"{phrase}" ({count})' for phrase, count in top_3_bigrams])
        print(f"🔗 Key Phrases: {phrase_list}")

# Run the enhanced analysis
create_smart_text_analysis()

🔍 Analyzing text from columns: ['title', 'text', 'full_text']
✅ Processed 115 text entries with enhanced analysis



🧠 ENHANCED TEXT ANALYTICS SUMMARY:
📊 Total Entries Analyzed: 115
📝 Average Text Length: 490.1 characters (±990.8)
🔤 Average Word Count: 86.1 words (±157.9)
📚 Unique Terms Found: 1,617
🔗 Unique Phrases Found: 2,700

💭 DETAILED SENTIMENT ANALYSIS:
   😞 Negative: 1 posts (0.9%)
   😐 Neutral: 106 posts (92.2%)
   😕 Slightly Negative: 2 posts (1.7%)
   🙂 Slightly Positive: 6 posts (5.2%)

📈 Sentiment Statistics:
   • Average Score: 0.0026 (±0.0356)
   • Score Range: -0.2500 to 0.1250

🎯 TOPIC ANALYSIS:
   📋 Question: 47 posts (40.9%)
   📋 General: 40 posts (34.8%)
   📋 Personal: 21 posts (18.3%)
   📋 Technical: 4 posts (3.5%)
   📋 News: 1 posts (0.9%)
   📋 Gaming: 1 posts (0.9%)
   📋 Discussion: 1 posts (0.9%)

📈 ENGAGEMENT CORRELATIONS:
   • Sentiment ↔ Engagement: 0.001
   • Length ↔ Engagement: -0.017
   📊 Sentiment shows minimal correlation with engagement

💬 CONTENT INSIGHTS:
   • Questions: 50 (43.5%)
   • Statements: 65 (56.5%)

🔝 Most Discussed Terms: 'what' (82), 'your' (46), 'abo

In [26]:
def create_user_score_analysis():
    """Analyze user activity and scoring patterns"""
    
    data = historical_raw if not historical_raw.empty else (raw_df if raw_df is not None else pd.DataFrame())
    
    if data is None or data.empty:
        print("⚠️ No data available for user analysis")
        return
    
    required_cols = ['author', 'score']
    if not all(col in data.columns for col in required_cols):
        print(f"⚠️ Missing required columns. Available: {list(data.columns)}")
        return
    
    # User statistics
    user_stats = data.groupby('author').agg({
        'score': ['count', 'sum', 'mean', 'std'],
        'id': 'count'
    }).round(2)
    
    user_stats.columns = ['Score_Count', 'Total_Score', 'Avg_Score', 'Score_StdDev', 'Post_Count']
    user_stats = user_stats.sort_values('Total_Score', ascending=False)
    
    # Remove any users with null/invalid names
    user_stats = user_stats[user_stats.index.notna()]
    
    print(f"✅ Analyzing {len(user_stats)} unique authors")
    
    # Create visualizations
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('🏆 Top Users by Total Score', '⭐ Average Score Distribution',
                       '📊 Posts vs Average Score', '🎯 User Activity Levels'),
        specs=[[{"type": "bar"}, {"type": "histogram"}],
               [{"type": "scatter"}, {"type": "pie"}]]
    )
    
    # Top users by total score
    top_users = user_stats.head(10)
    fig.add_trace(
        go.Bar(x=top_users.index, y=top_users['Total_Score'],
               name='Total Score', marker_color='#667eea',
               text=top_users['Total_Score'], textposition='auto'),
        row=1, col=1
    )
    
    # Average score distribution
    fig.add_trace(
        go.Histogram(x=user_stats['Avg_Score'], nbinsx=25, 
                    name='Avg Score Distribution', 
                    marker_color='#f093fb', opacity=0.7),
        row=1, col=2
    )
    
    # Posts vs Average Score scatter
    # Sample data if too many points
    sample_size = min(100, len(user_stats))
    sampled_users = user_stats.sample(sample_size) if len(user_stats) > sample_size else user_stats
    
    fig.add_trace(
        go.Scatter(x=sampled_users['Post_Count'], y=sampled_users['Avg_Score'],
                   mode='markers', name='User Activity',
                   marker=dict(size=8, color=sampled_users['Total_Score'],
                              colorscale='Viridis', showscale=True,
                              colorbar=dict(title="Total Score")),
                   text=sampled_users.index, textposition="top center"),
        row=2, col=1
    )
    
    # User activity levels pie chart
    activity_levels = pd.cut(user_stats['Post_Count'], 
                           bins=[0, 1, 3, 10, float('inf')],
                           labels=['Single Post', 'Low (2-3)', 'Medium (4-10)', 'High (10+)'])
    activity_counts = activity_levels.value_counts()
    
    fig.add_trace(
        go.Pie(labels=activity_counts.index, values=activity_counts.values,
               name="Activity Levels", hole=0.4,
               marker_colors=['#43e97b', '#4facfe', '#f093fb', '#667eea']),
        row=2, col=2
    )
    
    fig.update_layout(
        height=900,
        title_text="👥 User Activity & Score Analysis",
        title_x=0.5,
        template="plotly_white"
    )
    
    # Update x-axes for better readability
    fig.update_xaxes(tickangle=45, row=1, col=1)
    
    fig.show()
    
    # Display top performers table
    print("\n🏆 Top 10 Users by Total Score:")
    display(top_users[['Post_Count', 'Total_Score', 'Avg_Score']].head(10))
    
    # Summary statistics
    print(f"\n📈 User Activity Summary:")
    print(f"   • Total unique authors: {len(user_stats)}")
    print(f"   • Average posts per user: {user_stats['Post_Count'].mean():.1f}")
    print(f"   • Average score per user: {user_stats['Avg_Score'].mean():.2f}")
    print(f"   • Most active user: {user_stats['Post_Count'].idxmax()} ({user_stats['Post_Count'].max()} posts)")
    print(f"   • Highest scoring user: {user_stats['Total_Score'].idxmax()} ({user_stats['Total_Score'].max()} total score)")

create_user_score_analysis()

✅ Analyzing 99 unique authors



🏆 Top 10 Users by Total Score:


Unnamed: 0_level_0,Post_Count,Total_Score,Avg_Score
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Street_Anon,1,1548,1548.0
its-groit-craic,1,399,399.0
EndofGods,1,308,308.0
whatareyousomekinda,1,94,94.0
Datashot,1,43,43.0
Aggravating_Money992,2,40,20.0
corgibestie,1,35,35.0
hammer-jon,1,31,31.0
dr-steve,1,22,22.0
snozburger,1,20,20.0



📈 User Activity Summary:
   • Total unique authors: 99
   • Average posts per user: 1.2
   • Average score per user: 27.24
   • Most active user: Original_Garbage8557 (6 posts)
   • Highest scoring user: Street_Anon (1548 total score)


In [27]:
# 📈 COMPREHENSIVE HISTORICAL DATA ANALYSIS
# ========================================

def create_historical_analysis():
    """
    Analyze historical Reddit data with smart column detection and flexible visualizations.
    
    This function:
    1. Checks what data columns are available
    2. Creates appropriate visualizations based on available data
    3. Provides summary statistics and insights
    4. Handles missing columns gracefully
    """
    
    # Step 1: Validate data availability
    if historical_raw.empty:
        print("⚠️ No historical data available for analysis")
        return
    
    print("🔍 STARTING HISTORICAL ANALYSIS")
    print("=" * 50)
    print(f"📊 Dataset contains: {len(historical_raw):,} records")
    print(f"📋 Available columns: {list(historical_raw.columns)}")
    
    # Step 2: Identify key columns for analysis
    key_columns = {
        'batch_tracker': 'batch_number',
        'content_score': 'score', 
        'content_type': 'type',
        'community': 'subreddit',
        'user': 'author',
        'timestamp': 'file_timestamp',
        'identifier': 'id'
    }
    
    available_data = {}
    for purpose, column in key_columns.items():
        if column in historical_raw.columns:
            available_data[purpose] = column
            print(f"✅ {purpose.replace('_', ' ').title()}: Using '{column}' column")
        else:
            print(f"⚠️ {purpose.replace('_', ' ').title()}: Column '{column}' not found")
    
    # Step 3: Determine if we can group by batches
    has_batches = 'batch_tracker' in available_data
    batch_column = available_data.get('batch_tracker', 'batch_number')
    
    if has_batches:
        num_batches = historical_raw[batch_column].nunique()
        print(f"📦 Found {num_batches} different batches to analyze")
    else:
        print("📦 No batch information - will analyze as single dataset")
    
    # Step 4: Calculate summary statistics
    print("\n🧮 CALCULATING STATISTICS...")
    
    # Build aggregation rules based on available columns
    aggregation_rules = {}
    
    if 'identifier' in available_data:
        aggregation_rules[available_data['identifier']] = 'count'
    
    if 'content_score' in available_data:
        aggregation_rules[available_data['content_score']] = ['mean', 'median', 'std', 'min', 'max']
    
    if 'content_type' in available_data:
        aggregation_rules[available_data['content_type']] = lambda x: (x == 'post').sum()
    
    if 'user' in available_data:
        aggregation_rules[available_data['user']] = 'nunique'
    
    if 'community' in available_data:
        aggregation_rules[available_data['community']] = 'nunique'
    
    # Group data by batches or treat as single group
    if has_batches and len(aggregation_rules) > 0:
        summary_stats = historical_raw.groupby(batch_column).agg(aggregation_rules).round(2)
    elif len(aggregation_rules) > 0:
        # Create single-row summary for entire dataset
        summary_stats = historical_raw.agg(aggregation_rules).to_frame().T
        summary_stats.index = ['Overall']
    else:
        print("⚠️ No suitable columns found for statistical analysis")
        return
    
    # Step 5: Clean up column names for readability
    clean_columns = []
    for col in summary_stats.columns:
        if isinstance(col, tuple):
            # Handle multi-level column names from groupby aggregation
            base_name = col[0].replace('_', ' ').title()
            stat_name = col[1] if col[1] != '<lambda>' else 'Posts'
            clean_columns.append(f"{base_name} ({stat_name.title()})")
        else:
            clean_columns.append(col.replace('_', ' ').title())
    
    summary_stats.columns = clean_columns
    summary_stats = summary_stats.reset_index()
    
    print(f"✅ Generated {len(summary_stats)} summary records")
    
    # Step 6: Create visualizations based on available data
    print("\n📊 CREATING VISUALIZATIONS...")
    
    # Determine what charts we can create
    chart_plan = []
    
    if has_batches and len(summary_stats) > 1:
        chart_plan.extend([
            ('Timeline Analysis', 'line'),
            ('Batch Comparison', 'bar'),
            ('Score Distribution', 'box'),
            ('Growth Trends', 'area'),
            ('Community Analysis', 'scatter'),
            ('User Activity', 'heatmap')
        ])
    else:
        chart_plan.extend([
            ('Data Overview', 'bar'),
            ('Score Analysis', 'histogram'),
            ('Content Distribution', 'pie'),
            ('Top Contributors', 'bar')
        ])
    
    # Create subplot layout
    num_charts = min(6, len(chart_plan))
    rows, cols = 3, 2
    
    fig = make_subplots(
        rows=rows, cols=cols,
        subplot_titles=[chart[0] for chart in chart_plan[:num_charts]],
        specs=[[{"type": "xy"}] * cols for _ in range(rows)]
    )
    
    # Step 7: Generate charts based on available data
    chart_positions = [(1,1), (1,2), (2,1), (2,2), (3,1), (3,2)]
    
    x_axis = summary_stats[batch_column] if has_batches else summary_stats.index
    
    for i, ((chart_name, chart_type), (row, col)) in enumerate(zip(chart_plan[:num_charts], chart_positions)):
        
        try:
            if chart_name == 'Timeline Analysis' and has_batches:
                # Show data volume over time
                count_col = [col for col in summary_stats.columns if 'count' in col.lower()]
                if count_col:
                    fig.add_trace(
                        go.Scatter(
                            x=x_axis, 
                            y=summary_stats[count_col[0]],
                            mode='lines+markers',
                            name='Data Volume',
                            line=dict(color='#e74c3c', width=3)
                        ), row=row, col=col
                    )
            
            elif chart_name == 'Batch Comparison':
                # Compare different metrics across batches
                numeric_cols = summary_stats.select_dtypes(include=[np.number]).columns
                if len(numeric_cols) > 0:
                    fig.add_trace(
                        go.Bar(
                            x=x_axis,
                            y=summary_stats[numeric_cols[0]],
                            name=numeric_cols[0],
                            marker_color='#3498db'
                        ), row=row, col=col
                    )
            
            elif chart_name == 'Score Distribution' and 'content_score' in available_data:
                # Box plot of scores across batches
                if has_batches:
                    unique_batches = historical_raw[batch_column].unique()[:8]  # Limit for performance
                    for batch in unique_batches:
                        batch_data = historical_raw[historical_raw[batch_column] == batch]
                        fig.add_trace(
                            go.Box(
                                y=batch_data[available_data['content_score']], 
                                name=f'Batch {batch}',
                                boxpoints='outliers'
                            ), row=row, col=col
                        )
                else:
                    fig.add_trace(
                        go.Histogram(
                            x=historical_raw[available_data['content_score']], 
                            name='Score Distribution',
                            marker_color='#27ae60'
                        ), row=row, col=col
                    )
            
            elif chart_name == 'Growth Trends' and has_batches:
                # Show growth trends
                if 'user' in available_data:
                    user_col = [col for col in summary_stats.columns if 'user' in col.lower() or 'author' in col.lower()]
                    if user_col:
                        fig.add_trace(
                            go.Scatter(
                                x=x_axis,
                                y=summary_stats[user_col[0]],
                                fill='tonexty',
                                name='Unique Users',
                                line=dict(color='#27ae60')
                            ), row=row, col=col
                        )
            
            else:
                # Default: simple bar chart with available data
                numeric_cols = summary_stats.select_dtypes(include=[np.number]).columns
                if len(numeric_cols) > i % len(numeric_cols):
                    col_to_plot = numeric_cols[i % len(numeric_cols)]
                    fig.add_trace(
                        go.Bar(
                            x=x_axis,
                            y=summary_stats[col_to_plot],
                            name=col_to_plot,
                            #marker_color=f'hsl({(i*60) % 360}, 70%, 50%)'
                        ), row=row, col=col
                    )
                    
        except Exception as e:
            print(f"⚠️ Could not create {chart_name}: {e}")
    
    # Step 8: Finalize and display visualization
    fig.update_layout(
        title_text="📈 Historical Reddit Data Analysis Dashboard",
        showlegend=True,
        height=1000,
        template="plotly_white",
        title_x=0.5
    )
    
    fig.show()
    
    # Step 9: Display summary table and insights
    print("\n📊 SUMMARY STATISTICS TABLE:")
    print("=" * 80)
    display(summary_stats)
    
    # Step 10: Generate key insights
    print(f"\n🔍 KEY INSIGHTS FROM ANALYSIS:")
    print("=" * 50)
    
    if has_batches:
        print(f"📦 Analyzed {len(summary_stats)} different data batches")
    else:
        print(f"📊 Analyzed complete dataset as single unit")
    
    # Find numeric columns for insights
    numeric_columns = summary_stats.select_dtypes(include=[np.number]).columns
    
    for col in numeric_columns[:5]:  # Show insights for first 5 numeric columns
        try:
            col_data = summary_stats[col]
            avg_val = col_data.mean()
            min_val = col_data.min()
            max_val = col_data.max()
            
            print(f"📈 {col}:")
            print(f"   • Average: {avg_val:.2f}")
            print(f"   • Range: {min_val:.2f} to {max_val:.2f}")
            
            if len(col_data) > 1:
                trend = "increasing" if col_data.iloc[-1] > col_data.iloc[0] else "decreasing"
                print(f"   • Trend: {trend}")
                
        except Exception as e:
            print(f"   ⚠️ Could not analyze {col}: {e}")
    
    print("\n✅ Historical analysis completed!")

# Execute the analysis
create_historical_analysis()

🔍 STARTING HISTORICAL ANALYSIS
📊 Dataset contains: 115 records
📋 Available columns: ['id', 'type', 'title', 'text', 'full_text', 'author', 'score', 'upvote_ratio', 'num_comments', 'created_utc', 'created_datetime', 'url', 'permalink', 'post_id', 'is_self', 'is_submitter', 'over_18', 'spoiler', 'locked', 'user_references', 'subreddit_references', 'url_references', 'timestamp_received', 'spark_timestamp', 'processing_timestamp', 'batch_number', 'file_path', 'file_timestamp']
✅ Batch Tracker: Using 'batch_number' column
✅ Content Score: Using 'score' column
✅ Content Type: Using 'type' column
⚠️ Community: Column 'subreddit' not found
✅ User: Using 'author' column
✅ Timestamp: Using 'file_timestamp' column
✅ Identifier: Using 'id' column
📦 Found 23 different batches to analyze

🧮 CALCULATING STATISTICS...
✅ Generated 23 summary records

📊 CREATING VISUALIZATIONS...



📊 SUMMARY STATISTICS TABLE:


Unnamed: 0,batch_number,Id (Count),Score (Mean),Score (Median),Score (Std),Score (Min),Score (Max),Type (Posts),Author (Nunique)
0,1,13,2.69,1.0,3.25,0,11,6,11
1,2,20,9.4,4.0,12.84,0,43,9,20
2,3,18,133.94,2.0,370.78,0,1548,10,18
3,4,7,6.86,1.0,14.22,1,39,2,6
4,5,4,1.0,1.0,0.0,1,1,3,4
5,6,1,2.0,2.0,,2,2,1,1
6,7,3,1.0,1.0,0.0,1,1,3,3
7,8,5,1.0,1.0,0.0,1,1,2,5
8,9,5,1.0,1.0,0.0,1,1,3,5
9,10,1,1.0,1.0,,1,1,1,1



🔍 KEY INSIGHTS FROM ANALYSIS:
📦 Analyzed 23 different data batches
📈 batch_number:
   • Average: 12.00
   • Range: 1.00 to 23.00
   • Trend: increasing
📈 Id (Count):
   • Average: 5.00
   • Range: 1.00 to 20.00
   • Trend: decreasing
📈 Score (Mean):
   • Average: 7.52
   • Range: 1.00 to 133.94
   • Trend: decreasing
📈 Score (Median):
   • Average: 1.22
   • Range: 1.00 to 4.00
   • Trend: decreasing
📈 Score (Std):
   • Average: 23.59
   • Range: 0.00 to 370.78
   • Trend: decreasing

✅ Historical analysis completed!
