In [1]:
import os, time, pathlib, pprint, requests, json
from datetime import datetime, timedelta

EP = "http://browser:8004"  # Fixed port to 8004
SCRAPED = pathlib.Path("/storage/scraped_data")

def wait_for(job_id, every=3):
    print(f"⏳ Waiting for job {job_id}...")
    while True:
        rec = requests.get(f"{EP}/jobs/{job_id}").json()
        status = rec["status"]
        if status not in {"finished", "error"}:
            print(f"\r⏱️  {rec['status_with_elapsed']}", end="")
        else:
            print(f"\n✅ {status.upper()}")
            return rec
        time.sleep(every)

def submit(task, payload):
    print(f"🚀 Submitting {task} task...")
    print(f"📝 Payload: {json.dumps(payload, indent=2)}")
    r = requests.post(f"{EP}/jobs/{task}", json=payload)
    r.raise_for_status()
    jid = r.json()["job_id"]
    print(f"🆔 Job ID: {jid}")
    return wait_for(jid)

def analyze_twitter_results(result, test_name):
    """Comprehensive analysis function for Twitter scraping results."""
    print(f"\n" + "="*70)
    print(f"🐦 ANALYSIS: {test_name}")
    print("="*70)
    
    if result["status"] == "error":
        print(f"❌ FAILED: {result.get('error', 'Unknown error')}")
        return
    
    if "result" not in result:
        print(f"❌ No result data found")
        pprint.pp(result)
        return
        
    res = result["result"]
    metadata = res.get("search_metadata", {})
    data = res.get("data", [])
    
    # === BASIC INFO ===
    print(f"✅ STATUS: Task completed successfully")
    print(f"🎯 TARGET: @{metadata.get('target_username', 'N/A')}")
    print(f"📊 TOTAL EXTRACTED: {len(data)} items (found: {metadata.get('total_found', 'N/A')})")
    print(f"🔧 METHOD: {metadata.get('extraction_method', 'N/A')}")
    print(f"📈 SCRAPE LEVEL: {metadata.get('scrape_level', 'N/A')}")
    print(f"📈 SUCCESS RATE: {metadata.get('success_rate', 0):.1%}")
    
    if not data:
        print(f"❌ NO DATA EXTRACTED")
        return
    
    # Determine data structure based on first item
    first_item = data[0] if data else {}
    
    # Check if it's comprehensive user data (has profile + posts)
    if isinstance(first_item, dict) and 'profile' in first_item:
        profile_data = first_item
        print(f"\n📋 PROFILE DATA:")
        profile = profile_data.get('profile', {})
        print(f"   🏷️ Name: {profile.get('display_name', 'N/A')}")
        print(f"   📝 Bio: {profile.get('bio', 'N/A')[:100]}{'...' if len(profile.get('bio', '')) > 100 else ''}")
        print(f"   📊 Followers: {profile.get('followers_count', 'N/A')}")
        print(f"   👥 Following: {profile.get('following_count', 'N/A')}")
        print(f"   📝 Posts Count: {profile.get('posts_count', 'N/A')}")
        
        # Analyze different data types
        data_types = ['posts', 'likes', 'mentions', 'media', 'followers', 'following']
        total_items = 0
        
        print(f"\n📊 EXTRACTED DATA BREAKDOWN:")
        for data_type in data_types:
            items = profile_data.get(data_type, [])
            if items:
                total_items += len(items)
                print(f"   {get_emoji(data_type)} {data_type.title()}: {len(items)} items")
                
                # Specific analysis for posts/tweets
                if data_type == 'posts' and len(items) > 0:
                    analyze_tweets(items)
                    
                # Sample first item
                if len(items) > 0:
                    sample = items[0]
                    if isinstance(sample, dict):
                        sample_text = sample.get('text', sample.get('content', str(sample)))[:80]
                        print(f"      Sample: {sample_text}{'...' if len(str(sample)) > 80 else ''}")
        
        print(f"\n📈 TOTAL ITEMS: {total_items} across all data types")
        
    else:
        # Direct tweet/post list
        print(f"\n📝 POSTS/TWEETS ANALYSIS:")
        analyze_tweets(data)
    
    # === EXECUTION TIME ===
    completed_at = metadata.get('search_completed_at')
    if completed_at:
        print(f"\n⏱️  COMPLETED: {completed_at}")
    
    # === SAMPLE DATA ===
    if data:
        sample = data[0]
        print(f"\n📋 SAMPLE DATA STRUCTURE:")
        print(f"   📄 Type: {type(sample).__name__}")
        if isinstance(sample, dict):
            keys = list(sample.keys())[:10]
            print(f"   🔑 Keys: {keys}{'...' if len(sample.keys()) > 10 else ''}")

def analyze_tweets(tweets):
    """Analyze a list of tweets/posts."""
    if not tweets:
        return
        
    print(f"   📝 Total Tweets: {len(tweets)}")
    
    # Count tweets with different data types
    tweets_with_metrics = [t for t in tweets if isinstance(t, dict) and any(k in t for k in ['likes', 'retweets', 'replies'])]
    tweets_with_dates = [t for t in tweets if isinstance(t, dict) and t.get('date')]
    tweets_with_media = [t for t in tweets if isinstance(t, dict) and t.get('media')]
    
    print(f"   📊 With engagement metrics: {len(tweets_with_metrics)}/{len(tweets)}")
    print(f"   📅 With dates: {len(tweets_with_dates)}/{len(tweets)}")
    print(f"   🖼️ With media: {len(tweets_with_media)}/{len(tweets)}")
    
    # Analyze engagement if available
    if tweets_with_metrics:
        total_likes = sum(t.get('likes', 0) for t in tweets_with_metrics if isinstance(t.get('likes'), int))
        total_retweets = sum(t.get('retweets', 0) for t in tweets_with_metrics if isinstance(t.get('retweets'), int))
        avg_likes = total_likes / len(tweets_with_metrics) if tweets_with_metrics else 0
        avg_retweets = total_retweets / len(tweets_with_metrics) if tweets_with_metrics else 0
        
        print(f"   ❤️ Avg Likes: {avg_likes:.1f} (total: {total_likes})")
        print(f"   🔄 Avg Retweets: {avg_retweets:.1f} (total: {total_retweets})")
    
    # Show sample tweets
    sample_count = min(3, len(tweets))
    for i, tweet in enumerate(tweets[:sample_count]):
        if isinstance(tweet, dict):
            text = tweet.get('text', tweet.get('content', str(tweet)))
            date = tweet.get('date', 'No date')
            likes = tweet.get('likes', 'N/A')
            print(f"   🐦 Tweet {i+1}: {text[:100]}{'...' if len(text) > 100 else ''}")
            print(f"      📅 {date} | ❤️ {likes}")
        else:
            print(f"   🐦 Tweet {i+1}: {str(tweet)[:100]}{'...' if len(str(tweet)) > 100 else ''}")

def get_emoji(data_type):
    """Get emoji for data type."""
    emojis = {
        'posts': '📝',
        'likes': '❤️', 
        'mentions': '@️⃣',
        'media': '🖼️',
        'followers': '👥',
        'following': '➡️'
    }
    return emojis.get(data_type, '📊')

print("🐦 Twitter Analysis Notebook Initialized!")
print(f"📍 API Endpoint: {EP}")
print(f"📁 Storage: {SCRAPED}")

# Check storage accessibility
twitter_dir = SCRAPED / "twitter"
if twitter_dir.exists():
    recent_jobs = sorted([d.name for d in twitter_dir.iterdir() if d.is_dir()], reverse=True)[:3]
    print(f"✅ Storage accessible - Recent jobs: {recent_jobs}")
else:
    print(f"⚠️ Twitter data directory not found at {twitter_dir}")

# Test API connectivity
try:
    test_response = requests.get(f"{EP}/healthz", timeout=5)
    if test_response.status_code == 200:
        print(f"✅ API connectivity: Connected to browser service")
    else:
        print(f"⚠️ API connectivity: Unexpected response {test_response.status_code}")
except Exception as e:
    print(f"❌ API connectivity: Failed to connect - {e}")
    print(f"🔍 Check if browser service is running on port 8004")

🐦 Twitter Analysis Notebook Initialized!
📍 API Endpoint: http://browser:8004
📁 Storage: /storage/scraped_data
⚠️ Twitter data directory not found at /storage/scraped_data/twitter
✅ API connectivity: Connected to browser service


# 🧪 Twitter Scraper Testing & Validation

## Test popular accounts with different extraction modes

In [2]:
# Test accounts with different characteristics
test_accounts = {
    "naval": "High-quality tweets, philosophy",
    "elonmusk": "High activity, mixed content", 
    "paulg": "Startup advice, essays",
    "sama": "AI/tech commentary",
    "vitalikbuterin": "Crypto/blockchain content"
}

print("🎯 Available test accounts:")
for account, desc in test_accounts.items():
    print(f"   @{account}: {desc}")

def test_twitter_extraction(username, max_posts=20, test_name=None):
    """Test comprehensive Twitter extraction with engagement metrics."""
    if not test_name:
        test_name = f"@{username} Enhanced Extraction"
        
    payload = {
        "username": username,
        "scrape_posts": True,
        "max_posts": max_posts,
        "scrape_likes": True,
        "max_likes": 5,
        "scrape_mentions": True,
        "max_mentions": 3,
        "scrape_media": True,
        "max_media": 3,
        "scrape_level": 4  # Full extraction
    }
    
    print(f"\n🧪 Testing: {test_name}")
    result = submit("twitter", payload)
    analyze_twitter_results(result, test_name)
    return result

# Test enhanced extraction with engagement metrics
print("\n🚀 ENHANCED EXTRACTION TEST")
print("="*50)

enhanced_result = test_twitter_extraction("naval", max_posts=10, test_name="Naval - Enhanced with Engagement")

🎯 Available test accounts:
   @naval: High-quality tweets, philosophy
   @elonmusk: High activity, mixed content
   @paulg: Startup advice, essays
   @sama: AI/tech commentary
   @vitalikbuterin: Crypto/blockchain content

🚀 ENHANCED EXTRACTION TEST

🧪 Testing: Naval - Enhanced with Engagement
🚀 Submitting twitter task...
📝 Payload: {
  "username": "naval",
  "scrape_posts": true,
  "max_posts": 10,
  "scrape_likes": true,
  "max_likes": 5,
  "scrape_mentions": true,
  "max_mentions": 3,
  "scrape_media": true,
  "max_media": 3,
  "scrape_level": 4
}
🆔 Job ID: 3b71cf14a94f45abaf62acd5b2208d15
⏳ Waiting for job 3b71cf14a94f45abaf62acd5b2208d15...
⏱️  running 12s
✅ ERROR

🐦 ANALYSIS: Naval - Enhanced with Engagement
❌ FAILED: object of type 'bool' has no len()


## 🎯 Comprehensive Data Extraction Test

In [3]:
def test_comprehensive_extraction(username):
    """Test comprehensive extraction with all data types."""
    payload = {
        "username": username,
        "scrape_posts": True,
        "max_posts": 15,
        "scrape_likes": True,
        "max_likes": 10,
        "scrape_mentions": True,
        "max_mentions": 5,
        "scrape_media": True,
        "max_media": 5,
        "scrape_followers": False,  # Skip for speed
        "scrape_following": False,  # Skip for speed
        "level": 4
    }
    
    test_name = f"@{username} - Comprehensive Extraction"
    print(f"\n🎯 Testing: {test_name}")
    result = submit("twitter", payload)
    analyze_twitter_results(result, test_name)
    return result

print("\n🔬 COMPREHENSIVE EXTRACTION TEST")
print("="*50)
print("📊 Testing multiple data types extraction")

comprehensive_result = test_comprehensive_extraction("paulg")


🔬 COMPREHENSIVE EXTRACTION TEST
📊 Testing multiple data types extraction

🎯 Testing: @paulg - Comprehensive Extraction
🚀 Submitting twitter task...
📝 Payload: {
  "username": "paulg",
  "scrape_posts": true,
  "max_posts": 15,
  "scrape_likes": true,
  "max_likes": 10,
  "scrape_mentions": true,
  "max_mentions": 5,
  "scrape_media": true,
  "max_media": 5,
  "scrape_followers": false,
  "scrape_following": false,
  "level": 4
}
🆔 Job ID: fd9783df33ed423db447a4fd38bdb17b
⏳ Waiting for job fd9783df33ed423db447a4fd38bdb17b...
⏱️  running 9s
✅ ERROR

🐦 ANALYSIS: @paulg - Comprehensive Extraction
❌ FAILED: object of type 'bool' has no len()


## 📅 Date Filtering Performance Test

In [4]:
def test_date_filtering(username, date_range, max_posts=20):
    """Test date filtering functionality and performance."""
    payload = {
        "username": username,
        "scrape_posts": True,
        "max_posts": max_posts,
        "enable_date_filtering": True,
        "date_range": date_range,
        "stop_at_date_threshold": True,
        "level": 4
    }
    
    test_name = f"@{username} - Date Filter ({date_range})"
    print(f"\n📅 Testing: {test_name}")
    
    start_time = time.time()
    result = submit("twitter", payload)
    end_time = time.time()
    
    execution_time = end_time - start_time
    
    analyze_twitter_results(result, test_name)
    
    # Performance analysis
    if result["status"] == "finished":
        data_count = len(result["result"].get("data", []))
        print(f"\n⚡ PERFORMANCE ANALYSIS:")
        print(f"   ⏱️ Execution time: {execution_time:.1f} seconds")
        print(f"   📊 Items extracted: {data_count}")
        print(f"   🚀 Speed: {data_count/execution_time:.1f} items/second")
        
        # Expected performance improvement
        expected_improvement = {
            "today": "90-95%",
            "last_day": "90-95%",
            "last_week": "70-85%",
            "last_month": "50-70%"
        }
        improvement = expected_improvement.get(date_range, "varies")
        print(f"   📈 Expected improvement vs full scrape: {improvement} faster")
    
    return result

print("\n📅 DATE FILTERING PERFORMANCE TESTS")
print("="*50)
print("🎯 Testing performance improvements with date filtering")

# Test different date ranges
date_ranges = ["last_week", "last_day", "today"]
date_results = {}

for date_range in date_ranges:
    date_results[date_range] = test_date_filtering("sama", date_range, max_posts=25)
    time.sleep(2)  # Brief pause between tests


📅 DATE FILTERING PERFORMANCE TESTS
🎯 Testing performance improvements with date filtering

📅 Testing: @sama - Date Filter (last_week)
🚀 Submitting twitter task...
📝 Payload: {
  "username": "sama",
  "scrape_posts": true,
  "max_posts": 25,
  "enable_date_filtering": true,
  "date_range": "last_week",
  "stop_at_date_threshold": true,
  "level": 4
}
🆔 Job ID: 30554dd6046941a3ba9e0f4d4b30478b
⏳ Waiting for job 30554dd6046941a3ba9e0f4d4b30478b...
⏱️  running 12s
✅ ERROR

🐦 ANALYSIS: @sama - Date Filter (last_week)
❌ FAILED: object of type 'bool' has no len()

📅 Testing: @sama - Date Filter (last_day)
🚀 Submitting twitter task...
📝 Payload: {
  "username": "sama",
  "scrape_posts": true,
  "max_posts": 25,
  "enable_date_filtering": true,
  "date_range": "last_day",
  "stop_at_date_threshold": true,
  "level": 4
}
🆔 Job ID: 921238cf8cf74c778ed09e313e9875d8
⏳ Waiting for job 921238cf8cf74c778ed09e313e9875d8...
⏱️  running 15s
✅ ERROR

🐦 ANALYSIS: @sama - Date Filter (last_day)
❌ FAILED: o

## 🔬 Scrape Level Comparison

In [5]:
def test_scrape_level(username, level, max_posts=10):
    """Test specific scrape level."""
    payload = {
        "username": username,
        "scrape_posts": True,
        "max_posts": max_posts,
        "level": level,
        "scrape_level": level  # Both for compatibility
    }
    
    test_name = f"@{username} - Level {level}"
    print(f"\n🔬 Testing: {test_name}")
    result = submit("twitter", payload)
    analyze_twitter_results(result, test_name)
    return result

print("\n🔬 SCRAPE LEVEL COMPARISON")
print("="*50)
print("📊 Testing different extraction levels")

level_descriptions = {
    1: "Basic extraction",
    2: "Enhanced data", 
    3: "Full profile data",
    4: "Comprehensive extraction"
}

level_results = {}
test_username = "vitalikbuterin"

for level in [1, 2, 3, 4]:
    print(f"\n📈 Level {level}: {level_descriptions[level]}")
    level_results[level] = test_scrape_level(test_username, level, max_posts=8)
    time.sleep(1)

# Compare results across levels
print("\n" + "="*70)
print("📊 LEVEL COMPARISON SUMMARY")
print("="*70)

for level in [1, 2, 3, 4]:
    result = level_results.get(level, {})
    if result.get("status") == "finished":
        data = result["result"].get("data", [])
        method = result["result"].get("search_metadata", {}).get("extraction_method", "Unknown")
        print(f"Level {level}: {len(data)} items | {method}")
    else:
        print(f"Level {level}: ❌ Failed or incomplete")


🔬 SCRAPE LEVEL COMPARISON
📊 Testing different extraction levels

📈 Level 1: Basic extraction

🔬 Testing: @vitalikbuterin - Level 1
🚀 Submitting twitter task...
📝 Payload: {
  "username": "vitalikbuterin",
  "scrape_posts": true,
  "max_posts": 8,
  "level": 1,
  "scrape_level": 1
}
🆔 Job ID: f1d33ff152ff4623b7b52f1efb8d31e6
⏳ Waiting for job f1d33ff152ff4623b7b52f1efb8d31e6...
⏱️  running 12s
✅ ERROR

🐦 ANALYSIS: @vitalikbuterin - Level 1
❌ FAILED: object of type 'bool' has no len()

📈 Level 2: Enhanced data

🔬 Testing: @vitalikbuterin - Level 2
🚀 Submitting twitter task...
📝 Payload: {
  "username": "vitalikbuterin",
  "scrape_posts": true,
  "max_posts": 8,
  "level": 2,
  "scrape_level": 2
}
🆔 Job ID: 68278faa5e1f48f1ba8cb1803282b168
⏳ Waiting for job 68278faa5e1f48f1ba8cb1803282b168...
⏱️  running 12s
✅ ERROR

🐦 ANALYSIS: @vitalikbuterin - Level 2
❌ FAILED: object of type 'bool' has no len()

📈 Level 3: Full profile data

🔬 Testing: @vitalikbuterin - Level 3
🚀 Submitting twitter t

## 🎯 Batch Testing Multiple Accounts

In [6]:
def test_multiple_accounts():
    """Test extraction across multiple accounts to validate consistency."""
    test_accounts_subset = ["naval", "paulg", "sama"]
    results = {}
    
    print("🎯 BATCH TESTING MULTIPLE ACCOUNTS")
    print("="*50)
    
    for account in test_accounts_subset:
        payload = {
            "username": account,
            "scrape_posts": True,
            "max_posts": 5,  # Small for batch testing
            "enable_date_filtering": True,
            "date_range": "last_week",
            "level": 4
        }
        
        print(f"\n🧪 Testing @{account}...")
        start_time = time.time()
        result = submit("twitter", payload)
        execution_time = time.time() - start_time
        
        results[account] = {
            "result": result,
            "execution_time": execution_time
        }
        
        if result["status"] == "finished":
            data_count = len(result["result"].get("data", []))
            print(f"✅ @{account}: {data_count} items in {execution_time:.1f}s")
        else:
            print(f"❌ @{account}: Failed - {result.get('error', 'Unknown error')}")
        
        time.sleep(1)  # Brief pause between accounts
    
    # Summary analysis
    print("\n" + "="*50)
    print("📊 BATCH TESTING SUMMARY")
    print("="*50)
    
    successful_tests = 0
    total_items = 0
    total_time = 0
    
    for account, data in results.items():
        result = data["result"]
        exec_time = data["execution_time"]
        
        if result["status"] == "finished":
            successful_tests += 1
            items = len(result["result"].get("data", []))
            total_items += items
            total_time += exec_time
            
            print(f"✅ @{account}: {items} items | {exec_time:.1f}s | {items/exec_time:.1f} items/s")
        else:
            print(f"❌ @{account}: Failed")
    
    if successful_tests > 0:
        avg_time = total_time / successful_tests
        avg_items = total_items / successful_tests
        print(f"\n📈 AVERAGES:")
        print(f"   ⏱️ Time: {avg_time:.1f}s per account")
        print(f"   📊 Items: {avg_items:.1f} per account")
        print(f"   🚀 Speed: {total_items/total_time:.1f} items/second overall")
        print(f"   ✅ Success rate: {successful_tests}/{len(test_accounts_subset)} ({successful_tests/len(test_accounts_subset)*100:.0f}%)")
    
    return results

batch_results = test_multiple_accounts()

🎯 BATCH TESTING MULTIPLE ACCOUNTS

🧪 Testing @naval...
🚀 Submitting twitter task...
📝 Payload: {
  "username": "naval",
  "scrape_posts": true,
  "max_posts": 5,
  "enable_date_filtering": true,
  "date_range": "last_week",
  "level": 4
}
🆔 Job ID: 32e1fe29c29e44c4976d9273fe10af96
⏳ Waiting for job 32e1fe29c29e44c4976d9273fe10af96...
⏱️  running 42s
✅ FINISHED
✅ @naval: 0 items in 45.1s

🧪 Testing @paulg...
🚀 Submitting twitter task...
📝 Payload: {
  "username": "paulg",
  "scrape_posts": true,
  "max_posts": 5,
  "enable_date_filtering": true,
  "date_range": "last_week",
  "level": 4
}
🆔 Job ID: 79f91a4c000e475a9e9eed20d80a8fab
⏳ Waiting for job 79f91a4c000e475a9e9eed20d80a8fab...
⏱️  running 39s
✅ FINISHED
✅ @paulg: 0 items in 42.1s

🧪 Testing @sama...
🚀 Submitting twitter task...
📝 Payload: {
  "username": "sama",
  "scrape_posts": true,
  "max_posts": 5,
  "enable_date_filtering": true,
  "date_range": "last_week",
  "level": 4
}
🆔 Job ID: 9b529274502847b988b3db4167a4f0b1
⏳ Waitin

## 🔍 Data Quality Validation

In [7]:
def validate_extracted_data(result, test_name):
    """Validate quality and completeness of extracted data."""
    print(f"\n🔍 DATA VALIDATION: {test_name}")
    print("="*50)
    
    if result["status"] != "finished":
        print(f"❌ Cannot validate - extraction failed")
        return False
    
    data = result["result"].get("data", [])
    if not data:
        print(f"❌ No data to validate")
        return False
    
    validation_score = 0
    max_score = 0
    
    # Check if comprehensive extraction
    if isinstance(data[0], dict) and 'profile' in data[0]:
        profile_data = data[0]
        
        # Profile validation
        profile = profile_data.get('profile', {})
        max_score += 4
        
        if profile.get('display_name'):
            validation_score += 1
            print(f"✅ Profile name: {profile['display_name']}")
        else:
            print(f"❌ Missing profile name")
            
        if profile.get('username'):
            validation_score += 1
            print(f"✅ Username: @{profile['username']}")
        else:
            print(f"❌ Missing username")
            
        if profile.get('bio'):
            validation_score += 1
            print(f"✅ Bio: {len(profile['bio'])} chars")
        else:
            print(f"⚠️ No bio found")
            
        if profile.get('followers_count') is not None:
            validation_score += 1
            print(f"✅ Followers count: {profile['followers_count']}")
        else:
            print(f"❌ Missing followers count")
        
        # Posts validation
        posts = profile_data.get('posts', [])
        if posts:
            max_score += 3
            print(f"\n📝 POSTS VALIDATION ({len(posts)} posts):")
            
            posts_with_text = [p for p in posts if isinstance(p, dict) and p.get('text')]
            if posts_with_text:
                validation_score += 1
                avg_length = sum(len(p['text']) for p in posts_with_text) / len(posts_with_text)
                print(f"✅ Text content: {len(posts_with_text)}/{len(posts)} posts, avg {avg_length:.0f} chars")
            else:
                print(f"❌ No text content found in posts")
            
            posts_with_dates = [p for p in posts if isinstance(p, dict) and p.get('date')]
            if posts_with_dates:
                validation_score += 1
                print(f"✅ Dates: {len(posts_with_dates)}/{len(posts)} posts have dates")
            else:
                print(f"⚠️ No dates found in posts")
            
            posts_with_metrics = [p for p in posts if isinstance(p, dict) and any(k in p for k in ['likes', 'retweets'])]
            if posts_with_metrics:
                validation_score += 1
                print(f"✅ Engagement metrics: {len(posts_with_metrics)}/{len(posts)} posts")
            else:
                print(f"⚠️ No engagement metrics found")
                
    else:
        # Direct posts validation
        max_score = 3
        posts = data
        print(f"📝 DIRECT POSTS VALIDATION ({len(posts)} posts):")
        
        posts_with_text = [p for p in posts if isinstance(p, dict) and p.get('text')]
        if posts_with_text:
            validation_score += 1
            print(f"✅ Text content: {len(posts_with_text)}/{len(posts)} posts")
        
        posts_with_dates = [p for p in posts if isinstance(p, dict) and p.get('date')]
        if posts_with_dates:
            validation_score += 1
            print(f"✅ Dates: {len(posts_with_dates)}/{len(posts)} posts")
        
        if len(posts) >= 5:
            validation_score += 1
            print(f"✅ Sufficient data: {len(posts)} posts extracted")
    
    # Calculate validation score
    score_percentage = (validation_score / max_score) * 100 if max_score > 0 else 0
    
    print(f"\n📊 VALIDATION SCORE: {validation_score}/{max_score} ({score_percentage:.0f}%)")
    
    if score_percentage >= 80:
        print(f"✅ EXCELLENT data quality")
    elif score_percentage >= 60:
        print(f"✅ GOOD data quality")
    elif score_percentage >= 40:
        print(f"⚠️ FAIR data quality - some issues detected")
    else:
        print(f"❌ POOR data quality - significant issues")
    
    return score_percentage >= 60

# Validate recent results
print("🔍 DATA QUALITY VALIDATION")
print("="*50)

if 'comprehensive_result' in locals():
    validate_extracted_data(comprehensive_result, "Comprehensive Extraction")

if 'basic_result' in locals():
    validate_extracted_data(basic_result, "Basic Extraction")

🔍 DATA QUALITY VALIDATION

🔍 DATA VALIDATION: Comprehensive Extraction
❌ Cannot validate - extraction failed


## 📊 Final Testing Summary

In [8]:
print("📊 TWITTER SCRAPER TESTING SUMMARY")
print("="*70)

# Count successful tests
total_tests = 0
successful_tests = 0
failed_tests = 0

test_results = []

# Check all test results
if 'basic_result' in locals():
    total_tests += 1
    if basic_result.get('status') == 'finished':
        successful_tests += 1
        test_results.append(("✅", "Basic Extraction", "PASSED"))
    else:
        failed_tests += 1
        test_results.append(("❌", "Basic Extraction", "FAILED"))

if 'comprehensive_result' in locals():
    total_tests += 1
    if comprehensive_result.get('status') == 'finished':
        successful_tests += 1
        test_results.append(("✅", "Comprehensive Extraction", "PASSED"))
    else:
        failed_tests += 1
        test_results.append(("❌", "Comprehensive Extraction", "FAILED"))

if 'date_results' in locals():
    for date_range, result in date_results.items():
        total_tests += 1
        if result.get('status') == 'finished':
            successful_tests += 1
            test_results.append(("✅", f"Date Filter ({date_range})", "PASSED"))
        else:
            failed_tests += 1
            test_results.append(("❌", f"Date Filter ({date_range})", "FAILED"))

if 'level_results' in locals():
    for level, result in level_results.items():
        total_tests += 1
        if result.get('status') == 'finished':
            successful_tests += 1
            test_results.append(("✅", f"Scrape Level {level}", "PASSED"))
        else:
            failed_tests += 1
            test_results.append(("❌", f"Scrape Level {level}", "FAILED"))

if 'batch_results' in locals():
    for account, data in batch_results.items():
        total_tests += 1
        if data['result'].get('status') == 'finished':
            successful_tests += 1
            test_results.append(("✅", f"Batch Test (@{account})", "PASSED"))
        else:
            failed_tests += 1
            test_results.append(("❌", f"Batch Test (@{account})", "FAILED"))

# Display results
print(f"📈 OVERALL RESULTS:")
print(f"   Total Tests: {total_tests}")
print(f"   Successful: {successful_tests}")
print(f"   Failed: {failed_tests}")

if total_tests > 0:
    success_rate = (successful_tests / total_tests) * 100
    print(f"   Success Rate: {success_rate:.1f}%")
    
    print(f"\n📋 DETAILED RESULTS:")
    for emoji, test_name, status in test_results:
        print(f"   {emoji} {test_name}: {status}")
    
    print(f"\n🎯 OVERALL ASSESSMENT:")
    if success_rate >= 90:
        print(f"🎉 EXCELLENT - Twitter scraper is working perfectly!")
    elif success_rate >= 75:
        print(f"✅ GOOD - Twitter scraper is working well with minor issues")
    elif success_rate >= 50:
        print(f"⚠️ FAIR - Twitter scraper has some issues that need attention")
    else:
        print(f"❌ POOR - Twitter scraper needs significant fixes")
else:
    print("⚠️ No tests were run")

print(f"\n🔗 Data files can be found in: {SCRAPED}/twitter/")
print(f"🎉 Testing completed!")

📊 TWITTER SCRAPER TESTING SUMMARY
📈 OVERALL RESULTS:
   Total Tests: 11
   Successful: 4
   Failed: 7
   Success Rate: 36.4%

📋 DETAILED RESULTS:
   ❌ Comprehensive Extraction: FAILED
   ❌ Date Filter (last_week): FAILED
   ❌ Date Filter (last_day): FAILED
   ❌ Date Filter (today): FAILED
   ❌ Scrape Level 1: FAILED
   ❌ Scrape Level 2: FAILED
   ❌ Scrape Level 3: FAILED
   ✅ Scrape Level 4: PASSED
   ✅ Batch Test (@naval): PASSED
   ✅ Batch Test (@paulg): PASSED
   ✅ Batch Test (@sama): PASSED

🎯 OVERALL ASSESSMENT:
❌ POOR - Twitter scraper needs significant fixes

🔗 Data files can be found in: /storage/scraped_data/twitter/
🎉 Testing completed!
