# Scraped Data Analysis

This notebook demonstrates how to analyze data scraped using the web scraper framework.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
from datetime import datetime
import numpy as np

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configure pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 100)

## Load Scraped Data

Load the latest scraped data from the data directory.

In [None]:
# Define data directories
data_dir = Path("../data")
raw_data_dir = data_dir / "raw"
processed_data_dir = data_dir / "processed"

# Find the latest JSON file
json_files = list(raw_data_dir.glob("*.json"))
if json_files:
    latest_file = max(json_files, key=lambda x: x.stat().st_mtime)
    print(f"Loading data from: {latest_file}")
    
    with open(latest_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    df = pd.DataFrame(data)
    print(f"Loaded {len(df)} records")
    print(f"Columns: {list(df.columns)}")
else:
    print("No JSON files found in the raw data directory.")
    print("Please run the scraper first to generate data.")
    # Create sample data for demonstration
    sample_data = [
        {
            "id": f"sample_{i}",
            "text": f"This is sample tweet {i} with #hashtag",
            "created_at": f"2024-01-{15+i%10:02d}T{10+i%12:02d}:30:00Z",
            "like_count": np.random.randint(0, 100),
            "retweet_count": np.random.randint(0, 50),
            "reply_count": np.random.randint(0, 20),
            "lang": np.random.choice(["en", "es", "fr"], p=[0.7, 0.2, 0.1]),
            "hashtags": ["#hashtag", "#sample"] if i % 3 == 0 else ["#hashtag"],
            "text_length": len(f"This is sample tweet {i} with #hashtag")
        }
        for i in range(100)
    ]
    df = pd.DataFrame(sample_data)
    print("Created sample data for demonstration")

# Display basic information
df.head()

## Data Overview and Cleaning

In [None]:
# Basic data information
print("Dataset Shape:", df.shape)
print("\nData Types:")
print(df.dtypes)
print("\nMissing Values:")
print(df.isnull().sum())

# Convert date columns if they exist
if 'created_at' in df.columns:
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['date'] = df['created_at'].dt.date
    df['hour'] = df['created_at'].dt.hour
    df['day_of_week'] = df['created_at'].dt.day_name()

print("\nDataset after date processing:")
print(df.info())

## Engagement Metrics Analysis

In [None]:
# Calculate engagement metrics
if all(col in df.columns for col in ['like_count', 'retweet_count', 'reply_count']):
    df['total_engagement'] = df['like_count'] + df['retweet_count'] + df['reply_count']
    
    # Basic statistics
    print("Engagement Statistics:")
    engagement_cols = ['like_count', 'retweet_count', 'reply_count', 'total_engagement']
    print(df[engagement_cols].describe())
    
    # Visualize engagement distribution
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('Engagement Metrics Distribution', fontsize=16)
    
    for i, col in enumerate(engagement_cols):
        row, col_idx = i // 2, i % 2
        ax = axes[row, col_idx]
        
        # Histogram
        df[col].hist(bins=20, ax=ax, alpha=0.7)
        ax.set_title(f'{col.replace("_", " ").title()} Distribution')
        ax.set_xlabel(col.replace('_', ' ').title())
        ax.set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

## Temporal Analysis

In [None]:
if 'created_at' in df.columns:
    # Posts by day
    daily_posts = df.groupby('date').size().reset_index(name='post_count')
    
    # Posts by hour
    hourly_posts = df.groupby('hour').size().reset_index(name='post_count')
    
    # Posts by day of week
    weekly_posts = df.groupby('day_of_week').size().reset_index(name='post_count')
    
    fig, axes = plt.subplots(1, 3, figsize=(20, 6))
    
    # Daily timeline
    axes[0].plot(daily_posts['date'], daily_posts['post_count'], marker='o')
    axes[0].set_title('Posts Over Time')
    axes[0].set_xlabel('Date')
    axes[0].set_ylabel('Number of Posts')
    axes[0].tick_params(axis='x', rotation=45)
    
    # Hourly distribution
    axes[1].bar(hourly_posts['hour'], hourly_posts['post_count'])
    axes[1].set_title('Posts by Hour of Day')
    axes[1].set_xlabel('Hour')
    axes[1].set_ylabel('Number of Posts')
    
    # Day of week distribution
    day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    weekly_posts['day_of_week'] = pd.Categorical(weekly_posts['day_of_week'], categories=day_order, ordered=True)
    weekly_posts = weekly_posts.sort_values('day_of_week')
    
    axes[2].bar(weekly_posts['day_of_week'], weekly_posts['post_count'])
    axes[2].set_title('Posts by Day of Week')
    axes[2].set_xlabel('Day of Week')
    axes[2].set_ylabel('Number of Posts')
    axes[2].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

## Language and Content Analysis

In [None]:
# Language distribution
if 'lang' in df.columns:
    lang_counts = df['lang'].value_counts()
    
    plt.figure(figsize=(12, 5))
    
    # Language pie chart
    plt.subplot(1, 2, 1)
    plt.pie(lang_counts.values, labels=lang_counts.index, autopct='%1.1f%%')
    plt.title('Language Distribution')
    
    # Language bar chart
    plt.subplot(1, 2, 2)
    lang_counts.plot(kind='bar')
    plt.title('Posts by Language')
    plt.xlabel('Language')
    plt.ylabel('Number of Posts')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()

# Text length analysis
if 'text_length' in df.columns:
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    df['text_length'].hist(bins=30, alpha=0.7)
    plt.title('Text Length Distribution')
    plt.xlabel('Text Length (characters)')
    plt.ylabel('Frequency')
    
    plt.subplot(1, 2, 2)
    df.boxplot(column='text_length', ax=plt.gca())
    plt.title('Text Length Box Plot')
    plt.ylabel('Text Length (characters)')
    
    plt.tight_layout()
    plt.show()
    
    print(f"Average text length: {df['text_length'].mean():.1f} characters")
    print(f"Median text length: {df['text_length'].median():.1f} characters")

## Hashtag Analysis

In [None]:
if 'hashtags' in df.columns:
    # Extract all hashtags
    all_hashtags = []
    for hashtag_list in df['hashtags']:
        if isinstance(hashtag_list, list):
            all_hashtags.extend(hashtag_list)
    
    # Count hashtag frequency
    hashtag_counts = pd.Series(all_hashtags).value_counts()
    
    print(f"Total unique hashtags: {len(hashtag_counts)}")
    print(f"Total hashtag mentions: {len(all_hashtags)}")
    
    # Plot top hashtags
    plt.figure(figsize=(12, 6))
    
    top_hashtags = hashtag_counts.head(15)
    plt.barh(range(len(top_hashtags)), top_hashtags.values)
    plt.yticks(range(len(top_hashtags)), top_hashtags.index)
    plt.xlabel('Frequency')
    plt.title('Top 15 Hashtags')
    plt.gca().invert_yaxis()
    
    plt.tight_layout()
    plt.show()
    
    # Hashtag statistics
    df['hashtag_count'] = df['hashtags'].apply(lambda x: len(x) if isinstance(x, list) else 0)
    
    plt.figure(figsize=(10, 4))
    df['hashtag_count'].value_counts().sort_index().plot(kind='bar')
    plt.title('Distribution of Number of Hashtags per Post')
    plt.xlabel('Number of Hashtags')
    plt.ylabel('Number of Posts')
    plt.show()
    
    print(f"Average hashtags per post: {df['hashtag_count'].mean():.2f}")

## Correlation Analysis

In [None]:
# Select numeric columns for correlation analysis
numeric_cols = df.select_dtypes(include=[np.number]).columns

if len(numeric_cols) > 1:
    # Calculate correlation matrix
    correlation_matrix = df[numeric_cols].corr()
    
    # Plot correlation heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, 
                annot=True, 
                cmap='coolwarm', 
                center=0,
                square=True,
                fmt='.2f')
    plt.title('Correlation Matrix of Numeric Variables')
    plt.tight_layout()
    plt.show()
    
    # Print strong correlations
    print("Strong correlations (|r| > 0.5):")
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            corr_val = correlation_matrix.iloc[i, j]
            if abs(corr_val) > 0.5:
                col1 = correlation_matrix.columns[i]
                col2 = correlation_matrix.columns[j]
                print(f"{col1} <-> {col2}: {corr_val:.3f}")

## Advanced Analytics and Insights

In [None]:
# Engagement rate by text length
if all(col in df.columns for col in ['text_length', 'total_engagement']):
    # Create text length bins
    df['text_length_bin'] = pd.cut(df['text_length'], 
                                  bins=5, 
                                  labels=['Very Short', 'Short', 'Medium', 'Long', 'Very Long'])
    
    engagement_by_length = df.groupby('text_length_bin')['total_engagement'].mean()
    
    plt.figure(figsize=(10, 6))
    engagement_by_length.plot(kind='bar')
    plt.title('Average Engagement by Text Length')
    plt.xlabel('Text Length Category')
    plt.ylabel('Average Total Engagement')
    plt.xticks(rotation=45)
    plt.show()

# Engagement by hashtag count
if all(col in df.columns for col in ['hashtag_count', 'total_engagement']):
    engagement_by_hashtags = df.groupby('hashtag_count')['total_engagement'].mean()
    
    plt.figure(figsize=(10, 6))
    engagement_by_hashtags.plot(kind='bar')
    plt.title('Average Engagement by Number of Hashtags')
    plt.xlabel('Number of Hashtags')
    plt.ylabel('Average Total Engagement')
    plt.show()

# Language vs engagement
if all(col in df.columns for col in ['lang', 'total_engagement']):
    engagement_by_lang = df.groupby('lang')['total_engagement'].mean().sort_values(ascending=False)
    
    plt.figure(figsize=(10, 6))
    engagement_by_lang.plot(kind='bar')
    plt.title('Average Engagement by Language')
    plt.xlabel('Language')
    plt.ylabel('Average Total Engagement')
    plt.xticks(rotation=45)
    plt.show()

## Export Analysis Results

In [None]:
# Create summary statistics
analysis_summary = {
    'total_records': len(df),
    'date_range': {
        'start': df['created_at'].min().strftime('%Y-%m-%d') if 'created_at' in df.columns else None,
        'end': df['created_at'].max().strftime('%Y-%m-%d') if 'created_at' in df.columns else None
    },
    'languages': df['lang'].value_counts().to_dict() if 'lang' in df.columns else {},
    'average_engagement': df['total_engagement'].mean() if 'total_engagement' in df.columns else None,
    'top_hashtags': hashtag_counts.head(10).to_dict() if 'hashtags' in df.columns else {},
    'text_stats': {
        'avg_length': df['text_length'].mean() if 'text_length' in df.columns else None,
        'median_length': df['text_length'].median() if 'text_length' in df.columns else None
    }
}

# Save analysis summary
summary_file = processed_data_dir / f"analysis_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(summary_file, 'w', encoding='utf-8') as f:
    json.dump(analysis_summary, f, indent=2, default=str)

print(f"Analysis summary saved to: {summary_file}")
print("\nAnalysis Summary:")
for key, value in analysis_summary.items():
    print(f"{key}: {value}")

## Insights and Recommendations

Based on the analysis above, here are some key insights:

1. **Temporal Patterns**: Look for peak posting times and days to optimize content timing
2. **Engagement Drivers**: Identify what content characteristics lead to higher engagement
3. **Language Distribution**: Understand your audience's language preferences
4. **Hashtag Strategy**: Use top-performing hashtags and optimal hashtag counts
5. **Content Length**: Find the sweet spot for text length that maximizes engagement

This analysis framework can be extended with:
- Sentiment analysis
- Topic modeling
- User network analysis
- Predictive modeling for engagement
- Anomaly detection for viral content