# Stream Graph - Flow and Temporal Evolution

**Use Case**: Show how multiple categories change over time, emphasizing flow and continuity, stacked area visualization with organic appearance

This notebook demonstrates how to create effective stream graphs for visualizing flowing data changes over time.


In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from matplotlib.patches import Polygon
from scipy.interpolate import interp1d
from scipy.ndimage import gaussian_filter1d
import warnings
warnings.filterwarnings('ignore')

# Set style for better-looking plots
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("Set3")

# Set random seed for reproducibility
np.random.seed(42)

print("Stream graph visualization libraries loaded!")


In [None]:
# Create sample datasets for stream graphs
# 1. Social Media Platform Usage Over Time
years = np.arange(2010, 2024)
n_years = len(years)

# Define different social media platforms with realistic growth patterns
platforms = {
    'Facebook': {
        'peak_year': 2016,
        'max_users': 45,
        'growth_rate': 0.8,
        'decline_rate': 0.95
    },
    'Instagram': {
        'peak_year': 2020,
        'max_users': 35,
        'growth_rate': 1.2,
        'decline_rate': 0.98
    },
    'Twitter': {
        'peak_year': 2018,
        'max_users': 25,
        'growth_rate': 0.6,
        'decline_rate': 0.92
    },
    'TikTok': {
        'peak_year': 2022,
        'max_users': 40,
        'growth_rate': 2.0,
        'decline_rate': 1.0
    },
    'LinkedIn': {
        'peak_year': 2021,
        'max_users': 20,
        'growth_rate': 0.4,
        'decline_rate': 1.02
    },
    'YouTube': {
        'peak_year': 2019,
        'max_users': 30,
        'growth_rate': 0.5,
        'decline_rate': 1.01
    }
}

social_media_data = []
for year in years:
    for platform, params in platforms.items():
        # Calculate usage based on logistic-like growth and decline
        years_from_start = year - 2010
        years_from_peak = year - params['peak_year']
        
        if year <= params['peak_year']:
            # Growth phase
            usage = params['max_users'] * (1 / (1 + np.exp(-params['growth_rate'] * (years_from_start - 5))))
        else:
            # Decline or stabilization phase
            peak_usage = params['max_users']
            usage = peak_usage * (params['decline_rate'] ** abs(years_from_peak))
        
        # Add some noise
        usage += np.random.normal(0, usage * 0.1)
        usage = max(0, usage)  # Ensure non-negative
        
        social_media_data.append({
            'Year': year,
            'Platform': platform,
            'Users': usage
        })

social_df = pd.DataFrame(social_media_data)

# 2. Energy Source Mix Over Time
energy_sources = {
    'Coal': {
        'start_pct': 40,
        'end_pct': 15,
        'peak_year': 2012,
        'volatility': 2
    },
    'Natural Gas': {
        'start_pct': 25,
        'end_pct': 35,
        'peak_year': 2020,
        'volatility': 3
    },
    'Nuclear': {
        'start_pct': 20,
        'end_pct': 18,
        'peak_year': 2015,
        'volatility': 1
    },
    'Hydroelectric': {
        'start_pct': 8,
        'end_pct': 7,
        'peak_year': 2016,
        'volatility': 1.5
    },
    'Wind': {
        'start_pct': 2,
        'end_pct': 15,
        'peak_year': 2023,
        'volatility': 2.5
    },
    'Solar': {
        'start_pct': 1,
        'end_pct': 8,
        'peak_year': 2023,
        'volatility': 3
    },
    'Other Renewables': {
        'start_pct': 4,
        'end_pct': 2,
        'peak_year': 2014,
        'volatility': 1
    }
}

energy_data = []
for year in years:
    year_data = {}
    total_percentage = 0
    
    for source, params in energy_sources.items():
        # Calculate percentage based on transition from start to end
        progress = (year - years[0]) / (years[-1] - years[0])
        base_pct = params['start_pct'] + (params['end_pct'] - params['start_pct']) * progress
        
        # Add cyclical variation
        cycle_effect = np.sin((year - years[0]) * 0.5) * params['volatility']
        
        # Add noise
        noise = np.random.normal(0, params['volatility'] * 0.3)
        
        percentage = base_pct + cycle_effect + noise
        percentage = max(0.1, percentage)  # Minimum 0.1%
        
        year_data[source] = percentage
        total_percentage += percentage
    
    # Normalize to 100%
    for source in energy_sources.keys():
        year_data[source] = (year_data[source] / total_percentage) * 100
        
        energy_data.append({
            'Year': year,
            'Source': source,
            'Percentage': year_data[source]
        })

energy_df = pd.DataFrame(energy_data)

# 3. Music Genre Popularity (Spotify-like streaming data)
genres = ['Pop', 'Hip-Hop', 'Rock', 'Electronic', 'Country', 'Jazz', 'Classical', 'R&B', 'Folk', 'Reggae']
months = pd.date_range('2020-01', '2023-12', freq='M')

music_data = []
for month in months:
    # Seasonal effects for different genres
    month_num = month.month
    year = month.year
    
    seasonal_factors = {
        'Pop': 1 + 0.2 * np.sin(2 * np.pi * month_num / 12),  # Summer peaks
        'Hip-Hop': 1 + 0.15 * np.cos(2 * np.pi * month_num / 12),  # Winter peaks
        'Rock': 1 + 0.1 * np.sin(2 * np.pi * month_num / 12 + np.pi/4),
        'Electronic': 1 + 0.3 * np.sin(2 * np.pi * month_num / 12 + np.pi/2),  # Spring/Fall peaks
        'Country': 1 + 0.2 * np.cos(2 * np.pi * month_num / 12 + np.pi/3),
        'Jazz': 1 + 0.1 * np.cos(2 * np.pi * month_num / 12),
        'Classical': 1 + 0.15 * np.cos(2 * np.pi * month_num / 12 + np.pi/6),
        'R&B': 1 + 0.1 * np.sin(2 * np.pi * month_num / 12),
        'Folk': 1 + 0.2 * np.sin(2 * np.pi * month_num / 12 + np.pi),  # Winter peaks
        'Reggae': 1 + 0.25 * np.sin(2 * np.pi * month_num / 12)  # Summer peaks
    }
    
    # Base popularity levels with trends
    base_popularity = {
        'Pop': 25 - (year - 2020) * 0.5,  # Slight decline
        'Hip-Hop': 20 + (year - 2020) * 1.0,  # Growing
        'Rock': 15 - (year - 2020) * 0.3,  # Declining
        'Electronic': 12 + (year - 2020) * 0.8,  # Growing
        'Country': 10 + (year - 2020) * 0.2,  # Stable
        'Jazz': 5 - (year - 2020) * 0.1,  # Slight decline
        'Classical': 4 - (year - 2020) * 0.1,
        'R&B': 6 + (year - 2020) * 0.3,
        'Folk': 2 + (year - 2020) * 0.1,
        'Reggae': 3 + (year - 2020) * 0.0
    }
    
    month_total = 0
    month_data = {}
    
    for genre in genres:
        popularity = (base_popularity[genre] * seasonal_factors[genre] + 
                     np.random.normal(0, 1))
        popularity = max(0.5, popularity)  # Minimum popularity
        month_data[genre] = popularity
        month_total += popularity
    
    # Normalize to percentage
    for genre in genres:
        normalized_pct = (month_data[genre] / month_total) * 100
        music_data.append({
            'Month': month,
            'Genre': genre,
            'Popularity': normalized_pct
        })

music_df = pd.DataFrame(music_data)

# 4. Economic Sector Employment Share
sectors = ['Manufacturing', 'Services', 'Agriculture', 'Technology', 'Healthcare', 'Education', 'Retail', 'Construction']
economic_years = np.arange(2000, 2024)

sector_data = []
for year in economic_years:
    # Economic trends over time
    year_progress = (year - 2000) / 23
    
    # Base employment shares with realistic trends
    base_shares = {
        'Manufacturing': 25 - 8 * year_progress,  # Declining
        'Services': 30 + 5 * year_progress,  # Growing
        'Agriculture': 8 - 3 * year_progress,  # Declining
        'Technology': 5 + 12 * year_progress,  # Rapidly growing
        'Healthcare': 8 + 4 * year_progress,  # Growing
        'Education': 7 + 1 * year_progress,  # Slowly growing
        'Retail': 12 - 2 * year_progress,  # Declining
        'Construction': 5 - 1 * year_progress  # Slightly declining
    }
    
    # Add economic cycle effects (boom/bust cycles)
    cycle_phase = 2 * np.pi * (year - 2000) / 8  # 8-year economic cycles
    cycle_strength = 2 * np.sin(cycle_phase)
    
    year_total = 0
    year_shares = {}
    
    for sector, base_share in base_shares.items():
        # Apply cycle effects differently to each sector
        cycle_sensitivity = {
            'Manufacturing': 1.5,
            'Services': 0.8,
            'Agriculture': 0.3,
            'Technology': 1.2,
            'Healthcare': 0.4,
            'Education': 0.2,
            'Retail': 1.0,
            'Construction': 2.0
        }
        
        adjusted_share = (base_share + 
                         cycle_strength * cycle_sensitivity[sector] + 
                         np.random.normal(0, 0.5))
        adjusted_share = max(0.5, adjusted_share)  # Minimum share
        
        year_shares[sector] = adjusted_share
        year_total += adjusted_share
    
    # Normalize to 100%
    for sector in sectors:
        normalized_share = (year_shares[sector] / year_total) * 100
        sector_data.append({
            'Year': year,
            'Sector': sector,
            'Employment_Share': normalized_share
        })

employment_df = pd.DataFrame(sector_data)

# 5. Website Traffic Sources
traffic_months = pd.date_range('2021-01', '2023-12', freq='M')
traffic_sources = ['Organic Search', 'Paid Search', 'Social Media', 'Direct', 'Email', 'Referral']

traffic_data = []
for month in traffic_months:
    month_num = month.month
    year_progress = (month.year - 2021) + (month.month - 1) / 12
    
    # Base traffic shares with trends
    base_traffic = {
        'Organic Search': 40 - 2 * year_progress,  # Slightly declining
        'Paid Search': 20 + 3 * year_progress,  # Growing investment
        'Social Media': 15 + 5 * year_progress,  # Growing significantly
        'Direct': 15 - 1 * year_progress,  # Slightly declining
        'Email': 7 - 0.5 * year_progress,  # Declining
        'Referral': 3 + 1 * year_progress  # Growing
    }
    
    # Seasonal patterns
    seasonal_effects = {
        'Organic Search': 1 + 0.1 * np.cos(2 * np.pi * month_num / 12),
        'Paid Search': 1 + 0.15 * np.sin(2 * np.pi * month_num / 12 + np.pi/2),  # Q4 campaigns
        'Social Media': 1 + 0.2 * np.sin(2 * np.pi * month_num / 12),  # Summer activity
        'Direct': 1 + 0.05 * np.cos(2 * np.pi * month_num / 12),
        'Email': 1 + 0.1 * np.cos(2 * np.pi * month_num / 12 + np.pi/4),  # Holiday campaigns
        'Referral': 1 + 0.08 * np.sin(2 * np.pi * month_num / 12)
    }
    
    month_total = 0
    month_traffic = {}
    
    for source, base_pct in base_traffic.items():
        traffic_pct = (base_pct * seasonal_effects[source] + 
                      np.random.normal(0, 1))
        traffic_pct = max(1, traffic_pct)  # Minimum 1%
        month_traffic[source] = traffic_pct
        month_total += traffic_pct
    
    # Normalize to 100%
    for source in traffic_sources:
        normalized_pct = (month_traffic[source] / month_total) * 100
        traffic_data.append({
            'Month': month,
            'Source': source,
            'Percentage': normalized_pct
        })

traffic_df = pd.DataFrame(traffic_data)

print("Sample stream graph datasets created:")
print(f"Social Media Usage: {len(social_df)} platform-year combinations")
print(f"Energy Sources: {len(energy_df)} source-year combinations")
print(f"Music Genres: {len(music_df)} genre-month combinations")
print(f"Employment Sectors: {len(employment_df)} sector-year combinations")
print(f"Website Traffic: {len(traffic_df)} source-month combinations")

# Display sample data
print(f"\nSample Social Media Data:")
print(social_df.head(3))


In [None]:
# Create basic stream graphs
fig, axes = plt.subplots(2, 2, figsize=(20, 16))
fig.suptitle('Stream Graph Visualizations - Flow and Temporal Evolution', fontsize=16, fontweight='bold')

# 1. Social Media Platform Usage Stream Graph
ax1 = axes[0, 0]

# Pivot data for stacked area plot
social_pivot = social_df.pivot(index='Year', columns='Platform', values='Users').fillna(0)

# Create stream graph with smooth curves
x = social_pivot.index
y_stack = np.zeros(len(x))

# Use a more vibrant color palette
colors = plt.cm.Set3(np.linspace(0, 1, len(social_pivot.columns)))

for i, platform in enumerate(social_pivot.columns):
    y_values = social_pivot[platform].values
    
    # Smooth the curves using gaussian filter
    y_smooth = gaussian_filter1d(y_values, sigma=0.8)
    
    ax1.fill_between(x, y_stack, y_stack + y_smooth, 
                    alpha=0.8, color=colors[i], label=platform, linewidth=0)
    y_stack += y_smooth

ax1.set_title('Social Media Platform Usage Evolution\n(Million Users)', fontsize=12, fontweight='bold')
ax1.set_xlabel('Year')
ax1.set_ylabel('Users (Millions)')
ax1.legend(loc='upper left', bbox_to_anchor=(1.02, 1))
ax1.grid(True, alpha=0.3)

# 2. Energy Source Mix Stream Graph  
ax2 = axes[0, 1]

# Pivot energy data
energy_pivot = energy_df.pivot(index='Year', columns='Source', values='Percentage').fillna(0)

# Create centered stream graph (ThemeRiver style)
x = energy_pivot.index
baseline = np.zeros(len(x))

# Calculate total and center the stream
total_values = energy_pivot.sum(axis=1)
y_stack = -total_values / 2  # Start from negative half

energy_colors = plt.cm.tab10(np.linspace(0, 1, len(energy_pivot.columns)))

for i, source in enumerate(['Coal', 'Natural Gas', 'Nuclear', 'Hydroelectric', 'Wind', 'Solar', 'Other Renewables']):
    if source in energy_pivot.columns:
        y_values = energy_pivot[source].values
        y_smooth = gaussian_filter1d(y_values, sigma=0.6)
        
        ax2.fill_between(x, y_stack, y_stack + y_smooth,
                        alpha=0.8, color=energy_colors[i], label=source, linewidth=0)
        y_stack += y_smooth

ax2.set_title('Energy Source Mix Evolution\n(Percentage Share)', fontsize=12, fontweight='bold')
ax2.set_xlabel('Year')
ax2.set_ylabel('Percentage of Total Energy')
ax2.legend(loc='upper left', bbox_to_anchor=(1.02, 1))
ax2.grid(True, alpha=0.3)
ax2.axhline(y=0, color='black', linestyle='-', alpha=0.3)

# 3. Music Genre Popularity Stream Graph
ax3 = axes[1, 0]

# Convert month to numeric for plotting
music_df['Month_Numeric'] = pd.to_datetime(music_df['Month']).astype(int) / 10**9 / (30*24*3600)  # Convert to months since epoch

# Pivot music data
music_pivot = music_df.pivot(index='Month', columns='Genre', values='Popularity').fillna(0)

x = np.arange(len(music_pivot.index))
x_labels = [month.strftime('%Y-%m') for month in music_pivot.index[::6]]  # Every 6 months
x_tick_positions = np.arange(0, len(music_pivot.index), 6)

# Create stream with different ordering for visual appeal
genre_order = ['Pop', 'Hip-Hop', 'Electronic', 'Rock', 'Country', 'R&B', 'Jazz', 'Classical', 'Folk', 'Reggae']
y_stack = np.zeros(len(x))

music_colors = plt.cm.Spectral(np.linspace(0, 1, len(genre_order)))

for i, genre in enumerate(genre_order):
    if genre in music_pivot.columns:
        y_values = music_pivot[genre].values
        y_smooth = gaussian_filter1d(y_values, sigma=1.0)
        
        ax3.fill_between(x, y_stack, y_stack + y_smooth,
                        alpha=0.8, color=music_colors[i], label=genre, linewidth=0)
        y_stack += y_smooth

ax3.set_title('Music Genre Popularity Streams\n(Monthly Streaming Share)', fontsize=12, fontweight='bold')
ax3.set_xlabel('Time Period')
ax3.set_ylabel('Popularity Share (%)')
ax3.set_xticks(x_tick_positions)
ax3.set_xticklabels(x_labels, rotation=45)
ax3.legend(loc='upper left', bbox_to_anchor=(1.02, 1))
ax3.grid(True, alpha=0.3)

# 4. Employment Sector Stream Graph
ax4 = axes[1, 1]

# Pivot employment data
employment_pivot = employment_df.pivot(index='Year', columns='Sector', values='Employment_Share').fillna(0)

x = employment_pivot.index
y_stack = np.zeros(len(x))

sector_colors = plt.cm.tab20(np.linspace(0, 1, len(employment_pivot.columns)))

# Order sectors by average employment share for better visual hierarchy
sector_means = employment_pivot.mean().sort_values(ascending=False)
ordered_sectors = sector_means.index

for i, sector in enumerate(ordered_sectors):
    y_values = employment_pivot[sector].values
    y_smooth = gaussian_filter1d(y_values, sigma=0.5)
    
    ax4.fill_between(x, y_stack, y_stack + y_smooth,
                    alpha=0.8, color=sector_colors[i], label=sector, linewidth=0)
    y_stack += y_smooth

ax4.set_title('Economic Sector Employment Share\n(Percentage of Workforce)', fontsize=12, fontweight='bold')
ax4.set_xlabel('Year')
ax4.set_ylabel('Employment Share (%)')
ax4.legend(loc='upper left', bbox_to_anchor=(1.02, 1))
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Advanced stream graph techniques
fig, axes = plt.subplots(2, 2, figsize=(20, 16))
fig.suptitle('Advanced Stream Graph Techniques', fontsize=16, fontweight='bold')

# 1. Symmetric Stream Graph (ThemeRiver style)
ax1 = axes[0, 0]

# Use traffic data for symmetric visualization
traffic_pivot = traffic_df.pivot(index='Month', columns='Source', values='Percentage').fillna(0)

x = np.arange(len(traffic_pivot.index))
x_labels = [month.strftime('%Y-%m') for month in traffic_pivot.index[::6]]
x_tick_positions = np.arange(0, len(traffic_pivot.index), 6)

# Calculate the baseline for symmetric layout
total_height = traffic_pivot.sum(axis=1)
baseline = -total_height / 2

# Create symmetric stream
traffic_colors = plt.cm.viridis(np.linspace(0, 1, len(traffic_pivot.columns)))
y_bottom = baseline.values
y_top = baseline.values

for i, source in enumerate(traffic_pivot.columns):
    y_values = traffic_pivot[source].values
    y_smooth = gaussian_filter1d(y_values, sigma=1.2)
    
    # Split the stream symmetrically
    half_values = y_smooth / 2
    
    # Bottom half (negative)
    ax1.fill_between(x, y_bottom, y_bottom - half_values,
                    alpha=0.8, color=traffic_colors[i], linewidth=0)
    
    # Top half (positive)  
    ax1.fill_between(x, y_top, y_top + half_values,
                    alpha=0.8, color=traffic_colors[i], label=source, linewidth=0)
    
    y_bottom -= half_values
    y_top += half_values

ax1.set_title('Website Traffic Sources\n(Symmetric ThemeRiver Layout)', fontsize=12, fontweight='bold')
ax1.set_xlabel('Time Period')
ax1.set_ylabel('Traffic Share (%)')
ax1.set_xticks(x_tick_positions)
ax1.set_xticklabels(x_labels, rotation=45)
ax1.legend(loc='upper left', bbox_to_anchor=(1.02, 1))
ax1.axhline(y=0, color='black', linestyle='-', alpha=0.5)
ax1.grid(True, alpha=0.3)

# 2. Stream Graph with Trend Emphasis
ax2 = axes[0, 1]

# Highlight specific categories in energy data
energy_pivot = energy_df.pivot(index='Year', columns='Source', values='Percentage').fillna(0)

x = energy_pivot.index
y_stack = np.zeros(len(x))

# Highlight renewable sources
renewable_sources = ['Wind', 'Solar', 'Hydroelectric', 'Other Renewables']
fossil_sources = ['Coal', 'Natural Gas']
other_sources = ['Nuclear']

# Different visual treatment for different categories
for i, source in enumerate(energy_pivot.columns):
    y_values = energy_pivot[source].values
    y_smooth = gaussian_filter1d(y_values, sigma=0.6)
    
    if source in renewable_sources:
        # Bright colors for renewables
        color = plt.cm.Greens(0.4 + 0.6 * (renewable_sources.index(source) / len(renewable_sources)))
        alpha = 0.9
        linewidth = 1
        edgecolor = 'darkgreen'
    elif source in fossil_sources:
        # Darker colors for fossil fuels
        color = plt.cm.Reds(0.4 + 0.6 * (fossil_sources.index(source) / len(fossil_sources)))
        alpha = 0.8
        linewidth = 1
        edgecolor = 'darkred'
    else:
        # Neutral color for nuclear
        color = 'lightblue'
        alpha = 0.7
        linewidth = 1
        edgecolor = 'navy'
    
    ax2.fill_between(x, y_stack, y_stack + y_smooth,
                    alpha=alpha, color=color, label=source, 
                    linewidth=linewidth, edgecolor=edgecolor)
    y_stack += y_smooth

ax2.set_title('Energy Transition Visualization\n(Renewables vs Fossil Fuels)', fontsize=12, fontweight='bold')
ax2.set_xlabel('Year')
ax2.set_ylabel('Energy Share (%)')
ax2.legend(loc='upper left', bbox_to_anchor=(1.02, 1))
ax2.grid(True, alpha=0.3)

# Add trend annotations
ax2.annotate('Renewable Growth', xy=(2020, 75), xytext=(2018, 85),
            arrowprops=dict(arrowstyle='->', color='green', lw=2),
            fontsize=10, color='green', fontweight='bold')

ax2.annotate('Coal Decline', xy=(2020, 20), xytext=(2016, 10),
            arrowprops=dict(arrowstyle='->', color='red', lw=2),
            fontsize=10, color='red', fontweight='bold')

# 3. Multi-Scale Stream Graph
ax3 = axes[1, 0]

# Show both absolute and relative changes in social media data
social_pivot = social_df.pivot(index='Year', columns='Platform', values='Users').fillna(0)

# Normalize each platform's data to show relative growth
social_normalized = social_pivot.div(social_pivot.iloc[0], axis=1) * 100  # Normalize to first year = 100

x = social_normalized.index
y_stack = np.zeros(len(x))

# Create stream with varying transparency based on growth rate
for i, platform in enumerate(social_normalized.columns):
    y_values = social_normalized[platform].values
    y_smooth = gaussian_filter1d(y_values, sigma=0.8)
    
    # Calculate growth rate for transparency
    growth_rate = (y_values[-1] - y_values[0]) / y_values[0]
    alpha = min(0.9, 0.4 + abs(growth_rate) / 3)  # More transparent for slower growth
    
    color = plt.cm.tab10(i / len(social_normalized.columns))
    
    ax3.fill_between(x, y_stack, y_stack + y_smooth,
                    alpha=alpha, color=color, label=f'{platform} ({growth_rate:+.1%})', 
                    linewidth=1, edgecolor='white')
    y_stack += y_smooth

ax3.set_title('Social Media Platform Growth\n(Relative to 2010 Baseline)', fontsize=12, fontweight='bold')
ax3.set_xlabel('Year')
ax3.set_ylabel('Relative Growth (2010 = 100)')
ax3.legend(loc='upper left', bbox_to_anchor=(1.02, 1))
ax3.grid(True, alpha=0.3)

# 4. Stream Graph with Peak Highlighting
ax4 = axes[1, 1]

# Show employment sectors with peak periods highlighted
employment_pivot = employment_df.pivot(index='Year', columns='Sector', values='Employment_Share').fillna(0)

x = employment_pivot.index
y_stack = np.zeros(len(x))

# Calculate peaks for each sector
sector_peaks = {}
for sector in employment_pivot.columns:
    peak_year = employment_pivot[sector].idxmax()
    peak_value = employment_pivot[sector].max()
    sector_peaks[sector] = {'year': peak_year, 'value': peak_value}

sector_colors = plt.cm.tab20(np.linspace(0, 1, len(employment_pivot.columns)))

for i, sector in enumerate(employment_pivot.columns):
    y_values = employment_pivot[sector].values
    y_smooth = gaussian_filter1d(y_values, sigma=0.5)
    
    # Base stream
    ax4.fill_between(x, y_stack, y_stack + y_smooth,
                    alpha=0.7, color=sector_colors[i], label=sector, linewidth=0)
    
    # Highlight peak period
    peak_year = sector_peaks[sector]['year']
    peak_idx = list(x).index(peak_year)
    
    # Create highlight around peak (±2 years)
    highlight_start = max(0, peak_idx - 2)
    highlight_end = min(len(x), peak_idx + 3)
    
    x_highlight = x[highlight_start:highlight_end]
    y_bottom_highlight = y_stack[highlight_start:highlight_end]
    y_top_highlight = (y_stack + y_smooth)[highlight_start:highlight_end]
    
    ax4.fill_between(x_highlight, y_bottom_highlight, y_top_highlight,
                    alpha=0.9, color=sector_colors[i], linewidth=2, 
                    edgecolor='white')
    
    y_stack += y_smooth

ax4.set_title('Employment Sector Peaks\n(Peak Periods Highlighted)', fontsize=12, fontweight='bold')
ax4.set_xlabel('Year')
ax4.set_ylabel('Employment Share (%)')
ax4.legend(loc='upper left', bbox_to_anchor=(1.02, 1))
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Interactive stream graphs (Plotly structure)
print("Interactive Stream Graphs (Plotly):")
print("=" * 50)

print("\n1. Basic Interactive Stream Graph")
print("Code structure:")
print("""
import plotly.graph_objects as go

# Prepare data in wide format
df_pivot = data.pivot(index='time', columns='category', values='value')

fig = go.Figure()

# Create cumulative data for stacking
x = df_pivot.index
cumulative = np.zeros(len(x))

for column in df_pivot.columns:
    y_values = df_pivot[column].values
    
    # Add filled area
    fig.add_trace(go.Scatter(
        x=x,
        y=cumulative + y_values,
        fill=None,
        mode='lines',
        line=dict(width=0),
        showlegend=False,
        name=column + '_top'
    ))
    
    fig.add_trace(go.Scatter(
        x=x,
        y=cumulative,
        fill='tonexty' if column != df_pivot.columns[0] else 'tozeroy',
        mode='lines',
        line=dict(width=0),
        name=column,
        hovertemplate="<b>%{fullData.name}</b><br>" +
                      "Time: %{x}<br>" +
                      "Value: %{customdata}<br>" +
                      "<extra></extra>",
        customdata=y_values
    ))
    
    cumulative += y_values

fig.update_layout(
    title="Interactive Stream Graph",
    xaxis_title="Time",
    yaxis_title="Value",
    hovermode='x unified'
)

fig.show()
""")

print("\n2. Symmetric Stream Graph (ThemeRiver)")
print("Code structure:")
print("""
# Calculate symmetric baseline
total_values = df_pivot.sum(axis=1)
baseline = -total_values / 2

fig = go.Figure()

cumulative_bottom = baseline
cumulative_top = baseline

for column in df_pivot.columns:
    y_values = df_pivot[column].values
    half_values = y_values / 2
    
    # Bottom half
    fig.add_trace(go.Scatter(
        x=x,
        y=cumulative_bottom,
        fill=None,
        mode='lines',
        line=dict(width=0),
        showlegend=False
    ))
    
    fig.add_trace(go.Scatter(
        x=x,
        y=cumulative_bottom - half_values,
        fill='tonexty',
        mode='lines',
        line=dict(width=0),
        name=column,
        customdata=y_values
    ))
    
    # Top half (same data, mirrored)
    fig.add_trace(go.Scatter(
        x=x,
        y=cumulative_top + half_values,
        fill='tonexty',
        mode='lines',
        line=dict(width=0),
        showlegend=False,
        customdata=y_values
    ))
    
    cumulative_bottom -= half_values
    cumulative_top += half_values

fig.update_layout(
    title="Symmetric Stream Graph (ThemeRiver)",
    xaxis_title="Time",
    yaxis_title="Value",
    shapes=[dict(type="line", x0=x.min(), y0=0, x1=x.max(), y1=0,
                 line=dict(color="black", width=1, dash="dash"))]
)

fig.show()
""")

print("\n3. Animated Stream Graph")
print("Code structure:")
print("""
# Create animation frames
frames = []
time_points = sorted(data['time'].unique())

for i, time_point in enumerate(time_points):
    # Get data up to current time point
    current_data = data[data['time'] <= time_point]
    df_current = current_data.pivot(index='time', columns='category', values='value')
    
    frame_traces = []
    cumulative = np.zeros(len(df_current.index))
    
    for column in df_current.columns:
        y_values = df_current[column].fillna(0).values
        
        frame_traces.append(go.Scatter(
            x=df_current.index,
            y=cumulative + y_values,
            fill='tonexty' if column != df_current.columns[0] else 'tozeroy',
            mode='lines',
            line=dict(width=0),
            name=column
        ))
        
        cumulative += y_values
    
    frames.append(go.Frame(data=frame_traces, name=str(time_point)))

# Create figure with first frame
fig = go.Figure(data=frames[0].data, frames=frames)

# Add animation controls
fig.update_layout(
    updatemenus=[
        dict(type="buttons",
             buttons=[dict(label="Play", method="animate", args=[None]),
                      dict(label="Pause", method="animate", args=[[None]])])
    ],
    sliders=[dict(steps=[dict(args=[[f.name]], label=f.name, method="animate")
                        for f in frames])]
)

fig.show()
""")

print("\n4. Multi-Layer Stream with Brushing")
print("Code structure:")
print("""
from plotly.subplots import make_subplots

# Create subplot with secondary y-axis for different scales
fig = make_subplots(
    rows=2, cols=1,
    shared_xaxes=True,
    subplot_titles=("Absolute Values", "Percentage Share"),
    vertical_spacing=0.1
)

# Top plot: Absolute values
cumulative_abs = np.zeros(len(x))
for i, column in enumerate(df_pivot.columns):
    y_abs = df_pivot[column].values
    
    fig.add_trace(go.Scatter(
        x=x, y=cumulative_abs + y_abs,
        fill='tonexty' if i > 0 else 'tozeroy',
        mode='lines', line=dict(width=0),
        name=column, legendgroup=column,
        customdata=y_abs
    ), row=1, col=1)
    
    cumulative_abs += y_abs

# Bottom plot: Percentage share
df_pct = df_pivot.div(df_pivot.sum(axis=1), axis=0) * 100
cumulative_pct = np.zeros(len(x))

for i, column in enumerate(df_pct.columns):
    y_pct = df_pct[column].values
    
    fig.add_trace(go.Scatter(
        x=x, y=cumulative_pct + y_pct,
        fill='tonexty' if i > 0 else 'tozeroy',
        mode='lines', line=dict(width=0),
        name=column, legendgroup=column,
        showlegend=False,
        customdata=y_pct
    ), row=2, col=1)
    
    cumulative_pct += y_pct

# Add crossfilter behavior
fig.update_layout(
    title="Multi-Scale Stream Analysis",
    hovermode='x unified',
    height=800
)

fig.show()
""")


In [None]:
# Statistical analysis of stream graph data
print("Stream Graph Statistical Analysis:")
print("=" * 50)

# 1. Trend Analysis for Social Media Platforms
print("1. SOCIAL MEDIA PLATFORM TREND ANALYSIS:")

social_pivot = social_df.pivot(index='Year', columns='Platform', values='Users').fillna(0)

print("   Platform Growth Analysis:")
for platform in social_pivot.columns:
    platform_data = social_pivot[platform]
    
    # Calculate growth metrics
    initial_users = platform_data.iloc[0]
    final_users = platform_data.iloc[-1]
    peak_users = platform_data.max()
    peak_year = platform_data.idxmax()
    
    # Growth rates
    total_growth = ((final_users - initial_users) / initial_users * 100) if initial_users > 0 else 0
    cagr = ((final_users / initial_users) ** (1 / (len(platform_data) - 1)) - 1) * 100 if initial_users > 0 else 0
    
    # Volatility (coefficient of variation)
    volatility = (platform_data.std() / platform_data.mean()) * 100
    
    # Trend classification
    if total_growth > 100:
        trend = "High Growth"
    elif total_growth > 20:
        trend = "Moderate Growth"
    elif total_growth > -10:
        trend = "Stable"
    else:
        trend = "Declining"
    
    print(f"\n     {platform}:")
    print(f"       Total Growth: {total_growth:+.1f}%")
    print(f"       CAGR: {cagr:+.1f}%")
    print(f"       Peak: {peak_users:.1f}M users ({peak_year})")
    print(f"       Volatility: {volatility:.1f}%")
    print(f"       Trend: {trend}")

# Platform market share evolution
print(f"\n   Market Share Evolution:")
social_totals = social_pivot.sum(axis=1)
social_share = social_pivot.div(social_totals, axis=0) * 100

# Market concentration (Herfindahl-Hirschman Index)
hhi_by_year = {}
for year in social_share.index:
    year_shares = social_share.loc[year]
    hhi = sum(share ** 2 for share in year_shares)
    hhi_by_year[year] = hhi

initial_hhi = hhi_by_year[social_share.index[0]]
final_hhi = hhi_by_year[social_share.index[-1]]
market_concentration_change = ((final_hhi - initial_hhi) / initial_hhi) * 100

concentration_level = "High" if final_hhi > 2500 else "Moderate" if final_hhi > 1500 else "Low"

print(f"     Market Concentration (HHI): {final_hhi:.0f} ({concentration_level})")
print(f"     Concentration Change: {market_concentration_change:+.1f}%")

# 2. Energy Transition Analysis
print(f"\n2. ENERGY TRANSITION ANALYSIS:")

energy_pivot = energy_df.pivot(index='Year', columns='Source', values='Percentage').fillna(0)

# Categorize energy sources
renewable_sources = ['Wind', 'Solar', 'Hydroelectric', 'Other Renewables']
fossil_sources = ['Coal', 'Natural Gas'] 
nuclear_sources = ['Nuclear']

# Calculate category totals
renewables_total = energy_pivot[renewable_sources].sum(axis=1)
fossils_total = energy_pivot[fossil_sources].sum(axis=1)
nuclear_total = energy_pivot[nuclear_sources].sum(axis=1)

print("   Energy Source Category Analysis:")

# Renewable energy growth
renewables_growth = ((renewables_total.iloc[-1] - renewables_total.iloc[0]) / renewables_total.iloc[0]) * 100
renewables_cagr = ((renewables_total.iloc[-1] / renewables_total.iloc[0]) ** (1 / (len(renewables_total) - 1)) - 1) * 100

print(f"     Renewable Energy:")
print(f"       Share Growth: {renewables_growth:+.1f}%")
print(f"       CAGR: {renewables_cagr:+.1f}%")
print(f"       Current Share: {renewables_total.iloc[-1]:.1f}%")

# Fossil fuel decline
fossils_decline = ((fossils_total.iloc[-1] - fossils_total.iloc[0]) / fossils_total.iloc[0]) * 100
fossils_cagr = ((fossils_total.iloc[-1] / fossils_total.iloc[0]) ** (1 / (len(fossils_total) - 1)) - 1) * 100

print(f"     Fossil Fuels:")
print(f"       Share Change: {fossils_decline:+.1f}%")
print(f"       CAGR: {fossils_cagr:+.1f}%")
print(f"       Current Share: {fossils_total.iloc[-1]:.1f}%")

# Transition speed analysis
transition_years = len(energy_pivot.index)
years_to_majority_renewables = None

for i, year in enumerate(renewables_total.index):
    if renewables_total.iloc[i] > 50:
        years_to_majority_renewables = year
        break

if years_to_majority_renewables:
    print(f"     Renewable Majority Achieved: {years_to_majority_renewables}")
else:
    # Project when renewables might reach majority
    if renewables_cagr > 0:
        years_needed = np.log(50 / renewables_total.iloc[-1]) / np.log(1 + renewables_cagr / 100)
        projected_year = renewables_total.index[-1] + years_needed
        print(f"     Projected Renewable Majority: ~{projected_year:.0f}")
    else:
        print("     Renewable Majority: Not projected with current trends")

# 3. Music Genre Seasonality Analysis
print(f"\n3. MUSIC GENRE SEASONALITY ANALYSIS:")

music_df['Month_Num'] = pd.to_datetime(music_df['Month']).dt.month

print("   Seasonal Patterns by Genre:")
for genre in genres[:6]:  # Top 6 genres for brevity
    genre_data = music_df[music_df['Genre'] == genre]
    
    # Calculate seasonal variation
    monthly_avg = genre_data.groupby('Month_Num')['Popularity'].mean()
    seasonal_range = monthly_avg.max() - monthly_avg.min()
    seasonal_coefficient = (monthly_avg.std() / monthly_avg.mean()) * 100
    
    # Find peak and trough months
    peak_month = monthly_avg.idxmax()
    trough_month = monthly_avg.idxmin()
    
    month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                   'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    
    seasonality_level = "High" if seasonal_coefficient > 15 else "Moderate" if seasonal_coefficient > 8 else "Low"
    
    print(f"\n     {genre}:")
    print(f"       Seasonal Variation: {seasonal_range:.1f}%")
    print(f"       Seasonality: {seasonality_level} (CV: {seasonal_coefficient:.1f}%)")
    print(f"       Peak Month: {month_names[peak_month-1]} ({monthly_avg.iloc[peak_month-1]:.1f}%)")
    print(f"       Trough Month: {month_names[trough_month-1]} ({monthly_avg.iloc[trough_month-1]:.1f}%)")

# 4. Employment Sector Stability Analysis
print(f"\n4. EMPLOYMENT SECTOR STABILITY ANALYSIS:")

employment_pivot = employment_df.pivot(index='Year', columns='Sector', values='Employment_Share').fillna(0)

print("   Sector Stability Metrics:")
for sector in employment_pivot.columns:
    sector_data = employment_pivot[sector]
    
    # Stability metrics
    mean_share = sector_data.mean()
    volatility = sector_data.std()
    coefficient_variation = (volatility / mean_share) * 100
    
    # Trend analysis
    correlation_with_time = np.corrcoef(range(len(sector_data)), sector_data)[0, 1]
    
    # Economic cycle sensitivity (correlation with a simulated economic cycle)
    economic_cycle = np.sin(2 * np.pi * np.arange(len(sector_data)) / 8)  # 8-year cycle
    cycle_correlation = np.corrcoef(economic_cycle, sector_data)[0, 1]
    
    # Classifications
    if coefficient_variation < 10:
        stability = "Very Stable"
    elif coefficient_variation < 20:
        stability = "Stable"
    elif coefficient_variation < 30:
        stability = "Moderate"
    else:
        stability = "Volatile"
    
    if abs(correlation_with_time) > 0.7:
        trend_strength = "Strong"
    elif abs(correlation_with_time) > 0.4:
        trend_strength = "Moderate"
    else:
        trend_strength = "Weak"
    
    trend_direction = "Growing" if correlation_with_time > 0 else "Declining" if correlation_with_time < 0 else "Stable"
    
    print(f"\n     {sector}:")
    print(f"       Mean Share: {mean_share:.1f}%")
    print(f"       Stability: {stability} (CV: {coefficient_variation:.1f}%)")
    print(f"       Trend: {trend_strength} {trend_direction}")
    print(f"       Cycle Sensitivity: {abs(cycle_correlation):.2f}")

# 5. Stream Graph Design Effectiveness Analysis
print(f"\n5. STREAM GRAPH DESIGN ANALYSIS:")

print("   Data Characteristics Assessment:")
datasets = {
    'Social Media Usage': {
        'temporal_span': len(social_pivot.index),
        'categories': len(social_pivot.columns),
        'value_range': social_pivot.max().max() - social_pivot.min().min(),
        'stacking_suitability': 'Excellent',
        'trend_clarity': 'High'
    },
    'Energy Sources': {
        'temporal_span': len(energy_pivot.index),
        'categories': len(energy_pivot.columns),
        'value_range': 100,  # Percentages
        'stacking_suitability': 'Perfect',
        'trend_clarity': 'Very High'
    },
    'Music Genres': {
        'temporal_span': len(music_df['Month'].unique()),
        'categories': len(genres),
        'value_range': 100,  # Percentages
        'stacking_suitability': 'Good',
        'trend_clarity': 'Moderate'
    },
    'Employment Sectors': {
        'temporal_span': len(employment_pivot.index),
        'categories': len(employment_pivot.columns),
        'value_range': 100,  # Percentages
        'stacking_suitability': 'Excellent',
        'trend_clarity': 'High'
    }
}

for dataset, props in datasets.items():
    print(f"\n     {dataset}:")
    print(f"       Time Points: {props['temporal_span']}")
    print(f"       Categories: {props['categories']}")
    print(f"       Stacking Suitability: {props['stacking_suitability']}")
    print(f"       Trend Clarity: {props['trend_clarity']}")

print(f"\n   Stream Graph Best Practices:")
print("   ✓ Use for 3-15 categories (too few = underutilized, too many = cluttered)")
print("   ✓ Ideal for percentage/proportion data that sum to 100%")
print("   ✓ Minimum 10-20 time points for smooth curves")
print("   ✓ Apply smoothing (Gaussian filter) for organic appearance")
print("   ✓ Order categories by size or importance for visual hierarchy")
print("   ✓ Use symmetric layout (ThemeRiver) for balanced comparison")
print("   ✓ Consider interactive features for detailed exploration")

print(f"\nWhen to Use Stream Graphs:")
print("   • Market share evolution over time")
print("   • Budget allocation changes")
print("   • Population demographics shifts")
print("   • Resource consumption patterns")
print("   • Technology adoption curves")
print("   • Genre/category popularity trends")

print(f"\nAlternatives to Consider:")
print("   • Stacked area charts for simpler rectangular appearance")
print("   • Sankey diagrams for flow between specific time points")
print("   • Alluvial plots for categorical flow visualization")
print("   • Multi-line charts when absolute values matter more than proportions")
print("   • Animated bar charts for dramatic category changes")
