# Ridgeline Plot - Distribution Comparison Across Groups

**Use Case**: Compare distributions across multiple groups, show overlapping density curves, reveal patterns in grouped continuous data

This notebook demonstrates how to create effective ridgeline plots for comparing distributions across different categories or time periods.


In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.figure_factory as ff
from scipy import stats
from scipy.stats import gaussian_kde
import warnings
warnings.filterwarnings('ignore')

# Set style for better-looking plots
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("viridis")

# Set random seed for reproducibility
np.random.seed(42)

print("Ridgeline plot visualization libraries loaded!")


In [None]:
# Create sample datasets for ridgeline plots
# 1. Temperature Distributions by Month
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

temperature_data = []
for i, month in enumerate(months):
    # Seasonal temperature patterns
    base_temp = 10 + 20 * np.sin(2 * np.pi * i / 12 - np.pi/2)  # Sine wave through year
    std_temp = 8 + 3 * np.cos(2 * np.pi * i / 12)  # Seasonal variation in spread
    
    # Generate temperature samples
    n_days = 30
    temperatures = np.random.normal(base_temp, std_temp, n_days)
    
    for temp in temperatures:
        temperature_data.append({
            'Month': month,
            'Month_Num': i + 1,
            'Temperature': temp
        })

temp_df = pd.DataFrame(temperature_data)

# 2. Exam Scores by Subject
subjects = ['Mathematics', 'Science', 'English', 'History', 'Art', 'Physical Education']
n_students = 150

exam_data = []
for subject in subjects:
    # Different difficulty levels and score distributions
    subject_params = {
        'Mathematics': {'mean': 72, 'std': 15, 'skew': -0.3},
        'Science': {'mean': 78, 'std': 12, 'skew': -0.1},
        'English': {'mean': 82, 'std': 10, 'skew': -0.5},
        'History': {'mean': 75, 'std': 13, 'skew': 0.1},
        'Art': {'mean': 85, 'std': 8, 'skew': -0.2},
        'Physical Education': {'mean': 88, 'std': 7, 'skew': -0.8}
    }
    
    params = subject_params[subject]
    
    # Generate skewed normal distribution
    scores = stats.skewnorm.rvs(params['skew'], loc=params['mean'], 
                               scale=params['std'], size=n_students)
    scores = np.clip(scores, 0, 100)  # Clamp to 0-100 range
    
    for score in scores:
        exam_data.append({
            'Subject': subject,
            'Score': score
        })

exam_df = pd.DataFrame(exam_data)

# 3. Income Distribution by Age Group
age_groups = ['18-25', '26-35', '36-45', '46-55', '56-65', '65+']
n_people_per_group = 200

income_data = []
for i, age_group in enumerate(age_groups):
    # Age-based income patterns
    if age_group == '18-25':
        # Young adults: lower income, right-skewed
        incomes = np.random.lognormal(10.5, 0.8, n_people_per_group)
    elif age_group == '26-35':
        # Career growth: moderate income, moderate spread
        incomes = np.random.lognormal(11.0, 0.7, n_people_per_group)
    elif age_group in ['36-45', '46-55']:
        # Peak earning years: higher income, bimodal (some very high earners)
        low_earners = np.random.lognormal(11.2, 0.6, int(n_people_per_group * 0.7))
        high_earners = np.random.lognormal(12.2, 0.5, int(n_people_per_group * 0.3))
        incomes = np.concatenate([low_earners, high_earners])
    elif age_group == '56-65':
        # Pre-retirement: still high but more variable
        incomes = np.random.lognormal(11.1, 0.9, n_people_per_group)
    else:  # 65+
        # Retirement: lower income, pension-based
        incomes = np.random.lognormal(10.3, 0.5, n_people_per_group)
    
    for income in incomes:
        income_data.append({
            'Age_Group': age_group,
            'Income': income
        })

income_df = pd.DataFrame(income_data)

# 4. Response Time by Website Page Type
page_types = ['Homepage', 'Product Page', 'Checkout', 'Search Results', 'User Profile', 'Contact']
n_sessions = 300

response_data = []
for page_type in page_types:
    # Different response time patterns
    if page_type == 'Homepage':
        # Fast, optimized
        response_times = np.random.gamma(2, 0.2, n_sessions)
    elif page_type == 'Product Page':
        # Moderate, with some outliers
        base_times = np.random.gamma(3, 0.3, int(n_sessions * 0.9))
        slow_times = np.random.gamma(8, 0.5, int(n_sessions * 0.1))
        response_times = np.concatenate([base_times, slow_times])
    elif page_type == 'Checkout':
        # Bimodal: fast for cached, slow for new users
        fast_users = np.random.gamma(2.5, 0.25, int(n_sessions * 0.6))
        slow_users = np.random.gamma(6, 0.4, int(n_sessions * 0.4))
        response_times = np.concatenate([fast_users, slow_users])
    elif page_type == 'Search Results':
        # Variable based on query complexity
        response_times = np.random.gamma(4, 0.35, n_sessions)
    elif page_type == 'User Profile':
        # Generally fast
        response_times = np.random.gamma(2.2, 0.3, n_sessions)
    else:  # Contact
        # Moderate speed
        response_times = np.random.gamma(3.5, 0.3, n_sessions)
    
    for rt in response_times:
        response_data.append({
            'Page_Type': page_type,
            'Response_Time': rt
        })

response_df = pd.DataFrame(response_data)

# 5. Product Rating Distributions by Category
categories = ['Electronics', 'Books', 'Clothing', 'Home & Garden', 'Sports', 'Toys', 'Health']
n_reviews = 400

rating_data = []
for category in categories:
    # Different rating patterns
    if category == 'Electronics':
        # Bimodal: love it or hate it
        positive_ratings = np.random.beta(8, 2, int(n_reviews * 0.7)) * 5
        negative_ratings = np.random.beta(2, 5, int(n_reviews * 0.3)) * 5
        ratings = np.concatenate([positive_ratings, negative_ratings])
    elif category == 'Books':
        # Generally positive, slight left skew
        ratings = np.random.beta(6, 2, n_reviews) * 5
    elif category == 'Clothing':
        # More variable, normal-ish distribution
        ratings = np.random.normal(3.5, 1.2, n_reviews)
        ratings = np.clip(ratings, 1, 5)
    elif category == 'Home & Garden':
        # Positive skew, most people satisfied
        ratings = np.random.beta(7, 2.5, n_reviews) * 5
    elif category == 'Sports':
        # Right skew, quality varies
        ratings = np.random.beta(5, 3, n_reviews) * 5
    elif category == 'Toys':
        # Very positive, parents rate highly
        ratings = np.random.beta(9, 2, n_reviews) * 5
    else:  # Health
        # Conservative ratings, centered around 3-4
        ratings = np.random.beta(4, 3, n_reviews) * 5
    
    for rating in ratings:
        rating_data.append({
            'Category': category,
            'Rating': rating
        })

rating_df = pd.DataFrame(rating_data)

# 6. Sales Performance by Quarter (Multiple Years)
quarters = ['Q1', 'Q2', 'Q3', 'Q4']
years = [2020, 2021, 2022, 2023]

sales_data = []
for year in years:
    for i, quarter in enumerate(quarters):
        # Seasonal patterns with yearly growth
        base_sales = 100 + (year - 2020) * 15  # Year-over-year growth
        
        # Quarterly patterns
        seasonal_multipliers = [0.9, 1.1, 0.8, 1.3]  # Q4 holiday boost
        quarterly_base = base_sales * seasonal_multipliers[i]
        
        # Generate individual sales figures
        n_sales = 80
        if quarter == 'Q4':
            # Holiday season: higher variance, some big sales
            individual_sales = np.random.gamma(3, quarterly_base/3, n_sales)
        else:
            individual_sales = np.random.gamma(2.5, quarterly_base/2.5, n_sales)
        
        for sale in individual_sales:
            sales_data.append({
                'Year': year,
                'Quarter': quarter,
                'Period': f'{year} {quarter}',
                'Sales': sale
            })

sales_df = pd.DataFrame(sales_data)

print("Sample ridgeline plot datasets created:")
print(f"Temperature Data: {len(temp_df)} daily temperature readings")
print(f"Exam Scores: {len(exam_df)} student scores across subjects")
print(f"Income Distribution: {len(income_df)} income records by age group")
print(f"Response Times: {len(response_df)} web page response measurements")
print(f"Product Ratings: {len(rating_df)} customer ratings by category")
print(f"Sales Performance: {len(sales_df)} quarterly sales figures")

# Display sample data
print(f"\nSample Temperature Data:")
print(temp_df.head(3))


In [None]:
# Create basic ridgeline plots
fig, axes = plt.subplots(2, 2, figsize=(20, 16))
fig.suptitle('Ridgeline Plot Visualizations - Distribution Comparison', fontsize=16, fontweight='bold')

# 1. Temperature Distributions by Month
ax1 = axes[0, 0]

# Calculate density curves for each month
y_offset = 0
month_spacing = 1.5
colors = plt.cm.coolwarm(np.linspace(0, 1, len(months)))

for i, month in enumerate(months):
    month_data = temp_df[temp_df['Month'] == month]['Temperature']
    
    # Calculate kernel density
    kde = gaussian_kde(month_data)
    x_range = np.linspace(month_data.min() - 5, month_data.max() + 5, 200)
    density = kde(x_range)
    
    # Scale density for visual separation
    density_scaled = density * 0.8 + y_offset
    
    # Fill area under curve
    ax1.fill_between(x_range, y_offset, density_scaled, 
                    alpha=0.7, color=colors[i], label=month)
    
    # Add outline
    ax1.plot(x_range, density_scaled, color='white', linewidth=1.5, alpha=0.8)
    
    # Add month label
    ax1.text(x_range[np.argmax(density)], y_offset + max(density) * 0.4, 
            month, fontsize=10, ha='center', va='center', fontweight='bold')
    
    y_offset += month_spacing

ax1.set_title('Temperature Distributions by Month\n(°C)', fontsize=12, fontweight='bold')
ax1.set_xlabel('Temperature (°C)')
ax1.set_ylabel('Month')
ax1.set_yticks([i * month_spacing + 0.4 for i in range(len(months))])
ax1.set_yticklabels(months)
ax1.grid(True, alpha=0.3)

# 2. Exam Scores by Subject
ax2 = axes[0, 1]

y_offset = 0
subject_spacing = 1.2
subject_colors = plt.cm.viridis(np.linspace(0, 1, len(subjects)))

for i, subject in enumerate(subjects):
    subject_data = exam_df[exam_df['Subject'] == subject]['Score']
    
    # Calculate kernel density
    kde = gaussian_kde(subject_data)
    x_range = np.linspace(0, 100, 200)
    density = kde(x_range)
    
    # Scale density
    density_scaled = density * 15 + y_offset  # Scale factor for visibility
    
    # Fill area
    ax2.fill_between(x_range, y_offset, density_scaled, 
                    alpha=0.8, color=subject_colors[i])
    
    # Add outline
    ax2.plot(x_range, density_scaled, color='white', linewidth=1)
    
    # Add subject label with mean score
    mean_score = subject_data.mean()
    ax2.text(5, y_offset + max(density) * 7.5, 
            f'{subject} (μ={mean_score:.1f})', 
            fontsize=9, ha='left', va='center', fontweight='bold')
    
    # Mark mean with vertical line
    ax2.axvline(x=mean_score, ymin=(y_offset)/(len(subjects)*subject_spacing), 
               ymax=(y_offset + max(density) * 15)/(len(subjects)*subject_spacing),
               color='red', linestyle='--', alpha=0.7, linewidth=1.5)
    
    y_offset += subject_spacing

ax2.set_title('Exam Score Distributions by Subject\n(0-100 Scale)', fontsize=12, fontweight='bold')
ax2.set_xlabel('Score')
ax2.set_ylabel('Subject')
ax2.set_yticks([i * subject_spacing + 0.4 for i in range(len(subjects))])
ax2.set_yticklabels(subjects)
ax2.set_xlim(0, 100)
ax2.grid(True, alpha=0.3)

# 3. Income Distribution by Age Group
ax3 = axes[1, 0]

y_offset = 0
age_spacing = 1.0
age_colors = plt.cm.plasma(np.linspace(0, 1, len(age_groups)))

for i, age_group in enumerate(age_groups):
    age_data = income_df[income_df['Age_Group'] == age_group]['Income']
    
    # Use log scale for income
    log_income = np.log(age_data)
    
    # Calculate kernel density
    kde = gaussian_kde(log_income)
    x_range = np.linspace(log_income.min(), log_income.max(), 200)
    density = kde(x_range)
    
    # Convert back to dollar scale for display
    x_range_dollars = np.exp(x_range)
    density_scaled = density * 0.3 + y_offset
    
    # Fill area
    ax3.fill_between(x_range_dollars, y_offset, density_scaled, 
                    alpha=0.8, color=age_colors[i])
    
    # Add outline
    ax3.plot(x_range_dollars, density_scaled, color='white', linewidth=1)
    
    # Add age group label with median income
    median_income = age_data.median()
    ax3.text(x_range_dollars.min() * 1.1, y_offset + max(density) * 0.15, 
            f'{age_group} (${median_income:,.0f})', 
            fontsize=9, ha='left', va='center', fontweight='bold')
    
    y_offset += age_spacing

ax3.set_title('Income Distribution by Age Group\n(Annual Income)', fontsize=12, fontweight='bold')
ax3.set_xlabel('Income ($)')
ax3.set_ylabel('Age Group')
ax3.set_xscale('log')
ax3.set_yticks([i * age_spacing + 0.15 for i in range(len(age_groups))])
ax3.set_yticklabels(age_groups)
ax3.grid(True, alpha=0.3)

# Format x-axis labels
ax3.set_xticks([20000, 50000, 100000, 200000, 500000])
ax3.set_xticklabels(['$20K', '$50K', '$100K', '$200K', '$500K'])

# 4. Response Time by Page Type
ax4 = axes[1, 1]

y_offset = 0
page_spacing = 0.8
page_colors = plt.cm.Set2(np.linspace(0, 1, len(page_types)))

for i, page_type in enumerate(page_types):
    page_data = response_df[response_df['Page_Type'] == page_type]['Response_Time']
    
    # Calculate kernel density
    kde = gaussian_kde(page_data)
    x_range = np.linspace(0, page_data.max() * 1.1, 200)
    density = kde(x_range)
    
    # Scale density
    density_scaled = density * 1.5 + y_offset
    
    # Fill area
    ax4.fill_between(x_range, y_offset, density_scaled, 
                    alpha=0.8, color=page_colors[i])
    
    # Add outline
    ax4.plot(x_range, density_scaled, color='white', linewidth=1)
    
    # Add page type label with percentile info
    p95 = np.percentile(page_data, 95)
    ax4.text(x_range.max() * 0.7, y_offset + max(density) * 0.75, 
            f'{page_type}\n(95th: {p95:.2f}s)', 
            fontsize=9, ha='center', va='center', fontweight='bold',
            bbox=dict(boxstyle="round,pad=0.3", facecolor='white', alpha=0.8))
    
    y_offset += page_spacing

ax4.set_title('Response Time Distributions by Page Type\n(Seconds)', fontsize=12, fontweight='bold')
ax4.set_xlabel('Response Time (seconds)')
ax4.set_ylabel('Page Type')
ax4.set_yticks([i * page_spacing + 0.4 for i in range(len(page_types))])
ax4.set_yticklabels(page_types)
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Advanced ridgeline plot techniques
fig, axes = plt.subplots(2, 2, figsize=(20, 16))
fig.suptitle('Advanced Ridgeline Plot Techniques', fontsize=16, fontweight='bold')

# 1. Ridgeline with Statistical Overlays
ax1 = axes[0, 0]

y_offset = 0
rating_spacing = 0.8
category_colors = plt.cm.tab10(np.linspace(0, 1, len(categories)))

for i, category in enumerate(categories):
    category_data = rating_df[rating_df['Category'] == category]['Rating']
    
    # Calculate kernel density
    kde = gaussian_kde(category_data)
    x_range = np.linspace(1, 5, 200)
    density = kde(x_range)
    
    # Scale density
    density_scaled = density * 1.2 + y_offset
    
    # Fill area with gradient effect
    ax1.fill_between(x_range, y_offset, density_scaled, 
                    alpha=0.7, color=category_colors[i])
    
    # Add statistical markers
    mean_rating = category_data.mean()
    median_rating = category_data.median()
    
    # Mean line
    mean_density = kde(mean_rating)[0] * 1.2 + y_offset
    ax1.plot([mean_rating, mean_rating], [y_offset, mean_density], 
            color='red', linewidth=3, alpha=0.8)
    ax1.scatter([mean_rating], [mean_density], color='red', s=50, zorder=5)
    
    # Median line
    median_density = kde(median_rating)[0] * 1.2 + y_offset
    ax1.plot([median_rating, median_rating], [y_offset, median_density], 
            color='blue', linewidth=2, alpha=0.8, linestyle='--')
    
    # Quartiles
    q25, q75 = np.percentile(category_data, [25, 75])
    q25_density = kde(q25)[0] * 1.2 + y_offset
    q75_density = kde(q75)[0] * 1.2 + y_offset
    
    ax1.plot([q25, q75], [q25_density, q75_density], 
            color='green', linewidth=4, alpha=0.6)
    
    # Add category label with stats
    ax1.text(0.5, y_offset + max(density) * 0.6, 
            f'{category}\nμ={mean_rating:.2f}, σ={category_data.std():.2f}', 
            fontsize=9, ha='left', va='center', fontweight='bold',
            bbox=dict(boxstyle="round,pad=0.2", facecolor='white', alpha=0.9))
    
    y_offset += rating_spacing

ax1.set_title('Product Ratings with Statistical Overlays\n(1-5 Star Scale)', fontsize=12, fontweight='bold')
ax1.set_xlabel('Rating')
ax1.set_ylabel('Product Category')
ax1.set_xlim(1, 5)
ax1.set_yticks([i * rating_spacing + 0.4 for i in range(len(categories))])
ax1.set_yticklabels(categories)
ax1.grid(True, alpha=0.3)

# Add legend for statistical elements
from matplotlib.lines import Line2D
legend_elements = [Line2D([0], [0], color='red', lw=3, label='Mean'),
                   Line2D([0], [0], color='blue', lw=2, linestyle='--', label='Median'),
                   Line2D([0], [0], color='green', lw=4, alpha=0.6, label='IQR')]
ax1.legend(handles=legend_elements, loc='upper right')

# 2. Time Series Ridgeline
ax2 = axes[0, 1]

# Group sales data by year-quarter
sales_periods = sales_df['Period'].unique()
y_offset = 0
period_spacing = 0.6
period_colors = plt.cm.viridis(np.linspace(0, 1, len(sales_periods)))

for i, period in enumerate(sorted(sales_periods)):
    period_data = sales_df[sales_df['Period'] == period]['Sales']
    
    # Calculate kernel density
    kde = gaussian_kde(period_data)
    x_range = np.linspace(0, period_data.max() * 1.2, 200)
    density = kde(x_range)
    
    # Scale density
    density_scaled = density / density.max() * 0.5 + y_offset
    
    # Fill area with color based on year
    year = int(period.split()[0])
    color_intensity = (year - 2020) / 3  # 2020-2023 range
    
    ax2.fill_between(x_range, y_offset, density_scaled, 
                    alpha=0.8, color=plt.cm.plasma(color_intensity))
    
    # Add outline
    ax2.plot(x_range, density_scaled, color='white', linewidth=1)
    
    # Add period label
    ax2.text(x_range.max() * 0.8, y_offset + 0.25, 
            period, fontsize=9, ha='center', va='center', fontweight='bold')
    
    # Show trend with arrow
    if i > 0:
        prev_period = sorted(sales_periods)[i-1]
        prev_median = sales_df[sales_df['Period'] == prev_period]['Sales'].median()
        curr_median = period_data.median()
        
        # Arrow showing trend
        if curr_median > prev_median:
            arrow_color = 'green'
            arrow_text = '↗'
        else:
            arrow_color = 'red'
            arrow_text = '↘'
        
        ax2.text(x_range.max() * 0.9, y_offset + 0.1, 
                arrow_text, fontsize=12, ha='center', va='center', 
                color=arrow_color, fontweight='bold')
    
    y_offset += period_spacing

ax2.set_title('Sales Performance Over Time\n(Quarterly Distributions)', fontsize=12, fontweight='bold')
ax2.set_xlabel('Sales Amount ($)')
ax2.set_ylabel('Time Period')
ax2.set_yticks([i * period_spacing + 0.25 for i in range(len(sales_periods))])
ax2.set_yticklabels(sorted(sales_periods))
ax2.grid(True, alpha=0.3)

# 3. Comparative Ridgeline with Overlap Analysis
ax3 = axes[1, 0]

# Compare response times with overlap highlighting
selected_pages = ['Homepage', 'Product Page', 'Checkout']
y_offset = 0
comparison_spacing = 1.0
comparison_colors = ['blue', 'orange', 'red']

overlap_pairs = []

for i, page_type in enumerate(selected_pages):
    page_data = response_df[response_df['Page_Type'] == page_type]['Response_Time']
    
    # Calculate kernel density
    kde = gaussian_kde(page_data)
    x_range = np.linspace(0, 4, 400)
    density = kde(x_range)
    
    # Scale density
    density_scaled = density * 2 + y_offset
    
    # Fill area
    ax3.fill_between(x_range, y_offset, density_scaled, 
                    alpha=0.6, color=comparison_colors[i], label=page_type)
    
    # Store density for overlap calculation
    overlap_pairs.append((x_range, density_scaled, page_type))
    
    y_offset += comparison_spacing

# Calculate and highlight overlaps
for i in range(len(overlap_pairs)):
    for j in range(i+1, len(overlap_pairs)):
        x1, y1, name1 = overlap_pairs[i]
        x2, y2, name2 = overlap_pairs[j]
        
        # Find intersection points
        # Interpolate to common x values
        common_x = np.linspace(0, 4, 400)
        y1_interp = np.interp(common_x, x1, y1)
        y2_interp = np.interp(common_x, x2, y2)
        
        # Find minimum envelope (overlap area)
        overlap_y = np.minimum(y1_interp, y2_interp)
        
        # Only show significant overlaps
        overlap_area = np.trapz(overlap_y - np.maximum(y1_interp - (y1_interp - y1[0]), 
                                                      y2_interp - (y2_interp - y2[0])), common_x)
        
        if overlap_area > 0.1:  # Threshold for significant overlap
            ax3.fill_between(common_x, 
                           np.maximum(y1_interp - (y1_interp - y1[0]), 
                                    y2_interp - (y2_interp - y2[0])), 
                           overlap_y, 
                           alpha=0.8, color='yellow', 
                           label=f'{name1}-{name2} Overlap' if i == 0 and j == 1 else "")

ax3.set_title('Response Time Overlap Analysis\n(Selected Page Types)', fontsize=12, fontweight='bold')
ax3.set_xlabel('Response Time (seconds)')
ax3.set_ylabel('Page Type')
ax3.set_yticks([i * comparison_spacing + 1 for i in range(len(selected_pages))])
ax3.set_yticklabels(selected_pages)
ax3.legend()
ax3.grid(True, alpha=0.3)

# 4. Ridgeline with Quantile Bands
ax4 = axes[1, 1]

# Show income distributions with quantile confidence bands
selected_ages = ['26-35', '36-45', '46-55', '56-65']
y_offset = 0
quantile_spacing = 1.2
quantile_colors = plt.cm.coolwarm(np.linspace(0, 1, len(selected_ages)))

for i, age_group in enumerate(selected_ages):
    age_data = income_df[income_df['Age_Group'] == age_group]['Income']
    log_income = np.log(age_data)
    
    # Calculate kernel density
    kde = gaussian_kde(log_income)
    x_range = np.linspace(log_income.min(), log_income.max(), 200)
    density = kde(x_range)
    x_range_dollars = np.exp(x_range)
    
    # Scale density
    density_scaled = density * 0.5 + y_offset
    
    # Main distribution
    ax4.fill_between(x_range_dollars, y_offset, density_scaled, 
                    alpha=0.7, color=quantile_colors[i], label=age_group)
    
    # Add quantile bands
    quantiles = [10, 25, 50, 75, 90]
    quantile_values = np.percentile(age_data, quantiles)
    quantile_colors_inner = ['red', 'orange', 'green', 'orange', 'red']
    quantile_alphas = [0.8, 0.6, 1.0, 0.6, 0.8]
    
    for q_val, q_color, q_alpha in zip(quantile_values, quantile_colors_inner, quantile_alphas):
        q_density = kde(np.log(q_val))[0] * 0.5 + y_offset
        ax4.scatter([q_val], [q_density], color=q_color, s=30, alpha=q_alpha, zorder=5)
    
    # Add IQR box
    q25, q75 = quantile_values[1], quantile_values[3]
    q25_density = kde(np.log(q25))[0] * 0.5 + y_offset
    q75_density = kde(np.log(q75))[0] * 0.5 + y_offset
    
    ax4.plot([q25, q75], [q25_density, q75_density], 
            color='black', linewidth=5, alpha=0.8)
    
    # Add age group label
    median_income = quantile_values[2]
    ax4.text(median_income, y_offset + max(density) * 0.25, 
            age_group, fontsize=10, ha='center', va='center', fontweight='bold',
            bbox=dict(boxstyle="round,pad=0.2", facecolor='white', alpha=0.9))
    
    y_offset += quantile_spacing

ax4.set_title('Income Quantile Analysis by Age\n(Working Age Groups)', fontsize=12, fontweight='bold')
ax4.set_xlabel('Income ($)')
ax4.set_ylabel('Age Group')
ax4.set_xscale('log')
ax4.set_yticks([i * quantile_spacing + 0.25 for i in range(len(selected_ages))])
ax4.set_yticklabels(selected_ages)
ax4.grid(True, alpha=0.3)

# Format x-axis
ax4.set_xticks([30000, 50000, 100000, 200000, 400000])
ax4.set_xticklabels(['$30K', '$50K', '$100K', '$200K', '$400K'])

plt.tight_layout()
plt.show()


In [None]:
# Interactive ridgeline plots (Plotly structure)
print("Interactive Ridgeline Plots (Plotly):")
print("=" * 50)

print("\n1. Basic Interactive Ridgeline")
print("Code structure:")
print("""
import plotly.graph_objects as go
from scipy.stats import gaussian_kde

fig = go.Figure()

categories = df['category'].unique()
y_spacing = 1.0

for i, category in enumerate(categories):
    category_data = df[df['category'] == category]['value']
    
    # Calculate KDE
    kde = gaussian_kde(category_data)
    x_range = np.linspace(category_data.min(), category_data.max(), 200)
    density = kde(x_range)
    
    # Scale and offset density
    y_base = i * y_spacing
    y_values = density * 0.8 + y_base
    
    # Add filled area
    fig.add_trace(go.Scatter(
        x=x_range,
        y=y_values,
        fill='tonexty' if i > 0 else 'tozeroy',
        mode='lines',
        name=category,
        line=dict(width=0),
        fillcolor=f'rgba({colors[i]}, 0.7)',
        hovertemplate="<b>%{fullData.name}</b><br>" +
                      "Value: %{x}<br>" +
                      "Density: %{customdata}<br>" +
                      "<extra></extra>",
        customdata=density
    ))
    
    # Add baseline
    fig.add_trace(go.Scatter(
        x=x_range,
        y=[y_base] * len(x_range),
        mode='lines',
        line=dict(width=1, color='white'),
        showlegend=False,
        hoverinfo='skip'
    ))

fig.update_layout(
    title="Interactive Ridgeline Plot",
    xaxis_title="Value",
    yaxis_title="Category",
    yaxis=dict(
        tickvals=[i * y_spacing + 0.4 for i in range(len(categories))],
        ticktext=categories
    ),
    hovermode='x unified'
)

fig.show()
""")

print("\n2. Ridgeline with Statistical Annotations")
print("Code structure:")
print("""
fig = go.Figure()

for i, category in enumerate(categories):
    category_data = df[df['category'] == category]['value']
    
    # KDE calculation
    kde = gaussian_kde(category_data)
    x_range = np.linspace(category_data.min(), category_data.max(), 200)
    density = kde(x_range)
    
    y_base = i * y_spacing
    y_values = density * 0.8 + y_base
    
    # Main distribution
    fig.add_trace(go.Scatter(
        x=x_range,
        y=y_values,
        fill='tonexty' if i > 0 else 'tozeroy',
        mode='lines',
        name=category,
        line=dict(width=0)
    ))
    
    # Add statistical markers
    mean_val = category_data.mean()
    median_val = category_data.median()
    
    mean_density = kde(mean_val)[0] * 0.8 + y_base
    median_density = kde(median_val)[0] * 0.8 + y_base
    
    # Mean marker
    fig.add_trace(go.Scatter(
        x=[mean_val],
        y=[mean_density],
        mode='markers',
        marker=dict(symbol='line-ns', size=15, color='red'),
        name=f'{category} Mean',
        showlegend=False,
        hovertemplate="Mean: %{x:.2f}<extra></extra>"
    ))
    
    # Median marker  
    fig.add_trace(go.Scatter(
        x=[median_val],
        y=[median_density],
        mode='markers',
        marker=dict(symbol='line-ns', size=12, color='blue'),
        name=f'{category} Median',
        showlegend=False,
        hovertemplate="Median: %{x:.2f}<extra></extra>"
    ))
    
    # Add annotation
    fig.add_annotation(
        x=mean_val,
        y=mean_density + 0.1,
        text=f"μ={mean_val:.1f}<br>σ={category_data.std():.1f}",
        showarrow=False,
        font=dict(size=10),
        bgcolor="white",
        bordercolor="black",
        borderwidth=1
    )

fig.update_layout(
    title="Ridgeline with Statistical Overlays",
    showlegend=True
)

fig.show()
""")

print("\n3. Animated Ridgeline (Time Evolution)")
print("Code structure:")
print("""
# Prepare data for animation
time_periods = sorted(df['time_period'].unique())
frames = []

for t in time_periods:
    frame_data = []
    period_data = df[df['time_period'] == t]
    
    for i, category in enumerate(categories):
        cat_data = period_data[period_data['category'] == category]['value']
        
        if len(cat_data) > 0:
            kde = gaussian_kde(cat_data)
            x_range = np.linspace(overall_min, overall_max, 200)
            density = kde(x_range)
            
            y_base = i * y_spacing
            y_values = density * 0.8 + y_base
            
            frame_data.append(go.Scatter(
                x=x_range,
                y=y_values,
                fill='tonexty' if i > 0 else 'tozeroy',
                mode='lines',
                name=category,
                line=dict(width=0)
            ))
    
    frames.append(go.Frame(data=frame_data, name=str(t)))

# Create initial figure
fig = go.Figure(data=frames[0].data, frames=frames)

# Add animation controls
fig.update_layout(
    updatemenus=[
        dict(type="buttons",
             buttons=[dict(label="Play", method="animate", args=[None]),
                      dict(label="Pause", method="animate", args=[[None]])])
    ],
    sliders=[dict(
        steps=[dict(args=[[f.name]], label=f.name, method="animate") 
               for f in frames],
        active=0
    )],
    title="Animated Ridgeline Evolution"
)

fig.show()
""")

print("\n4. Interactive Comparison with Brushing")
print("Code structure:")
print("""
from plotly.subplots import make_subplots

# Create subplots for comparison
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Distribution Comparison", "Summary Statistics"),
    column_widths=[0.7, 0.3]
)

# Left plot: Ridgeline
for i, category in enumerate(categories):
    category_data = df[df['category'] == category]['value']
    
    kde = gaussian_kde(category_data)
    x_range = np.linspace(category_data.min(), category_data.max(), 200)
    density = kde(x_range)
    
    y_base = i * y_spacing
    y_values = density * 0.8 + y_base
    
    fig.add_trace(go.Scatter(
        x=x_range,
        y=y_values,
        fill='tonexty' if i > 0 else 'tozeroy',
        mode='lines',
        name=category,
        line=dict(width=0),
        legendgroup=category
    ), row=1, col=1)

# Right plot: Box plots for comparison
fig.add_trace(go.Box(
    y=df['category'],
    x=df['value'],
    orientation='h',
    name="Distribution Summary",
    showlegend=False
), row=1, col=2)

# Add selection capability
fig.update_layout(
    title="Interactive Distribution Comparison",
    dragmode='select',
    hovermode='x unified'
)

# Add callback for selection (pseudo-code)
# fig.data[0].on_selection(update_statistics_panel)

fig.show()
""")


In [None]:
# Statistical analysis of ridgeline plot data
print("Ridgeline Plot Statistical Analysis:")
print("=" * 50)

# 1. Distribution Shape Analysis
print("1. DISTRIBUTION SHAPE ANALYSIS:")

from scipy import stats

def analyze_distribution_shape(data, name):
    """Analyze the shape characteristics of a distribution"""
    # Basic statistics
    mean = data.mean()
    median = data.median()
    std = data.std()
    
    # Shape statistics
    skewness = stats.skew(data)
    kurtosis = stats.kurtosis(data)
    
    # Normality tests
    shapiro_stat, shapiro_p = stats.shapiro(data.sample(min(5000, len(data))))
    
    # Classify distribution shape
    if abs(skewness) < 0.5:
        skew_desc = "Approximately symmetric"
    elif skewness > 0.5:
        skew_desc = "Right-skewed (positive)"
    else:
        skew_desc = "Left-skewed (negative)"
    
    if kurtosis > 1:
        kurt_desc = "Heavy-tailed (leptokurtic)"
    elif kurtosis < -1:
        kurt_desc = "Light-tailed (platykurtic)"
    else:
        kurt_desc = "Normal-like tails (mesokurtic)"
    
    is_normal = shapiro_p > 0.05
    
    return {
        'mean': mean, 'median': median, 'std': std,
        'skewness': skewness, 'kurtosis': kurtosis,
        'skew_desc': skew_desc, 'kurt_desc': kurt_desc,
        'is_normal': is_normal, 'shapiro_p': shapiro_p
    }

print("   Subject Score Distribution Analysis:")
for subject in subjects:
    subject_data = exam_df[exam_df['Subject'] == subject]['Score']
    analysis = analyze_distribution_shape(subject_data, subject)
    
    print(f"\n     {subject}:")
    print(f"       Mean: {analysis['mean']:.1f}, Median: {analysis['median']:.1f}")
    print(f"       Shape: {analysis['skew_desc']}")
    print(f"       Tails: {analysis['kurt_desc']}")
    print(f"       Skewness: {analysis['skewness']:.3f}")
    print(f"       Kurtosis: {analysis['kurtosis']:.3f}")
    print(f"       Normal?: {'Yes' if analysis['is_normal'] else 'No'} (p={analysis['shapiro_p']:.4f})")

# 2. Group Comparison Analysis
print(f"\n2. GROUP COMPARISON ANALYSIS:")

# Compare exam scores across subjects using statistical tests
print("   Statistical Comparisons (Exam Scores):")

# ANOVA to test if means differ across subjects
subject_groups = [exam_df[exam_df['Subject'] == subject]['Score'].values for subject in subjects]
anova_stat, anova_p = stats.f_oneway(*subject_groups)

print(f"     ANOVA Test (Equal Means):")
print(f"       F-statistic: {anova_stat:.3f}")
print(f"       p-value: {anova_p:.6f}")
print(f"       Result: {'Significant differences' if anova_p < 0.05 else 'No significant differences'} between subjects")

# Kruskal-Wallis test (non-parametric alternative)
kw_stat, kw_p = stats.kruskal(*subject_groups)
print(f"\n     Kruskal-Wallis Test (Equal Distributions):")
print(f"       H-statistic: {kw_stat:.3f}")
print(f"       p-value: {kw_p:.6f}")
print(f"       Result: {'Significant differences' if kw_p < 0.05 else 'No significant differences'} in distributions")

# Pairwise comparisons (selected pairs)
comparison_pairs = [('Mathematics', 'Science'), ('English', 'Art'), ('Mathematics', 'Art')]

print(f"\n     Pairwise Comparisons (Mann-Whitney U):")
for subj1, subj2 in comparison_pairs:
    data1 = exam_df[exam_df['Subject'] == subj1]['Score']
    data2 = exam_df[exam_df['Subject'] == subj2]['Score']
    
    u_stat, u_p = stats.mannwhitneyu(data1, data2, alternative='two-sided')
    effect_size = u_stat / (len(data1) * len(data2))  # Simple effect size
    
    mean_diff = data1.mean() - data2.mean()
    
    print(f"       {subj1} vs {subj2}:")
    print(f"         Mean difference: {mean_diff:+.1f} points")
    print(f"         p-value: {u_p:.4f}")
    print(f"         Effect size: {effect_size:.3f}")
    print(f"         Significance: {'Yes' if u_p < 0.05 else 'No'}")

# 3. Age Group Income Analysis
print(f"\n3. AGE GROUP INCOME ANALYSIS:")

print("   Income Distribution Characteristics:")
for age_group in age_groups:
    age_data = income_df[income_df['Age_Group'] == age_group]['Income']
    
    # Income statistics (using robust statistics due to skewness)
    median_income = age_data.median()
    q25, q75 = np.percentile(age_data, [25, 75])
    iqr = q75 - q25
    
    # Gini coefficient (income inequality measure)
    def gini_coefficient(x):
        """Calculate Gini coefficient for income inequality"""
        sorted_x = np.sort(x)
        n = len(x)
        cumsum = np.cumsum(sorted_x)
        return (n + 1 - 2 * np.sum(cumsum) / cumsum[-1]) / n
    
    gini = gini_coefficient(age_data)
    
    # High earners analysis (top 10%)
    top_10_threshold = np.percentile(age_data, 90)
    high_earners_pct = (age_data >= top_10_threshold).mean() * 100
    
    print(f"\n     {age_group}:")
    print(f"       Median Income: ${median_income:,.0f}")
    print(f"       IQR: ${q25:,.0f} - ${q75:,.0f} (spread: ${iqr:,.0f})")
    print(f"       Income Inequality (Gini): {gini:.3f}")
    print(f"       High Earners (>90th percentile): {high_earners_pct:.1f}%")

# Cross-age group comparison
print(f"\n   Age Group Income Comparison:")

# Test for trends across age groups
age_medians = []
age_order = ['18-25', '26-35', '36-45', '46-55', '56-65', '65+']

for age_group in age_order:
    median_income = income_df[income_df['Age_Group'] == age_group]['Income'].median()
    age_medians.append(median_income)

# Spearman correlation with age order
age_ranks = list(range(len(age_order)))
correlation, corr_p = stats.spearmanr(age_ranks, age_medians)

print(f"     Age-Income Correlation: {correlation:.3f} (p={corr_p:.4f})")
print(f"     Trend: {'Significant positive' if correlation > 0 and corr_p < 0.05 else 'No significant'} correlation with age")

# Find peak earning age group
peak_age_idx = np.argmax(age_medians)
peak_age_group = age_order[peak_age_idx]
peak_income = age_medians[peak_age_idx]

print(f"     Peak Earning Group: {peak_age_group} (${peak_income:,.0f} median)")

# 4. Response Time Performance Analysis
print(f"\n4. RESPONSE TIME PERFORMANCE ANALYSIS:")

print("   Page Performance Metrics:")
for page_type in page_types:
    page_data = response_df[response_df['Page_Type'] == page_type]['Response_Time']
    
    # Performance percentiles
    percentiles = [50, 75, 90, 95, 99]
    perf_values = np.percentile(page_data, percentiles)
    
    # SLA analysis (assuming 2 second SLA)
    sla_threshold = 2.0
    sla_compliance = (page_data <= sla_threshold).mean() * 100
    
    # Performance classification
    p95 = perf_values[3]  # 95th percentile
    if p95 < 1.0:
        perf_class = "Excellent"
    elif p95 < 2.0:
        perf_class = "Good" 
    elif p95 < 3.0:
        perf_class = "Fair"
    else:
        perf_class = "Poor"
    
    print(f"\n     {page_type}:")
    print(f"       Median: {perf_values[0]:.3f}s")
    print(f"       75th percentile: {perf_values[1]:.3f}s")
    print(f"       95th percentile: {perf_values[3]:.3f}s")
    print(f"       99th percentile: {perf_values[4]:.3f}s")
    print(f"       SLA Compliance (<2s): {sla_compliance:.1f}%")
    print(f"       Performance Class: {perf_class}")

# Identify worst performing pages
print(f"\n   Performance Ranking (by 95th percentile):")
page_performance = []
for page_type in page_types:
    page_data = response_df[response_df['Page_Type'] == page_type]['Response_Time']
    p95 = np.percentile(page_data, 95)
    page_performance.append((page_type, p95))

# Sort by performance (ascending = better)
page_performance.sort(key=lambda x: x[1])

for rank, (page_type, p95) in enumerate(page_performance, 1):
    print(f"     {rank}. {page_type}: {p95:.3f}s")

# 5. Seasonal Temperature Variation Analysis
print(f"\n5. SEASONAL TEMPERATURE ANALYSIS:")

print("   Monthly Temperature Characteristics:")
monthly_stats = temp_df.groupby('Month')['Temperature'].agg(['mean', 'std', 'min', 'max'])

# Calculate seasonal metrics
winter_months = ['Dec', 'Jan', 'Feb']
spring_months = ['Mar', 'Apr', 'May'] 
summer_months = ['Jun', 'Jul', 'Aug']
fall_months = ['Sep', 'Oct', 'Nov']

seasonal_temps = {}
for season, months_list in [('Winter', winter_months), ('Spring', spring_months), 
                           ('Summer', summer_months), ('Fall', fall_months)]:
    season_data = temp_df[temp_df['Month'].isin(months_list)]['Temperature']
    seasonal_temps[season] = {
        'mean': season_data.mean(),
        'std': season_data.std(),
        'range': season_data.max() - season_data.min()
    }

print(f"\n   Seasonal Summary:")
for season, stats in seasonal_temps.items():
    print(f"     {season}: {stats['mean']:.1f}°C ± {stats['std']:.1f}°C (range: {stats['range']:.1f}°C)")

# Temperature variability analysis
annual_range = temp_df['Temperature'].max() - temp_df['Temperature'].min()
most_variable_month = monthly_stats['std'].idxmax()
least_variable_month = monthly_stats['std'].idxmin()

print(f"\n   Temperature Variability:")
print(f"     Annual Range: {annual_range:.1f}°C")
print(f"     Most Variable Month: {most_variable_month} ({monthly_stats.loc[most_variable_month, 'std']:.1f}°C std)")
print(f"     Least Variable Month: {least_variable_month} ({monthly_stats.loc[least_variable_month, 'std']:.1f}°C std)")

# 6. Ridgeline Plot Design Effectiveness
print(f"\n6. RIDGELINE PLOT DESIGN ANALYSIS:")

print("   Dataset Suitability for Ridgeline Visualization:")

datasets_analysis = {
    'Temperature by Month': {
        'groups': len(months),
        'overlap_level': 'Moderate', 
        'pattern_clarity': 'High',
        'comparison_value': 'Excellent',
        'recommendation': 'Ideal - clear seasonal patterns'
    },
    'Exam Scores by Subject': {
        'groups': len(subjects),
        'overlap_level': 'High',
        'pattern_clarity': 'Moderate', 
        'comparison_value': 'Good',
        'recommendation': 'Good - shows performance differences'
    },
    'Income by Age Group': {
        'groups': len(age_groups),
        'overlap_level': 'Low',
        'pattern_clarity': 'Very High',
        'comparison_value': 'Excellent',
        'recommendation': 'Excellent - distinct patterns by age'
    },
    'Response Time by Page': {
        'groups': len(page_types),
        'overlap_level': 'Moderate',
        'pattern_clarity': 'High',
        'comparison_value': 'Good',
        'recommendation': 'Good - performance comparison'
    }
}

for dataset, analysis in datasets_analysis.items():
    print(f"\n     {dataset}:")
    print(f"       Groups: {analysis['groups']}")
    print(f"       Overlap: {analysis['overlap_level']}")
    print(f"       Pattern Clarity: {analysis['pattern_clarity']}")
    print(f"       Comparison Value: {analysis['comparison_value']}")
    print(f"       Recommendation: {analysis['recommendation']}")

print(f"\n   Ridgeline Plot Best Practices:")
print("   ✓ Use 3-12 groups (too few = wasted space, too many = cluttered)")
print("   ✓ Ideal when distributions have different shapes or centers")
print("   ✓ Best for continuous data with sufficient sample sizes (n>30)")
print("   ✓ Apply consistent vertical spacing for visual balance")
print("   ✓ Order groups logically (time, magnitude, importance)")
print("   ✓ Use consistent color schemes within groups")
print("   ✓ Add statistical overlays (mean, median) when appropriate")

print(f"\nWhen to Use Ridgeline Plots:")
print("   • Comparing distributions across categories")
print("   • Time series of distributions (monthly, yearly)")
print("   • Performance metrics across different systems")
print("   • Survey responses by demographic groups")  
print("   • A/B test result distributions")
print("   • Environmental measurements across locations")

print(f"\nAlternatives to Consider:")
print("   • Box plots for quartile focus")
print("   • Violin plots for symmetric distribution display")
print("   • Histogram grids for discrete bin analysis")
print("   • Heat maps for distribution matrices")
print("   • Strip plots for individual observation focus")
