# Box Plot/Violin Plot - Compare Distributions

**Use Case**: Compare distributions across groups (salaries by department, prices by brand)

This notebook demonstrates how to create effective box plots and violin plots for comparing distributions across different categories.

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set style for better-looking plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Set random seed for reproducibility
np.random.seed(42)

In [None]:
# Generate sample data for multiple distributions
departments = ['Sales', 'Engineering', 'Marketing', 'HR', 'Finance']
salary_data = []
dept_labels = []

for dept in departments:
    base_salary = {'Sales': 60000, 'Engineering': 80000, 'Marketing': 55000, 
                   'HR': 50000, 'Finance': 70000}[dept]
    salaries = np.random.normal(base_salary, base_salary * 0.15, 50)
    salary_data.extend(salaries)
    dept_labels.extend([dept] * 50)

df_salaries = pd.DataFrame({'Department': dept_labels, 'Salary': salary_data})

# Generate brand price data
brands = ['Premium', 'Standard', 'Budget', 'Luxury']
price_data = []
brand_labels = []

for brand in brands:
    base_price = {'Premium': 150, 'Standard': 80, 'Budget': 40, 'Luxury': 300}[brand]
    prices = np.random.lognormal(np.log(base_price), 0.3, 100)
    price_data.extend(prices)
    brand_labels.extend([brand] * 100)

df_prices = pd.DataFrame({'Brand': brand_labels, 'Price': price_data})

# Create figure with subplots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))

# Basic box plot
sns.boxplot(data=df_salaries, x='Department', y='Salary', ax=ax1)
ax1.set_title('Salary Distribution by Department', fontsize=14, fontweight='bold')
ax1.set_xlabel('Department')
ax1.set_ylabel('Salary ($)')
ax1.tick_params(axis='x', rotation=45)

# Violin plot
sns.violinplot(data=df_prices, x='Brand', y='Price', ax=ax2)
ax2.set_title('Price Distribution by Brand', fontsize=14, fontweight='bold')
ax2.set_xlabel('Brand')
ax2.set_ylabel('Price ($)')
ax2.tick_params(axis='x', rotation=45)

# Box plot with individual points
sns.boxplot(data=df_salaries, x='Department', y='Salary', ax=ax3)
sns.stripplot(data=df_salaries, x='Department', y='Salary', ax=ax3, 
              color='red', alpha=0.5, size=3)
ax3.set_title('Salary Distribution with Individual Points', fontsize=14, fontweight='bold')
ax3.set_xlabel('Department')
ax3.set_ylabel('Salary ($)')
ax3.tick_params(axis='x', rotation=45)

# Split violin plot
# Generate gender data for comparison
gender_data = []
for dept in departments:
    base_salary = {'Sales': 60000, 'Engineering': 80000, 'Marketing': 55000, 
                   'HR': 50000, 'Finance': 70000}[dept]
    # Add slight gender pay gap for demonstration
    male_salaries = np.random.normal(base_salary * 1.05, base_salary * 0.15, 25)
    female_salaries = np.random.normal(base_salary * 0.95, base_salary * 0.15, 25)
    
    for salary in male_salaries:
        gender_data.append({'Department': dept, 'Salary': salary, 'Gender': 'Male'})
    for salary in female_salaries:
        gender_data.append({'Department': dept, 'Salary': salary, 'Gender': 'Female'})

df_gender = pd.DataFrame(gender_data)

sns.violinplot(data=df_gender, x='Department', y='Salary', hue='Gender', 
               split=True, ax=ax4)
ax4.set_title('Salary Distribution by Department and Gender', fontsize=14, fontweight='bold')
ax4.set_xlabel('Department')
ax4.set_ylabel('Salary ($)')
ax4.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Statistical comparison between groups
print("Statistical Summary by Department:")
print("=" * 50)
dept_summary = df_salaries.groupby('Department')['Salary'].agg([
    'count', 'mean', 'median', 'std', 'min', 'max'
]).round(2)
print(dept_summary)

print("\nStatistical Summary by Brand:")
print("=" * 50)
brand_summary = df_prices.groupby('Brand')['Price'].agg([
    'count', 'mean', 'median', 'std', 'min', 'max'
]).round(2)
print(brand_summary)

In [None]:
# ANOVA test to check if there are significant differences between groups
from scipy.stats import f_oneway

# Prepare data for ANOVA
dept_groups = [df_salaries[df_salaries['Department'] == dept]['Salary'].values 
               for dept in departments]

# Perform one-way ANOVA
f_stat, p_value = f_oneway(*dept_groups)

print("One-way ANOVA Results for Salary by Department:")
print(f"F-statistic: {f_stat:.4f}")
print(f"p-value: {p_value:.6f}")

if p_value < 0.05:
    print("Result: There are statistically significant differences between departments.")
else:
    print("Result: No statistically significant differences between departments.")

# Post-hoc test (pairwise comparisons)
from scipy.stats import ttest_ind

print("\nPairwise t-tests between departments:")
print("=" * 40)

for i, dept1 in enumerate(departments):
    for dept2 in departments[i+1:]:
        group1 = df_salaries[df_salaries['Department'] == dept1]['Salary']
        group2 = df_salaries[df_salaries['Department'] == dept2]['Salary']
        t_stat, p_val = ttest_ind(group1, group2)
        significance = "***" if p_val < 0.001 else "**" if p_val < 0.01 else "*" if p_val < 0.05 else "ns"
        print(f"{dept1} vs {dept2}: t = {t_stat:.3f}, p = {p_val:.6f} {significance}")

In [None]:
# Advanced visualization: Box plots with custom styling
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Custom box plot with notches and different colors
box_plot = ax1.boxplot([df_salaries[df_salaries['Department'] == dept]['Salary'].values 
                        for dept in departments],
                       labels=departments,
                       notch=True,  # Shows confidence interval around median
                       patch_artist=True,  # Allows coloring
                       showmeans=True)  # Shows mean as well as median

# Color the boxes
colors = ['lightblue', 'lightgreen', 'lightcoral', 'lightyellow', 'lightpink']
for patch, color in zip(box_plot['boxes'], colors):
    patch.set_facecolor(color)

ax1.set_title('Salary Distribution by Department\n(with notches and means)', fontsize=14, fontweight='bold')
ax1.set_xlabel('Department')
ax1.set_ylabel('Salary ($)')
ax1.tick_params(axis='x', rotation=45)
ax1.grid(True, alpha=0.3)

# Horizontal box plot
sns.boxplot(data=df_prices, y='Brand', x='Price', orient='h', ax=ax2)
ax2.set_title('Price Distribution by Brand (Horizontal)', fontsize=14, fontweight='bold')
ax2.set_ylabel('Brand')
ax2.set_xlabel('Price ($)')

plt.tight_layout()
plt.show()