# Bar/Column Chart - Categorical Comparison

**Use Case**: Compare categories (sales by region, product ratings, survey responses)

This notebook demonstrates how to create effective bar charts for comparing categorical data.

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style for better-looking plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Set random seed for reproducibility
np.random.seed(42)

In [None]:
# Sample data for bar charts
regions = ['North', 'South', 'East', 'West', 'Central']
sales = [450000, 380000, 520000, 410000, 390000]
products = ['Product A', 'Product B', 'Product C', 'Product D', 'Product E']
ratings = [4.5, 3.8, 4.2, 4.7, 3.9]

# Create figure with subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# Vertical bar chart
bars1 = ax1.bar(regions, sales, color='steelblue', edgecolor='navy', linewidth=1.5)
ax1.set_title('Sales by Region', fontsize=14, fontweight='bold')
ax1.set_xlabel('Region')
ax1.set_ylabel('Sales ($)')
ax1.grid(True, axis='y', alpha=0.3)

# Add value labels on bars
for bar in bars1:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height,
             f'${height:,.0f}', ha='center', va='bottom', fontsize=10)

# Horizontal bar chart
bars2 = ax2.barh(products, ratings, color='coral', edgecolor='darkred', linewidth=1.5)
ax2.set_title('Product Ratings', fontsize=14, fontweight='bold')
ax2.set_xlabel('Rating (out of 5)')
ax2.set_ylabel('Product')
ax2.set_xlim(0, 5)
ax2.grid(True, axis='x', alpha=0.3)

# Add value labels
for i, bar in enumerate(bars2):
    width = bar.get_width()
    ax2.text(width + 0.05, bar.get_y() + bar.get_height()/2.,
             f'{ratings[i]:.1f}', ha='left', va='center', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
# Advanced bar chart examples
# Sample data for different scenarios
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']
revenue_2023 = [85000, 92000, 88000, 95000, 105000, 98000]
revenue_2024 = [89000, 96000, 94000, 101000, 110000, 108000]

categories = ['Category A', 'Category B', 'Category C', 'Category D']
values = [23, 45, 56, 78]
errors = [3, 5, 4, 6]  # Error bars

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# Grouped bar chart comparison
x = np.arange(len(months))
width = 0.35

ax1.bar(x - width/2, revenue_2023, width, label='2023', color='lightblue', edgecolor='blue')
ax1.bar(x + width/2, revenue_2024, width, label='2024', color='lightcoral', edgecolor='red')
ax1.set_title('Monthly Revenue Comparison', fontsize=14, fontweight='bold')
ax1.set_xlabel('Month')
ax1.set_ylabel('Revenue ($)')
ax1.set_xticks(x)
ax1.set_xticklabels(months)
ax1.legend()
ax1.grid(True, axis='y', alpha=0.3)

# Bar chart with error bars
bars = ax2.bar(categories, values, yerr=errors, capsize=5, 
               color='gold', edgecolor='orange', linewidth=2)
ax2.set_title('Categories with Error Bars', fontsize=14, fontweight='bold')
ax2.set_xlabel('Category')
ax2.set_ylabel('Value')
ax2.grid(True, axis='y', alpha=0.3)

# Sorted bar chart
df_survey = pd.DataFrame({
    'Response': ['Excellent', 'Good', 'Fair', 'Poor', 'Very Poor'],
    'Count': [145, 230, 85, 25, 15]
})
df_sorted = df_survey.sort_values('Count', ascending=True)

colors = ['red', 'orange', 'yellow', 'lightgreen', 'green']
ax3.barh(df_sorted['Response'], df_sorted['Count'], color=colors)
ax3.set_title('Survey Responses (Sorted)', fontsize=14, fontweight='bold')
ax3.set_xlabel('Number of Responses')
ax3.grid(True, axis='x', alpha=0.3)

# Percentage bar chart
total = sum(df_survey['Count'])
percentages = [count/total * 100 for count in df_survey['Count']]

bars = ax4.bar(df_survey['Response'], percentages, 
               color=['darkgreen', 'green', 'yellow', 'orange', 'red'])
ax4.set_title('Survey Responses (Percentage)', fontsize=14, fontweight='bold')
ax4.set_xlabel('Response')
ax4.set_ylabel('Percentage (%)')
ax4.tick_params(axis='x', rotation=45)
ax4.grid(True, axis='y', alpha=0.3)

# Add percentage labels
for bar, pct in zip(bars, percentages):
    height = bar.get_height()
    ax4.text(bar.get_x() + bar.get_width()/2., height + 0.5,
             f'{pct:.1f}%', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

In [None]:
# Seaborn bar plots with statistical enhancements
# Create sample dataset
np.random.seed(42)
data = []
departments = ['Sales', 'Marketing', 'Engineering', 'HR']
for dept in departments:
    base_score = {'Sales': 75, 'Marketing': 80, 'Engineering': 85, 'HR': 70}[dept]
    scores = np.random.normal(base_score, 10, 50)
    for score in scores:
        data.append({'Department': dept, 'Performance_Score': score})

df = pd.DataFrame(data)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# Seaborn bar plot with confidence intervals
sns.barplot(data=df, x='Department', y='Performance_Score', ax=ax1, 
            ci=95, capsize=0.1, palette='viridis')
ax1.set_title('Performance Score by Department\n(with 95% CI)', fontsize=14, fontweight='bold')
ax1.set_xlabel('Department')
ax1.set_ylabel('Performance Score')
ax1.tick_params(axis='x', rotation=45)

# Count plot
# Create categorical data
satisfaction_data = []
responses = ['Very Satisfied', 'Satisfied', 'Neutral', 'Dissatisfied', 'Very Dissatisfied']
weights = [0.3, 0.4, 0.15, 0.1, 0.05]
for _ in range(1000):
    response = np.random.choice(responses, p=weights)
    satisfaction_data.append(response)

df_satisfaction = pd.DataFrame({'Satisfaction': satisfaction_data})
sns.countplot(data=df_satisfaction, y='Satisfaction', ax=ax2, 
              order=['Very Satisfied', 'Satisfied', 'Neutral', 'Dissatisfied', 'Very Dissatisfied'],
              palette='RdYlGn_r')
ax2.set_title('Customer Satisfaction Distribution', fontsize=14, fontweight='bold')
ax2.set_xlabel('Count')
ax2.set_ylabel('Satisfaction Level')

plt.tight_layout()
plt.show()

In [None]:
# Statistical analysis and insights
print("Bar Chart Analysis Summary:")
print("=" * 40)

# Regional sales analysis
sales_df = pd.DataFrame({'Region': regions, 'Sales': sales})
print("Sales by Region:")
print(f"  Total Sales: ${sum(sales):,}")
print(f"  Average Sales: ${np.mean(sales):,.0f}")
print(f"  Best Region: {regions[np.argmax(sales)]} (${max(sales):,})")
print(f"  Worst Region: {regions[np.argmin(sales)]} (${min(sales):,})")
print(f"  Sales Range: ${max(sales) - min(sales):,}")
print(f"  Coefficient of Variation: {np.std(sales)/np.mean(sales):.3f}")

# Product ratings analysis
ratings_df = pd.DataFrame({'Product': products, 'Rating': ratings})
print(f"\nProduct Ratings:")
print(f"  Average Rating: {np.mean(ratings):.2f}/5.0")
print(f"  Best Product: {products[np.argmax(ratings)]} ({max(ratings)}/5.0)")
print(f"  Worst Product: {products[np.argmin(ratings)]} ({min(ratings)}/5.0)")
print(f"  Rating Spread: {max(ratings) - min(ratings):.1f} points")

# Statistical significance test
from scipy.stats import chi2_contingency, f_oneway

# Performance score analysis
dept_groups = [df[df['Department'] == dept]['Performance_Score'].values for dept in departments]
f_stat, p_value = f_oneway(*dept_groups)

print(f"\nDepartment Performance Analysis:")
for dept in departments:
    dept_scores = df[df['Department'] == dept]['Performance_Score']
    print(f"  {dept}: Mean = {dept_scores.mean():.1f}, Std = {dept_scores.std():.1f}")

print(f"\nANOVA Test Results:")
print(f"  F-statistic: {f_stat:.4f}")
print(f"  p-value: {p_value:.6f}")
significance = "significant" if p_value < 0.05 else "not significant"
print(f"  Result: Differences between departments are {significance}")