# Bubble Chart - Multi-dimensional Scatter Plot

**Use Case**: Display relationships between three or more variables (market analysis, performance metrics, risk assessment)

This notebook demonstrates how to create effective bubble charts for visualizing multi-dimensional relationships where bubble size represents a third dimension.


In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from matplotlib.patches import Circle
import warnings
warnings.filterwarnings('ignore')

# Set style for better-looking plots
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("Set2")

# Set random seed for reproducibility
np.random.seed(42)

print("Bubble chart visualization libraries loaded!")


In [None]:
# Create sample datasets for bubble charts
# 1. Company Performance Analysis
companies = ['TechCorp', 'DataSoft', 'CloudInc', 'AIVentures', 'CyberSys', 
             'InfoTech', 'DigitalPro', 'NetSolutions', 'CodeCraft', 'ByteWorks',
             'SoftEngine', 'DataFlow', 'TechPioneer', 'InnovateLab', 'FutureTech']

np.random.seed(42)
company_data = []
for i, company in enumerate(companies):
    # Generate correlated business metrics
    revenue = np.random.uniform(10, 500)  # Revenue in millions
    growth_rate = np.random.uniform(-5, 25) + (revenue/500) * 10  # Growth influenced by size
    profit_margin = np.random.uniform(5, 30) - (revenue/1000) * 5  # Larger companies often have lower margins
    employees = revenue * np.random.uniform(8, 25)  # Employees roughly correlated with revenue
    
    # Add some industry variation
    if 'AI' in company or 'Data' in company:
        growth_rate += np.random.uniform(2, 8)  # AI/Data companies growing faster
        profit_margin += np.random.uniform(-3, 5)
    elif 'Cloud' in company or 'Cyber' in company:
        profit_margin += np.random.uniform(2, 8)  # High-margin sectors
    
    company_data.append({
        'Company': company,
        'Revenue_M': revenue,
        'Growth_Rate': growth_rate,
        'Profit_Margin': profit_margin,
        'Employees': int(employees),
        'Market_Cap_B': revenue * np.random.uniform(3, 12),  # Market cap multiple
        'R_D_Spending': revenue * np.random.uniform(0.05, 0.25)  # R&D as % of revenue
    })

company_df = pd.DataFrame(company_data)

# 2. Country Development Indicators
countries = ['USA', 'China', 'Japan', 'Germany', 'UK', 'France', 'India', 'Italy', 
             'Brazil', 'Canada', 'Russia', 'South Korea', 'Spain', 'Australia', 'Mexico',
             'Indonesia', 'Netherlands', 'Saudi Arabia', 'Turkey', 'Taiwan']

country_data = []
for country in countries:
    # Base economic indicators
    if country in ['USA', 'China', 'Japan', 'Germany']:
        gdp_base = np.random.uniform(3000, 25000)
        hdi_base = np.random.uniform(0.85, 0.95)
    elif country in ['UK', 'France', 'Italy', 'Canada', 'South Korea', 'Australia']:
        gdp_base = np.random.uniform(1500, 4000)
        hdi_base = np.random.uniform(0.8, 0.9)
    elif country in ['Brazil', 'Russia', 'Mexico', 'Turkey']:
        gdp_base = np.random.uniform(500, 2000)
        hdi_base = np.random.uniform(0.7, 0.85)
    else:
        gdp_base = np.random.uniform(200, 1500)
        hdi_base = np.random.uniform(0.6, 0.8)
    
    # Add some noise and correlations
    gdp_per_capita = gdp_base * np.random.uniform(0.8, 1.2)
    hdi = hdi_base * np.random.uniform(0.95, 1.05)
    co2_emissions = gdp_per_capita * np.random.uniform(0.3, 2.0) / 1000  # Rough correlation
    
    # Population (inverse correlation with GDP per capita)
    if gdp_per_capita > 40000:
        population = np.random.uniform(5, 100)
    elif gdp_per_capita > 20000:
        population = np.random.uniform(20, 300)
    else:
        population = np.random.uniform(50, 1400)
    
    country_data.append({
        'Country': country,
        'GDP_per_Capita': gdp_per_capita,
        'HDI': hdi,
        'CO2_Emissions_per_Capita': co2_emissions,
        'Population_M': population,
        'Life_Expectancy': 60 + hdi * 25 + np.random.uniform(-3, 3),
        'Education_Years': hdi * 15 + np.random.uniform(-2, 2)
    })

country_df = pd.DataFrame(country_data)

# 3. Product Portfolio Analysis
products = [f'Product {chr(65+i)}' for i in range(15)]  # Products A through O

product_data = []
for i, product in enumerate(products):
    # Create different product archetypes
    if i < 3:  # Premium products
        price = np.random.uniform(800, 2000)
        quality = np.random.uniform(8, 10)
        volume = np.random.uniform(100, 500)
    elif i < 8:  # Mid-market products
        price = np.random.uniform(200, 800)
        quality = np.random.uniform(6, 8.5)
        volume = np.random.uniform(500, 2000)
    else:  # Budget products
        price = np.random.uniform(50, 300)
        quality = np.random.uniform(4, 7)
        volume = np.random.uniform(1000, 5000)
    
    # Calculate derived metrics
    revenue = price * volume
    customer_satisfaction = quality * np.random.uniform(0.8, 1.1)
    market_share = volume / 50000 * 100  # Rough market share
    
    product_data.append({
        'Product': product,
        'Price': price,
        'Quality_Score': quality,
        'Sales_Volume': int(volume),
        'Revenue': revenue,
        'Customer_Satisfaction': customer_satisfaction,
        'Market_Share': market_share,
        'Profit_Margin': np.random.uniform(10, 40)
    })

product_df = pd.DataFrame(product_data)

# 4. Investment Portfolio
investments = ['Tech Stocks', 'Healthcare', 'Energy', 'Real Estate', 'Bonds', 
               'Commodities', 'Crypto', 'Emerging Markets', 'Blue Chips', 'Growth Stocks',
               'Value Stocks', 'REITs', 'Foreign Exchange', 'Derivatives', 'Cash']

investment_data = []
for investment in investments:
    # Different risk-return profiles
    if investment in ['Crypto', 'Derivatives', 'Emerging Markets']:
        risk = np.random.uniform(15, 35)
        expected_return = risk * np.random.uniform(0.8, 1.5)
        allocation = np.random.uniform(2, 8)
    elif investment in ['Bonds', 'Cash', 'REITs']:
        risk = np.random.uniform(2, 10)
        expected_return = risk * np.random.uniform(0.5, 1.2)
        allocation = np.random.uniform(8, 20)
    else:
        risk = np.random.uniform(8, 20)
        expected_return = risk * np.random.uniform(0.7, 1.3)
        allocation = np.random.uniform(5, 15)
    
    # Liquidity score (inverse correlation with return in some cases)
    if investment in ['Cash', 'Blue Chips']:
        liquidity = np.random.uniform(8, 10)
    elif investment in ['Real Estate', 'Derivatives']:
        liquidity = np.random.uniform(2, 5)
    else:
        liquidity = np.random.uniform(4, 8)
    
    investment_data.append({
        'Investment': investment,
        'Expected_Return': expected_return,
        'Risk_Level': risk,
        'Portfolio_Allocation': allocation,
        'Liquidity_Score': liquidity,
        'Market_Value_B': allocation * np.random.uniform(0.8, 2.0),
        'Expense_Ratio': np.random.uniform(0.1, 2.5)
    })

investment_df = pd.DataFrame(investment_data)

print("Sample datasets for bubble charts created:")
print(f"Company Performance: {len(company_df)} companies with financial metrics")
print(f"Country Development: {len(country_df)} countries with development indicators")
print(f"Product Portfolio: {len(product_df)} products with market metrics")
print(f"Investment Analysis: {len(investment_df)} investment types with risk-return profiles")


In [None]:
# Create basic bubble charts
fig, axes = plt.subplots(2, 2, figsize=(20, 16))
fig.suptitle('Bubble Chart Visualizations - Multi-dimensional Analysis', fontsize=16, fontweight='bold')

# 1. Company Performance Bubble Chart
ax1 = axes[0, 0]

# X: Revenue, Y: Growth Rate, Size: Employees, Color: Profit Margin
scatter1 = ax1.scatter(company_df['Revenue_M'], company_df['Growth_Rate'],
                      s=company_df['Employees']/5,  # Scale bubble size
                      c=company_df['Profit_Margin'], 
                      cmap='RdYlGn', alpha=0.7, edgecolors='black', linewidth=0.5)

ax1.set_xlabel('Revenue ($ Millions)')
ax1.set_ylabel('Growth Rate (%)')
ax1.set_title('Company Performance Analysis\n(Size=Employees, Color=Profit Margin)', 
              fontsize=12, fontweight='bold')

# Add colorbar
cbar1 = plt.colorbar(scatter1, ax=ax1)
cbar1.set_label('Profit Margin (%)')

# Add labels for top companies
top_companies = company_df.nlargest(3, 'Revenue_M')
for _, company in top_companies.iterrows():
    ax1.annotate(company['Company'], 
                (company['Revenue_M'], company['Growth_Rate']),
                xytext=(5, 5), textcoords='offset points', fontsize=8)

# 2. Country Development Bubble Chart
ax2 = axes[0, 1]

# X: GDP per Capita, Y: HDI, Size: Population, Color: CO2 Emissions
scatter2 = ax2.scatter(country_df['GDP_per_Capita'], country_df['HDI'],
                      s=country_df['Population_M']*2,  # Scale bubble size
                      c=country_df['CO2_Emissions_per_Capita'],
                      cmap='Reds', alpha=0.7, edgecolors='black', linewidth=0.5)

ax2.set_xlabel('GDP per Capita ($)')
ax2.set_ylabel('Human Development Index')
ax2.set_title('Country Development Indicators\n(Size=Population, Color=CO2 Emissions)', 
              fontsize=12, fontweight='bold')

# Add colorbar
cbar2 = plt.colorbar(scatter2, ax=ax2)
cbar2.set_label('CO2 Emissions per Capita')

# Add labels for major countries
major_countries = ['USA', 'China', 'India', 'Germany', 'Japan']
for country in major_countries:
    if country in country_df['Country'].values:
        row = country_df[country_df['Country'] == country].iloc[0]
        ax2.annotate(country, (row['GDP_per_Capita'], row['HDI']),
                    xytext=(5, 5), textcoords='offset points', fontsize=8)

# 3. Product Portfolio Bubble Chart
ax3 = axes[1, 0]

# X: Price, Y: Quality Score, Size: Sales Volume, Color: Customer Satisfaction
scatter3 = ax3.scatter(product_df['Price'], product_df['Quality_Score'],
                      s=product_df['Sales_Volume']/10,  # Scale bubble size
                      c=product_df['Customer_Satisfaction'],
                      cmap='viridis', alpha=0.7, edgecolors='black', linewidth=0.5)

ax3.set_xlabel('Price ($)')
ax3.set_ylabel('Quality Score')
ax3.set_title('Product Portfolio Analysis\n(Size=Sales Volume, Color=Customer Satisfaction)', 
              fontsize=12, fontweight='bold')

# Add colorbar
cbar3 = plt.colorbar(scatter3, ax=ax3)
cbar3.set_label('Customer Satisfaction')

# Add quadrant lines
ax3.axhline(y=product_df['Quality_Score'].median(), color='gray', linestyle='--', alpha=0.5)
ax3.axvline(x=product_df['Price'].median(), color='gray', linestyle='--', alpha=0.5)

# Add quadrant labels
ax3.text(product_df['Price'].max()*0.8, product_df['Quality_Score'].max()*0.95, 
         'Premium', fontsize=10, fontweight='bold')
ax3.text(product_df['Price'].min()*1.1, product_df['Quality_Score'].max()*0.95, 
         'High Value', fontsize=10, fontweight='bold')
ax3.text(product_df['Price'].max()*0.8, product_df['Quality_Score'].min()*1.1, 
         'Overpriced', fontsize=10, fontweight='bold')
ax3.text(product_df['Price'].min()*1.1, product_df['Quality_Score'].min()*1.1, 
         'Budget', fontsize=10, fontweight='bold')

# 4. Investment Risk-Return Bubble Chart
ax4 = axes[1, 1]

# X: Risk Level, Y: Expected Return, Size: Portfolio Allocation, Color: Liquidity
scatter4 = ax4.scatter(investment_df['Risk_Level'], investment_df['Expected_Return'],
                      s=investment_df['Portfolio_Allocation']*20,  # Scale bubble size
                      c=investment_df['Liquidity_Score'],
                      cmap='Blues', alpha=0.7, edgecolors='black', linewidth=0.5)

ax4.set_xlabel('Risk Level')
ax4.set_ylabel('Expected Return (%)')
ax4.set_title('Investment Risk-Return Analysis\n(Size=Portfolio Allocation, Color=Liquidity)', 
              fontsize=12, fontweight='bold')

# Add colorbar
cbar4 = plt.colorbar(scatter4, ax=ax4)
cbar4.set_label('Liquidity Score')

# Add efficient frontier line (simplified)
risk_range = np.linspace(investment_df['Risk_Level'].min(), investment_df['Risk_Level'].max(), 100)
efficient_return = np.sqrt(risk_range) * 3  # Simplified efficient frontier
ax4.plot(risk_range, efficient_return, 'r--', alpha=0.7, linewidth=2, label='Efficient Frontier')
ax4.legend()

# Add labels for extreme investments
high_risk = investment_df.loc[investment_df['Risk_Level'].idxmax()]
low_risk = investment_df.loc[investment_df['Risk_Level'].idxmin()]
ax4.annotate(high_risk['Investment'], 
            (high_risk['Risk_Level'], high_risk['Expected_Return']),
            xytext=(5, 5), textcoords='offset points', fontsize=8)
ax4.annotate(low_risk['Investment'], 
            (low_risk['Risk_Level'], low_risk['Expected_Return']),
            xytext=(5, 5), textcoords='offset points', fontsize=8)

plt.tight_layout()
plt.show()


In [None]:
# Advanced bubble chart techniques
fig, axes = plt.subplots(2, 2, figsize=(20, 16))
fig.suptitle('Advanced Bubble Chart Techniques', fontsize=16, fontweight='bold')

# 1. Animated/Time Series Bubble Chart (simulated)
ax1 = axes[0, 0]

# Create multiple time snapshots of company data
time_periods = ['2020', '2021', '2022', '2023']
colors = ['lightblue', 'orange', 'lightgreen', 'pink']

for i, period in enumerate(time_periods):
    # Simulate growth over time
    growth_factor = 1 + i * 0.15
    size_factor = 1 + i * 0.1
    
    x_data = company_df['Revenue_M'] * growth_factor + np.random.normal(0, 10, len(company_df))
    y_data = company_df['Growth_Rate'] - i * 2 + np.random.normal(0, 2, len(company_df))  # Growth slowing
    sizes = company_df['Employees'] * size_factor / 8
    
    ax1.scatter(x_data, y_data, s=sizes, alpha=0.6, 
               color=colors[i], label=period, edgecolors='black', linewidth=0.5)

ax1.set_xlabel('Revenue ($ Millions)')
ax1.set_ylabel('Growth Rate (%)')
ax1.set_title('Company Evolution Over Time\n(Different colors represent different years)', 
              fontsize=12, fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Add trend arrows for a few companies
for i in range(3):  # Show trends for first 3 companies
    x_start = company_df.iloc[i]['Revenue_M']
    y_start = company_df.iloc[i]['Growth_Rate']
    x_end = x_start * 1.45 + np.random.normal(0, 10)
    y_end = y_start - 6 + np.random.normal(0, 2)
    
    ax1.annotate('', xy=(x_end, y_end), xytext=(x_start, y_start),
                arrowprops=dict(arrowstyle='->', color='red', lw=2, alpha=0.7))

# 2. Categorical Bubble Chart with Grouping
ax2 = axes[0, 1]

# Group countries by region
country_regions = {
    'North America': ['USA', 'Canada', 'Mexico'],
    'Europe': ['Germany', 'UK', 'France', 'Italy', 'Spain', 'Netherlands'],
    'Asia': ['China', 'Japan', 'South Korea', 'India', 'Indonesia', 'Taiwan'],
    'Others': ['Brazil', 'Russia', 'Australia', 'Saudi Arabia', 'Turkey']
}

region_colors = {'North America': 'red', 'Europe': 'blue', 'Asia': 'green', 'Others': 'orange'}

for region, countries in country_regions.items():
    region_data = country_df[country_df['Country'].isin(countries)]
    if not region_data.empty:
        ax2.scatter(region_data['GDP_per_Capita'], region_data['Life_Expectancy'],
                   s=region_data['Population_M']*3, alpha=0.7,
                   color=region_colors[region], label=region, 
                   edgecolors='black', linewidth=0.5)

ax2.set_xlabel('GDP per Capita ($)')
ax2.set_ylabel('Life Expectancy (years)')
ax2.set_title('Life Expectancy vs GDP by Region\n(Size=Population)', 
              fontsize=12, fontweight='bold')
ax2.legend()
ax2.grid(True, alpha=0.3)

# 3. Multi-bubble Chart with Confidence Intervals
ax3 = axes[1, 0]

# Add uncertainty bands to product data
product_df['Price_Error'] = product_df['Price'] * 0.1
product_df['Quality_Error'] = product_df['Quality_Score'] * 0.15

# Plot main bubbles
main_scatter = ax3.scatter(product_df['Price'], product_df['Quality_Score'],
                          s=product_df['Sales_Volume']/8,
                          c=product_df['Revenue'], cmap='plasma',
                          alpha=0.7, edgecolors='black', linewidth=0.5)

# Add error bars (confidence intervals)
ax3.errorbar(product_df['Price'], product_df['Quality_Score'],
            xerr=product_df['Price_Error'], yerr=product_df['Quality_Error'],
            fmt='none', ecolor='gray', alpha=0.5, capsize=3)

ax3.set_xlabel('Price ($)')
ax3.set_ylabel('Quality Score')
ax3.set_title('Product Analysis with Uncertainty\n(Error bars show confidence intervals)', 
              fontsize=12, fontweight='bold')

# Add colorbar
cbar = plt.colorbar(main_scatter, ax=ax3)
cbar.set_label('Revenue ($)')

# 4. Bubble Chart with Correlation Analysis
ax4 = axes[1, 1]

# Calculate correlation between variables and show as bubble connections
from scipy.stats import pearsonr

# Create correlation matrix for investment data
numeric_cols = ['Expected_Return', 'Risk_Level', 'Portfolio_Allocation', 'Liquidity_Score']
correlations = {}

# Plot main bubbles
scatter_main = ax4.scatter(investment_df['Risk_Level'], investment_df['Expected_Return'],
                          s=investment_df['Portfolio_Allocation']*25,
                          c=investment_df['Liquidity_Score'], cmap='coolwarm',
                          alpha=0.8, edgecolors='black', linewidth=1)

# Add connection lines for highly correlated investments
for i in range(len(investment_df)):
    for j in range(i+1, len(investment_df)):
        inv1 = investment_df.iloc[i]
        inv2 = investment_df.iloc[j]
        
        # Calculate similarity score based on multiple factors
        risk_diff = abs(inv1['Risk_Level'] - inv2['Risk_Level'])
        return_diff = abs(inv1['Expected_Return'] - inv2['Expected_Return'])
        
        # Draw connection if investments are similar
        if risk_diff < 5 and return_diff < 5:
            ax4.plot([inv1['Risk_Level'], inv2['Risk_Level']], 
                    [inv1['Expected_Return'], inv2['Expected_Return']],
                    'gray', alpha=0.3, linewidth=1)

ax4.set_xlabel('Risk Level')
ax4.set_ylabel('Expected Return (%)')
ax4.set_title('Investment Clusters\n(Lines connect similar investments)', 
              fontsize=12, fontweight='bold')

# Add colorbar
cbar4 = plt.colorbar(scatter_main, ax=ax4)
cbar4.set_label('Liquidity Score')

# Add investment type labels
for _, inv in investment_df.iterrows():
    ax4.annotate(inv['Investment'][:8], 
                (inv['Risk_Level'], inv['Expected_Return']),
                xytext=(2, 2), textcoords='offset points', 
                fontsize=7, alpha=0.8)

plt.tight_layout()
plt.show()


In [None]:
# Statistical analysis of bubble chart data
print("Bubble Chart Statistical Analysis:")
print("=" * 50)

# 1. Company Performance Analysis
print("1. COMPANY PERFORMANCE METRICS:")

# Correlation analysis
from scipy.stats import pearsonr, spearmanr

correlations = {}
metrics = ['Revenue_M', 'Growth_Rate', 'Profit_Margin', 'Employees', 'Market_Cap_B']

print("   Correlation Matrix (Pearson r):")
for i, metric1 in enumerate(metrics):
    for j, metric2 in enumerate(metrics):
        if i < j:  # Only upper triangle
            corr, p_value = pearsonr(company_df[metric1], company_df[metric2])
            significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else ""
            print(f"     {metric1} ↔ {metric2}: {corr:.3f}{significance}")

# Company clustering analysis
print(f"\n   Company Performance Clusters:")
# Simple clustering based on revenue and growth
high_revenue = company_df['Revenue_M'] > company_df['Revenue_M'].median()
high_growth = company_df['Growth_Rate'] > company_df['Growth_Rate'].median()

clusters = {
    'Stars (High Revenue, High Growth)': company_df[high_revenue & high_growth],
    'Cash Cows (High Revenue, Low Growth)': company_df[high_revenue & ~high_growth],
    'Question Marks (Low Revenue, High Growth)': company_df[~high_revenue & high_growth],
    'Dogs (Low Revenue, Low Growth)': company_df[~high_revenue & ~high_growth]
}

for cluster_name, cluster_df in clusters.items():
    if not cluster_df.empty:
        avg_profit = cluster_df['Profit_Margin'].mean()
        print(f"     {cluster_name}: {len(cluster_df)} companies, avg profit margin: {avg_profit:.1f}%")
        top_company = cluster_df.loc[cluster_df['Revenue_M'].idxmax()]
        print(f"       Top performer: {top_company['Company']} (${top_company['Revenue_M']:.0f}M revenue)")

# 2. Country Development Analysis
print(f"\n2. COUNTRY DEVELOPMENT PATTERNS:")

# HDI vs GDP correlation
gdp_hdi_corr, gdp_hdi_p = pearsonr(country_df['GDP_per_Capita'], country_df['HDI'])
print(f"   GDP per Capita ↔ HDI correlation: {gdp_hdi_corr:.3f} (p={gdp_hdi_p:.4f})")

# Environmental impact analysis
gdp_co2_corr, gdp_co2_p = pearsonr(country_df['GDP_per_Capita'], country_df['CO2_Emissions_per_Capita'])
print(f"   GDP per Capita ↔ CO2 Emissions correlation: {gdp_co2_corr:.3f} (p={gdp_co2_p:.4f})")

# Efficiency analysis (HDI per unit CO2)
country_df['HDI_CO2_Ratio'] = country_df['HDI'] / country_df['CO2_Emissions_per_Capita']
most_efficient = country_df.nlargest(3, 'HDI_CO2_Ratio')
print(f"   Most Efficient Countries (HDI per unit CO2):")
for _, country in most_efficient.iterrows():
    print(f"     {country['Country']}: {country['HDI_CO2_Ratio']:.2f} HDI/CO2")

# Population vs development
pop_hdi_corr, pop_hdi_p = spearmanr(country_df['Population_M'], country_df['HDI'])
print(f"   Population ↔ HDI correlation (Spearman): {pop_hdi_corr:.3f} (p={pop_hdi_p:.4f})")

# 3. Product Portfolio Analysis
print(f"\n3. PRODUCT PORTFOLIO INSIGHTS:")

# Price-Quality relationship
price_quality_corr, pq_p = pearsonr(product_df['Price'], product_df['Quality_Score'])
print(f"   Price ↔ Quality correlation: {price_quality_corr:.3f} (p={pq_p:.4f})")

# Volume-Price relationship (demand curve)
price_volume_corr, pv_p = pearsonr(product_df['Price'], product_df['Sales_Volume'])
print(f"   Price ↔ Sales Volume correlation: {price_volume_corr:.3f} (p={pv_p:.4f})")

# Product segmentation
product_df['Price_Segment'] = pd.cut(product_df['Price'], bins=3, labels=['Budget', 'Mid-Market', 'Premium'])
segment_analysis = product_df.groupby('Price_Segment').agg({
    'Quality_Score': 'mean',
    'Sales_Volume': 'sum',
    'Customer_Satisfaction': 'mean',
    'Revenue': 'sum'
}).round(2)

print(f"   Product Segment Analysis:")
for segment in segment_analysis.index:
    data = segment_analysis.loc[segment]
    print(f"     {segment}:")
    print(f"       Avg Quality: {data['Quality_Score']:.2f}")
    print(f"       Total Volume: {data['Sales_Volume']:,.0f}")
    print(f"       Avg Satisfaction: {data['Customer_Satisfaction']:.2f}")
    print(f"       Total Revenue: ${data['Revenue']:,.0f}")

# 4. Investment Analysis
print(f"\n4. INVESTMENT RISK-RETURN ANALYSIS:")

# Risk-return correlation
risk_return_corr, rr_p = pearsonr(investment_df['Risk_Level'], investment_df['Expected_Return'])
print(f"   Risk ↔ Expected Return correlation: {risk_return_corr:.3f} (p={rr_p:.4f})")

# Sharpe ratio calculation (simplified, assuming risk-free rate = 2%)
risk_free_rate = 2
investment_df['Sharpe_Ratio'] = (investment_df['Expected_Return'] - risk_free_rate) / investment_df['Risk_Level']

best_sharpe = investment_df.nlargest(3, 'Sharpe_Ratio')
print(f"   Best Risk-Adjusted Returns (Sharpe Ratio):")
for _, inv in best_sharpe.iterrows():
    print(f"     {inv['Investment']}: {inv['Sharpe_Ratio']:.3f}")

# Diversification analysis
print(f"   Portfolio Diversification:")
total_allocation = investment_df['Portfolio_Allocation'].sum()
print(f"     Total allocation: {total_allocation:.1f}%")

high_allocation = investment_df[investment_df['Portfolio_Allocation'] > 10]
if not high_allocation.empty:
    print(f"     High-concentration investments (>10%):")
    for _, inv in high_allocation.iterrows():
        print(f"       {inv['Investment']}: {inv['Portfolio_Allocation']:.1f}%")

# Risk-liquidity trade-off
risk_liquidity_corr, rl_p = pearsonr(investment_df['Risk_Level'], investment_df['Liquidity_Score'])
print(f"   Risk ↔ Liquidity correlation: {risk_liquidity_corr:.3f} (p={rl_p:.4f})")

# 5. Bubble Chart Design Effectiveness
print(f"\n5. BUBBLE CHART DESIGN ANALYSIS:")

datasets = {
    'Companies': company_df,
    'Countries': country_df, 
    'Products': product_df,
    'Investments': investment_df
}

print(f"   Dataset Characteristics for Bubble Charts:")
for name, df in datasets.items():
    n_points = len(df)
    
    # Analyze bubble size distribution
    if name == 'Companies':
        size_col = 'Employees'
    elif name == 'Countries':
        size_col = 'Population_M'
    elif name == 'Products':
        size_col = 'Sales_Volume'
    else:
        size_col = 'Portfolio_Allocation'
    
    size_ratio = df[size_col].max() / df[size_col].min()
    
    suitability = "Excellent" if n_points <= 20 and size_ratio <= 100 else \
                 "Good" if n_points <= 50 and size_ratio <= 1000 else "Challenging"
    
    print(f"     {name}: {n_points} points, size ratio {size_ratio:.1f}:1 ({suitability})")

print(f"\nBubble Chart Best Practices:")
print("✓ Use 3-4 dimensions maximum (x, y, size, color)")
print("✓ Ensure bubble sizes are visually distinguishable")
print("✓ Use color to represent categorical or continuous variables")
print("✓ Include legends for size and color scales")
print("✓ Avoid overlapping bubbles when possible")
print("✓ Use transparency for overlapping data points")
print("✓ Label important or outlier points")
print("✓ Consider log scales for highly skewed data")

print(f"\nWhen to Use Bubble Charts:")
print("• Multi-dimensional data exploration")
print("• Correlation analysis with additional variables")
print("• Portfolio/performance analysis")
print("• Market research and competitive analysis")
print("• Risk assessment visualization")
print("• Scientific data with multiple measurements")
