# Scatter Plot - Relationship Between Variables

**Use Case**: Show relationship between two variables (height vs weight, price vs quality)

This notebook demonstrates how to create effective scatter plots for analyzing relationships between continuous variables.

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set style for better-looking plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Set random seed for reproducibility
np.random.seed(42)

In [None]:
# Generate sample data for relationships
n_points = 200

# Temperature vs Ice cream sales (positive correlation)
temperature = np.random.uniform(15, 35, n_points)
sales = 20 + 2.5 * temperature + np.random.normal(0, 10, n_points)

# Product quality vs price (positive correlation with noise)
quality = np.random.uniform(1, 10, n_points)
price = 10 + 8 * quality + np.random.normal(0, 15, n_points)
price = np.maximum(price, 5)  # Ensure positive prices

# Create figure with subplots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(14, 10))

# Basic scatter plot
ax1.scatter(temperature, sales, alpha=0.6, color='red')
ax1.set_title('Temperature vs Ice Cream Sales', fontsize=14, fontweight='bold')
ax1.set_xlabel('Temperature (°C)')
ax1.set_ylabel('Sales ($)')
ax1.grid(True, alpha=0.3)

# Scatter with regression line
ax2.scatter(quality, price, alpha=0.6, color='green')
# Add regression line
z = np.polyfit(quality, price, 1)
p = np.poly1d(z)
ax2.plot(quality, p(quality), "r--", alpha=0.8, linewidth=2)
ax2.set_title('Product Quality vs Price', fontsize=14, fontweight='bold')
ax2.set_xlabel('Quality Score')
ax2.set_ylabel('Price ($)')
ax2.grid(True, alpha=0.3)

# Scatter with size and color encoding
# Add a third variable (store size) that affects sales
store_size = np.random.uniform(50, 200, n_points)
sales_adjusted = sales + 0.5 * store_size

scatter = ax3.scatter(temperature, sales_adjusted, 
                     c=store_size, s=store_size, 
                     alpha=0.6, cmap='viridis')
ax3.set_title('Temperature vs Ice Cream Sales\n(Size & Color = Store Size)', fontsize=14, fontweight='bold')
ax3.set_xlabel('Temperature (°C)')
ax3.set_ylabel('Sales ($)')
ax3.grid(True, alpha=0.3)
cbar = plt.colorbar(scatter, ax=ax3)
cbar.set_label('Store Size', rotation=270, labelpad=15)

# Scatter with categories
categories = np.random.choice(['Category A', 'Category B', 'Category C'], n_points)
colors = {'Category A': 'red', 'Category B': 'blue', 'Category C': 'green'}

for category in ['Category A', 'Category B', 'Category C']:
    mask = categories == category
    ax4.scatter(quality[mask], price[mask], 
               label=category, color=colors[category], alpha=0.6)

ax4.set_title('Quality vs Price by Category', fontsize=14, fontweight='bold')
ax4.set_xlabel('Quality Score')
ax4.set_ylabel('Price ($)')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Advanced scatter plot examples with Seaborn
# Create a more complex dataset
np.random.seed(42)
n = 300

# Generate correlated data
height = np.random.normal(170, 10, n)  # cm
weight = 0.8 * height - 80 + np.random.normal(0, 8, n)  # kg
age = np.random.randint(18, 70, n)
gender = np.random.choice(['Male', 'Female'], n)

# Create DataFrame
df = pd.DataFrame({
    'Height': height,
    'Weight': weight,
    'Age': age,
    'Gender': gender
})

# Create figure with multiple Seaborn plots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(14, 10))

# Seaborn scatter with regression line
sns.scatterplot(data=df, x='Height', y='Weight', ax=ax1)
sns.regplot(data=df, x='Height', y='Weight', ax=ax1, scatter=False, color='red')
ax1.set_title('Height vs Weight with Regression', fontsize=14, fontweight='bold')

# Scatter plot with categories (hue)
sns.scatterplot(data=df, x='Height', y='Weight', hue='Gender', ax=ax2)
ax2.set_title('Height vs Weight by Gender', fontsize=14, fontweight='bold')

# Scatter plot with size encoding
sns.scatterplot(data=df, x='Height', y='Weight', size='Age', ax=ax3, alpha=0.7)
ax3.set_title('Height vs Weight (Size = Age)', fontsize=14, fontweight='bold')

# Combined: hue and size
sns.scatterplot(data=df, x='Height', y='Weight', hue='Gender', size='Age', ax=ax4, alpha=0.7)
ax4.set_title('Height vs Weight by Gender and Age', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis and statistics
print("Correlation Analysis:")
print(f"Temperature vs Sales: r = {np.corrcoef(temperature, sales)[0,1]:.3f}")
print(f"Quality vs Price: r = {np.corrcoef(quality, price)[0,1]:.3f}")
print(f"Height vs Weight: r = {np.corrcoef(df['Height'], df['Weight'])[0,1]:.3f}")

# Statistical significance
r_temp_sales, p_temp_sales = stats.pearsonr(temperature, sales)
r_qual_price, p_qual_price = stats.pearsonr(quality, price)
r_height_weight, p_height_weight = stats.pearsonr(df['Height'], df['Weight'])

print("\nStatistical Significance (p-values):")
print(f"Temperature vs Sales: p = {p_temp_sales:.6f}")
print(f"Quality vs Price: p = {p_qual_price:.6f}")
print(f"Height vs Weight: p = {p_height_weight:.6f}")

# Interpretation
print("\nInterpretation:")
for name, r, p in [('Temperature vs Sales', r_temp_sales, p_temp_sales),
                   ('Quality vs Price', r_qual_price, p_qual_price),
                   ('Height vs Weight', r_height_weight, p_height_weight)]:
    strength = "strong" if abs(r) > 0.7 else "moderate" if abs(r) > 0.5 else "weak"
    direction = "positive" if r > 0 else "negative"
    significance = "significant" if p < 0.05 else "not significant"
    print(f"{name}: {strength} {direction} correlation ({significance})")