# Pair Plot - Multivariate Data Exploration

**Use Case**: Explore relationships in multivariate data (correlation analysis, feature exploration)

This notebook demonstrates how to create effective pair plots for exploring relationships between multiple variables simultaneously.


In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris, load_wine, make_classification
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set style for better-looking plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Set random seed for reproducibility
np.random.seed(42)


In [None]:
# Load and prepare sample datasets
# 1. Classic iris dataset
iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['species'] = iris.target_names[iris.target]

# 2. Wine dataset
wine = load_wine()
wine_df = pd.DataFrame(wine.data, columns=wine.feature_names)
wine_df['wine_class'] = wine.target_names[wine.target]

# 3. Custom dataset - Customer analytics
np.random.seed(42)
n_customers = 300

# Generate correlated customer features
age = np.random.normal(40, 15, n_customers)
age = np.clip(age, 18, 80)

income = 30000 + age * 800 + np.random.normal(0, 10000, n_customers)
income = np.maximum(income, 20000)

spending = income * 0.3 + np.random.normal(0, 5000, n_customers)
spending = np.maximum(spending, 1000)

satisfaction = 3.5 + (spending / income) * 2 + np.random.normal(0, 0.5, n_customers)
satisfaction = np.clip(satisfaction, 1, 5)

# Add customer segments
segments = []
for i in range(n_customers):
    if income[i] > 70000 and satisfaction[i] > 4:
        segments.append('Premium')
    elif income[i] > 50000:
        segments.append('Standard')
    else:
        segments.append('Budget')

customer_df = pd.DataFrame({
    'age': age,
    'income': income,
    'spending': spending,
    'satisfaction': satisfaction,
    'segment': segments
})

print("Datasets prepared:")
print(f"Iris dataset: {iris_df.shape[0]} samples, {iris_df.shape[1]-1} features")
print(f"Customer dataset: {customer_df.shape[0]} samples, {customer_df.shape[1]-1} features")


In [None]:
# Basic pair plots
fig = plt.figure(figsize=(16, 12))

# 1. Simple pair plot without categories
plt.subplot(2, 2, 1)
# Select subset of iris features for clarity
iris_subset = iris_df[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']]
sns.pairplot(iris_subset, diag_kind='hist', plot_kws={'alpha': 0.6})
plt.suptitle('Iris Dataset - Basic Pair Plot', y=1.02, fontsize=14, fontweight='bold')

plt.show()

# 2. Pair plot with categorical coloring
g = sns.pairplot(iris_df, hue='species', diag_kind='kde', 
                 plot_kws={'alpha': 0.7}, diag_kws={'alpha': 0.7})
g.fig.suptitle('Iris Dataset - Colored by Species', y=1.02, fontsize=14, fontweight='bold')
plt.show()


In [None]:
# Advanced pair plot techniques
# 1. Customer analytics pair plot
g = sns.pairplot(customer_df, hue='segment', 
                 diag_kind='kde',
                 plot_kws={'alpha': 0.6, 's': 50},
                 diag_kws={'alpha': 0.7})
g.fig.suptitle('Customer Analytics - Pair Plot by Segment', y=1.02, fontsize=14, fontweight='bold')

# Add correlation coefficients to the plots
def corrfunc(x, y, **kws):
    r, p = stats.pearsonr(x, y)
    ax = plt.gca()
    ax.annotate(f"r = {r:.2f}\np = {p:.3f}", 
                xy=(0.05, 0.95), xycoords=ax.transAxes,
                bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8))

g.map_lower(corrfunc)
plt.show()


In [None]:
# Custom pair plot with regression lines
# Create a focused pair plot for customer data
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
fig.suptitle('Customer Analytics - Detailed Pair Plot with Regression', fontsize=16, fontweight='bold')

variables = ['age', 'income', 'spending', 'satisfaction']
colors = {'Premium': 'red', 'Standard': 'blue', 'Budget': 'green'}

for i, var1 in enumerate(variables):
    for j, var2 in enumerate(variables):
        ax = axes[i, j] if i < 3 and j < 3 else None
        if ax is None:
            continue
            
        if i == j:  # Diagonal - show distribution
            for segment in customer_df['segment'].unique():
                segment_data = customer_df[customer_df['segment'] == segment]
                ax.hist(segment_data[var1], alpha=0.6, label=segment, 
                       color=colors[segment], bins=20)
            ax.set_xlabel(var1)
            ax.set_ylabel('Frequency')
            if i == 0:
                ax.legend()
        else:  # Off-diagonal - show scatter with regression
            for segment in customer_df['segment'].unique():
                segment_data = customer_df[customer_df['segment'] == segment]
                ax.scatter(segment_data[var2], segment_data[var1], 
                          alpha=0.6, color=colors[segment], label=segment, s=30)
            
            # Add regression line for all data
            x_data = customer_df[var2]
            y_data = customer_df[var1]
            z = np.polyfit(x_data, y_data, 1)
            p = np.poly1d(z)
            ax.plot(x_data, p(x_data), "k--", alpha=0.8, linewidth=1)
            
            # Calculate and display correlation
            r, p_val = stats.pearsonr(x_data, y_data)
            ax.text(0.05, 0.95, f'r={r:.2f}', transform=ax.transAxes, 
                   bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8))
            
            ax.set_xlabel(var2)
            ax.set_ylabel(var1)

# Remove empty subplots
for i in range(3):
    for j in range(3):
        if i >= len(variables) or j >= len(variables):
            fig.delaxes(axes[i, j])

plt.tight_layout()
plt.show()


In [None]:
# Statistical analysis of pair relationships
print("Pair Plot Statistical Analysis:")
print("=" * 50)

# Correlation matrix for customer data
print("Customer Data Correlation Matrix:")
customer_numeric = customer_df.select_dtypes(include=[np.number])
correlation_matrix = customer_numeric.corr()
print(correlation_matrix.round(3))

# Find strongest correlations
print(f"\nStrongest Correlations:")
correlations = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        var1 = correlation_matrix.columns[i]
        var2 = correlation_matrix.columns[j]
        corr_value = correlation_matrix.iloc[i, j]
        correlations.append((var1, var2, corr_value))

# Sort by absolute correlation strength
correlations.sort(key=lambda x: abs(x[2]), reverse=True)
for var1, var2, corr in correlations[:5]:  # Top 5 correlations
    strength = "very strong" if abs(corr) > 0.8 else "strong" if abs(corr) > 0.6 else "moderate" if abs(corr) > 0.4 else "weak"
    direction = "positive" if corr > 0 else "negative"
    print(f"  {var1} ↔ {var2}: {corr:.3f} ({strength} {direction})")

# Segment analysis
print(f"\nCustomer Segment Analysis:")
segment_stats = customer_df.groupby('segment').agg({
    'age': ['mean', 'std'],
    'income': ['mean', 'std'],
    'spending': ['mean', 'std'],
    'satisfaction': ['mean', 'std']
}).round(2)

for segment in customer_df['segment'].unique():
    segment_data = customer_df[customer_df['segment'] == segment]
    print(f"\n{segment} Segment (n={len(segment_data)}):")
    print(f"  Age: {segment_data['age'].mean():.1f} ± {segment_data['age'].std():.1f}")
    print(f"  Income: ${segment_data['income'].mean():,.0f} ± ${segment_data['income'].std():,.0f}")
    print(f"  Spending: ${segment_data['spending'].mean():,.0f} ± ${segment_data['spending'].std():.0f}")
    print(f"  Satisfaction: {segment_data['satisfaction'].mean():.2f} ± {segment_data['satisfaction'].std():.2f}")

# ANOVA tests to check if segments differ significantly
from scipy.stats import f_oneway

print(f"\nANOVA Tests (differences between segments):")
for variable in ['age', 'income', 'spending', 'satisfaction']:
    premium = customer_df[customer_df['segment'] == 'Premium'][variable]
    standard = customer_df[customer_df['segment'] == 'Standard'][variable]
    budget = customer_df[customer_df['segment'] == 'Budget'][variable]
    
    f_stat, p_value = f_oneway(premium, standard, budget)
    significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else "ns"
    print(f"  {variable}: F={f_stat:.2f}, p={p_value:.6f} {significance}")

print(f"\nPair Plot Insights:")
print("✓ Pair plots reveal multivariate relationships simultaneously")
print("✓ Diagonal shows individual variable distributions")
print("✓ Off-diagonal shows pairwise relationships")
print("✓ Color coding by categories reveals group differences")
print("✓ Correlation coefficients quantify relationship strength")
print("✓ Regression lines show linear trends")

print(f"\nInterpretation Guidelines:")
print("• Strong correlations (|r| > 0.7) suggest potential redundancy")
print("• Moderate correlations (0.4 < |r| < 0.7) indicate meaningful relationships")
print("• Weak correlations (|r| < 0.4) suggest independence")
print("• Non-linear patterns may not be captured by correlation coefficients")
print("• Group separation in scatter plots indicates discriminative features")
