# Parallel Coordinates Plot - Multivariate Data Analysis

**Use Case**: Visualize high-dimensional data, compare multiple variables simultaneously, identify patterns and clusters

This notebook demonstrates how to create effective parallel coordinates plots for analyzing relationships across multiple continuous variables.


In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.datasets import load_iris, load_wine
from pandas.plotting import parallel_coordinates
import warnings
warnings.filterwarnings('ignore')

# Set style for better-looking plots
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("Set2")

# Set random seed for reproducibility
np.random.seed(42)

print("Parallel coordinates plot visualization libraries loaded!")


In [None]:
# Create sample datasets for parallel coordinates
# 1. Student Performance Dataset
np.random.seed(42)

n_students = 200
student_data = []

for i in range(n_students):
    # Create correlated performance metrics
    base_ability = np.random.normal(75, 15)  # Base academic ability
    
    # Subject scores with different correlations to base ability
    math_score = max(0, min(100, base_ability + np.random.normal(0, 8)))
    science_score = max(0, min(100, base_ability * 0.9 + np.random.normal(0, 10)))
    english_score = max(0, min(100, base_ability * 0.7 + np.random.normal(0, 12)))
    history_score = max(0, min(100, base_ability * 0.6 + np.random.normal(0, 15)))
    
    # Other metrics
    study_hours = max(0, np.random.normal(25, 8))  # Hours per week
    attendance = max(50, min(100, base_ability * 0.3 + np.random.normal(70, 10)))
    extracurricular = np.random.randint(0, 8)  # Number of activities
    
    # Performance level based on average scores
    avg_score = (math_score + science_score + english_score + history_score) / 4
    if avg_score >= 85:
        performance_level = 'Excellent'
    elif avg_score >= 70:
        performance_level = 'Good'
    elif avg_score >= 55:
        performance_level = 'Average'
    else:
        performance_level = 'Below Average'
    
    student_data.append({
        'Student_ID': f'S{i+1:03d}',
        'Math_Score': round(math_score, 1),
        'Science_Score': round(science_score, 1),
        'English_Score': round(english_score, 1),
        'History_Score': round(history_score, 1),
        'Study_Hours_Weekly': round(study_hours, 1),
        'Attendance_Percent': round(attendance, 1),
        'Extracurricular_Count': extracurricular,
        'Performance_Level': performance_level
    })

student_df = pd.DataFrame(student_data)

# 2. Employee Performance Dataset
n_employees = 150

employee_data = []
departments = ['Engineering', 'Sales', 'Marketing', 'HR', 'Finance']
experience_levels = ['Junior', 'Mid', 'Senior', 'Lead']

for i in range(n_employees):
    dept = np.random.choice(departments)
    exp_level = np.random.choice(experience_levels)
    
    # Base performance varies by experience
    exp_multiplier = {'Junior': 0.7, 'Mid': 1.0, 'Senior': 1.3, 'Lead': 1.5}[exp_level]
    base_performance = np.random.normal(7, 1.5) * exp_multiplier
    
    # Department-specific variations
    dept_bonus = {'Engineering': 0.5, 'Sales': 0.3, 'Marketing': 0.2, 'HR': 0.1, 'Finance': 0.4}[dept]
    
    technical_skills = max(1, min(10, base_performance + dept_bonus + np.random.normal(0, 1)))
    communication = max(1, min(10, base_performance * 0.8 + np.random.normal(0, 1.2)))
    leadership = max(1, min(10, base_performance * 0.6 + np.random.normal(0, 1.5)))
    problem_solving = max(1, min(10, base_performance * 0.9 + np.random.normal(0, 1)))
    productivity = max(1, min(10, base_performance * 1.1 + np.random.normal(0, 0.8)))
    
    # Salary based on performance and experience
    salary = (base_performance * 8000 + 
             {'Junior': 45000, 'Mid': 65000, 'Senior': 85000, 'Lead': 110000}[exp_level] +
             np.random.normal(0, 5000))
    
    employee_data.append({
        'Employee_ID': f'E{i+1:03d}',
        'Department': dept,
        'Experience_Level': exp_level,
        'Technical_Skills': round(technical_skills, 1),
        'Communication': round(communication, 1),
        'Leadership': round(leadership, 1),
        'Problem_Solving': round(problem_solving, 1),
        'Productivity': round(productivity, 1),
        'Salary': int(salary)
    })

employee_df = pd.DataFrame(employee_data)

# 3. Car Specifications Dataset
n_cars = 180

car_data = []
car_types = ['Sedan', 'SUV', 'Hatchback', 'Coupe', 'Truck']
fuel_types = ['Gasoline', 'Hybrid', 'Electric', 'Diesel']

for i in range(n_cars):
    car_type = np.random.choice(car_types)
    fuel_type = np.random.choice(fuel_types)
    
    # Base specifications vary by car type
    if car_type == 'Sedan':
        base_hp = np.random.normal(200, 40)
        base_mpg = np.random.normal(28, 5)
        base_price = np.random.normal(35000, 8000)
    elif car_type == 'SUV':
        base_hp = np.random.normal(280, 50)
        base_mpg = np.random.normal(22, 4)
        base_price = np.random.normal(45000, 12000)
    elif car_type == 'Hatchback':
        base_hp = np.random.normal(150, 30)
        base_mpg = np.random.normal(32, 6)
        base_price = np.random.normal(25000, 5000)
    elif car_type == 'Coupe':
        base_hp = np.random.normal(320, 80)
        base_mpg = np.random.normal(20, 4)
        base_price = np.random.normal(50000, 15000)
    else:  # Truck
        base_hp = np.random.normal(350, 60)
        base_mpg = np.random.normal(18, 3)
        base_price = np.random.normal(40000, 10000)
    
    # Fuel type adjustments
    if fuel_type == 'Hybrid':
        mpg_bonus = 1.5
        price_bonus = 3000
    elif fuel_type == 'Electric':
        mpg_bonus = 3.0  # MPGe
        price_bonus = 8000
    elif fuel_type == 'Diesel':
        mpg_bonus = 1.2
        price_bonus = 2000
    else:  # Gasoline
        mpg_bonus = 1.0
        price_bonus = 0
    
    horsepower = max(100, base_hp + np.random.normal(0, 20))
    mpg = max(10, base_mpg * mpg_bonus + np.random.normal(0, 3))
    price = max(15000, base_price + price_bonus + np.random.normal(0, 3000))
    
    # Other specs
    engine_size = max(1.0, horsepower / 100 + np.random.normal(0, 0.3))
    safety_rating = np.random.uniform(3.5, 5.0)
    
    car_data.append({
        'Car_ID': f'C{i+1:03d}',
        'Car_Type': car_type,
        'Fuel_Type': fuel_type,
        'Horsepower': round(horsepower),
        'MPG': round(mpg, 1),
        'Engine_Size_L': round(engine_size, 1),
        'Safety_Rating': round(safety_rating, 1),
        'Price_USD': int(price)
    })

car_df = pd.DataFrame(car_data)

# 4. Wine Quality Dataset (using sklearn wine dataset as base)
wine_sklearn = load_wine()
wine_features = pd.DataFrame(wine_sklearn.data, columns=wine_sklearn.feature_names)
wine_features['quality_class'] = wine_sklearn.target
wine_features['quality_name'] = wine_features['quality_class'].map({0: 'Low', 1: 'Medium', 2: 'High'})

# Select key features for visualization
wine_df = wine_features[['alcohol', 'total_phenols', 'flavanoids', 'color_intensity', 
                        'od280/od315_of_diluted_wines', 'proline', 'quality_name']].copy()
wine_df.columns = ['Alcohol', 'Phenols', 'Flavanoids', 'Color_Intensity', 
                   'OD_Ratio', 'Proline', 'Quality']

# 5. City Livability Index
n_cities = 120

city_data = []
regions = ['North America', 'Europe', 'Asia Pacific', 'Latin America', 'Middle East & Africa']

for i in range(n_cities):
    region = np.random.choice(regions)
    
    # Regional baseline differences
    regional_baseline = {
        'North America': 75,
        'Europe': 80,
        'Asia Pacific': 70,
        'Latin America': 65,
        'Middle East & Africa': 60
    }[region]
    
    base_score = np.random.normal(regional_baseline, 10)
    
    # Correlated livability factors
    cost_of_living = max(30, min(100, 100 - base_score * 0.5 + np.random.normal(0, 15)))  # Inverse correlation
    safety = max(20, min(100, base_score * 0.8 + np.random.normal(0, 12)))
    healthcare = max(40, min(100, base_score * 0.9 + np.random.normal(0, 10)))
    education = max(50, min(100, base_score * 0.7 + np.random.normal(0, 15)))
    infrastructure = max(30, min(100, base_score * 0.8 + np.random.normal(0, 12)))
    environment = max(20, min(100, base_score * 0.6 + np.random.normal(0, 18)))
    culture = max(40, min(100, base_score * 0.5 + np.random.normal(0, 20)))
    
    # Overall livability score
    overall = (safety + healthcare + education + infrastructure + environment + culture) / 6
    
    city_data.append({
        'City_ID': f'City_{i+1:03d}',
        'Region': region,
        'Cost_of_Living': round(cost_of_living, 1),
        'Safety': round(safety, 1),
        'Healthcare': round(healthcare, 1),
        'Education': round(education, 1),
        'Infrastructure': round(infrastructure, 1),
        'Environment': round(environment, 1),
        'Culture': round(culture, 1),
        'Overall_Score': round(overall, 1)
    })

city_df = pd.DataFrame(city_data)

print("Sample parallel coordinates datasets created:")
print(f"Student Performance: {len(student_df)} students with 8 metrics")
print(f"Employee Performance: {len(employee_df)} employees with 6 skills + salary")  
print(f"Car Specifications: {len(car_df)} cars with 6 technical specs")
print(f"Wine Quality: {len(wine_df)} wines with 6 chemical properties")
print(f"City Livability: {len(city_df)} cities with 8 livability factors")

# Display sample data
print(f"\nSample Student Data:")
print(student_df.head(3))


In [None]:
# Create basic parallel coordinates plots
fig, axes = plt.subplots(2, 2, figsize=(20, 16))
fig.suptitle('Parallel Coordinates Visualizations - Multivariate Analysis', fontsize=16, fontweight='bold')

# 1. Student Performance Analysis
ax1 = axes[0, 0]

# Prepare data for parallel coordinates
student_numeric = student_df[['Math_Score', 'Science_Score', 'English_Score', 'History_Score', 
                             'Study_Hours_Weekly', 'Attendance_Percent', 'Extracurricular_Count']].copy()
student_with_class = student_numeric.copy()
student_with_class['Performance_Level'] = student_df['Performance_Level']

# Use pandas parallel_coordinates
parallel_coordinates(student_with_class, 'Performance_Level', ax=ax1, alpha=0.6)
ax1.set_title('Student Performance Metrics', fontweight='bold')
ax1.tick_params(axis='x', rotation=45)
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# 2. Employee Skills Analysis  
ax2 = axes[0, 1]

employee_numeric = employee_df[['Technical_Skills', 'Communication', 'Leadership', 
                               'Problem_Solving', 'Productivity']].copy()
employee_with_dept = employee_numeric.copy()
employee_with_dept['Department'] = employee_df['Department']

parallel_coordinates(employee_with_dept, 'Department', ax=ax2, alpha=0.6)
ax2.set_title('Employee Skills by Department', fontweight='bold')
ax2.tick_params(axis='x', rotation=45)
ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# 3. Car Specifications
ax3 = axes[1, 0]

# Normalize car data for better visualization
car_numeric = car_df[['Horsepower', 'MPG', 'Engine_Size_L', 'Safety_Rating']].copy()
# Scale price to 0-100 range for visualization
car_numeric['Price_Scaled'] = (car_df['Price_USD'] - car_df['Price_USD'].min()) / (car_df['Price_USD'].max() - car_df['Price_USD'].min()) * 100

car_with_type = car_numeric.copy()
car_with_type['Car_Type'] = car_df['Car_Type']

parallel_coordinates(car_with_type, 'Car_Type', ax=ax3, alpha=0.6)
ax3.set_title('Car Specifications by Type', fontweight='bold')
ax3.tick_params(axis='x', rotation=45)
ax3.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# 4. Wine Quality Analysis
ax4 = axes[1, 1]

wine_for_plot = wine_df.copy()
parallel_coordinates(wine_for_plot, 'Quality', ax=ax4, alpha=0.6)
ax4.set_title('Wine Chemical Properties by Quality', fontweight='bold')
ax4.tick_params(axis='x', rotation=45)
ax4.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()


In [None]:
# Advanced parallel coordinates techniques
fig, axes = plt.subplots(2, 2, figsize=(20, 16))
fig.suptitle('Advanced Parallel Coordinates Techniques', fontsize=16, fontweight='bold')

# 1. Normalized Parallel Coordinates with Brushing Effect
ax1 = axes[0, 0]

# Standardize city data for better comparison
city_numeric = city_df[['Cost_of_Living', 'Safety', 'Healthcare', 'Education', 
                       'Infrastructure', 'Environment', 'Culture']].copy()

# Normalize to 0-1 scale
scaler = MinMaxScaler()
city_normalized = pd.DataFrame(scaler.fit_transform(city_numeric), 
                              columns=city_numeric.columns)
city_normalized['Region'] = city_df['Region']

# Create custom parallel coordinates with highlighting
regions = city_normalized['Region'].unique()
colors = plt.cm.Set1(np.linspace(0, 1, len(regions)))

for i, region in enumerate(regions):
    region_data = city_normalized[city_normalized['Region'] == region]
    
    # Plot background lines (all other regions) in light gray
    other_data = city_normalized[city_normalized['Region'] != region]
    for idx, row in other_data.iterrows():
        values = row[city_numeric.columns].values
        ax1.plot(range(len(city_numeric.columns)), values, 
                color='lightgray', alpha=0.1, linewidth=0.5)
    
    # Plot current region in color
    for idx, row in region_data.iterrows():
        values = row[city_numeric.columns].values
        ax1.plot(range(len(city_numeric.columns)), values, 
                color=colors[i], alpha=0.7, linewidth=1.5, 
                label=region if idx == region_data.index[0] else "")

ax1.set_xticks(range(len(city_numeric.columns)))
ax1.set_xticklabels(city_numeric.columns, rotation=45, ha='right')
ax1.set_ylabel('Normalized Score (0-1)')
ax1.set_title('City Livability Factors by Region\n(With Background Context)', fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3)

# 2. Percentile-based Parallel Coordinates
ax2 = axes[0, 1]

# Convert student scores to percentiles
student_percentiles = student_df[['Math_Score', 'Science_Score', 'English_Score', 'History_Score']].copy()

for col in student_percentiles.columns:
    student_percentiles[col] = student_percentiles[col].rank(pct=True) * 100

student_percentiles['Performance_Level'] = student_df['Performance_Level']

# Plot with percentile interpretation
parallel_coordinates(student_percentiles, 'Performance_Level', ax=ax2, alpha=0.6)
ax2.set_title('Student Performance (Percentile Rankings)', fontweight='bold')
ax2.set_ylabel('Percentile Rank')
ax2.tick_params(axis='x', rotation=45)
ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Add percentile reference lines
for pct in [25, 50, 75]:
    ax2.axhline(y=pct, color='gray', linestyle='--', alpha=0.5)
    ax2.text(-0.1, pct, f'{pct}th', ha='right', va='center', fontsize=8)

# 3. Multi-scale Parallel Coordinates
ax3 = axes[1, 0]

# Employee data with different scales
employee_multi = employee_df[['Technical_Skills', 'Communication', 'Leadership', 
                             'Problem_Solving', 'Productivity']].copy()

# Add salary scaled to 1-10 range to match other metrics
employee_multi['Salary_Scaled'] = ((employee_df['Salary'] - employee_df['Salary'].min()) / 
                                  (employee_df['Salary'].max() - employee_df['Salary'].min()) * 9) + 1

employee_multi['Experience_Level'] = employee_df['Experience_Level']

parallel_coordinates(employee_multi, 'Experience_Level', ax=ax3, alpha=0.6)
ax3.set_title('Employee Metrics by Experience Level\n(Salary Scaled to 1-10)', fontweight='bold')
ax3.tick_params(axis='x', rotation=45)
ax3.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# 4. Correlation-based Parallel Coordinates  
ax4 = axes[1, 1]

# Show high-correlation vs low-correlation variables
wine_corr = wine_df[['Alcohol', 'Phenols', 'Flavanoids', 'Color_Intensity', 'OD_Ratio', 'Proline']].copy()

# Calculate correlations with alcohol (first variable)
correlations = wine_corr.corr()['Alcohol'].abs().sort_values(ascending=False)

print("Variable correlations with Alcohol:")
for var, corr in correlations.items():
    print(f"  {var}: {corr:.3f}")

# Reorder columns by correlation strength
ordered_cols = correlations.index.tolist()
wine_ordered = wine_corr[ordered_cols].copy()
wine_ordered['Quality'] = wine_df['Quality']

parallel_coordinates(wine_ordered, 'Quality', ax=ax4, alpha=0.6)
ax4.set_title('Wine Properties (Ordered by Correlation)\nAlcohol → Most Correlated', fontweight='bold')
ax4.tick_params(axis='x', rotation=45)
ax4.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()


In [None]:
# Interactive parallel coordinates examples (Plotly structure)
print("Interactive Parallel Coordinates (Plotly):")
print("=" * 50)

print("\n1. Basic Interactive Parallel Coordinates")
print("Code structure:")
print("""
fig = go.Figure(data=
    go.Parcoords(
        line=dict(color=df['target_column'],
                 colorscale='Viridis',
                 showscale=True),
        dimensions=list([
            dict(range=[df[col].min(), df[col].max()],
                 constraintrange=[df[col].quantile(0.2), df[col].quantile(0.8)],
                 label=col, values=df[col])
            for col in numeric_columns
        ])
    )
)

fig.update_layout(title='Interactive Parallel Coordinates Plot')
fig.show()
""")

print("\n2. Parallel Coordinates with Brushing and Linking")
print("Code structure:")
print("""
# Create figure with brushing capability
fig = go.Figure(data=
    go.Parcoords(
        line=dict(color=df['category_col'],
                 colorscale='Set1',
                 showscale=True,
                 colorbar=dict(title="Category")),
        dimensions=[
            dict(range=[min_val, max_val],
                 constraintrange=[filter_min, filter_max],  # Enable brushing
                 label=label, 
                 values=values,
                 tickvals=tick_positions,
                 ticktext=tick_labels) 
            for each dimension
        ]
    )
)

# Add selection and filtering capabilities
fig.update_layout(
    title="Brushable Parallel Coordinates",
    font=dict(size=12)
)

fig.show()
""")

print("\n3. Multi-dimensional Comparison Dashboard")
print("Code structure:")
print("""
# Create subplots for different views
from plotly.subplots import make_subplots

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('All Data', 'Filtered View', 'Correlation Matrix', 'Distribution'),
    specs=[[{"type": "parcoords"}, {"type": "parcoords"}],
           [{"type": "heatmap"}, {"type": "histogram"}]]
)

# Add parallel coordinates plots
fig.add_trace(go.Parcoords(...), row=1, col=1)
fig.add_trace(go.Parcoords(...), row=1, col=2)

# Add correlation heatmap
fig.add_trace(go.Heatmap(...), row=2, col=1)

# Add distribution plot
fig.add_trace(go.Histogram(...), row=2, col=2)

fig.show()
""")

print("\n4. Categorical and Continuous Mixed Dimensions")
print("Code structure:")
print("""
# Handle mixed data types
dimensions = []

for col in df.columns:
    if df[col].dtype in ['object', 'category']:
        # Categorical dimension
        unique_vals = df[col].unique()
        dimensions.append(dict(
            label=col,
            values=df[col].map({v: i for i, v in enumerate(unique_vals)}),
            tickvals=list(range(len(unique_vals))),
            ticktext=unique_vals
        ))
    else:
        # Continuous dimension
        dimensions.append(dict(
            label=col,
            values=df[col],
            range=[df[col].min(), df[col].max()]
        ))

fig = go.Figure(data=go.Parcoords(
    line=dict(color=color_values),
    dimensions=dimensions
))

fig.show()
""")


In [None]:
# Clustering analysis with parallel coordinates
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

print("Clustering Analysis with Parallel Coordinates:")
print("=" * 50)

# 1. K-means clustering on student data
print("1. STUDENT PERFORMANCE CLUSTERING:")

# Prepare student data for clustering
student_cluster_data = student_df[['Math_Score', 'Science_Score', 'English_Score', 'History_Score', 
                                  'Study_Hours_Weekly', 'Attendance_Percent']].copy()

# Standardize the data
scaler = StandardScaler()
student_scaled = scaler.fit_transform(student_cluster_data)

# Perform k-means clustering
kmeans_student = KMeans(n_clusters=4, random_state=42)
student_clusters = kmeans_student.fit_predict(student_scaled)

student_cluster_data['Cluster'] = student_clusters
student_cluster_data['Cluster_Name'] = student_cluster_data['Cluster'].map({
    0: 'High Achievers', 1: 'Balanced Students', 2: 'Struggling Students', 3: 'Subject Specialists'
})

# Analyze cluster characteristics
print("   Cluster Analysis:")
for cluster_id in range(4):
    cluster_data = student_cluster_data[student_cluster_data['Cluster'] == cluster_id]
    cluster_name = cluster_data['Cluster_Name'].iloc[0]
    print(f"\n   {cluster_name} (n={len(cluster_data)}):")
    
    for col in ['Math_Score', 'Science_Score', 'English_Score', 'History_Score']:
        mean_score = cluster_data[col].mean()
        print(f"     {col}: {mean_score:.1f}")
    
    print(f"     Study Hours: {cluster_data['Study_Hours_Weekly'].mean():.1f}")
    print(f"     Attendance: {cluster_data['Attendance_Percent'].mean():.1f}%")

# 2. Employee clustering
print(f"\n2. EMPLOYEE SKILLS CLUSTERING:")

employee_cluster_data = employee_df[['Technical_Skills', 'Communication', 'Leadership', 
                                   'Problem_Solving', 'Productivity']].copy()

employee_scaled = scaler.fit_transform(employee_cluster_data)
kmeans_employee = KMeans(n_clusters=3, random_state=42)
employee_clusters = kmeans_employee.fit_predict(employee_scaled)

employee_cluster_data['Cluster'] = employee_clusters
employee_cluster_data['Department'] = employee_df['Department']
employee_cluster_data['Experience_Level'] = employee_df['Experience_Level']

cluster_names = {0: 'Technical Specialists', 1: 'Well-Rounded', 2: 'Leadership Focused'}
employee_cluster_data['Cluster_Name'] = employee_cluster_data['Cluster'].map(cluster_names)

print("   Employee Cluster Characteristics:")
for cluster_id in range(3):
    cluster_data = employee_cluster_data[employee_cluster_data['Cluster'] == cluster_id]
    cluster_name = cluster_names[cluster_id]
    print(f"\n   {cluster_name} (n={len(cluster_data)}):")
    
    for skill in ['Technical_Skills', 'Communication', 'Leadership', 'Problem_Solving', 'Productivity']:
        mean_skill = cluster_data[skill].mean()
        print(f"     {skill}: {mean_skill:.1f}")
    
    # Department distribution
    dept_dist = cluster_data['Department'].value_counts()
    print(f"     Top Departments: {dept_dist.head(2).to_dict()}")

# Visualize clusters
fig, axes = plt.subplots(2, 2, figsize=(20, 16))
fig.suptitle('Clustering Analysis with Parallel Coordinates', fontsize=16, fontweight='bold')

# 1. Student clusters
ax1 = axes[0, 0]
student_for_plot = student_cluster_data[['Math_Score', 'Science_Score', 'English_Score', 
                                        'History_Score', 'Study_Hours_Weekly', 'Attendance_Percent', 
                                        'Cluster_Name']].copy()
parallel_coordinates(student_for_plot, 'Cluster_Name', ax=ax1, alpha=0.6)
ax1.set_title('Student Performance Clusters', fontweight='bold')
ax1.tick_params(axis='x', rotation=45)
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# 2. Employee clusters
ax2 = axes[0, 1]
employee_for_plot = employee_cluster_data[['Technical_Skills', 'Communication', 'Leadership', 
                                          'Problem_Solving', 'Productivity', 'Cluster_Name']].copy()
parallel_coordinates(employee_for_plot, 'Cluster_Name', ax=ax2, alpha=0.6)
ax2.set_title('Employee Skills Clusters', fontweight='bold')
ax2.tick_params(axis='x', rotation=45)
ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# 3. City clustering
ax3 = axes[1, 0]

city_cluster_data = city_df[['Safety', 'Healthcare', 'Education', 'Infrastructure', 
                            'Environment', 'Culture']].copy()
city_scaled = scaler.fit_transform(city_cluster_data)
kmeans_city = KMeans(n_clusters=3, random_state=42)
city_clusters = kmeans_city.fit_predict(city_scaled)

city_cluster_data['Cluster'] = city_clusters
city_cluster_names = {0: 'Developed Cities', 1: 'Emerging Cities', 2: 'Developing Cities'}
city_cluster_data['Cluster_Name'] = city_cluster_data['Cluster'].map(city_cluster_names)

city_for_plot = city_cluster_data.copy()
parallel_coordinates(city_for_plot, 'Cluster_Name', ax=ax3, alpha=0.6)
ax3.set_title('City Livability Clusters', fontweight='bold')
ax3.tick_params(axis='x', rotation=45)
ax3.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# 4. Cluster comparison matrix
ax4 = axes[1, 1]

# Create a cluster comparison for students
cluster_means = student_cluster_data.groupby('Cluster_Name')[['Math_Score', 'Science_Score', 
                                                             'English_Score', 'History_Score']].mean()

# Normalize for heatmap
cluster_means_norm = (cluster_means - cluster_means.min()) / (cluster_means.max() - cluster_means.min())

im = ax4.imshow(cluster_means_norm.values, cmap='RdYlBu_r', aspect='auto')
ax4.set_xticks(range(len(cluster_means.columns)))
ax4.set_xticklabels(cluster_means.columns, rotation=45, ha='right')
ax4.set_yticks(range(len(cluster_means.index)))
ax4.set_yticklabels(cluster_means.index)
ax4.set_title('Student Cluster Profiles\n(Normalized Scores)', fontweight='bold')

# Add text annotations
for i in range(len(cluster_means.index)):
    for j in range(len(cluster_means.columns)):
        text = ax4.text(j, i, f'{cluster_means.iloc[i, j]:.1f}',
                       ha="center", va="center", color="black", fontweight='bold')

plt.colorbar(im, ax=ax4, shrink=0.6)

plt.tight_layout()
plt.show()


In [None]:
# Statistical analysis of parallel coordinates data
print("Parallel Coordinates Statistical Analysis:")
print("=" * 50)

# 1. Correlation Analysis
print("1. CORRELATION ANALYSIS:")

datasets_for_corr = {
    'Student Performance': student_df[['Math_Score', 'Science_Score', 'English_Score', 'History_Score', 
                                      'Study_Hours_Weekly', 'Attendance_Percent']],
    'Employee Skills': employee_df[['Technical_Skills', 'Communication', 'Leadership', 
                                   'Problem_Solving', 'Productivity']],
    'Wine Properties': wine_df[['Alcohol', 'Phenols', 'Flavanoids', 'Color_Intensity', 'OD_Ratio', 'Proline']],
    'City Livability': city_df[['Safety', 'Healthcare', 'Education', 'Infrastructure', 'Environment', 'Culture']]
}

for dataset_name, data in datasets_for_corr.items():
    print(f"\n   {dataset_name} Correlations:")
    corr_matrix = data.corr()
    
    # Find highest correlations (excluding self-correlations)
    corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            corr_value = corr_matrix.iloc[i, j]
            corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], abs(corr_value), corr_value))
    
    # Sort by absolute correlation
    corr_pairs.sort(key=lambda x: x[2], reverse=True)
    
    print("     Strongest correlations:")
    for var1, var2, abs_corr, corr in corr_pairs[:3]:
        print(f"       {var1} ↔ {var2}: {corr:+.3f}")
    
    print("     Weakest correlations:")
    for var1, var2, abs_corr, corr in corr_pairs[-3:]:
        print(f"       {var1} ↔ {var2}: {corr:+.3f}")

# 2. Dimensionality and Complexity Analysis
print(f"\n2. DIMENSIONALITY ANALYSIS:")

for dataset_name, data in datasets_for_corr.items():
    n_vars = len(data.columns)
    n_observations = len(data)
    
    # Calculate effective dimensionality using PCA
    from sklearn.decomposition import PCA
    
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    
    pca = PCA()
    pca.fit(scaled_data)
    
    # Find number of components for 95% variance
    cumsum_var = np.cumsum(pca.explained_variance_ratio_)
    n_components_95 = np.argmax(cumsum_var >= 0.95) + 1
    
    print(f"\n   {dataset_name}:")
    print(f"     Total Variables: {n_vars}")
    print(f"     Observations: {n_observations}")
    print(f"     Ratio (obs/vars): {n_observations/n_vars:.1f}")
    print(f"     Components for 95% variance: {n_components_95}")
    print(f"     First 3 PC variance: {cumsum_var[2]:.1%}")

# 3. Pattern Detection Analysis
print(f"\n3. PATTERN DETECTION ANALYSIS:")

# Analyze student performance patterns
student_analysis = student_df.copy()

# Define performance categories
def categorize_performance(row):
    scores = [row['Math_Score'], row['Science_Score'], row['English_Score'], row['History_Score']]
    avg_score = np.mean(scores)
    std_score = np.std(scores)
    
    if std_score < 5:  # Consistent performance
        if avg_score >= 80:
            return 'Consistent High'
        elif avg_score >= 60:
            return 'Consistent Medium'
        else:
            return 'Consistent Low'
    else:  # Variable performance
        if max(scores) - min(scores) > 20:
            return 'Highly Variable'
        else:
            return 'Moderately Variable'

student_analysis['Performance_Pattern'] = student_analysis.apply(categorize_performance, axis=1)

print("   Student Performance Patterns:")
pattern_counts = student_analysis['Performance_Pattern'].value_counts()
for pattern, count in pattern_counts.items():
    percentage = (count / len(student_analysis)) * 100
    print(f"     {pattern}: {count} students ({percentage:.1f}%)")

# Analyze relationship between patterns and study habits
print("\n   Pattern vs Study Habits:")
for pattern in pattern_counts.index:
    pattern_data = student_analysis[student_analysis['Performance_Pattern'] == pattern]
    avg_study = pattern_data['Study_Hours_Weekly'].mean()
    avg_attendance = pattern_data['Attendance_Percent'].mean()
    print(f"     {pattern}: {avg_study:.1f} hrs/week, {avg_attendance:.1f}% attendance")

# 4. Outlier Detection in Multivariate Space
print(f"\n4. OUTLIER DETECTION:")

from sklearn.ensemble import IsolationForest

datasets_for_outliers = {
    'Students': student_df[['Math_Score', 'Science_Score', 'English_Score', 'History_Score']],
    'Employees': employee_df[['Technical_Skills', 'Communication', 'Leadership', 'Problem_Solving']],
    'Cars': car_df[['Horsepower', 'MPG', 'Engine_Size_L', 'Safety_Rating']],
    'Cities': city_df[['Safety', 'Healthcare', 'Education', 'Infrastructure']]
}

for dataset_name, data in datasets_for_outliers.items():
    # Detect outliers using Isolation Forest
    iso_forest = IsolationForest(contamination=0.1, random_state=42)
    outliers = iso_forest.fit_predict(data)
    
    n_outliers = np.sum(outliers == -1)
    outlier_percentage = (n_outliers / len(data)) * 100
    
    print(f"\n   {dataset_name} Outliers:")
    print(f"     Detected outliers: {n_outliers} ({outlier_percentage:.1f}%)")
    
    if n_outliers > 0:
        # Analyze outlier characteristics
        outlier_indices = np.where(outliers == -1)[0]
        outlier_data = data.iloc[outlier_indices]
        normal_data = data.iloc[np.where(outliers == 1)[0]]
        
        print("     Outlier characteristics (vs normal):")
        for col in data.columns:
            outlier_mean = outlier_data[col].mean()
            normal_mean = normal_data[col].mean()
            diff = outlier_mean - normal_mean
            print(f"       {col}: {outlier_mean:.1f} vs {normal_mean:.1f} ({diff:+.1f})")

# 5. Visualization Effectiveness Analysis
print(f"\n5. VISUALIZATION EFFECTIVENESS:")

print("   Parallel Coordinates Suitability Analysis:")

for dataset_name, data in datasets_for_corr.items():
    n_vars = len(data.columns)
    n_obs = len(data)
    
    # Calculate visualization complexity score
    corr_matrix = data.corr().abs()
    avg_correlation = (corr_matrix.sum().sum() - len(corr_matrix)) / (len(corr_matrix) * (len(corr_matrix) - 1))
    
    # Assess data characteristics
    if n_vars <= 5:
        complexity = "Simple"
    elif n_vars <= 10:
        complexity = "Moderate"
    else:
        complexity = "Complex"
    
    if avg_correlation > 0.7:
        correlation_level = "High"
    elif avg_correlation > 0.4:
        correlation_level = "Moderate"
    else:
        correlation_level = "Low"
    
    # Effectiveness recommendation
    if n_vars <= 8 and avg_correlation > 0.3:
        effectiveness = "Highly Effective"
    elif n_vars <= 12 and avg_correlation > 0.2:
        effectiveness = "Moderately Effective"
    else:
        effectiveness = "Consider Alternatives"
    
    print(f"\n   {dataset_name}:")
    print(f"     Variables: {n_vars} ({complexity})")
    print(f"     Avg Correlation: {avg_correlation:.3f} ({correlation_level})")
    print(f"     Effectiveness: {effectiveness}")

print(f"\n6. BEST PRACTICES FOR PARALLEL COORDINATES:")

print("   ✓ Optimal for 3-12 continuous variables")
print("   ✓ Most effective with moderate correlations (0.3-0.7)")
print("   ✓ Normalize/standardize variables for comparison")
print("   ✓ Order variables by correlation or logical flow")
print("   ✓ Use color to encode categories or clusters")
print("   ✓ Consider brushing for interactive exploration")
print("   ✓ Limit line opacity to reduce overplotting")

print(f"\nWhen to Use Parallel Coordinates:")
print("   • Multivariate data exploration")
print("   • Identifying patterns and clusters")
print("   • Comparing groups across multiple dimensions")
print("   • Quality control and anomaly detection")
print("   • Feature selection and correlation analysis")

print(f"\nAlternatives to Consider:")
print("   • Scatterplot matrices for pairwise relationships")
print("   • Radar/spider charts for profile comparison")
print("   • Heatmaps for correlation visualization")
print("   • PCA plots for dimensionality reduction")
print("   • t-SNE/UMAP for non-linear structure")
