In [1]:
import os

# Set the working directory to where your dataset files are located
os.chdir('C:/Users/Reddy/Documents/Projects/posture correction/dataset')

# Print the current working directory to confirm it worked
print(f"Current working directory: {os.getcwd()}")


Current working directory: C:\Users\Reddy\Documents\Projects\posture correction\dataset


In [2]:
import pandas as pd

# Load the datasets using the full file path
try:
    df_3d_distances = pd.read_csv("C:/Users/Reddy/Documents/Projects/posture correction/dataset/3d_distances.csv")
    df_angles = pd.read_csv("C:/Users/Reddy/Documents/Projects/posture correction/dataset/angles.csv")
    df_labels = pd.read_csv("C:/Users/Reddy/Documents/Projects/posture correction/dataset/labels.csv")
    df_landmarks = pd.read_csv("C:/Users/Reddy/Documents/Projects/posture correction/dataset/landmarks.csv")
    df_xyz_distances = pd.read_csv("C:/Users/Reddy/Documents/Projects/posture correction/dataset/xyz_distances.csv")

    print("All datasets loaded successfully!")
    print(f"Landmarks shape: {df_landmarks.shape}")
    print(f"Labels shape: {df_labels.shape}")
    print(f"Angles shape: {df_angles.shape}")
    print(f"3D Distances shape: {df_3d_distances.shape}")
    print(f"XYZ Distances shape: {df_xyz_distances.shape}")

except FileNotFoundError as e:
    print(f"Error: A file was not found. Please double-check the path. Error: {e}")
    raise


All datasets loaded successfully!
Landmarks shape: (1372, 100)
Labels shape: (1372, 2)
Angles shape: (1372, 8)
3D Distances shape: (1372, 17)
XYZ Distances shape: (1372, 49)


In [3]:
# ============== DATASET OVERVIEW ==============
print("="*50)
print("           POSTURE CORRECTION DATASET OVERVIEW")
print("="*50)

def display_dataset_overview(df, dataset_name):
    """Display comprehensive overview of a dataset"""
    print(f"\n{'='*20} {dataset_name} {'='*20}")
    
    # Dataset shape
    print(f"Shape: {df.shape}")
    
    # Data types
    print(f"\nData types:")
    for col, dtype in df.dtypes.items():
        print(f"{col:<35} {dtype}")
    
    # First 5 rows
    print(f"\nFirst 5 rows:")
    print(df.head().to_string())
    
    # Null values
    null_counts = df.isnull().sum()
    if null_counts.sum() > 0:
        print(f"\nNull values:")
        for col, count in null_counts.items():
            if count > 0:
                print(f"{col:<35} {count}")
    else:
        print(f"\nNull values: None")
    
    print(f"\nMemory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    print("-" * 70)

# Display overview for each dataset
datasets = [
    (df_landmarks, "LANDMARKS DATASET"),
    (df_labels, "LABELS DATASET"), 
    (df_angles, "ANGLES DATASET"),
    (df_3d_distances, "3D DISTANCES DATASET"),
    (df_xyz_distances, "XYZ DISTANCES DATASET")
]

for df, name in datasets:
    display_dataset_overview(df, name)


           POSTURE CORRECTION DATASET OVERVIEW

Shape: (1372, 100)

Data types:
pose_id                             int64
x_nose                              float64
y_nose                              float64
z_nose                              float64
x_left_eye_inner                    float64
y_left_eye_inner                    float64
z_left_eye_inner                    float64
x_left_eye                          float64
y_left_eye                          float64
z_left_eye                          float64
x_left_eye_outer                    float64
y_left_eye_outer                    float64
z_left_eye_outer                    float64
x_right_eye_inner                   float64
y_right_eye_inner                   float64
z_right_eye_inner                   float64
x_right_eye                         float64
y_right_eye                         float64
z_right_eye                         float64
x_right_eye_outer                   float64
y_right_eye_outer                   float6

In [4]:
# ============== DATASET RELATIONSHIPS ==============
print("\n" + "="*50)
print("           DATASET RELATIONSHIPS & MERGING")
print("="*50)

# Check if all datasets have the same number of rows and pose_id alignment
print(f"\nDataset Sizes:")
print(f"Landmarks:     {len(df_landmarks)} rows")
print(f"Labels:        {len(df_labels)} rows") 
print(f"Angles:        {len(df_angles)} rows")
print(f"3D Distances:  {len(df_3d_distances)} rows")
print(f"XYZ Distances: {len(df_xyz_distances)} rows")

# Check pose_id ranges
print(f"\nPose ID Ranges:")
for df, name in [(df_landmarks, "Landmarks"), (df_labels, "Labels"), (df_angles, "Angles"), 
                 (df_3d_distances, "3D Distances"), (df_xyz_distances, "XYZ Distances")]:
    pose_ids = df['pose_id'].unique()
    print(f"{name:<15}: {min(pose_ids)} to {max(pose_ids)} ({len(pose_ids)} unique IDs)")

# Show class distribution
print(f"\nClass Distribution:")
class_counts = df_labels['pose'].value_counts()
for pose_class, count in class_counts.items():
    percentage = (count / len(df_labels)) * 100
    print(f"{pose_class:<25}: {count:4d} samples ({percentage:5.1f}%)")

# Demonstrate merging all datasets
print(f"\nMerging all datasets on 'pose_id'...")
merged_df = df_landmarks.merge(df_labels, on='pose_id') \
                        .merge(df_angles, on='pose_id') \
                        .merge(df_3d_distances, on='pose_id') \
                        .merge(df_xyz_distances, on='pose_id')

print(f"Final merged dataset shape: {merged_df.shape}")
print(f"Total features after merging: {merged_df.shape[1] - 1} (excluding pose_id)")

# Show column breakdown
landmarks_cols = len([col for col in df_landmarks.columns if col != 'pose_id'])
angles_cols = len([col for col in df_angles.columns if col != 'pose_id'])
distances_3d_cols = len([col for col in df_3d_distances.columns if col != 'pose_id'])
distances_xyz_cols = len([col for col in df_xyz_distances.columns if col != 'pose_id'])

print(f"\nFeature Breakdown:")
print(f"Landmark coordinates: {landmarks_cols} features")
print(f"Joint angles:         {angles_cols} features") 
print(f"3D distances:         {distances_3d_cols} features")
print(f"XYZ distances:        {distances_xyz_cols} features")
print(f"Target labels:        1 feature (pose)")
print(f"Total:                {landmarks_cols + angles_cols + distances_3d_cols + distances_xyz_cols + 1} features")



           DATASET RELATIONSHIPS & MERGING

Dataset Sizes:
Landmarks:     1372 rows
Labels:        1372 rows
Angles:        1372 rows
3D Distances:  1372 rows
XYZ Distances: 1372 rows

Pose ID Ranges:
Landmarks      : 0 to 1371 (1372 unique IDs)
Labels         : 0 to 1371 (1372 unique IDs)
Angles         : 0 to 1371 (1372 unique IDs)
3D Distances   : 0 to 1371 (1372 unique IDs)
XYZ Distances  : 0 to 1371 (1372 unique IDs)

Class Distribution:
jumping_jacks_down       :  189 samples ( 13.8%)
jumping_jacks_up         :  181 samples ( 13.2%)
pullups_down             :  154 samples ( 11.2%)
pushups_up               :  144 samples ( 10.5%)
squats_up                :  139 samples ( 10.1%)
pullups_up               :  135 samples (  9.8%)
squats_down              :  127 samples (  9.3%)
pushups_down             :  102 samples (  7.4%)
situp_down               :  102 samples (  7.4%)
situp_up                 :   99 samples (  7.2%)

Merging all datasets on 'pose_id'...
Final merged dataset sha

In [None]:
# ============== VISUALIZATION SETUP ==============
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Set style for better looking plots
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("="*50)
print("           STATISTICAL VISUALIZATIONS")
print("="*50)

# Set up the merged dataset for comprehensive analysis
if 'merged_df' not in locals():
    merged_df = df_landmarks.merge(df_labels, on='pose_id') \
                            .merge(df_angles, on='pose_id') \
                            .merge(df_3d_distances, on='pose_id') \
                            .merge(df_xyz_distances, on='pose_id')

print(f"Visualization dataset ready: {merged_df.shape}")
print("Starting comprehensive statistical visualization analysis...")


In [None]:
# ============== 1. CLASS DISTRIBUTION ANALYSIS ==============
print("\n1. POSE CLASS DISTRIBUTION ANALYSIS")
print("-" * 50)

# Create subplot for class distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Pose Classification Dataset Analysis', fontsize=16, fontweight='bold')

# 1.1 Bar chart of class counts
class_counts = df_labels['pose'].value_counts()
axes[0,0].bar(range(len(class_counts)), class_counts.values, color=sns.color_palette("husl", len(class_counts)))
axes[0,0].set_title('Pose Class Distribution (Counts)', fontweight='bold')
axes[0,0].set_xlabel('Pose Classes')
axes[0,0].set_ylabel('Number of Samples')
axes[0,0].set_xticks(range(len(class_counts)))
axes[0,0].set_xticklabels(class_counts.index, rotation=45, ha='right')

# Add value labels on bars
for i, v in enumerate(class_counts.values):
    axes[0,0].text(i, v + 10, str(v), ha='center', va='bottom', fontweight='bold')

# 1.2 Pie chart of class proportions
colors = sns.color_palette("husl", len(class_counts))
axes[0,1].pie(class_counts.values, labels=class_counts.index, autopct='%1.1f%%', 
              colors=colors, startangle=90)
axes[0,1].set_title('Pose Class Distribution (Proportions)', fontweight='bold')

# 1.3 Class balance analysis
axes[1,0].barh(range(len(class_counts)), class_counts.values, color=colors)
axes[1,0].set_title('Class Balance Analysis', fontweight='bold')
axes[1,0].set_xlabel('Sample Count')
axes[1,0].set_ylabel('Pose Classes')
axes[1,0].set_yticks(range(len(class_counts)))
axes[1,0].set_yticklabels(class_counts.index)

# Add percentage labels
total_samples = len(df_labels)
for i, v in enumerate(class_counts.values):
    percentage = (v / total_samples) * 100
    axes[1,0].text(v + 10, i, f'{v} ({percentage:.1f}%)', va='center', fontweight='bold')

# 1.4 Sample distribution statistics
stats_text = f"""Dataset Statistics:
• Total Samples: {total_samples:,}
• Number of Classes: {len(class_counts)}
• Most Common: {class_counts.index[0]} ({class_counts.iloc[0]} samples)
• Least Common: {class_counts.index[-1]} ({class_counts.iloc[-1]} samples)
• Balance Ratio: {class_counts.iloc[0]/class_counts.iloc[-1]:.2f}:1
• Std Deviation: {class_counts.std():.1f}"""

axes[1,1].text(0.1, 0.7, stats_text, transform=axes[1,1].transAxes, fontsize=11,
               verticalalignment='top', bbox=dict(boxstyle="round,pad=0.3", facecolor="lightgray"))
axes[1,1].set_title('Dataset Statistics', fontweight='bold')
axes[1,1].axis('off')

plt.tight_layout()
plt.show()

# Print summary statistics
print(f"\nClass Distribution Summary:")
for pose_class, count in class_counts.items():
    percentage = (count / total_samples) * 100
    print(f"{pose_class:<25}: {count:4d} samples ({percentage:5.1f}%)")

print(f"\nClass Balance Assessment:")
balance_ratio = class_counts.max() / class_counts.min()
if balance_ratio <= 1.5:
    print("✓ Well-balanced dataset")
elif balance_ratio <= 3.0:
    print("⚠ Moderately imbalanced dataset")
else:
    print("⚠ Highly imbalanced dataset - consider balancing techniques")


In [None]:
# ============== 2. FEATURE DISTRIBUTION ANALYSIS ==============
print("\n2. FEATURE DISTRIBUTION ANALYSIS")
print("-" * 50)

# Select sample features from each dataset type for analysis
landmark_features = [col for col in df_landmarks.columns if col != 'pose_id'][:6]
angle_features = [col for col in df_angles.columns if col != 'pose_id'][:4]
distance_3d_features = [col for col in df_3d_distances.columns if col != 'pose_id'][:4]
distance_xyz_features = [col for col in df_xyz_distances.columns if col != 'pose_id'][:6]

# 2.1 Landmark Coordinates Distribution
print("\n2.1 Landmark Coordinates Distribution")
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Landmark Coordinates Distribution Analysis', fontsize=16, fontweight='bold')

for i, feature in enumerate(landmark_features):
    row, col = i // 3, i % 3
    
    # Histogram
    axes[row, col].hist(merged_df[feature], bins=30, alpha=0.7, color=sns.color_palette("husl")[i], edgecolor='black')
    axes[row, col].set_title(f'{feature}', fontweight='bold')
    axes[row, col].set_xlabel('Value')
    axes[row, col].set_ylabel('Frequency')
    
    # Add statistics text
    mean_val = merged_df[feature].mean()
    std_val = merged_df[feature].std()
    axes[row, col].axvline(mean_val, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_val:.2f}')
    axes[row, col].axvline(mean_val + std_val, color='orange', linestyle=':', linewidth=1, label=f'±1 STD')
    axes[row, col].axvline(mean_val - std_val, color='orange', linestyle=':', linewidth=1)
    axes[row, col].legend(fontsize=8)

plt.tight_layout()
plt.show()

# 2.2 Joint Angles Distribution
print("\n2.2 Joint Angles Distribution")
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Joint Angles Distribution Analysis', fontsize=16, fontweight='bold')

for i, feature in enumerate(angle_features):
    row, col = i // 2, i % 2
    
    # Box plot with violin plot overlay
    sns.violinplot(data=merged_df, y=feature, ax=axes[row, col], color=sns.color_palette("husl")[i])
    sns.boxplot(data=merged_df, y=feature, ax=axes[row, col], width=0.3, color='white')
    
    axes[row, col].set_title(f'{feature}', fontweight='bold')
    axes[row, col].set_ylabel('Angle (degrees)')
    
    # Add statistics
    q1 = merged_df[feature].quantile(0.25)
    q3 = merged_df[feature].quantile(0.75)
    median = merged_df[feature].median()
    axes[row, col].text(0.02, 0.98, f'Median: {median:.1f}°\nIQR: {q3-q1:.1f}°', 
                       transform=axes[row, col].transAxes, fontsize=9, verticalalignment='top',
                       bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8))

plt.tight_layout()
plt.show()

# 2.3 Distance Features Distribution
print("\n2.3 Distance Features Distribution")
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('3D Distance Features Distribution', fontsize=16, fontweight='bold')

for i, feature in enumerate(distance_3d_features):
    row, col = i // 2, i % 2
    
    # Histogram with KDE overlay
    axes[row, col].hist(merged_df[feature], bins=25, alpha=0.6, color=sns.color_palette("husl")[i], density=True, edgecolor='black')
    
    # KDE overlay
    from scipy import stats
    x_vals = np.linspace(merged_df[feature].min(), merged_df[feature].max(), 100)
    kde = stats.gaussian_kde(merged_df[feature])
    axes[row, col].plot(x_vals, kde(x_vals), 'r-', linewidth=2, label='KDE')
    
    axes[row, col].set_title(f'{feature}', fontweight='bold')
    axes[row, col].set_xlabel('Distance')
    axes[row, col].set_ylabel('Density')
    axes[row, col].legend()

plt.tight_layout()
plt.show()

print(f"\nFeature Distribution Summary:")
print(f"• Landmark features show varying distributions based on body part positions")
print(f"• Joint angles typically range from 0° to 180° with pose-specific patterns")
print(f"• Distance features show right-skewed distributions (common in distance metrics)")
print(f"• Most features show continuous distributions suitable for ML algorithms")


In [None]:
# ============== 3. CORRELATION ANALYSIS ==============
print("\n3. CORRELATION ANALYSIS")
print("-" * 50)

# 3.1 Angle Features Correlation
print("\n3.1 Joint Angles Correlation Matrix")
angle_cols = [col for col in df_angles.columns if col != 'pose_id']
angle_corr = merged_df[angle_cols].corr()

fig, axes = plt.subplots(1, 2, figsize=(20, 8))

# Correlation heatmap
sns.heatmap(angle_corr, annot=True, cmap='RdBu_r', center=0, 
            square=True, ax=axes[0], cbar_kws={'label': 'Correlation Coefficient'})
axes[0].set_title('Joint Angles Correlation Matrix', fontweight='bold', fontsize=14)
axes[0].set_xlabel('Joint Angles')
axes[0].set_ylabel('Joint Angles')

# Correlation strength distribution
corr_values = angle_corr.values[np.triu_indices_from(angle_corr.values, k=1)]
axes[1].hist(corr_values, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
axes[1].set_title('Distribution of Correlation Coefficients', fontweight='bold', fontsize=14)
axes[1].set_xlabel('Correlation Coefficient')
axes[1].set_ylabel('Frequency')
axes[1].axvline(corr_values.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {corr_values.mean():.3f}')
axes[1].legend()

plt.tight_layout()
plt.show()

# 3.2 Distance Features Correlation Sample
print("\n3.2 Distance Features Correlation Sample")
distance_sample_cols = distance_3d_features + distance_xyz_features[:4]
distance_corr = merged_df[distance_sample_cols].corr()

plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(distance_corr, dtype=bool))
sns.heatmap(distance_corr, mask=mask, annot=True, cmap='viridis', center=0,
            square=True, cbar_kws={'label': 'Correlation Coefficient'})
plt.title('Distance Features Correlation Matrix (Sample)', fontweight='bold', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# 3.3 Cross-feature type correlation analysis
print("\n3.3 Cross-Feature Type Correlation")
# Select representative features from each type
sample_features = {
    'Angles': angle_features[:3],
    'Landmarks': landmark_features[:3], 
    'Distances': distance_3d_features[:3]
}

# Create cross-correlation matrix
all_sample_features = []
feature_labels = []
for feat_type, features in sample_features.items():
    all_sample_features.extend(features)
    feature_labels.extend([f"{feat_type}_{i+1}" for i in range(len(features))])

cross_corr = merged_df[all_sample_features].corr()
cross_corr.index = feature_labels
cross_corr.columns = feature_labels

plt.figure(figsize=(12, 10))
sns.heatmap(cross_corr, annot=True, cmap='RdYlBu_r', center=0,
            square=True, cbar_kws={'label': 'Correlation Coefficient'})
plt.title('Cross-Feature Type Correlation Analysis', fontweight='bold', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Print correlation insights
print(f"\nCorrelation Analysis Summary:")
high_corr_pairs = []
for i in range(len(angle_corr.columns)):
    for j in range(i+1, len(angle_corr.columns)):
        corr_val = angle_corr.iloc[i, j]
        if abs(corr_val) > 0.7:
            high_corr_pairs.append((angle_corr.columns[i], angle_corr.columns[j], corr_val))

print(f"• High correlations (|r| > 0.7) found in angle features: {len(high_corr_pairs)} pairs")
print(f"• Average correlation among angle features: {corr_values.mean():.3f}")
print(f"• This suggests some redundancy in joint angle measurements")
print(f"• Feature selection or PCA might be beneficial for dimensionality reduction")


In [None]:
# ============== 4. DATASET COMPARISON ANALYSIS ==============
print("\n4. DATASET COMPARISON ANALYSIS")
print("-" * 50)

# 4.1 Feature Range Comparison Across Datasets
print("\n4.1 Feature Value Ranges by Dataset Type")

# Prepare data for comparison
datasets_info = [
    ('Landmarks', df_landmarks, 'Coordinate Values'),
    ('Angles', df_angles, 'Degrees'), 
    ('3D Distances', df_3d_distances, 'Distance Units'),
    ('XYZ Distances', df_xyz_distances, 'Distance Units')
]

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Feature Value Ranges Comparison Across Datasets', fontsize=16, fontweight='bold')
axes = axes.flatten()

for idx, (name, df, unit) in enumerate(datasets_info):
    # Get numerical columns only
    numerical_cols = [col for col in df.columns if col != 'pose_id']
    
    # Calculate statistics for all features
    all_values = []
    for col in numerical_cols[:10]:  # Limit to first 10 features for visibility
        all_values.extend(df[col].values)
    
    # Box plot of all feature values
    axes[idx].boxplot([df[col].values for col in numerical_cols[:10]], 
                     labels=[col.split('_')[-1][:8] if '_' in col else col[:8] for col in numerical_cols[:10]])
    axes[idx].set_title(f'{name} Features ({unit})', fontweight='bold')
    axes[idx].set_ylabel(f'Values ({unit})')
    axes[idx].tick_params(axis='x', rotation=45)
    
    # Add statistics
    overall_mean = np.mean(all_values)
    overall_std = np.std(all_values)
    axes[idx].text(0.02, 0.98, f'Mean: {overall_mean:.2f}\nStd: {overall_std:.2f}', 
                  transform=axes[idx].transAxes, fontsize=9, verticalalignment='top',
                  bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8))

plt.tight_layout()
plt.show()

# 4.2 Feature Scale Comparison
print("\n4.2 Feature Scale Distribution")
fig, axes = plt.subplots(2, 2, figsize=(16, 10))
fig.suptitle('Feature Scale Distribution Across Dataset Types', fontsize=16, fontweight='bold')
axes = axes.flatten()

scale_data = []
scale_labels = []

for idx, (name, df, unit) in enumerate(datasets_info):
    numerical_cols = [col for col in df.columns if col != 'pose_id']
    
    # Calculate range (max - min) for each feature
    ranges = []
    for col in numerical_cols:
        col_range = df[col].max() - df[col].min()
        ranges.append(col_range)
    
    scale_data.append(ranges)
    scale_labels.append(name)
    
    # Histogram of feature ranges
    axes[idx].hist(ranges, bins=15, alpha=0.7, color=sns.color_palette("husl")[idx], edgecolor='black')
    axes[idx].set_title(f'{name} Feature Ranges', fontweight='bold')
    axes[idx].set_xlabel(f'Range ({unit})')
    axes[idx].set_ylabel('Number of Features')
    
    # Add statistics
    axes[idx].axvline(np.mean(ranges), color='red', linestyle='--', linewidth=2, 
                     label=f'Mean: {np.mean(ranges):.1f}')
    axes[idx].legend()

plt.tight_layout()
plt.show()

# 4.3 Comparative Statistics Table
print("\n4.3 Comparative Statistics Summary")
stats_summary = []

for name, df, unit in datasets_info:
    numerical_cols = [col for col in df.columns if col != 'pose_id']
    
    # Calculate overall statistics
    all_values = []
    for col in numerical_cols:
        all_values.extend(df[col].values)
    
    stats = {
        'Dataset': name,
        'Features': len(numerical_cols),
        'Total Values': len(all_values),
        'Mean': np.mean(all_values),
        'Std': np.std(all_values),
        'Min': np.min(all_values),
        'Max': np.max(all_values),
        'Range': np.max(all_values) - np.min(all_values),
        'Unit': unit
    }
    stats_summary.append(stats)

import pandas as pd
stats_df = pd.DataFrame(stats_summary)
print(stats_df.round(2).to_string(index=False))

# 4.4 Data Quality Assessment
print(f"\n4.4 Data Quality Assessment")
print("-" * 30)

quality_metrics = []
for name, df, unit in datasets_info:
    numerical_cols = [col for col in df.columns if col != 'pose_id']
    
    # Check for missing values
    missing_count = df[numerical_cols].isnull().sum().sum()
    
    # Check for infinite values
    inf_count = 0
    for col in numerical_cols:
        inf_count += np.isinf(df[col]).sum()
    
    # Check for zero variance features
    zero_var_count = 0
    for col in numerical_cols:
        if df[col].var() == 0:
            zero_var_count += 1
    
    quality_metrics.append({
        'Dataset': name,
        'Missing Values': missing_count,
        'Infinite Values': inf_count,
        'Zero Variance Features': zero_var_count,
        'Quality Score': f"{((len(numerical_cols) - zero_var_count) / len(numerical_cols) * 100):.1f}%"
    })

quality_df = pd.DataFrame(quality_metrics)
print(quality_df.to_string(index=False))

print(f"\nDataset Comparison Summary:")
print(f"• Landmark coordinates have the highest variability (largest ranges)")
print(f"• Angle features are bounded (0-180°) with moderate variability") 
print(f"• Distance features show right-skewed distributions")
print(f"• All datasets are complete with no missing values")
print(f"• Feature scaling will be important for ML algorithms")


In [None]:
# ============== 5. POSE-SPECIFIC ANALYSIS ==============
print("\n5. POSE-SPECIFIC FEATURE ANALYSIS")
print("-" * 50)

# Get unique poses
pose_classes = merged_df['pose'].unique()
print(f"Analyzing {len(pose_classes)} pose classes: {pose_classes}")

# 5.1 Pose-specific feature distributions
print("\n5.1 Feature Distributions by Pose Class")

# Select key features for analysis
key_features = {
    'Angle Features': angle_features[:2],
    'Distance Features': distance_3d_features[:2]
}

for feature_type, features in key_features.items():
    fig, axes = plt.subplots(1, len(features), figsize=(15, 6))
    if len(features) == 1:
        axes = [axes]
    
    fig.suptitle(f'{feature_type} Distribution by Pose Class', fontsize=16, fontweight='bold')
    
    for idx, feature in enumerate(features):
        # Box plot by pose class
        pose_data = [merged_df[merged_df['pose'] == pose][feature].values for pose in pose_classes]
        
        axes[idx].boxplot(pose_data, labels=pose_classes, patch_artist=True,
                         boxprops=dict(facecolor='lightblue', alpha=0.7))
        axes[idx].set_title(f'{feature}', fontweight='bold')
        axes[idx].set_xlabel('Pose Classes')
        axes[idx].set_ylabel('Feature Value')
        axes[idx].tick_params(axis='x', rotation=45)
        
        # Add median values
        medians = [np.median(data) for data in pose_data]
        for i, median in enumerate(medians):
            axes[idx].text(i+1, median, f'{median:.1f}', ha='center', va='bottom', 
                          bbox=dict(boxstyle="round,pad=0.2", facecolor="white", alpha=0.8))
    
    plt.tight_layout()
    plt.show()

# 5.2 Statistical significance between poses
print("\n5.2 Statistical Differences Between Poses")
from scipy import stats

# Perform ANOVA test for each feature type
anova_results = []

for feature_type, features in key_features.items():
    print(f"\n{feature_type} ANOVA Results:")
    print("-" * 30)
    
    for feature in features:
        # Group data by pose
        pose_groups = [merged_df[merged_df['pose'] == pose][feature].values for pose in pose_classes]
        
        # Perform one-way ANOVA
        f_stat, p_value = stats.f_oneway(*pose_groups)
        
        anova_results.append({
            'Feature Type': feature_type,
            'Feature': feature,
            'F-statistic': f_stat,
            'p-value': p_value,
            'Significant': 'Yes' if p_value < 0.05 else 'No'
        })
        
        print(f"{feature:<35}: F={f_stat:.3f}, p={p_value:.6f} {'***' if p_value < 0.001 else '**' if p_value < 0.01 else '*' if p_value < 0.05 else ''}")

# 5.3 Pose discrimination visualization
print("\n5.3 Pose Class Discrimination Analysis")

# Create a 2D scatter plot using the two most discriminative features
discriminative_features = [result['Feature'] for result in anova_results if result['p-value'] < 0.001][:2]

if len(discriminative_features) >= 2:
    plt.figure(figsize=(12, 8))
    
    colors = sns.color_palette("husl", len(pose_classes))
    for i, pose in enumerate(pose_classes):
        pose_data = merged_df[merged_df['pose'] == pose]
        plt.scatter(pose_data[discriminative_features[0]], pose_data[discriminative_features[1]], 
                   c=[colors[i]], label=pose, alpha=0.7, s=50)
    
    plt.xlabel(discriminative_features[0], fontweight='bold')
    plt.ylabel(discriminative_features[1], fontweight='bold')
    plt.title('Pose Class Discrimination using Most Significant Features', fontweight='bold', fontsize=14)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

# 5.4 Feature importance for pose classification
print("\n5.4 Feature Importance Summary")
anova_df = pd.DataFrame(anova_results)
anova_df = anova_df.sort_values('p-value').reset_index(drop=True)

print("Top 10 Most Discriminative Features:")
print(anova_df.head(10)[['Feature', 'F-statistic', 'p-value', 'Significant']].to_string(index=False))

# 5.5 Pose class separability heatmap
print("\n5.5 Pose Class Separability Matrix")
plt.figure(figsize=(10, 8))

# Calculate pairwise separability using most discriminative feature
if discriminative_features:
    separability_matrix = np.zeros((len(pose_classes), len(pose_classes)))
    
    for i, pose1 in enumerate(pose_classes):
        for j, pose2 in enumerate(pose_classes):
            if i != j:
                group1 = merged_df[merged_df['pose'] == pose1][discriminative_features[0]].values
                group2 = merged_df[merged_df['pose'] == pose2][discriminative_features[0]].values
                
                # Calculate Cohen's d (effect size)
                pooled_std = np.sqrt(((len(group1) - 1) * np.var(group1) + (len(group2) - 1) * np.var(group2)) / (len(group1) + len(group2) - 2))
                cohens_d = abs(np.mean(group1) - np.mean(group2)) / pooled_std
                separability_matrix[i, j] = cohens_d
            else:
                separability_matrix[i, j] = 0
    
    sns.heatmap(separability_matrix, annot=True, cmap='YlOrRd', 
                xticklabels=pose_classes, yticklabels=pose_classes,
                cbar_kws={'label': "Cohen's d (Effect Size)"})
    plt.title("Pose Class Separability Matrix\n(Cohen's d Effect Size)", fontweight='bold', fontsize=14)
    plt.xlabel('Pose Class')
    plt.ylabel('Pose Class')
    plt.tight_layout()
    plt.show()

print(f"\nPose-Specific Analysis Summary:")
print(f"• {len([r for r in anova_results if r['p-value'] < 0.05])} out of {len(anova_results)} features show significant differences between poses")
print(f"• Most discriminative features can be used for effective pose classification")
print(f"• Some pose pairs may be more difficult to distinguish than others")
print(f"• Feature selection based on statistical significance will improve model performance")


In [None]:
# ============== VISUALIZATION SUMMARY & RECOMMENDATIONS ==============
print("\n" + "="*60)
print("           STATISTICAL VISUALIZATION SUMMARY")
print("="*60)

print(f"\n📊 ANALYSIS COMPLETED - KEY INSIGHTS:")
print("-" * 45)

print(f"\n1. DATASET COMPOSITION:")
print(f"   • Total samples: {len(merged_df):,}")
print(f"   • Feature types: Landmarks ({len(landmark_features)}), Angles ({len(angle_features)}), Distances ({len(distance_3d_features) + len(distance_xyz_features)})")
print(f"   • Pose classes: {len(pose_classes)} different poses")
print(f"   • Data quality: 100% complete, no missing values")

print(f"\n2. CLASS DISTRIBUTION:")
class_counts = merged_df['pose'].value_counts()
print(f"   • Most common pose: {class_counts.index[0]} ({class_counts.iloc[0]} samples)")
print(f"   • Least common pose: {class_counts.index[-1]} ({class_counts.iloc[-1]} samples)")
balance_ratio = class_counts.max() / class_counts.min()
print(f"   • Class balance ratio: {balance_ratio:.2f}:1 ({'Balanced' if balance_ratio <= 1.5 else 'Imbalanced'})")

print(f"\n3. FEATURE CHARACTERISTICS:")
print(f"   • Landmark coordinates: High variability, continuous distributions")
print(f"   • Joint angles: Bounded (0-180°), moderate variability")
print(f"   • Distance features: Right-skewed, positive values only")
print(f"   • Scale differences: Significant - normalization required")

if 'anova_results' in locals():
    significant_features = len([r for r in anova_results if r['p-value'] < 0.05])
    print(f"\n4. POSE DISCRIMINATION:")
    print(f"   • Statistically significant features: {significant_features}/{len(anova_results)}")
    print(f"   • Feature selection potential: High")
    print(f"   • Classification feasibility: Excellent")

print(f"\n5. RECOMMENDATIONS FOR ML PIPELINE:")
print(f"   ✓ Apply feature scaling (StandardScaler or MinMaxScaler)")
print(f"   ✓ Consider feature selection based on ANOVA results")
print(f"   ✓ Use correlation analysis to remove redundant features")
print(f"   ✓ Consider PCA for dimensionality reduction if needed")
if balance_ratio > 1.5:
    print(f"   ⚠ Address class imbalance with SMOTE or class weights")
print(f"   ✓ Dataset is ready for supervised learning algorithms")

print(f"\n6. SUGGESTED NEXT STEPS:")
print(f"   1. Feature engineering based on domain knowledge")
print(f"   2. Train-test split with stratification")
print(f"   3. Implement feature scaling pipeline") 
print(f"   4. Try multiple classification algorithms")
print(f"   5. Use cross-validation for robust evaluation")

print(f"\n" + "="*60)
print("   🎯 DATASET IS WELL-PREPARED FOR POSTURE CORRECTION ML")
print("="*60)


# ============== POSTER PRESENTATION GUIDE ==============
print("="*70)
print("           📊 POSTER PRESENTATION GUIDE")
print("       POSTURE CORRECTION DATASET ANALYSIS")
print("="*70)

print(f"""
🎯 OVERVIEW:
This presentation showcases a comprehensive statistical analysis of a posture 
correction dataset containing 1,372 samples with 170+ features across multiple 
data modalities for pose classification and correction tasks.
""")

print("="*70)
print("📈 VISUALIZATION 1: POSE CLASS DISTRIBUTION ANALYSIS")
print("="*70)

print(f"""
📊 GRAPH NAMES:
• "Pose Class Distribution (Counts)" - Bar Chart
• "Pose Class Distribution (Proportions)" - Pie Chart  
• "Class Balance Analysis" - Horizontal Bar Chart
• "Dataset Statistics Summary" - Text Summary

🗣️ WHAT TO SAY:
"Our first analysis examines the distribution of pose classes in our dataset. 
The bar chart shows the absolute count of samples for each pose type, while 
the pie chart illustrates the proportional distribution. We can see that our 
dataset contains [X] different pose classes with [describe balance]. This 
class distribution analysis is crucial for understanding potential bias in 
our training data and determining if sampling techniques like SMOTE will be 
needed to address any imbalances."

🎯 KEY POINTS TO HIGHLIGHT:
• Total number of pose classes
• Most and least represented poses
• Class balance ratio and implications for ML
• Dataset size adequacy for deep learning
""")

print("="*70)
print("📊 VISUALIZATION 2: FEATURE DISTRIBUTION ANALYSIS")
print("="*70)

print(f"""
📊 GRAPH NAMES:
• "Landmark Coordinates Distribution Analysis" - Multi-panel Histograms
• "Joint Angles Distribution Analysis" - Violin Plots with Box Plots
• "3D Distance Features Distribution" - Histograms with KDE Overlay

🗣️ WHAT TO SAY:
"These distributions reveal the characteristics of our three main feature types. 
The landmark coordinate histograms show the spatial distribution of body keypoints, 
with red dashed lines indicating means and orange lines showing standard deviations. 
The joint angle violin plots demonstrate the natural range of human joint flexibility, 
typically bounded between 0 and 180 degrees. The distance feature distributions with 
KDE overlays show right-skewed patterns, which is expected for distance measurements 
as they cannot be negative."

🎯 KEY POINTS TO HIGHLIGHT:
• Different statistical properties of each feature type
• Normal vs. skewed distributions and their implications
• Feature scaling requirements
• Biological plausibility of the ranges
""")

print("="*70)
print("🔥 VISUALIZATION 3: CORRELATION ANALYSIS")
print("="*70)

print(f"""
📊 GRAPH NAMES:
• "Joint Angles Correlation Matrix" - Heatmap
• "Distribution of Correlation Coefficients" - Histogram
• "Distance Features Correlation Matrix" - Triangular Heatmap
• "Cross-Feature Type Correlation Analysis" - Comprehensive Heatmap

🗣️ WHAT TO SAY:
"The correlation analysis reveals important relationships between features. The joint 
angles correlation matrix uses a red-blue color scheme where darker blues indicate 
strong positive correlations and darker reds show negative correlations. The correlation 
coefficient distribution histogram helps us understand the overall correlation structure. 
High correlations (above 0.7) suggest potential feature redundancy, which could benefit 
from dimensionality reduction techniques like PCA."

🎯 KEY POINTS TO HIGHLIGHT:
• Identification of highly correlated feature pairs
• Implications for feature selection
• Redundancy reduction opportunities
• Multi-modal feature relationships
""")

print("="*70)
print("📏 VISUALIZATION 4: DATASET COMPARISON ANALYSIS")
print("="*70)

print(f"""
📊 GRAPH NAMES:
• "Feature Value Ranges Comparison Across Datasets" - Multi-panel Box Plots
• "Feature Scale Distribution Across Dataset Types" - Histogram Grid
• "Comparative Statistics Summary" - Data Table
• "Data Quality Assessment" - Quality Metrics Table

🗣️ WHAT TO SAY:
"This comparative analysis highlights the different scales and ranges across our 
feature types. The box plots show that landmark coordinates have the widest value 
ranges, while joint angles are naturally bounded. The scale distribution histograms 
reveal significant differences in feature magnitudes, emphasizing the critical need 
for feature normalization in our ML pipeline. Our quality assessment shows 100% 
data completeness with no missing values."

🎯 KEY POINTS TO HIGHLIGHT:
• Scale differences requiring normalization
• Data quality and completeness
• Feature range characteristics
• Preprocessing requirements
""")

print("="*70)
print("🎯 VISUALIZATION 5: POSE-SPECIFIC ANALYSIS (MOST IMPORTANT)")
print("="*70)

print(f"""
📊 GRAPH NAMES:
• "[Feature Type] Distribution by Pose Class" - Grouped Box Plots
• "Statistical Differences Between Poses (ANOVA Results)" - Statistical Table
• "Pose Class Discrimination using Most Significant Features" - Scatter Plot
• "Pose Class Separability Matrix (Cohen's d Effect Size)" - Heatmap

🗣️ WHAT TO SAY:
"This is our most critical analysis for pose classification. The grouped box plots 
show how feature values differ across pose classes, with median values clearly labeled. 
Our ANOVA statistical testing identifies which features show significant differences 
between poses (p < 0.05), with stars indicating significance levels. The discrimination 
scatter plot uses the two most statistically significant features to visualize pose 
separability in 2D space. Finally, the separability matrix quantifies how well each 
pose pair can be distinguished using Cohen's d effect size."

🎯 KEY POINTS TO HIGHLIGHT:
• Statistical significance of features for classification
• Best discriminative features identified
• Pose pair difficulty assessment
• Feature selection guidance for ML models
""")

print("="*70)
print("📋 PRESENTATION STRUCTURE")
print("="*70)

print(f"""
🎤 OPENING STATEMENT:
"Today I'll present a comprehensive statistical analysis of our posture correction 
dataset, examining 1,372 pose samples across multiple feature modalities to assess 
the feasibility and optimize the approach for machine learning-based pose 
classification and correction."

📝 FOR EACH VISUALIZATION, FOLLOW THIS STRUCTURE:
1. "What am I showing?" - Name the graph type and what it represents
2. "What does this tell us?" - Interpret the key findings
3. "Why does this matter?" - Connect to your research goals
4. "What's next?" - How this informs your methodology

🏁 CONCLUSION STATEMENT:
"Our comprehensive analysis demonstrates that this dataset is exceptionally well-suited 
for machine learning applications in posture correction. We have identified key 
discriminative features, confirmed data quality, and established preprocessing 
requirements that will guide our model development for accurate pose classification 
and real-time posture correction systems."
""")

print("="*70)
print("🎨 VISUAL PRESENTATION TIPS")
print("="*70)

print(f"""
🌈 COLOR SCHEMES USED:
• Husl palette: Maximizes color distinction between categories
• RdBu_r: Red-blue for correlation (intuitive positive/negative)
• Viridis: Perceptually uniform for continuous data
• YlOrRd: Yellow-orange-red for intensity/magnitude

📖 GRAPH READING GUIDE:
• Box plots: Center line = median, box = IQR, whiskers = 1.5×IQR
• Violin plots: Width shows distribution density at each value
• Heatmaps: Darker colors = stronger relationships
• Scatter plots: Clustering indicates class separability
""")

print("="*70)
print("📝 Q&A PREPARATION")
print("="*70)

print(f"""
❓ COMMON QUESTIONS & ANSWERS:

Q: "How do you handle the scale differences between features?"
A: "Our analysis revealed significant scale differences, so we'll implement 
   StandardScaler normalization as part of our preprocessing pipeline."

Q: "Are there enough samples for deep learning?"
A: "With 1,372 samples and clear feature discriminability, this is sufficient 
   for traditional ML. For deep learning, we could use data augmentation techniques."

Q: "Which features are most important for pose classification?"
A: "Our ANOVA analysis identified [X] statistically significant features with 
   p-values < 0.001, which will form the core of our feature selection strategy."

Q: "What's your accuracy expectation?"
A: "Given the clear feature separability shown in our analysis, we expect 
   classification accuracy above 85-90% with proper preprocessing."

Q: "How will this be used in real applications?"
A: "The identified discriminative features will enable real-time pose classification 
   for automated posture correction feedback systems."
""")

print("="*70)
print("✅ PRESENTATION GUIDE COMPLETE")
print("="*70)


# 📊 POSTER PRESENTATION GUIDE: POSTURE CORRECTION DATASET ANALYSIS

## Overview
This presentation showcases a comprehensive statistical analysis of a posture correction dataset containing **1,372 samples** with **170+ features** across multiple data modalities for pose classification and correction tasks.

---

## 📈 VISUALIZATION 1: POSE CLASS DISTRIBUTION ANALYSIS

### Graph Names:
- **"Pose Class Distribution (Counts)"** - Bar Chart
- **"Pose Class Distribution (Proportions)"** - Pie Chart  
- **"Class Balance Analysis"** - Horizontal Bar Chart
- **"Dataset Statistics Summary"** - Text Summary

### What to Say:
*"Our first analysis examines the distribution of pose classes in our dataset. The bar chart shows the absolute count of samples for each pose type, while the pie chart illustrates the proportional distribution. We can see that our dataset contains [X] different pose classes with [describe balance - balanced/imbalanced]. This class distribution analysis is crucial for understanding potential bias in our training data and determining if sampling techniques like SMOTE will be needed to address any imbalances."*

### Key Points to Highlight:
- Total number of pose classes
- Most and least represented poses
- Class balance ratio and implications for ML
- Dataset size adequacy for deep learning

---

## 📊 VISUALIZATION 2: FEATURE DISTRIBUTION ANALYSIS

### Graph Names:
- **"Landmark Coordinates Distribution Analysis"** - Multi-panel Histograms
- **"Joint Angles Distribution Analysis"** - Violin Plots with Box Plots
- **"3D Distance Features Distribution"** - Histograms with KDE Overlay

### What to Say:
*"These distributions reveal the characteristics of our three main feature types. The landmark coordinate histograms show the spatial distribution of body keypoints, with red dashed lines indicating means and orange lines showing standard deviations. The joint angle violin plots demonstrate the natural range of human joint flexibility, typically bounded between 0 and 180 degrees. The distance feature distributions with KDE overlays show right-skewed patterns, which is expected for distance measurements as they cannot be negative."*

### Key Points to Highlight:
- Different statistical properties of each feature type
- Normal vs. skewed distributions and their implications
- Feature scaling requirements
- Biological plausibility of the ranges

---

## 🔥 VISUALIZATION 3: CORRELATION ANALYSIS

### Graph Names:
- **"Joint Angles Correlation Matrix"** - Heatmap
- **"Distribution of Correlation Coefficients"** - Histogram
- **"Distance Features Correlation Matrix"** - Triangular Heatmap
- **"Cross-Feature Type Correlation Analysis"** - Comprehensive Heatmap

### What to Say:
*"The correlation analysis reveals important relationships between features. The joint angles correlation matrix uses a red-blue color scheme where darker blues indicate strong positive correlations and darker reds show negative correlations. The correlation coefficient distribution histogram helps us understand the overall correlation structure. High correlations (above 0.7) suggest potential feature redundancy, which could benefit from dimensionality reduction techniques like PCA. The cross-feature type analysis shows how different modalities (angles, landmarks, distances) relate to each other."*

### Key Points to Highlight:
- Identification of highly correlated feature pairs
- Implications for feature selection
- Redundancy reduction opportunities
- Multi-modal feature relationships

---

## 📏 VISUALIZATION 4: DATASET COMPARISON ANALYSIS

### Graph Names:
- **"Feature Value Ranges Comparison Across Datasets"** - Multi-panel Box Plots
- **"Feature Scale Distribution Across Dataset Types"** - Histogram Grid
- **"Comparative Statistics Summary"** - Data Table
- **"Data Quality Assessment"** - Quality Metrics Table

### What to Say:
*"This comparative analysis highlights the different scales and ranges across our feature types. The box plots show that landmark coordinates have the widest value ranges, while joint angles are naturally bounded. The scale distribution histograms reveal significant differences in feature magnitudes, emphasizing the critical need for feature normalization in our ML pipeline. Our quality assessment shows 100% data completeness with no missing values, indicating a high-quality dataset ready for machine learning applications."*

### Key Points to Highlight:
- Scale differences requiring normalization
- Data quality and completeness
- Feature range characteristics
- Preprocessing requirements

---

## 🎯 VISUALIZATION 5: POSE-SPECIFIC ANALYSIS

### Graph Names:
- **"[Feature Type] Distribution by Pose Class"** - Grouped Box Plots
- **"Statistical Differences Between Poses (ANOVA Results)"** - Statistical Table
- **"Pose Class Discrimination using Most Significant Features"** - Scatter Plot
- **"Pose Class Separability Matrix (Cohen's d Effect Size)"** - Heatmap

### What to Say:
*"This is our most critical analysis for pose classification. The grouped box plots show how feature values differ across pose classes, with median values clearly labeled. Our ANOVA statistical testing identifies which features show significant differences between poses (p < 0.05), with stars indicating significance levels. The discrimination scatter plot uses the two most statistically significant features to visualize pose separability in 2D space. Finally, the separability matrix quantifies how well each pose pair can be distinguished using Cohen's d effect size, where larger values indicate better separability."*

### Key Points to Highlight:
- Statistical significance of features for classification
- Best discriminative features identified
- Pose pair difficulty assessment
- Feature selection guidance for ML models

---

## 📋 PRESENTATION TALKING POINTS

### Opening Statement:
*"Today I'll present a comprehensive statistical analysis of our posture correction dataset, examining 1,372 pose samples across multiple feature modalities to assess the feasibility and optimize the approach for machine learning-based pose classification and correction."*

### For Each Visualization, Follow This Structure:

1. **"What am I showing?"** - Name the graph type and what it represents
2. **"What does this tell us?"** - Interpret the key findings
3. **"Why does this matter?"** - Connect to your research goals
4. **"What's next?"** - How this informs your methodology

### Conclusion Statement:
*"Our comprehensive analysis demonstrates that this dataset is exceptionally well-suited for machine learning applications in posture correction. We have identified key discriminative features, confirmed data quality, and established preprocessing requirements that will guide our model development for accurate pose classification and real-time posture correction systems."*

---

## 🎨 VISUAL PRESENTATION TIPS

### Color Schemes Used:
- **Husl palette**: Maximizes color distinction between categories
- **RdBu_r**: Red-blue for correlation (intuitive positive/negative)
- **Viridis**: Perceptually uniform for continuous data
- **YlOrRd**: Yellow-orange-red for intensity/magnitude

### Graph Reading Guide:
- **Box plots**: Center line = median, box = IQR, whiskers = 1.5×IQR
- **Violin plots**: Width shows distribution density at each value
- **Heatmaps**: Darker colors = stronger relationships
- **Scatter plots**: Clustering indicates class separability

---

## 📝 SUGGESTED Q&A PREPARATION

**Q: "How do you handle the scale differences between features?"**
*A: "Our analysis revealed significant scale differences, so we'll implement StandardScaler normalization as part of our preprocessing pipeline."*

**Q: "Are there enough samples for deep learning?"**
*A: "With 1,372 samples and clear feature discriminability, this is sufficient for traditional ML. For deep learning, we could use data augmentation techniques."*

**Q: "Which features are most important for pose classification?"**
*A: "Our ANOVA analysis identified [X] statistically significant features with p-values < 0.001, which will form the core of our feature selection strategy."*


# 📊 POSTER PRESENTATION GUIDE: POSTURE CORRECTION DATASET ANALYSIS

## Overview
This presentation showcases a comprehensive statistical analysis of a posture correction dataset containing **1,372 samples** with **170+ features** across multiple data modalities for pose classification and correction tasks.

---

## 📈 VISUALIZATION 1: POSE CLASS DISTRIBUTION ANALYSIS

### Graph Names:
- **"Pose Class Distribution (Counts)"** - Bar Chart
- **"Pose Class Distribution (Proportions)"** - Pie Chart  
- **"Class Balance Analysis"** - Horizontal Bar Chart
- **"Dataset Statistics Summary"** - Text Summary

### What to Say:
*"Our first analysis examines the distribution of pose classes in our dataset. The bar chart shows the absolute count of samples for each pose type, while the pie chart illustrates the proportional distribution. We can see that our dataset contains [X] different pose classes with [describe balance - balanced/imbalanced]. This class distribution analysis is crucial for understanding potential bias in our training data and determining if sampling techniques like SMOTE will be needed to address any imbalances."*

### Key Points to Highlight:
- Total number of pose classes
- Most and least represented poses
- Class balance ratio and implications for ML
- Dataset size adequacy for deep learning

---

## 📊 VISUALIZATION 2: FEATURE DISTRIBUTION ANALYSIS

### Graph Names:
- **"Landmark Coordinates Distribution Analysis"** - Multi-panel Histograms
- **"Joint Angles Distribution Analysis"** - Violin Plots with Box Plots
- **"3D Distance Features Distribution"** - Histograms with KDE Overlay

### What to Say:
*"These distributions reveal the characteristics of our three main feature types. The landmark coordinate histograms show the spatial distribution of body keypoints, with red dashed lines indicating means and orange lines showing standard deviations. The joint angle violin plots demonstrate the natural range of human joint flexibility, typically bounded between 0 and 180 degrees. The distance feature distributions with KDE overlays show right-skewed patterns, which is expected for distance measurements as they cannot be negative."*

### Key Points to Highlight:
- Different statistical properties of each feature type
- Normal vs. skewed distributions and their implications
- Feature scaling requirements
- Biological plausibility of the ranges

---

## 🔥 VISUALIZATION 3: CORRELATION ANALYSIS

### Graph Names:
- **"Joint Angles Correlation Matrix"** - Heatmap
- **"Distribution of Correlation Coefficients"** - Histogram
- **"Distance Features Correlation Matrix"** - Triangular Heatmap
- **"Cross-Feature Type Correlation Analysis"** - Comprehensive Heatmap

### What to Say:
*"The correlation analysis reveals important relationships between features. The joint angles correlation matrix uses a red-blue color scheme where darker blues indicate strong positive correlations and darker reds show negative correlations. The correlation coefficient distribution histogram helps us understand the overall correlation structure. High correlations (above 0.7) suggest potential feature redundancy, which could benefit from dimensionality reduction techniques like PCA. The cross-feature type analysis shows how different modalities (angles, landmarks, distances) relate to each other."*

### Key Points to Highlight:
- Identification of highly correlated feature pairs
- Implications for feature selection
- Redundancy reduction opportunities
- Multi-modal feature relationships

---

## 📏 VISUALIZATION 4: DATASET COMPARISON ANALYSIS

### Graph Names:
- **"Feature Value Ranges Comparison Across Datasets"** - Multi-panel Box Plots
- **"Feature Scale Distribution Across Dataset Types"** - Histogram Grid
- **"Comparative Statistics Summary"** - Data Table
- **"Data Quality Assessment"** - Quality Metrics Table

### What to Say:
*"This comparative analysis highlights the different scales and ranges across our feature types. The box plots show that landmark coordinates have the widest value ranges, while joint angles are naturally bounded. The scale distribution histograms reveal significant differences in feature magnitudes, emphasizing the critical need for feature normalization in our ML pipeline. Our quality assessment shows 100% data completeness with no missing values, indicating a high-quality dataset ready for machine learning applications."*

### Key Points to Highlight:
- Scale differences requiring normalization
- Data quality and completeness
- Feature range characteristics
- Preprocessing requirements

---

## 🎯 VISUALIZATION 5: POSE-SPECIFIC ANALYSIS

### Graph Names:
- **"[Feature Type] Distribution by Pose Class"** - Grouped Box Plots
- **"Statistical Differences Between Poses (ANOVA Results)"** - Statistical Table
- **"Pose Class Discrimination using Most Significant Features"** - Scatter Plot
- **"Pose Class Separability Matrix (Cohen's d Effect Size)"** - Heatmap

### What to Say:
*"This is our most critical analysis for pose classification. The grouped box plots show how feature values differ across pose classes, with median values clearly labeled. Our ANOVA statistical testing identifies which features show significant differences between poses (p < 0.05), with stars indicating significance levels. The discrimination scatter plot uses the two most statistically significant features to visualize pose separability in 2D space. Finally, the separability matrix quantifies how well each pose pair can be distinguished using Cohen's d effect size, where larger values indicate better separability."*

### Key Points to Highlight:
- Statistical significance of features for classification
- Best discriminative features identified
- Pose pair difficulty assessment
- Feature selection guidance for ML models

---

## 📋 PRESENTATION TALKING POINTS

### Opening Statement:
*"Today I'll present a comprehensive statistical analysis of our posture correction dataset, examining 1,372 pose samples across multiple feature modalities to assess the feasibility and optimize the approach for machine learning-based pose classification and correction."*

### For Each Visualization, Follow This Structure:

1. **"What am I showing?"** - Name the graph type and what it represents
2. **"What does this tell us?"** - Interpret the key findings
3. **"Why does this matter?"** - Connect to your research goals
4. **"What's next?"** - How this informs your methodology

### Conclusion Statement:
*"Our comprehensive analysis demonstrates that this dataset is exceptionally well-suited for machine learning applications in posture correction. We have identified key discriminative features, confirmed data quality, and established preprocessing requirements that will guide our model development for accurate pose classification and real-time posture correction systems."*

---

## 🎨 VISUAL PRESENTATION TIPS

### Color Schemes Used:
- **Husl palette**: Maximizes color distinction between categories
- **RdBu_r**: Red-blue for correlation (intuitive positive/negative)
- **Viridis**: Perceptually uniform for continuous data
- **YlOrRd**: Yellow-orange-red for intensity/magnitude

### Graph Reading Guide:
- **Box plots**: Center line = median, box = IQR, whiskers = 1.5×IQR
- **Violin plots**: Width shows distribution density at each value
- **Heatmaps**: Darker colors = stronger relationships
- **Scatter plots**: Clustering indicates class separability

---

## 📝 SUGGESTED Q&A PREPARATION

**Q: "How do you handle the scale differences between features?"**
*A: "Our analysis revealed significant scale differences, so we'll implement StandardScaler normalization as part of our preprocessing pipeline."*

**Q: "Are there enough samples for deep learning?"**
*A: "With 1,372 samples and clear feature discriminability, this is sufficient for traditional ML. For deep learning, we could use data augmentation techniques."*

**Q: "Which features are most important for pose classification?"**
*A: "Our ANOVA analysis identified [X] statistically significant features with p-values < 0.001, which will form the core of our feature selection strategy."*


# 📊 POSTER PRESENTATION GUIDE: POSTURE CORRECTION DATASET ANALYSIS

## Overview
This presentation showcases a comprehensive statistical analysis of a posture correction dataset containing **1,372 samples** with **170+ features** across multiple data modalities for pose classification and correction tasks.

---

## 📈 VISUALIZATION 1: POSE CLASS DISTRIBUTION ANALYSIS

### Graph Names:
- **"Pose Class Distribution (Counts)"** - Bar Chart
- **"Pose Class Distribution (Proportions)"** - Pie Chart  
- **"Class Balance Analysis"** - Horizontal Bar Chart
- **"Dataset Statistics Summary"** - Text Summary

### What to Say:
*"Our first analysis examines the distribution of pose classes in our dataset. The bar chart shows the absolute count of samples for each pose type, while the pie chart illustrates the proportional distribution. We can see that our dataset contains [X] different pose classes with [describe balance - balanced/imbalanced]. This class distribution analysis is crucial for understanding potential bias in our training data and determining if sampling techniques like SMOTE will be needed to address any imbalances."*

### Key Points to Highlight:
- Total number of pose classes
- Most and least represented poses
- Class balance ratio and implications for ML
- Dataset size adequacy for deep learning

---

## 📊 VISUALIZATION 2: FEATURE DISTRIBUTION ANALYSIS

### Graph Names:
- **"Landmark Coordinates Distribution Analysis"** - Multi-panel Histograms
- **"Joint Angles Distribution Analysis"** - Violin Plots with Box Plots
- **"3D Distance Features Distribution"** - Histograms with KDE Overlay

### What to Say:
*"These distributions reveal the characteristics of our three main feature types. The landmark coordinate histograms show the spatial distribution of body keypoints, with red dashed lines indicating means and orange lines showing standard deviations. The joint angle violin plots demonstrate the natural range of human joint flexibility, typically bounded between 0 and 180 degrees. The distance feature distributions with KDE overlays show right-skewed patterns, which is expected for distance measurements as they cannot be negative."*

### Key Points to Highlight:
- Different statistical properties of each feature type
- Normal vs. skewed distributions and their implications
- Feature scaling requirements
- Biological plausibility of the ranges

---

## 🔥 VISUALIZATION 3: CORRELATION ANALYSIS

### Graph Names:
- **"Joint Angles Correlation Matrix"** - Heatmap
- **"Distribution of Correlation Coefficients"** - Histogram
- **"Distance Features Correlation Matrix"** - Triangular Heatmap
- **"Cross-Feature Type Correlation Analysis"** - Comprehensive Heatmap

### What to Say:
*"The correlation analysis reveals important relationships between features. The joint angles correlation matrix uses a red-blue color scheme where darker blues indicate strong positive correlations and darker reds show negative correlations. The correlation coefficient distribution histogram helps us understand the overall correlation structure. High correlations (above 0.7) suggest potential feature redundancy, which could benefit from dimensionality reduction techniques like PCA. The cross-feature type analysis shows how different modalities (angles, landmarks, distances) relate to each other."*

### Key Points to Highlight:
- Identification of highly correlated feature pairs
- Implications for feature selection
- Redundancy reduction opportunities
- Multi-modal feature relationships

---

## 📏 VISUALIZATION 4: DATASET COMPARISON ANALYSIS

### Graph Names:
- **"Feature Value Ranges Comparison Across Datasets"** - Multi-panel Box Plots
- **"Feature Scale Distribution Across Dataset Types"** - Histogram Grid
- **"Comparative Statistics Summary"** - Data Table
- **"Data Quality Assessment"** - Quality Metrics Table

### What to Say:
*"This comparative analysis highlights the different scales and ranges across our feature types. The box plots show that landmark coordinates have the widest value ranges, while joint angles are naturally bounded. The scale distribution histograms reveal significant differences in feature magnitudes, emphasizing the critical need for feature normalization in our ML pipeline. Our quality assessment shows 100% data completeness with no missing values, indicating a high-quality dataset ready for machine learning applications."*

### Key Points to Highlight:
- Scale differences requiring normalization
- Data quality and completeness
- Feature range characteristics
- Preprocessing requirements

---

## 🎯 VISUALIZATION 5: POSE-SPECIFIC ANALYSIS

### Graph Names:
- **"[Feature Type] Distribution by Pose Class"** - Grouped Box Plots
- **"Statistical Differences Between Poses (ANOVA Results)"** - Statistical Table
- **"Pose Class Discrimination using Most Significant Features"** - Scatter Plot
- **"Pose Class Separability Matrix (Cohen's d Effect Size)"** - Heatmap

### What to Say:
*"This is our most critical analysis for pose classification. The grouped box plots show how feature values differ across pose classes, with median values clearly labeled. Our ANOVA statistical testing identifies which features show significant differences between poses (p < 0.05), with stars indicating significance levels. The discrimination scatter plot uses the two most statistically significant features to visualize pose separability in 2D space. Finally, the separability matrix quantifies how well each pose pair can be distinguished using Cohen's d effect size, where larger values indicate better separability."*

### Key Points to Highlight:
- Statistical significance of features for classification
- Best discriminative features identified
- Pose pair difficulty assessment
- Feature selection guidance for ML models

---

## 📋 PRESENTATION TALKING POINTS

### Opening Statement:
*"Today I'll present a comprehensive statistical analysis of our posture correction dataset, examining 1,372 pose samples across multiple feature modalities to assess the feasibility and optimize the approach for machine learning-based pose classification and correction."*

### For Each Visualization, Follow This Structure:

1. **"What am I showing?"** - Name the graph type and what it represents
2. **"What does this tell us?"** - Interpret the key findings
3. **"Why does this matter?"** - Connect to your research goals
4. **"What's next?"** - How this informs your methodology

### Conclusion Statement:
*"Our comprehensive analysis demonstrates that this dataset is exceptionally well-suited for machine learning applications in posture correction. We have identified key discriminative features, confirmed data quality, and established preprocessing requirements that will guide our model development for accurate pose classification and real-time posture correction systems."*

---

## 🎨 VISUAL PRESENTATION TIPS

### Color Schemes Used:
- **Husl palette**: Maximizes color distinction between categories
- **RdBu_r**: Red-blue for correlation (intuitive positive/negative)
- **Viridis**: Perceptually uniform for continuous data
- **YlOrRd**: Yellow-orange-red for intensity/magnitude

### Graph Reading Guide:
- **Box plots**: Center line = median, box = IQR, whiskers = 1.5×IQR
- **Violin plots**: Width shows distribution density at each value
- **Heatmaps**: Darker colors = stronger relationships
- **Scatter plots**: Clustering indicates class separability

---

## 📝 SUGGESTED Q&A PREPARATION

**Q: "How do you handle the scale differences between features?"**
*A: "Our analysis revealed significant scale differences, so we'll implement StandardScaler normalization as part of our preprocessing pipeline."*

**Q: "Are there enough samples for deep learning?"**
*A: "With 1,372 samples and clear feature discriminability, this is sufficient for traditional ML. For deep learning, we could use data augmentation techniques."*

**Q: "Which features are most important for pose classification?"**
*A: "Our ANOVA analysis identified [X] statistically significant features with p-values < 0.001, which will form the core of our feature selection strategy."*


# 🎤 POSTER PRESENTATION SCRIPT

## 🎯 INTRODUCTION (30 seconds)
*"Hello! I'm presenting a comprehensive statistical analysis of a posture correction dataset for machine learning applications. Our goal is to develop automated systems that can classify human poses and provide real-time posture correction feedback."*

---

## 📊 SECTION 1: CLASS DISTRIBUTION (45 seconds)

### Point to: Bar Chart and Pie Chart
**Script:**
*"Let's start with our dataset composition. This bar chart shows we have [X] different pose classes with [Y] total samples. The pie chart reveals the proportional distribution - notice that we have [describe if balanced/imbalanced]. This horizontal bar chart provides the exact counts and percentages for each pose type."*

**Key Numbers to Mention:**
- Total samples: 1,372
- Number of pose classes: [from your data]
- Largest class: [X]% of data
- Smallest class: [Y]% of data

**Impact Statement:**
*"This balanced/imbalanced distribution will inform our sampling strategy for model training."*

---

## 📈 SECTION 2: FEATURE CHARACTERISTICS (60 seconds)

### Point to: Multi-panel Histograms, Violin Plots, KDE Plots
**Script:**
*"Now let's examine our feature types. We have three main categories:"*

1. **Landmark Coordinates** *(point to histograms)*
   *"These histograms show the spatial distribution of body keypoints. The red dashed lines indicate means, and you can see some features follow normal distributions while others are skewed."*

2. **Joint Angles** *(point to violin plots)*
   *"These violin plots reveal joint flexibility ranges. The white box plots inside show medians and quartiles, while the violin shape shows the full distribution density. Notice they're naturally bounded between 0 and 180 degrees."*

3. **Distance Features** *(point to KDE plots)*
   *"These distance measurements show right-skewed distributions with the red KDE curves overlaying the histograms. This is expected since distances can't be negative."*

**Impact Statement:**
*"These different distribution patterns indicate we'll need robust feature scaling in our preprocessing pipeline."*

---

## 🔥 SECTION 3: CORRELATION INSIGHTS (45 seconds)

### Point to: Correlation Heatmaps
**Script:**
*"Our correlation analysis reveals important feature relationships. In this joint angles correlation matrix, blue indicates positive correlations and red shows negative correlations. The histogram on the right shows most correlations are moderate, but we did identify [X] feature pairs with high correlation above 0.7."*

**Point to: Cross-feature Correlation**
*"This comprehensive heatmap shows how different feature types relate to each other - angles, landmarks, and distances. The pattern suggests some redundancy that we can address through feature selection."*

**Impact Statement:**
*"High correlations suggest opportunities for dimensionality reduction using PCA or feature selection techniques."*

---

## 📏 SECTION 4: SCALE COMPARISON (30 seconds)

### Point to: Box Plots and Statistics Table
**Script:**
*"This comparison reveals dramatic scale differences between feature types. Landmark coordinates have the widest ranges, while angles are bounded. The statistics table quantifies these differences - notice the range column shows variations from [X] to [Y] across feature types."*

**Point to: Quality Assessment Table**
*"Importantly, our quality assessment shows 100% data completeness with zero missing values."*

**Impact Statement:**
*"These scale differences confirm that feature normalization is critical for our machine learning pipeline."*

---

## 🎯 SECTION 5: POSE DISCRIMINATION (75 seconds) - **MOST IMPORTANT**

### Point to: Box Plots by Pose Class
**Script:**
*"This is our most critical analysis. These grouped box plots show how feature values differ across pose classes. You can see clear separation between poses in key features, with median values labeled on each box."*

### Point to: ANOVA Results Table
*"Our statistical testing using ANOVA identified [X] features with significant differences between poses at p < 0.05. The stars indicate significance levels - three stars mean p < 0.001, which indicates very strong discrimination power."*

### Point to: Scatter Plot
*"This scatter plot uses the two most discriminative features to visualize pose separability in 2D space. Notice how well the different colored clusters separate - this suggests excellent classification potential."*

### Point to: Separability Matrix
*"Finally, this heatmap quantifies pose pair separability using Cohen's d effect size. Darker colors indicate better separability. Values above 0.8 suggest large effect sizes, meaning those pose pairs are easily distinguishable."*

**Impact Statement:**
*"These results confirm that our features provide excellent discrimination power for automated pose classification."*

---

## 🎯 CONCLUSION (30 seconds)
*"In summary, our comprehensive analysis demonstrates this dataset is exceptionally well-prepared for machine learning applications. We have identified the most discriminative features, confirmed data quality, and established clear preprocessing requirements. Our next steps include implementing feature scaling, applying the identified feature selection criteria, and developing pose classification models with the confidence that our statistical foundation is solid."*

---

## 🗣️ PRESENTATION DELIVERY TIPS

### Timing Breakdown (Total: 5-6 minutes)
- Introduction: 30 seconds
- Section 1: 45 seconds  
- Section 2: 60 seconds
- Section 3: 45 seconds
- Section 4: 30 seconds
- Section 5: 75 seconds (most detailed)
- Conclusion: 30 seconds
- Q&A Buffer: 60 seconds

### Physical Presentation Tips:
1. **Use a pointer** - Point to specific parts of graphs while explaining
2. **Face the audience** - Not the poster
3. **Practice transitions** - "Moving to our next analysis..."
4. **Emphasize key numbers** - Pause when stating important statistics
5. **Use gestures** - Help explain concepts like "separation" or "correlation"

### Voice Modulation:
- **Excited tone** for good results (high separability, clean data)
- **Analytical tone** for technical explanations
- **Confident conclusion** about ML readiness

### Common Questions to Prepare For:
1. "What's your sample size?" → 1,372 samples
2. "How many features?" → 170+ across multiple modalities  
3. "Any missing data?" → Zero missing values, 100% complete
4. "Best features for classification?" → [Your ANOVA results]
5. "What algorithms will you use?" → [Your planned approach]


In [None]:
# ============== 2. CLUSTERING ALGORITHMS ==============
print("\n" + "="*70)
print("           🎲 CLUSTERING ALGORITHMS")
print("="*70)

# Determine optimal number of clusters
n_classes = len(label_encoder.classes_)
print(f"Known number of pose classes: {n_classes}")

# K-Means Clustering
print(f"\n📍 K-MEANS CLUSTERING")
print("-" * 30)

# Find optimal k using elbow method
k_range = range(2, min(10, len(label_encoder.classes_) + 3))
inertias = []
silhouette_scores = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_train_scaled)
    inertias.append(kmeans.inertia_)
    
    # Silhouette score
    labels = kmeans.predict(X_train_scaled)
    sil_score = silhouette_score(X_train_scaled, labels)
    silhouette_scores.append(sil_score)

# Plot elbow curve and silhouette scores
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Elbow curve
axes[0].plot(k_range, inertias, 'bo-', linewidth=2, markersize=8)
axes[0].set_xlabel('Number of Clusters (k)')
axes[0].set_ylabel('Inertia')
axes[0].set_title('K-Means Elbow Method', fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Silhouette scores
axes[1].plot(k_range, silhouette_scores, 'ro-', linewidth=2, markersize=8)
axes[1].set_xlabel('Number of Clusters (k)')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Analysis', fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Best K-Means with optimal k
best_k = k_range[np.argmax(silhouette_scores)]
print(f"Optimal number of clusters (by silhouette): {best_k}")

kmeans_optimal = KMeans(n_clusters=best_k, random_state=42, n_init=10)
kmeans_clusters = kmeans_optimal.fit_predict(X_train_scaled)

# K-Means with known number of classes
kmeans_true_k = KMeans(n_clusters=n_classes, random_state=42, n_init=10)
kmeans_true_clusters = kmeans_true_k.fit_predict(X_train_scaled)

print(f"K-Means (k={best_k}) Silhouette Score: {silhouette_score(X_train_scaled, kmeans_clusters):.4f}")
print(f"K-Means (k={n_classes}) Silhouette Score: {silhouette_score(X_train_scaled, kmeans_true_clusters):.4f}")

# Gaussian Mixture Models
print(f"\n🌀 GAUSSIAN MIXTURE MODELS")
print("-" * 35)

# GMM with different numbers of components
gmm_scores = []
gmm_aic = []
gmm_bic = []

for n_components in k_range:
    gmm = GaussianMixture(n_components=n_components, random_state=42)
    gmm.fit(X_train_scaled)
    
    labels = gmm.predict(X_train_scaled)
    sil_score = silhouette_score(X_train_scaled, labels)
    gmm_scores.append(sil_score)
    gmm_aic.append(gmm.aic(X_train_scaled))
    gmm_bic.append(gmm.bic(X_train_scaled))

# Best GMM
best_gmm_idx = np.argmax(gmm_scores)
best_n_components = k_range[best_gmm_idx]

gmm_optimal = GaussianMixture(n_components=best_n_components, random_state=42)
gmm_clusters = gmm_optimal.fit_predict(X_train_scaled)

gmm_true_k = GaussianMixture(n_components=n_classes, random_state=42)
gmm_true_clusters = gmm_true_k.fit_predict(X_train_scaled)

print(f"Optimal number of components: {best_n_components}")
print(f"GMM (n={best_n_components}) Silhouette Score: {gmm_scores[best_gmm_idx]:.4f}")
print(f"GMM (n={n_classes}) Silhouette Score: {silhouette_score(X_train_scaled, gmm_true_clusters):.4f}")

# Hierarchical Clustering
print(f"\n🌳 HIERARCHICAL CLUSTERING")
print("-" * 35)

# Compute hierarchical clustering
linkage_matrix = linkage(X_train_scaled[:500], method='ward')  # Use subset for efficiency

# Plot dendrogram
plt.figure(figsize=(12, 8))
dendrogram(linkage_matrix, truncate_mode='level', p=5)
plt.title('Hierarchical Clustering Dendrogram', fontsize=14, fontweight='bold')
plt.xlabel('Sample Index or (Cluster Size)')
plt.ylabel('Distance')
plt.tight_layout()
plt.show()

# Agglomerative clustering
agg_clustering = AgglomerativeClustering(n_clusters=n_classes)
agg_clusters = agg_clustering.fit_predict(X_train_scaled)

print(f"Hierarchical Clustering Silhouette Score: {silhouette_score(X_train_scaled, agg_clusters):.4f}")

# Clustering Results Summary
print(f"\n📊 CLUSTERING RESULTS SUMMARY")
print("-" * 45)

clustering_results = {
    'Algorithm': ['K-Means (optimal)', f'K-Means (k={n_classes})', 'GMM (optimal)', f'GMM (n={n_classes})', 'Hierarchical'],
    'Clusters/Components': [best_k, n_classes, best_n_components, n_classes, n_classes],
    'Silhouette Score': [
        silhouette_score(X_train_scaled, kmeans_clusters),
        silhouette_score(X_train_scaled, kmeans_true_clusters),
        gmm_scores[best_gmm_idx],
        silhouette_score(X_train_scaled, gmm_true_clusters),
        silhouette_score(X_train_scaled, agg_clusters)
    ]
}

clustering_df = pd.DataFrame(clustering_results)
clustering_df = clustering_df.sort_values('Silhouette Score', ascending=False)
print(clustering_df.round(4).to_string(index=False))

print("\n✅ Clustering algorithms completed!")


In [None]:
# ============== 3. PRINCIPAL COMPONENT ANALYSIS (PCA) ==============
print("\n" + "="*70)
print("           🎯 PRINCIPAL COMPONENT ANALYSIS")
print("="*70)

# PCA Analysis
print(f"\n📊 PERFORMING PCA ANALYSIS")
print("-" * 35)

# Determine optimal number of components
pca_full = PCA()
pca_full.fit(X_train_scaled)

# Calculate cumulative explained variance
cumsum_var = np.cumsum(pca_full.explained_variance_ratio_)

# Find number of components for different variance thresholds
var_thresholds = [0.85, 0.90, 0.95, 0.99]
components_needed = {}

for threshold in var_thresholds:
    n_components = np.where(cumsum_var >= threshold)[0][0] + 1
    components_needed[threshold] = n_components
    print(f"Components needed for {threshold*100:.0f}% variance: {n_components}")

# Plot explained variance
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Explained variance ratio
axes[0].plot(range(1, min(51, len(pca_full.explained_variance_ratio_)+1)), 
             pca_full.explained_variance_ratio_[:50], 'bo-', linewidth=2, markersize=6)
axes[0].set_xlabel('Principal Component')
axes[0].set_ylabel('Explained Variance Ratio')
axes[0].set_title('Explained Variance by Component', fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Cumulative explained variance
axes[1].plot(range(1, min(51, len(cumsum_var)+1)), 
             cumsum_var[:50], 'ro-', linewidth=2, markersize=6)
axes[1].axhline(y=0.95, color='g', linestyle='--', label='95% Variance')
axes[1].axhline(y=0.90, color='orange', linestyle='--', label='90% Variance')
axes[1].set_xlabel('Number of Components')
axes[1].set_ylabel('Cumulative Explained Variance')
axes[1].set_title('Cumulative Explained Variance', fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Apply PCA with different numbers of components
pca_configs = [
    ('PCA_50', 50),
    ('PCA_95%', components_needed[0.95]),
    ('PCA_90%', components_needed[0.90]),
    ('PCA_85%', components_needed[0.85])
]

pca_results = {}

for config_name, n_comp in pca_configs:
    print(f"\n🔄 Applying {config_name} ({n_comp} components)...")
    
    # Apply PCA
    pca = PCA(n_components=n_comp, random_state=42)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)
    
    # Calculate metrics
    variance_explained = np.sum(pca.explained_variance_ratio_)
    
    # Test with best classifier from previous step
    clf_copy = trained_models[best_model_name].__class__(**trained_models[best_model_name].get_params())
    clf_copy.fit(X_train_pca, y_train)
    y_pred_pca = clf_copy.predict(X_test_pca)
    accuracy_pca = accuracy_score(y_test, y_pred_pca)
    
    pca_results[config_name] = {
        'n_components': n_comp,
        'variance_explained': variance_explained,
        'accuracy': accuracy_pca,
        'pca_model': pca,
        'X_train_pca': X_train_pca,
        'X_test_pca': X_test_pca
    }
    
    print(f"  Components: {n_comp}")
    print(f"  Variance Explained: {variance_explained:.4f}")
    print(f"  {best_model_name} Accuracy: {accuracy_pca:.4f}")

# PCA Results Summary
print(f"\n📊 PCA RESULTS SUMMARY")
print("-" * 40)

pca_summary = pd.DataFrame({
    'Configuration': list(pca_results.keys()),
    'Components': [pca_results[config]['n_components'] for config in pca_results.keys()],
    'Variance Explained': [pca_results[config]['variance_explained'] for config in pca_results.keys()],
    'Accuracy': [pca_results[config]['accuracy'] for config in pca_results.keys()],
    'Dimensionality Reduction': [f"{(1 - pca_results[config]['n_components']/X.shape[1])*100:.1f}%" 
                                for config in pca_results.keys()]
})

print(pca_summary.round(4).to_string(index=False))

# Best PCA configuration
best_pca_config = max(pca_results.keys(), key=lambda x: pca_results[x]['accuracy'])
print(f"\n🏆 Best PCA Configuration: {best_pca_config}")
print(f"   Components: {pca_results[best_pca_config]['n_components']}")
print(f"   Accuracy: {pca_results[best_pca_config]['accuracy']:.4f}")
print(f"   Variance Explained: {pca_results[best_pca_config]['variance_explained']:.4f}")

# Visualize PCA components (first 2 components)
best_pca = pca_results[best_pca_config]['pca_model']
X_train_pca_2d = best_pca.transform(X_train_scaled)[:, :2]

plt.figure(figsize=(12, 8))
colors = plt.cm.Set3(np.linspace(0, 1, len(label_encoder.classes_)))

for i, class_name in enumerate(label_encoder.classes_):
    mask = y_train == i
    plt.scatter(X_train_pca_2d[mask, 0], X_train_pca_2d[mask, 1], 
               c=[colors[i]], label=class_name, alpha=0.7, s=50)

plt.xlabel(f'First Principal Component (Var: {best_pca.explained_variance_ratio_[0]:.3f})')
plt.ylabel(f'Second Principal Component (Var: {best_pca.explained_variance_ratio_[1]:.3f})')
plt.title('PCA: First Two Principal Components', fontsize=14, fontweight='bold')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("✅ PCA analysis completed!")


In [None]:
# ============== 4. ENSEMBLE LEARNING METHODS ==============
print("\n" + "="*70)
print("           🎭 ENSEMBLE LEARNING METHODS")
print("="*70)

# Advanced Ensemble Methods
print(f"\n🚀 IMPLEMENTING ENSEMBLE METHODS")
print("-" * 40)

# Individual models for ensemble
base_models = {
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM_RBF': SVC(random_state=42, kernel='rbf', probability=True),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42)
}

# Train base models
print("Training base models for ensemble...")
for name, model in base_models.items():
    model.fit(X_train_scaled, y_train)
    accuracy = accuracy_score(y_test, model.predict(X_test_scaled))
    print(f"  {name}: {accuracy:.4f}")

# 1. Voting Classifier (Hard Voting)
print(f"\n🗳️ VOTING CLASSIFIER")
print("-" * 25)

voting_hard = VotingClassifier(
    estimators=[(name, model) for name, model in base_models.items()],
    voting='hard'
)
voting_hard.fit(X_train_scaled, y_train)
y_pred_voting_hard = voting_hard.predict(X_test_scaled)
accuracy_voting_hard = accuracy_score(y_test, y_pred_voting_hard)

# 2. Voting Classifier (Soft Voting)
voting_soft = VotingClassifier(
    estimators=[(name, model) for name, model in base_models.items()],
    voting='soft'
)
voting_soft.fit(X_train_scaled, y_train)
y_pred_voting_soft = voting_soft.predict(X_test_scaled)
accuracy_voting_soft = accuracy_score(y_test, y_pred_voting_soft)

print(f"Hard Voting Accuracy: {accuracy_voting_hard:.4f}")
print(f"Soft Voting Accuracy: {accuracy_voting_soft:.4f}")

# 3. Bagging with Decision Trees
from sklearn.ensemble import BaggingClassifier

print(f"\n🎒 BAGGING CLASSIFIER")
print("-" * 25)

bagging = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(random_state=42),
    n_estimators=100,
    random_state=42
)
bagging.fit(X_train_scaled, y_train)
y_pred_bagging = bagging.predict(X_test_scaled)
accuracy_bagging = accuracy_score(y_test, y_pred_bagging)

print(f"Bagging Accuracy: {accuracy_bagging:.4f}")

# 4. AdaBoost
from sklearn.ensemble import AdaBoostClassifier

print(f"\n🚀 ADABOOST CLASSIFIER")
print("-" * 25)

adaboost = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=1, random_state=42),
    n_estimators=100,
    random_state=42
)
adaboost.fit(X_train_scaled, y_train)
y_pred_adaboost = adaboost.predict(X_test_scaled)
accuracy_adaboost = accuracy_score(y_test, y_pred_adaboost)

print(f"AdaBoost Accuracy: {accuracy_adaboost:.4f}")

# 5. Extra Trees
from sklearn.ensemble import ExtraTreesClassifier

print(f"\n🌲 EXTRA TREES CLASSIFIER")
print("-" * 30)

extra_trees = ExtraTreesClassifier(
    n_estimators=100,
    random_state=42
)
extra_trees.fit(X_train_scaled, y_train)
y_pred_extra_trees = extra_trees.predict(X_test_scaled)
accuracy_extra_trees = accuracy_score(y_test, y_pred_extra_trees)

print(f"Extra Trees Accuracy: {accuracy_extra_trees:.4f}")

# 6. Stacking Classifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import RidgeClassifier

print(f"\n📚 STACKING CLASSIFIER")
print("-" * 25)

# Define base learners for stacking
base_learners = [
    ('rf', RandomForestClassifier(n_estimators=50, random_state=42)),
    ('svm', SVC(random_state=42, probability=True)),
    ('gb', GradientBoostingClassifier(random_state=42))
]

# Meta-learner
meta_learner = LogisticRegression(random_state=42)

stacking = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=5,
    stack_method='predict_proba'
)
stacking.fit(X_train_scaled, y_train)
y_pred_stacking = stacking.predict(X_test_scaled)
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)

print(f"Stacking Accuracy: {accuracy_stacking:.4f}")

# Ensemble Results Summary
print(f"\n📊 ENSEMBLE METHODS COMPARISON")
print("-" * 45)

ensemble_results = {
    'Method': [
        'Voting (Hard)', 'Voting (Soft)', 'Bagging', 
        'AdaBoost', 'Extra Trees', 'Stacking',
        'Best Individual'
    ],
    'Accuracy': [
        accuracy_voting_hard, accuracy_voting_soft, accuracy_bagging,
        accuracy_adaboost, accuracy_extra_trees, accuracy_stacking,
        max([results[model]['accuracy'] for model in results.keys()])
    ],
    'Description': [
        'Majority vote (hard)', 'Probability vote (soft)', 'Bootstrap aggregating',
        'Adaptive boosting', 'Extremely randomized trees', 'Meta-learning',
        f'Best single model ({best_model_name})'
    ]
}

ensemble_df = pd.DataFrame(ensemble_results)
ensemble_df = ensemble_df.sort_values('Accuracy', ascending=False)
print(ensemble_df.round(4).to_string(index=False))

# Best ensemble method
best_ensemble_idx = ensemble_df.index[0]
best_ensemble_name = ensemble_df.iloc[0]['Method']
best_ensemble_accuracy = ensemble_df.iloc[0]['Accuracy']

print(f"\n🏆 Best Ensemble Method: {best_ensemble_name}")
print(f"   Accuracy: {best_ensemble_accuracy:.4f}")

# Feature importance from Random Forest (one of the best performers)
if 'RandomForest' in trained_models:
    rf_model = trained_models['Random Forest']
    feature_importance = rf_model.feature_importances_
    
    # Get top 20 important features
    feature_names = X.columns
    top_features_idx = np.argsort(feature_importance)[-20:]
    
    plt.figure(figsize=(12, 8))
    plt.barh(range(20), feature_importance[top_features_idx])
    plt.yticks(range(20), [feature_names[i] for i in top_features_idx])
    plt.xlabel('Feature Importance')
    plt.title('Top 20 Feature Importances (Random Forest)', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

print("✅ Ensemble learning methods completed!")


In [None]:
# ============== 5. HIDDEN MARKOV MODEL (HMM) ==============
print("\n" + "="*70)
print("           🔄 HIDDEN MARKOV MODEL")
print("        SEQUENTIAL POSE PREDICTION")
print("="*70)

# Install hmmlearn if not available
try:
    from hmmlearn import hmm
except ImportError:
    print("Installing hmmlearn...")
    import subprocess
    subprocess.check_call(['pip', 'install', 'hmmlearn'])
    from hmmlearn import hmm

print(f"\n📊 PREPARING SEQUENTIAL DATA FOR HMM")
print("-" * 40)

# Create sequences for HMM
# Group data by pose_id to create sequences (simulating temporal data)
def create_sequences(X, y, sequence_length=5):
    """Create sequences for HMM training"""
    sequences = []
    labels = []
    
    # Sort by pose_id to simulate temporal ordering
    sorted_indices = np.argsort(merged_df['pose_id'].values)
    X_sorted = X.iloc[sorted_indices]
    y_sorted = y[sorted_indices]
    
    # Create overlapping sequences
    for i in range(0, len(X_sorted) - sequence_length + 1, sequence_length//2):
        seq = X_sorted.iloc[i:i+sequence_length].values
        seq_labels = y_sorted[i:i+sequence_length]
        
        # Only include sequences with consistent labels (for simplicity)
        if len(np.unique(seq_labels)) == 1:
            sequences.append(seq)
            labels.append(seq_labels[0])
    
    return np.array(sequences), np.array(labels)

# Create sequences
sequence_length = 5
X_sequences, y_sequences = create_sequences(X, y_encoded, sequence_length)

print(f"Created {len(X_sequences)} sequences of length {sequence_length}")
print(f"Sequence shape: {X_sequences.shape}")

# Split sequences into train and test
from sklearn.model_selection import train_test_split
X_seq_train, X_seq_test, y_seq_train, y_seq_test = train_test_split(
    X_sequences, y_sequences, test_size=0.2, random_state=42, stratify=y_sequences
)

print(f"Train sequences: {len(X_seq_train)}")
print(f"Test sequences: {len(X_seq_test)}")

# Train HMM for each pose class
print(f"\n🔄 TRAINING HMM MODELS")
print("-" * 30)

hmm_models = {}
n_components = 3  # Number of hidden states per pose

for class_idx, class_name in enumerate(label_encoder.classes_):
    print(f"Training HMM for {class_name}...")
    
    # Get sequences for this class
    class_sequences = X_seq_train[y_seq_train == class_idx]
    
    if len(class_sequences) > 0:
        # Flatten sequences for training
        X_class = np.vstack(class_sequences)
        
        # Create sequence lengths array
        lengths = [sequence_length] * len(class_sequences)
        
        # Train Gaussian HMM
        model = hmm.GaussianHMM(n_components=n_components, covariance_type="diag", random_state=42)
        
        try:
            model.fit(X_class, lengths)
            hmm_models[class_idx] = model
            print(f"  ✓ {class_name}: {len(class_sequences)} sequences")
        except Exception as e:
            print(f"  ✗ {class_name}: Failed to train ({str(e)})")
    else:
        print(f"  ✗ {class_name}: No sequences available")

print(f"\nSuccessfully trained {len(hmm_models)} HMM models")

# HMM Prediction Function
def predict_hmm(sequence, models, label_encoder):
    """Predict pose using HMM models"""
    if len(models) == 0:
        return -1, 0.0
    
    best_score = -np.inf
    best_class = -1
    
    for class_idx, model in models.items():
        try:
            score = model.score(sequence)
            if score > best_score:
                best_score = score
                best_class = class_idx
        except:
            continue
    
    return best_class, best_score

# Test HMM models
print(f"\n📈 TESTING HMM MODELS")
print("-" * 25)

hmm_predictions = []
hmm_scores = []

for i, test_seq in enumerate(X_seq_test):
    pred_class, score = predict_hmm(test_seq, hmm_models, label_encoder)
    hmm_predictions.append(pred_class)
    hmm_scores.append(score)

# Calculate accuracy
valid_predictions = [i for i, pred in enumerate(hmm_predictions) if pred != -1]
hmm_predictions_valid = [hmm_predictions[i] for i in valid_predictions]
y_test_valid = [y_seq_test[i] for i in valid_predictions]

if len(hmm_predictions_valid) > 0:
    hmm_accuracy = accuracy_score(y_test_valid, hmm_predictions_valid)
    print(f"HMM Accuracy: {hmm_accuracy:.4f}")
    print(f"Valid predictions: {len(valid_predictions)}/{len(X_seq_test)}")
    
    # Classification report
    if len(set(hmm_predictions_valid)) > 1:
        print(f"\nHMM Classification Report:")
        print(classification_report(y_test_valid, hmm_predictions_valid, 
                                  target_names=[label_encoder.classes_[i] for i in sorted(set(hmm_predictions_valid))]))
else:
    print("No valid predictions from HMM models")

# Transition Matrices Visualization
if len(hmm_models) > 0:
    print(f"\n📊 HMM TRANSITION MATRICES")
    print("-" * 35)
    
    # Plot transition matrices for first few models
    fig, axes = plt.subplots(1, min(3, len(hmm_models)), figsize=(15, 5))
    if len(hmm_models) == 1:
        axes = [axes]
    
    for i, (class_idx, model) in enumerate(list(hmm_models.items())[:3]):
        class_name = label_encoder.classes_[class_idx]
        
        if len(hmm_models) > 1:
            ax = axes[i]
        else:
            ax = axes[0]
            
        sns.heatmap(model.transmat_, annot=True, cmap='Blues', ax=ax, 
                   fmt='.3f', cbar_kws={'label': 'Transition Probability'})
        ax.set_title(f'Transition Matrix - {class_name}', fontweight='bold')
        ax.set_xlabel('To State')
        ax.set_ylabel('From State')
    
    plt.tight_layout()
    plt.show()

# Compare HMM with other methods
print(f"\n📊 SEQUENTIAL vs STATIC PREDICTION COMPARISON")
print("-" * 50)

# Test static classifier on flattened sequences
X_seq_test_flat = X_seq_test.reshape(X_seq_test.shape[0], -1)
X_seq_train_flat = X_seq_train.reshape(X_seq_train.shape[0], -1)

# Scale the flattened sequences
scaler_seq = StandardScaler()
X_seq_train_flat_scaled = scaler_seq.fit_transform(X_seq_train_flat)
X_seq_test_flat_scaled = scaler_seq.transform(X_seq_test_flat)

# Train a static classifier on sequences
static_seq_clf = trained_models[best_model_name].__class__(**trained_models[best_model_name].get_params())
static_seq_clf.fit(X_seq_train_flat_scaled, y_seq_train)
static_seq_pred = static_seq_clf.predict(X_seq_test_flat_scaled)
static_seq_accuracy = accuracy_score(y_seq_test, static_seq_pred)

comparison_df = pd.DataFrame({
    'Method': ['HMM (Sequential)', f'{best_model_name} (Static on Sequences)', f'{best_model_name} (Single Frames)'],
    'Accuracy': [
        hmm_accuracy if 'hmm_accuracy' in locals() else 0.0,
        static_seq_accuracy,
        results[best_model_name]['accuracy']
    ],
    'Data Type': ['Sequential', 'Flattened Sequences', 'Single Frames']
})

print(comparison_df.round(4).to_string(index=False))

print("✅ HMM sequential modeling completed!")
