# Data Exploration - CBF Training Dataset

This notebook explores the training dataset for the Control Barrier Function neural network.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Load Data

In [None]:
data = np.load('../data/raw/training_dataset_50k.npz')

states = data['states']
labels = data['labels']
safe_mask = data['safe_mask']
boundary_mask = data['boundary_mask']

print(f"Dataset shape: {states.shape}")
print(f"Total samples: {len(states)}")
print(f"Safe samples: {np.sum(safe_mask)} ({100*np.mean(safe_mask):.1f}%)")
print(f"Unsafe samples: {np.sum(~safe_mask & ~boundary_mask)} ({100*np.mean(~safe_mask & ~boundary_mask):.1f}%)")
print(f"Boundary samples: {np.sum(boundary_mask)} ({100*np.mean(boundary_mask):.1f}%)")

## State Distribution Analysis

In [None]:
# Robot joint angles distribution
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.ravel()

for i in range(6):
    axes[i].hist(states[:, i], bins=50, alpha=0.7, edgecolor='black')
    axes[i].set_title(f'Joint {i+1} Angle')
    axes[i].set_xlabel('Angle (rad)')
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.savefig('../results/figures/joint_angles_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## Safe vs Unsafe State Comparison

In [None]:
# Compare human positions for safe vs unsafe
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Human X positions
safe_px = states[safe_mask, 12::3]  # Every 3rd element starting from 12
unsafe_px = states[~safe_mask & ~boundary_mask, 12::3]

axes[0].hist(safe_px.ravel(), bins=50, alpha=0.6, label='Safe', color='green')
axes[0].hist(unsafe_px.ravel(), bins=50, alpha=0.6, label='Unsafe', color='red')
axes[0].set_xlabel('Human X Position (m)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Human Position Distribution')
axes[0].legend()

# Label distribution
axes[1].bar(['Safe', 'Unsafe'], [np.sum(safe_mask), np.sum(~safe_mask)], 
           color=['green', 'red'], alpha=0.7)
axes[1].set_ylabel('Count')
axes[1].set_title('Class Balance')

plt.tight_layout()
plt.savefig('../results/figures/safe_unsafe_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## Feature Correlation Analysis

In [None]:
# Compute correlation matrix for subset of features
feature_subset = states[:1000, :18]  # Robot states only
corr_matrix = np.corrcoef(feature_subset.T)

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, cmap='coolwarm', center=0, 
           xticklabels=[f'q{i}' for i in range(6)] + [f'qd{i}' for i in range(6)],
           yticklabels=[f'q{i}' for i in range(6)] + [f'qd{i}' for i in range(6)])
plt.title('Robot State Correlation Matrix')
plt.tight_layout()
plt.savefig('../results/figures/correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

## Summary Statistics

In [None]:
import pandas as pd

# State names
state_names = (
    [f'q_r_{i}' for i in range(6)] +
    [f'qd_r_{i}' for i in range(6)] +
    [f'p_h_{i}' for i in range(18)] +
    [f'v_h_{i}' for i in range(18)] +
    [f's_obj_{i}' for i in range(12)] +
    ['s_conv'] +
    [f's_task_{i}' for i in range(5)]
)

# Create dataframe
df = pd.DataFrame(states, columns=state_names)
df['label'] = labels

# Summary statistics
print("\nSummary Statistics (first 12 features):")
print(df.iloc[:, :12].describe())