# Exhaustive Data Analysis for Basketball Free Throw Motion Capture

This notebook performs comprehensive analysis of ALL aspects of the motion capture data with 95% confidence that nothing is missed.

## Analysis Levels
1. **Per-Player Statistical Profiling** - Baseline statistics for every keypoint/axis
2. **Per-Shot Anomaly Detection** - Z-score, IQR, MAD-based outlier detection
3. **Pre-Release Window Analysis** - Frames 150-200 before typical release
4. **Cross-Keypoint Coordination** - Joint angles, kinematic chain timing
5. **Player Signature Detection** - Unique patterns per player
6. **Outcome Correlation Analysis** - Feature-target correlations with FDR correction

## Data Structure
- 69 keypoints x 3 axes = 207 features per frame
- 240 frames at 60 FPS (4 seconds)
- 5 participants, 344 training shots, 112 test shots
- 3 targets: angle, depth, left_right

## 1. Setup and Data Loading

In [1]:
import numpy as np
import pandas as pd
import json
import os
from pathlib import Path
from typing import Dict, List, Tuple, Optional
from scipy import stats
from scipy.signal import butter, filtfilt, welch
from scipy.stats import spearmanr, pearsonr
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML

# Constants
FRAME_RATE = 60
NUM_FRAMES = 240
NUM_KEYPOINTS = 69
NUM_COORDS = 3
NUM_FEATURES = NUM_KEYPOINTS * NUM_COORDS  # 207
DT = 1.0 / FRAME_RATE

# Anomaly thresholds (95% detection confidence)
Z_THRESHOLD = 2.0       # |z| > 2.0 detects 95.45% of true anomalies
IQR_MULTIPLIER = 1.5    # Standard outlier detection
MAD_THRESHOLD = 3.0     # Robust to non-normal distributions
VELOCITY_SPIKE_SD = 3.0 # Flags sudden movements
TREMOR_FREQ_THRESHOLD = 5  # Hz - high frequency oscillation detection
TREMOR_POWER_RATIO = 0.30  # 30% high-freq power indicates tremor

# Pre-release window (frames 150-200, before typical release at ~180)
PRE_RELEASE_START = 150
PRE_RELEASE_END = 200

# FDR correction threshold
FDR_ALPHA = 0.05

print("Setup complete")

Setup complete


In [2]:
def parse_array_json(s):
    """Parse string array to numpy array."""
    if pd.isna(s):
        return np.full(NUM_FRAMES, np.nan, dtype=np.float32)
    s = str(s).replace('nan', 'null')
    return np.array(json.loads(s), dtype=np.float32)


def load_data(train: bool = True, max_shots: Optional[int] = None) -> Tuple[pd.DataFrame, List[str], np.ndarray]:
    """
    Load data and convert to usable format.
    
    Returns:
        df: DataFrame with metadata and targets
        keypoint_cols: List of keypoint column names
        X: np.ndarray of shape (n_shots, 240, 207)
    """
    filepath = 'train.csv' if train else 'test.csv'
    print(f"Loading {filepath}...")
    
    df = pd.read_csv(filepath, nrows=max_shots)
    print(f"Loaded {len(df)} shots")
    
    # Identify keypoint columns
    meta_cols = ['id', 'shot_id', 'participant_id']
    target_cols = ['angle', 'depth', 'left_right'] if train else []
    keypoint_cols = [c for c in df.columns if c not in meta_cols + target_cols]
    
    print(f"Keypoint columns: {len(keypoint_cols)}")
    
    # Convert time series strings to numpy arrays
    n_shots = len(df)
    X = np.zeros((n_shots, NUM_FRAMES, len(keypoint_cols)), dtype=np.float32)
    
    print("Parsing time series...")
    for i, row in df.iterrows():
        if i % 50 == 0:
            print(f"  Processing shot {i}/{n_shots}")
        for j, col in enumerate(keypoint_cols):
            X[i, :, j] = parse_array_json(row[col])
    
    print(f"Data loaded: X shape = {X.shape}")
    return df, keypoint_cols, X


def get_keypoint_names(keypoint_cols: List[str]) -> List[str]:
    """Extract unique keypoint names from column names."""
    keypoints = []
    for col in keypoint_cols:
        if col.endswith('_x'):
            keypoints.append(col[:-2])
    return keypoints


def get_keypoint_index(keypoint_cols: List[str]) -> Dict[str, int]:
    """Create mapping from keypoint name to base index."""
    keypoints = get_keypoint_names(keypoint_cols)
    return {name: i for i, name in enumerate(keypoints)}


print("Data loading functions defined")

Data loading functions defined


In [3]:
# Load training data
train_df, keypoint_cols, X_train = load_data(train=True)

# Extract targets
y_train = train_df[['angle', 'depth', 'left_right']].values
participant_ids = train_df['participant_id'].values

# Get keypoint metadata
KEYPOINT_NAMES = get_keypoint_names(keypoint_cols)
KEYPOINT_INDEX = get_keypoint_index(keypoint_cols)

print(f"\nKeypoints ({len(KEYPOINT_NAMES)}): {KEYPOINT_NAMES[:10]}...")
print(f"Participants: {np.unique(participant_ids)}")
print(f"Target stats:")
print(f"  angle: mean={y_train[:, 0].mean():.2f}, std={y_train[:, 0].std():.2f}")
print(f"  depth: mean={y_train[:, 1].mean():.2f}, std={y_train[:, 1].std():.2f}")
print(f"  left_right: mean={y_train[:, 2].mean():.2f}, std={y_train[:, 2].std():.2f}")

Loading train.csv...


Loaded 345 shots
Keypoint columns: 207
Parsing time series...
  Processing shot 0/345


  Processing shot 50/345


  Processing shot 100/345


  Processing shot 150/345


  Processing shot 200/345


  Processing shot 250/345


  Processing shot 300/345


Data loaded: X shape = (345, 240, 207)

Keypoints (69): ['nose', 'left_eye', 'right_eye', 'left_ear', 'right_ear', 'left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow', 'left_wrist']...
Participants: [1 2 3 4 5]
Target stats:
  angle: mean=45.48, std=4.86
  depth: mean=9.66, std=5.39
  left_right: mean=-0.78, std=3.80


In [4]:
# Load test data
test_df, _, X_test = load_data(train=False)
test_participant_ids = test_df['participant_id'].values

print(f"\nTest set participants: {np.unique(test_participant_ids)}")
print(f"Test shots per participant:")
for pid in np.unique(test_participant_ids):
    print(f"  Participant {pid}: {np.sum(test_participant_ids == pid)} shots")

Loading test.csv...


Loaded 113 shots
Keypoint columns: 207
Parsing time series...
  Processing shot 0/113


  Processing shot 50/113


  Processing shot 100/113
Data loaded: X shape = (113, 240, 207)

Test set participants: [1 2 3 4 5]
Test shots per participant:
  Participant 1: 23 shots
  Participant 2: 22 shots
  Participant 3: 22 shots
  Participant 4: 22 shots
  Participant 5: 24 shots


---
## Level 1: Per-Player Statistical Profiling

For each of 5 players, for each of 207 features (69 keypoints x 3 axes):
- Compute baseline statistics: mean, std, median, min, max, range, percentiles, skewness, kurtosis, MAD
- These profiles define "normal" ranges for anomaly detection

In [5]:
def compute_player_profile(X_player: np.ndarray) -> Dict:
    """
    Compute comprehensive statistics for a player's shots.
    
    Args:
        X_player: (n_shots, 240, 207) array of player's shots
    
    Returns:
        Profile dict with stats for each of 207 features
    """
    n_shots, n_frames, n_features = X_player.shape
    profile = {'n_shots': n_shots, 'features': {}}
    
    for feat_idx in range(n_features):
        # Flatten all frames across all shots for this feature
        all_values = X_player[:, :, feat_idx].flatten()
        valid_values = all_values[~np.isnan(all_values)]
        
        if len(valid_values) == 0:
            profile['features'][feat_idx] = {'valid': False}
            continue
        
        # Basic statistics
        mean = np.mean(valid_values)
        std = np.std(valid_values)
        median = np.median(valid_values)
        
        # Percentiles
        q10 = np.percentile(valid_values, 10)
        q25 = np.percentile(valid_values, 25)
        q75 = np.percentile(valid_values, 75)
        q90 = np.percentile(valid_values, 90)
        iqr = q75 - q25
        
        # Robust statistics
        mad = np.median(np.abs(valid_values - median))  # Median Absolute Deviation
        if mad == 0:
            mad = 1e-10  # Avoid division by zero
        
        # Shape statistics
        if len(valid_values) > 2:
            skewness = stats.skew(valid_values)
            kurtosis = stats.kurtosis(valid_values)
        else:
            skewness = 0
            kurtosis = 0
        
        profile['features'][feat_idx] = {
            'valid': True,
            'mean': mean,
            'std': std,
            'median': median,
            'min': np.min(valid_values),
            'max': np.max(valid_values),
            'range': np.max(valid_values) - np.min(valid_values),
            'q10': q10,
            'q25': q25,
            'q75': q75,
            'q90': q90,
            'iqr': iqr,
            'mad': mad,
            'skewness': skewness,
            'kurtosis': kurtosis,
            # Pre-computed bounds for anomaly detection
            'z_lower': mean - Z_THRESHOLD * std,
            'z_upper': mean + Z_THRESHOLD * std,
            'iqr_lower': q25 - IQR_MULTIPLIER * iqr,
            'iqr_upper': q75 + IQR_MULTIPLIER * iqr,
            'mad_lower': median - MAD_THRESHOLD * mad * 1.4826,  # Scale factor for normal distribution
            'mad_upper': median + MAD_THRESHOLD * mad * 1.4826,
        }
    
    return profile


def compute_velocity_profile(X_player: np.ndarray) -> Dict:
    """
    Compute velocity statistics for a player.
    
    Returns:
        Velocity profile with mean/std for each feature
    """
    n_shots, n_frames, n_features = X_player.shape
    profile = {'features': {}}
    
    for feat_idx in range(n_features):
        all_velocities = []
        for shot_idx in range(n_shots):
            series = X_player[shot_idx, :, feat_idx]
            if np.isnan(series).all():
                continue
            vel = np.gradient(series, DT)
            all_velocities.append(vel)
        
        if len(all_velocities) == 0:
            profile['features'][feat_idx] = {'valid': False}
            continue
        
        all_vel = np.concatenate(all_velocities)
        valid_vel = all_vel[~np.isnan(all_vel)]
        
        if len(valid_vel) == 0:
            profile['features'][feat_idx] = {'valid': False}
            continue
        
        profile['features'][feat_idx] = {
            'valid': True,
            'mean': np.mean(valid_vel),
            'std': np.std(valid_vel),
            'max_abs': np.max(np.abs(valid_vel)),
            'spike_threshold': np.mean(np.abs(valid_vel)) + VELOCITY_SPIKE_SD * np.std(np.abs(valid_vel))
        }
    
    return profile


print("Profile computation functions defined")

Profile computation functions defined


In [6]:
# Compute profiles for all players
player_profiles = {}
player_velocity_profiles = {}

for pid in range(1, 6):
    mask = participant_ids == pid
    X_player = X_train[mask]
    print(f"\nComputing profile for Player {pid} ({np.sum(mask)} shots)...")
    
    player_profiles[pid] = compute_player_profile(X_player)
    player_velocity_profiles[pid] = compute_velocity_profile(X_player)
    
    # Quick summary
    valid_features = sum(1 for f in player_profiles[pid]['features'].values() if f.get('valid', False))
    print(f"  Valid features: {valid_features}/{NUM_FEATURES}")

print("\nAll player profiles computed!")


Computing profile for Player 1 (70 shots)...


  Valid features: 207/207

Computing profile for Player 2 (66 shots)...


  Valid features: 207/207

Computing profile for Player 3 (68 shots)...


  Valid features: 207/207

Computing profile for Player 4 (67 shots)...


  Valid features: 207/207

Computing profile for Player 5 (74 shots)...


  Valid features: 207/207

All player profiles computed!


In [7]:
# Create summary table of player profiles
def create_profile_summary_df(player_profiles: Dict, keypoint_cols: List[str]) -> pd.DataFrame:
    """Create a summary DataFrame of player profiles."""
    rows = []
    
    for feat_idx, col_name in enumerate(keypoint_cols):
        # Determine keypoint and axis
        if col_name.endswith('_x'):
            keypoint = col_name[:-2]
            axis = 'x'
        elif col_name.endswith('_y'):
            keypoint = col_name[:-2]
            axis = 'y'
        elif col_name.endswith('_z'):
            keypoint = col_name[:-2]
            axis = 'z'
        else:
            keypoint = col_name
            axis = '?'
        
        for pid in range(1, 6):
            feat_stats = player_profiles[pid]['features'].get(feat_idx, {})
            if not feat_stats.get('valid', False):
                continue
            
            rows.append({
                'feature_idx': feat_idx,
                'keypoint': keypoint,
                'axis': axis,
                'player': pid,
                'mean': feat_stats['mean'],
                'std': feat_stats['std'],
                'min': feat_stats['min'],
                'max': feat_stats['max'],
                'range': feat_stats['range'],
                'q25': feat_stats['q25'],
                'median': feat_stats['median'],
                'q75': feat_stats['q75'],
                'iqr': feat_stats['iqr'],
                'mad': feat_stats['mad'],
                'skewness': feat_stats['skewness'],
                'kurtosis': feat_stats['kurtosis'],
            })
    
    return pd.DataFrame(rows)

profile_summary_df = create_profile_summary_df(player_profiles, keypoint_cols)
print(f"Profile summary: {len(profile_summary_df)} rows")
display(profile_summary_df.head(20))

Profile summary: 1035 rows


Unnamed: 0,feature_idx,keypoint,axis,player,mean,std,min,max,range,q25,median,q75,iqr,mad,skewness,kurtosis
0,0,nose,x,1,19.495037,0.253913,16.772724,20.371164,3.59844,19.360453,19.530483,19.650309,0.289856,0.139832,-1.272218,8.313874
1,0,nose,x,2,18.592106,1.313702,12.64782,19.916716,7.268896,18.214365,19.223763,19.389936,1.175571,0.251741,-1.671978,1.878195
2,0,nose,x,3,19.347965,0.184863,15.806208,19.797483,3.991276,19.266298,19.363798,19.456396,0.190098,0.094893,-6.486572,98.258797
3,0,nose,x,4,19.590319,0.217706,18.914768,20.498007,1.583239,19.450306,19.623785,19.743099,0.292793,0.143499,-0.413944,-0.121131
4,0,nose,x,5,19.195009,0.23915,18.344175,20.540684,2.196508,19.056978,19.222267,19.36647,0.309492,0.154099,-0.537723,0.852803
5,1,nose,y,1,-25.375929,0.165374,-27.803375,-21.850479,5.952896,-25.429382,-25.379959,-25.326164,0.103218,0.051407,5.523412,168.46756
6,1,nose,y,2,-25.223717,0.679812,-28.496561,-17.012486,11.484076,-25.468559,-25.299765,-25.125671,0.342888,0.171078,3.670227,29.914303
7,1,nose,y,3,-25.734699,0.102241,-26.74346,-24.484486,2.258974,-25.771782,-25.733246,-25.695902,0.07588,0.037943,2.272863,36.933483
8,1,nose,y,4,-25.058304,0.125227,-26.616814,-23.461823,3.154991,-25.098289,-25.050926,-25.005684,0.092606,0.046467,-1.762195,38.758305
9,1,nose,y,5,-25.444574,0.157441,-26.950708,-24.831295,2.119413,-25.531807,-25.44729,-25.35297,0.178837,0.089769,-0.588964,6.677033


In [8]:
# Save player profiles to JSON
import json

output_dir = Path('../output')
output_dir.mkdir(exist_ok=True)

for pid in range(1, 6):
    # Convert numpy types to native Python types for JSON serialization
    profile_json = {
        'n_shots': int(player_profiles[pid]['n_shots']),
        'features': {}
    }
    for feat_idx, feat_stats in player_profiles[pid]['features'].items():
        profile_json['features'][str(feat_idx)] = {
            k: float(v) if isinstance(v, (np.floating, np.integer)) else v
            for k, v in feat_stats.items()
        }
    
    filepath = output_dir / f'player_{pid}_profile.json'
    with open(filepath, 'w') as f:
        json.dump(profile_json, f, indent=2)
    print(f"Saved {filepath}")

print("All profiles saved!")

Saved ../output/player_1_profile.json
Saved ../output/player_2_profile.json
Saved ../output/player_3_profile.json
Saved ../output/player_4_profile.json
Saved ../output/player_5_profile.json
All profiles saved!


---
## Level 2: Per-Shot Anomaly Detection

For each shot, compare against player's baseline profile using multiple methods:
- **Z-score**: flag if |z| > 2.0 (95.45% detection)
- **IQR**: flag if value < Q25 - 1.5*IQR or > Q75 + 1.5*IQR
- **MAD**: flag if |x - median| / MAD > 3
- **Velocity spikes**: flag if velocity > mean + 3*SD

In [9]:
def detect_anomalies_in_shot(
    shot_data: np.ndarray,
    player_profile: Dict,
    velocity_profile: Dict,
    shot_idx: int,
    keypoint_cols: List[str]
) -> List[Dict]:
    """
    Detect all anomalies in a single shot.
    
    Args:
        shot_data: (240, 207) array for this shot
        player_profile: Player's baseline statistics
        velocity_profile: Player's velocity statistics
        shot_idx: Index of this shot
        keypoint_cols: List of column names
    
    Returns:
        List of anomaly dicts with details
    """
    anomalies = []
    n_frames, n_features = shot_data.shape
    
    for feat_idx in range(n_features):
        feat_stats = player_profile['features'].get(feat_idx, {})
        vel_stats = velocity_profile['features'].get(feat_idx, {})
        
        if not feat_stats.get('valid', False):
            continue
        
        series = shot_data[:, feat_idx]
        col_name = keypoint_cols[feat_idx]
        
        # Position anomaly detection (frame by frame)
        for frame_idx in range(n_frames):
            value = series[frame_idx]
            if np.isnan(value):
                continue
            
            # Z-score check
            if value < feat_stats['z_lower'] or value > feat_stats['z_upper']:
                z_score = (value - feat_stats['mean']) / feat_stats['std'] if feat_stats['std'] > 0 else 0
                anomalies.append({
                    'shot_idx': shot_idx,
                    'frame': frame_idx,
                    'feature_idx': feat_idx,
                    'feature_name': col_name,
                    'method': 'z_score',
                    'value': value,
                    'expected_mean': feat_stats['mean'],
                    'expected_std': feat_stats['std'],
                    'z_score': z_score,
                    'severity': abs(z_score)
                })
            
            # IQR check
            if value < feat_stats['iqr_lower'] or value > feat_stats['iqr_upper']:
                iqr_deviation = max(
                    (feat_stats['iqr_lower'] - value) / feat_stats['iqr'] if value < feat_stats['iqr_lower'] else 0,
                    (value - feat_stats['iqr_upper']) / feat_stats['iqr'] if value > feat_stats['iqr_upper'] else 0
                ) if feat_stats['iqr'] > 0 else 0
                
                anomalies.append({
                    'shot_idx': shot_idx,
                    'frame': frame_idx,
                    'feature_idx': feat_idx,
                    'feature_name': col_name,
                    'method': 'iqr',
                    'value': value,
                    'expected_q25': feat_stats['q25'],
                    'expected_q75': feat_stats['q75'],
                    'iqr': feat_stats['iqr'],
                    'severity': iqr_deviation
                })
            
            # MAD check
            if value < feat_stats['mad_lower'] or value > feat_stats['mad_upper']:
                mad_score = abs(value - feat_stats['median']) / (feat_stats['mad'] * 1.4826)
                anomalies.append({
                    'shot_idx': shot_idx,
                    'frame': frame_idx,
                    'feature_idx': feat_idx,
                    'feature_name': col_name,
                    'method': 'mad',
                    'value': value,
                    'expected_median': feat_stats['median'],
                    'mad': feat_stats['mad'],
                    'mad_score': mad_score,
                    'severity': mad_score
                })
        
        # Velocity spike detection
        if vel_stats.get('valid', False):
            velocity = np.gradient(series, DT)
            spike_threshold = vel_stats['spike_threshold']
            
            for frame_idx in range(n_frames):
                if np.isnan(velocity[frame_idx]):
                    continue
                
                if abs(velocity[frame_idx]) > spike_threshold:
                    anomalies.append({
                        'shot_idx': shot_idx,
                        'frame': frame_idx,
                        'feature_idx': feat_idx,
                        'feature_name': col_name,
                        'method': 'velocity_spike',
                        'value': velocity[frame_idx],
                        'threshold': spike_threshold,
                        'severity': abs(velocity[frame_idx]) / spike_threshold
                    })
    
    return anomalies


print("Anomaly detection functions defined")

Anomaly detection functions defined


In [10]:
# Run anomaly detection on ALL training shots
all_anomalies = []
shot_anomaly_counts = []

print("Running anomaly detection on training data...")
for shot_idx in range(len(X_train)):
    if shot_idx % 50 == 0:
        print(f"  Processing shot {shot_idx}/{len(X_train)}")
    
    pid = participant_ids[shot_idx]
    shot_data = X_train[shot_idx]
    
    anomalies = detect_anomalies_in_shot(
        shot_data, 
        player_profiles[pid], 
        player_velocity_profiles[pid],
        shot_idx,
        keypoint_cols
    )
    
    all_anomalies.extend(anomalies)
    
    # Count by method
    z_count = sum(1 for a in anomalies if a['method'] == 'z_score')
    iqr_count = sum(1 for a in anomalies if a['method'] == 'iqr')
    mad_count = sum(1 for a in anomalies if a['method'] == 'mad')
    vel_count = sum(1 for a in anomalies if a['method'] == 'velocity_spike')
    
    shot_anomaly_counts.append({
        'shot_idx': shot_idx,
        'shot_id': train_df.iloc[shot_idx]['shot_id'],
        'participant_id': pid,
        'total_anomalies': len(anomalies),
        'z_score_anomalies': z_count,
        'iqr_anomalies': iqr_count,
        'mad_anomalies': mad_count,
        'velocity_anomalies': vel_count,
        'angle': y_train[shot_idx, 0],
        'depth': y_train[shot_idx, 1],
        'left_right': y_train[shot_idx, 2]
    })

print(f"\nTotal anomalies detected: {len(all_anomalies)}")
print(f"Shots with anomalies: {sum(1 for s in shot_anomaly_counts if s['total_anomalies'] > 0)}")

Running anomaly detection on training data...
  Processing shot 0/345


  Processing shot 50/345


  Processing shot 100/345


  Processing shot 150/345


  Processing shot 200/345


  Processing shot 250/345


  Processing shot 300/345



Total anomalies detected: 3309254
Shots with anomalies: 345


In [11]:
# Create anomaly summary DataFrame
anomaly_df = pd.DataFrame(all_anomalies)
shot_anomaly_df = pd.DataFrame(shot_anomaly_counts)

print("Anomaly Summary by Method:")
print(anomaly_df['method'].value_counts())

print("\nTop 20 shots by total anomalies:")
display(shot_anomaly_df.nlargest(20, 'total_anomalies'))

print("\nMost severe anomalies (top 20):")
display(anomaly_df.nlargest(20, 'severity')[['shot_idx', 'frame', 'feature_name', 'method', 'value', 'severity']])

Anomaly Summary by Method:


method
mad               1325059
iqr                928263
z_score            752059
velocity_spike     303873
Name: count, dtype: int64

Top 20 shots by total anomalies:


Unnamed: 0,shot_idx,shot_id,participant_id,total_anomalies,z_score_anomalies,iqr_anomalies,mad_anomalies,velocity_anomalies,angle,depth,left_right
101,101,ok8pOMq,2,29108,6917,7781,8425,5985,41.97,7.57,10.15
136,136,Qy332dv,3,28293,6530,9687,10574,1502,47.24,7.6,-3.19
51,51,RolxW0m,1,22616,5515,5377,7776,3948,43.83,1.62,4.86
86,86,2MDqyA4,2,21491,5106,6044,7009,3332,44.97,16.76,-5.33
17,17,DDqo6vX,1,19385,4265,4707,7087,3326,46.64,16.34,-5.96
243,243,EWaa4q6,4,18843,5238,6076,6630,899,55.35,4.2,-2.98
104,104,LaMjDDj,2,18648,4738,5902,7007,1001,41.76,9.18,-5.69
295,295,kOaOkmo,5,18325,6388,5090,5398,1449,42.0,21.3,2.7
97,97,41jYo9Q,2,18157,4474,5307,6069,2307,41.76,6.13,-7.65
115,115,Jax8zkY,2,17924,4646,5565,6637,1076,38.97,10.82,3.47



Most severe anomalies (top 20):


Unnamed: 0,shot_idx,frame,feature_name,method,value,severity
1599175,160,229,left_big_toe_z,mad,-1.668901,338.876617
1599172,160,228,left_big_toe_z,mad,-1.650621,335.891388
1599174,160,229,left_big_toe_z,iqr,-1.668901,224.67955
1599171,160,228,left_big_toe_z,iqr,-1.650621,222.681625
1599241,160,228,left_small_toe_z,mad,-1.893535,212.716171
1599244,160,229,left_small_toe_z,mad,-1.865929,210.175003
1262699,136,238,left_small_toe_x,mad,15.001565,159.351334
1262702,136,239,left_small_toe_x,mad,15.024837,158.581253
1262696,136,237,left_small_toe_x,mad,15.028935,158.445663
1262690,136,235,left_small_toe_x,mad,15.03118,158.371368


In [12]:
# Verification: Check for known anomaly (Shot 17 with value 30.99)
print("Searching for extreme values (>20 or <-30) in training data...")
extreme_anomalies = anomaly_df[anomaly_df['value'].abs() > 20]
print(f"Found {len(extreme_anomalies)} extreme value anomalies")

if len(extreme_anomalies) > 0:
    print("\nExtreme anomalies:")
    display(extreme_anomalies[['shot_idx', 'frame', 'feature_name', 'method', 'value', 'severity']].drop_duplicates())

Searching for extreme values (>20 or <-30) in training data...
Found 1376677 extreme value anomalies

Extreme anomalies:


Unnamed: 0,shot_idx,frame,feature_name,method,value,severity
60,0,201,left_shoulder_x,iqr,20.344454,0.024800
61,0,202,left_shoulder_x,iqr,20.355312,0.098876
62,0,203,left_shoulder_x,iqr,20.366650,0.176217
63,0,204,left_shoulder_x,iqr,20.376583,0.243982
64,0,204,left_shoulder_x,mad,20.376583,3.035156
...,...,...,...,...,...,...
3308874,344,215,right_pinky_y,z_score,-24.167767,2.330414
3308875,344,216,right_pinky_y,z_score,-24.170540,2.319702
3308876,344,217,right_pinky_y,z_score,-24.185398,2.262307
3308877,344,218,right_pinky_y,z_score,-24.212147,2.158983


In [13]:
# Save anomaly reports
shot_anomaly_df.to_csv(output_dir / 'shot_anomaly_report.csv', index=False)
print(f"Saved shot anomaly report to {output_dir / 'shot_anomaly_report.csv'}")

# Save per-player anomaly JSON files
for pid in range(1, 6):
    player_anomalies = [a for a in all_anomalies if shot_anomaly_df.iloc[a['shot_idx']]['participant_id'] == pid]
    
    # Convert to JSON-serializable format
    player_anomalies_json = []
    for a in player_anomalies:
        a_json = {k: float(v) if isinstance(v, (np.floating, np.integer)) else v for k, v in a.items()}
        player_anomalies_json.append(a_json)
    
    filepath = output_dir / f'player_{pid}_anomalies.json'
    with open(filepath, 'w') as f:
        json.dump(player_anomalies_json, f)
    print(f"Saved {len(player_anomalies)} anomalies for player {pid}")

Saved shot anomaly report to ../output/shot_anomaly_report.csv


Saved 565926 anomalies for player 1


Saved 686080 anomalies for player 2


Saved 946746 anomalies for player 3


Saved 558187 anomalies for player 4


Saved 552315 anomalies for player 5


---
## Level 3: Pre-Release Window Analysis (Critical)

**Window:** Frames 150-200 (30-50 frames before typical release at ~180)

Checks:
- Position deviation from player mean
- Velocity magnitude spikes
- Finger tremor detection (high-frequency oscillations >5 Hz)
- Elbow stability (lateral drift)
- Wrist acceleration profile
- Body sway (hip oscillation)

In [14]:
def detect_release_frame(shot_data: np.ndarray, keypoint_idx: Dict) -> int:
    """
    Detect ball release frame using maximum wrist velocity.
    """
    try:
        wrist_idx = keypoint_idx.get('right_wrist', None)
        if wrist_idx is None:
            return NUM_FRAMES // 2
        
        # Get wrist x, y, z
        wrist_data = shot_data[:, wrist_idx*3:(wrist_idx+1)*3]
        
        # Compute velocity magnitude
        vel = np.gradient(wrist_data, DT, axis=0)
        vel_mag = np.linalg.norm(vel, axis=1)
        
        # Search from 1/3 into shot
        search_start = NUM_FRAMES // 3
        release_idx = search_start + np.nanargmax(vel_mag[search_start:])
        return int(release_idx)
    except:
        return NUM_FRAMES // 2


def analyze_pre_release_window(
    shot_data: np.ndarray,
    player_profile: Dict,
    keypoint_idx: Dict,
    keypoint_cols: List[str]
) -> Dict:
    """
    Analyze the pre-release window (frames 150-200) for anomalies.
    """
    results = {
        'release_frame': detect_release_frame(shot_data, keypoint_idx),
        'position_deviations': [],
        'velocity_spikes': [],
        'finger_tremors': [],
        'elbow_stability': {},
        'body_sway': {}
    }
    
    window = shot_data[PRE_RELEASE_START:PRE_RELEASE_END]
    
    # Check position deviations in window
    for feat_idx in range(shot_data.shape[1]):
        feat_stats = player_profile['features'].get(feat_idx, {})
        if not feat_stats.get('valid', False):
            continue
        
        window_data = window[:, feat_idx]
        mean_deviation = np.nanmean(np.abs(window_data - feat_stats['mean']))
        
        if mean_deviation > 2 * feat_stats['std']:
            results['position_deviations'].append({
                'feature_idx': feat_idx,
                'feature_name': keypoint_cols[feat_idx],
                'mean_deviation': mean_deviation,
                'expected_std': feat_stats['std']
            })
    
    # Finger tremor detection
    finger_keypoints = [
        'right_first_finger_distal', 'right_second_finger_distal',
        'right_third_finger_distal', 'right_fourth_finger_distal',
        'right_fifth_finger_distal', 'right_thumb'
    ]
    
    for kp_name in finger_keypoints:
        kp_idx = keypoint_idx.get(kp_name)
        if kp_idx is None:
            continue
        
        for axis_offset, axis_name in enumerate(['x', 'y', 'z']):
            feat_idx = kp_idx * 3 + axis_offset
            series = window[:, feat_idx]
            
            if np.isnan(series).all():
                continue
            
            # Compute power spectrum
            try:
                freqs, psd = welch(series[~np.isnan(series)], fs=FRAME_RATE, nperseg=min(32, len(series)//2))
                
                # High frequency power (>5 Hz)
                high_freq_mask = freqs > TREMOR_FREQ_THRESHOLD
                total_power = np.sum(psd)
                high_freq_power = np.sum(psd[high_freq_mask]) if total_power > 0 else 0
                tremor_ratio = high_freq_power / total_power if total_power > 0 else 0
                
                if tremor_ratio > TREMOR_POWER_RATIO:
                    results['finger_tremors'].append({
                        'keypoint': kp_name,
                        'axis': axis_name,
                        'tremor_ratio': tremor_ratio,
                        'high_freq_power': high_freq_power,
                        'total_power': total_power
                    })
            except:
                pass
    
    # Elbow stability (lateral drift)
    elbow_idx = keypoint_idx.get('right_elbow')
    if elbow_idx is not None:
        elbow_x = window[:, elbow_idx * 3]  # x-axis is lateral
        if not np.isnan(elbow_x).all():
            results['elbow_stability'] = {
                'lateral_drift': np.nanmax(elbow_x) - np.nanmin(elbow_x),
                'lateral_std': np.nanstd(elbow_x)
            }
    
    # Body sway (hip oscillation)
    mid_hip_idx = keypoint_idx.get('mid_hip')
    if mid_hip_idx is not None:
        hip_x = window[:, mid_hip_idx * 3]
        hip_y = window[:, mid_hip_idx * 3 + 1]
        if not np.isnan(hip_x).all():
            results['body_sway'] = {
                'lateral_range': np.nanmax(hip_x) - np.nanmin(hip_x),
                'forward_range': np.nanmax(hip_y) - np.nanmin(hip_y),
                'lateral_std': np.nanstd(hip_x),
                'forward_std': np.nanstd(hip_y)
            }
    
    return results


print("Pre-release analysis functions defined")

Pre-release analysis functions defined


In [15]:
# Run pre-release analysis on all training shots
pre_release_results = []

print("Running pre-release window analysis...")
for shot_idx in range(len(X_train)):
    if shot_idx % 50 == 0:
        print(f"  Processing shot {shot_idx}/{len(X_train)}")
    
    pid = participant_ids[shot_idx]
    shot_data = X_train[shot_idx]
    
    result = analyze_pre_release_window(
        shot_data,
        player_profiles[pid],
        KEYPOINT_INDEX,
        keypoint_cols
    )
    result['shot_idx'] = shot_idx
    result['participant_id'] = pid
    pre_release_results.append(result)

print(f"\nPre-release analysis complete for {len(pre_release_results)} shots")

Running pre-release window analysis...
  Processing shot 0/345


  Processing shot 50/345


  Processing shot 100/345


  Processing shot 150/345
  Processing shot 200/345


  Processing shot 250/345


  Processing shot 300/345

Pre-release analysis complete for 345 shots


In [16]:
# Summarize pre-release findings
tremor_shots = [r for r in pre_release_results if len(r['finger_tremors']) > 0]
print(f"Shots with finger tremors detected: {len(tremor_shots)}")

# Create summary DataFrame
pre_release_summary = []
for r in pre_release_results:
    pre_release_summary.append({
        'shot_idx': r['shot_idx'],
        'participant_id': r['participant_id'],
        'release_frame': r['release_frame'],
        'n_position_deviations': len(r['position_deviations']),
        'n_finger_tremors': len(r['finger_tremors']),
        'elbow_lateral_drift': r['elbow_stability'].get('lateral_drift', np.nan),
        'body_lateral_sway': r['body_sway'].get('lateral_range', np.nan),
        'body_forward_sway': r['body_sway'].get('forward_range', np.nan),
    })

pre_release_df = pd.DataFrame(pre_release_summary)
print("\nPre-release summary statistics:")
display(pre_release_df.describe())

print("\nShots with most position deviations:")
display(pre_release_df.nlargest(10, 'n_position_deviations'))

Shots with finger tremors detected: 5

Pre-release summary statistics:


Unnamed: 0,shot_idx,participant_id,release_frame,n_position_deviations,n_finger_tremors,elbow_lateral_drift,body_lateral_sway,body_forward_sway
count,345.0,345.0,345.0,345.0,345.0,345.0,345.0,345.0
mean,172.0,3.026087,173.55942,1.385507,0.04058,1.084126,0.533986,0.164111
std,99.737155,1.435397,34.773897,6.020249,0.386667,0.507854,0.688448,0.275253
min,0.0,1.0,93.0,0.0,0.0,0.084906,0.064091,0.011169
25%,86.0,2.0,157.0,0.0,0.0,0.669001,0.179876,0.043303
50%,172.0,3.0,179.0,0.0,0.0,1.214251,0.254618,0.066351
75%,258.0,4.0,194.0,0.0,0.0,1.491514,0.348793,0.139692
max,344.0,5.0,239.0,65.0,5.0,2.649351,3.156754,2.817823



Shots with most position deviations:


Unnamed: 0,shot_idx,participant_id,release_frame,n_position_deviations,n_finger_tremors,elbow_lateral_drift,body_lateral_sway,body_forward_sway
101,101,2,226,65,0,1.506887,3.156754,2.817823
252,252,4,116,36,0,1.558815,0.295229,0.059425
323,323,5,218,36,0,0.439116,0.306814,0.050465
188,188,3,159,34,0,1.662745,0.384684,0.135277
161,161,3,164,33,0,1.590393,0.163727,0.171751
136,136,3,112,28,0,1.222513,0.991886,0.302561
318,318,5,180,28,0,1.414621,0.074142,0.179651
208,208,4,170,23,0,1.485922,0.256649,0.056137
258,258,4,163,23,0,1.46332,0.342794,0.056116
310,310,5,219,20,0,0.310469,0.257383,0.038296


In [17]:
# Show tremor details
if len(tremor_shots) > 0:
    print("Finger tremor details:")
    for r in tremor_shots[:10]:
        print(f"\nShot {r['shot_idx']} (Player {r['participant_id']}):")
        for t in r['finger_tremors']:
            print(f"  {t['keypoint']}_{t['axis']}: tremor_ratio={t['tremor_ratio']:.3f}")
else:
    print("No finger tremors detected at the specified threshold.")

Finger tremor details:

Shot 4 (Player 1):
  right_second_finger_distal_z: tremor_ratio=0.359
  right_third_finger_distal_z: tremor_ratio=0.340
  right_fourth_finger_distal_z: tremor_ratio=0.303

Shot 28 (Player 1):
  right_first_finger_distal_z: tremor_ratio=0.328
  right_second_finger_distal_z: tremor_ratio=0.315
  right_third_finger_distal_z: tremor_ratio=0.310
  right_fourth_finger_distal_z: tremor_ratio=0.330
  right_fifth_finger_distal_z: tremor_ratio=0.310

Shot 31 (Player 1):
  right_fifth_finger_distal_z: tremor_ratio=0.314

Shot 38 (Player 1):
  right_first_finger_distal_z: tremor_ratio=0.328

Shot 50 (Player 1):
  right_first_finger_distal_z: tremor_ratio=0.344
  right_second_finger_distal_z: tremor_ratio=0.303
  right_fourth_finger_distal_z: tremor_ratio=0.319
  right_fifth_finger_distal_z: tremor_ratio=0.361


---
## Level 4: Cross-Keypoint Coordination Analysis

- Joint angle tracking (11 angles)
- Temporal coordination (kinematic chain cross-correlations)
- Phase timing detection

In [18]:
def compute_joint_angle(p1: np.ndarray, p2: np.ndarray, p3: np.ndarray) -> np.ndarray:
    """
    Compute angle at joint p2 formed by p1-p2-p3.
    """
    v1 = p1 - p2
    v2 = p3 - p2
    
    dot = np.sum(v1 * v2, axis=1)
    norm1 = np.linalg.norm(v1, axis=1)
    norm2 = np.linalg.norm(v2, axis=1)
    
    denom = norm1 * norm2
    denom[denom == 0] = 1e-10
    
    cos_angle = np.clip(dot / denom, -1, 1)
    angle = np.arccos(cos_angle) * 180 / np.pi
    
    return angle


def get_keypoint_3d(shot_data: np.ndarray, keypoint_idx: Dict, keypoint_name: str) -> np.ndarray:
    """
    Get (240, 3) array for a keypoint.
    """
    idx = keypoint_idx.get(keypoint_name)
    if idx is None:
        return np.full((NUM_FRAMES, 3), np.nan)
    return shot_data[:, idx*3:(idx+1)*3]


def analyze_coordination(shot_data: np.ndarray, keypoint_idx: Dict) -> Dict:
    """
    Analyze cross-keypoint coordination.
    """
    results = {
        'joint_angles': {},
        'cross_correlations': {},
        'phase_timing': {}
    }
    
    # Define joints: (name, point1, center, point2)
    joints = [
        ('right_elbow', 'right_shoulder', 'right_elbow', 'right_wrist'),
        ('left_elbow', 'left_shoulder', 'left_elbow', 'left_wrist'),
        ('right_shoulder', 'neck', 'right_shoulder', 'right_elbow'),
        ('left_shoulder', 'neck', 'left_shoulder', 'left_elbow'),
        ('right_knee', 'right_hip', 'right_knee', 'right_ankle'),
        ('left_knee', 'left_hip', 'left_knee', 'left_ankle'),
        ('right_hip', 'right_shoulder', 'right_hip', 'right_knee'),
        ('left_hip', 'left_shoulder', 'left_hip', 'left_knee'),
    ]
    
    # Compute joint angles
    for joint_name, kp1, kp2, kp3 in joints:
        try:
            p1 = get_keypoint_3d(shot_data, keypoint_idx, kp1)
            p2 = get_keypoint_3d(shot_data, keypoint_idx, kp2)
            p3 = get_keypoint_3d(shot_data, keypoint_idx, kp3)
            
            angle = compute_joint_angle(p1, p2, p3)
            valid = angle[~np.isnan(angle)]
            
            if len(valid) > 0:
                results['joint_angles'][joint_name] = {
                    'mean': np.mean(valid),
                    'std': np.std(valid),
                    'min': np.min(valid),
                    'max': np.max(valid),
                    'range': np.max(valid) - np.min(valid),
                    'time_series': angle
                }
        except:
            pass
    
    # Cross-correlations (kinematic chain)
    kinematic_pairs = [
        ('right_wrist', 'right_elbow'),
        ('right_elbow', 'right_shoulder'),
        ('right_shoulder', 'right_hip'),
        ('right_hip', 'right_knee'),
    ]
    
    for kp1_name, kp2_name in kinematic_pairs:
        try:
            kp1_data = get_keypoint_3d(shot_data, keypoint_idx, kp1_name)
            kp2_data = get_keypoint_3d(shot_data, keypoint_idx, kp2_name)
            
            # Use z-axis velocity for correlation
            vel1 = np.gradient(kp1_data[:, 2], DT)
            vel2 = np.gradient(kp2_data[:, 2], DT)
            
            # Remove NaN
            valid_mask = ~(np.isnan(vel1) | np.isnan(vel2))
            if np.sum(valid_mask) > 10:
                corr = np.corrcoef(vel1[valid_mask], vel2[valid_mask])[0, 1]
                results['cross_correlations'][f"{kp1_name}_to_{kp2_name}"] = corr
        except:
            pass
    
    # Phase timing
    try:
        right_knee = get_keypoint_3d(shot_data, keypoint_idx, 'right_knee')
        knee_z = right_knee[:, 2]
        
        if not np.isnan(knee_z).all():
            # Loading phase: knee minimum (deepest flex)
            valid_knee = np.where(~np.isnan(knee_z))[0]
            if len(valid_knee) > 0:
                loading_frame = valid_knee[np.argmin(knee_z[valid_knee])]
                results['phase_timing']['loading_frame'] = int(loading_frame)
                results['phase_timing']['loading_time'] = loading_frame / FRAME_RATE
    except:
        pass
    
    try:
        right_wrist = get_keypoint_3d(shot_data, keypoint_idx, 'right_wrist')
        wrist_vel = np.gradient(right_wrist, DT, axis=0)
        vel_mag = np.linalg.norm(wrist_vel, axis=1)
        
        if not np.isnan(vel_mag).all():
            search_start = NUM_FRAMES // 3
            release_frame = search_start + np.nanargmax(vel_mag[search_start:])
            results['phase_timing']['release_frame'] = int(release_frame)
            results['phase_timing']['release_time'] = release_frame / FRAME_RATE
    except:
        pass
    
    return results


print("Coordination analysis functions defined")

Coordination analysis functions defined


In [19]:
# Run coordination analysis on all training shots
coordination_results = []

print("Running coordination analysis...")
for shot_idx in range(len(X_train)):
    if shot_idx % 50 == 0:
        print(f"  Processing shot {shot_idx}/{len(X_train)}")
    
    result = analyze_coordination(X_train[shot_idx], KEYPOINT_INDEX)
    result['shot_idx'] = shot_idx
    result['participant_id'] = participant_ids[shot_idx]
    coordination_results.append(result)

print(f"\nCoordination analysis complete for {len(coordination_results)} shots")

Running coordination analysis...
  Processing shot 0/345
  Processing shot 50/345
  Processing shot 100/345
  Processing shot 150/345
  Processing shot 200/345
  Processing shot 250/345


  Processing shot 300/345



Coordination analysis complete for 345 shots


In [20]:
# Create coordination summary
coord_summary_rows = []

for r in coordination_results:
    row = {
        'shot_idx': r['shot_idx'],
        'participant_id': r['participant_id'],
    }
    
    # Add joint angle stats
    for joint_name, stats in r['joint_angles'].items():
        row[f'{joint_name}_angle_mean'] = stats['mean']
        row[f'{joint_name}_angle_range'] = stats['range']
    
    # Add cross-correlations
    for pair_name, corr in r['cross_correlations'].items():
        row[f'corr_{pair_name}'] = corr
    
    # Add phase timing
    for timing_name, value in r['phase_timing'].items():
        row[timing_name] = value
    
    coord_summary_rows.append(row)

coord_summary_df = pd.DataFrame(coord_summary_rows)
print("Coordination summary:")
display(coord_summary_df.describe())

Coordination summary:


Unnamed: 0,shot_idx,participant_id,right_elbow_angle_mean,right_elbow_angle_range,left_elbow_angle_mean,left_elbow_angle_range,right_shoulder_angle_mean,right_shoulder_angle_range,left_shoulder_angle_mean,left_shoulder_angle_range,...,left_hip_angle_mean,left_hip_angle_range,corr_right_wrist_to_right_elbow,corr_right_elbow_to_right_shoulder,corr_right_shoulder_to_right_hip,corr_right_hip_to_right_knee,loading_frame,loading_time,release_frame,release_time
count,345.0,345.0,345.0,345.0,345.0,345.0,345.0,345.0,345.0,345.0,...,345.0,345.0,345.0,345.0,345.0,345.0,345.0,345.0,345.0,345.0
mean,172.0,3.026087,117.72113,103.466896,117.259117,91.484932,103.475296,35.737995,98.395752,28.247309,...,158.505325,38.55114,0.90632,0.826598,0.948571,0.95839,111.617391,1.86029,173.55942,2.892657
std,99.737155,1.435397,7.379237,9.936538,7.248632,13.667542,3.198826,10.642912,1.595076,6.389122,...,3.709183,12.862377,0.040795,0.060297,0.028965,0.035093,30.300697,0.505012,34.773897,0.579565
min,0.0,1.0,81.690804,82.786682,88.173874,40.133766,97.925507,11.40097,94.150352,14.503136,...,144.599472,7.091507,0.509987,0.662232,0.849084,0.814753,65.0,1.083333,93.0,1.55
25%,86.0,2.0,113.644043,94.025322,112.088539,85.065262,100.877541,27.423416,97.306572,23.864098,...,155.934601,33.896881,0.892896,0.774919,0.944024,0.959004,92.0,1.533333,157.0,2.616667
50%,172.0,3.0,116.854218,104.855804,118.052864,94.425026,102.895668,33.715919,98.304565,27.664291,...,158.014572,42.410873,0.910581,0.844126,0.957881,0.972341,109.0,1.816667,179.0,2.983333
75%,258.0,4.0,123.572624,111.523827,122.760208,99.130058,105.940262,43.81414,99.327896,32.459908,...,161.570374,48.15683,0.927271,0.879815,0.967669,0.978618,127.0,2.116667,194.0,3.233333
max,344.0,5.0,133.249008,149.959564,134.59671,131.412643,111.177139,99.67543,103.026924,51.855804,...,166.987488,60.199219,0.966645,0.954844,0.99272,0.99694,238.0,3.966667,239.0,3.983333


In [21]:
# Joint angle means by player
angle_cols = [c for c in coord_summary_df.columns if 'angle_mean' in c]

print("Mean joint angles by player:")
display(coord_summary_df.groupby('participant_id')[angle_cols].mean().round(2))

Mean joint angles by player:


Unnamed: 0_level_0,right_elbow_angle_mean,left_elbow_angle_mean,right_shoulder_angle_mean,left_shoulder_angle_mean,right_knee_angle_mean,left_knee_angle_mean,right_hip_angle_mean,left_hip_angle_mean
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,123.459999,125.400002,106.919998,99.529999,170.240005,165.690002,166.520004,163.410004
2,125.550003,121.230003,106.730003,98.910004,153.589996,155.649994,158.559998,154.919998
3,114.779999,117.779999,99.620003,98.589996,165.190002,164.279999,158.539993,157.509995
4,115.059998,109.080002,101.82,96.940002,161.679993,160.179993,159.110001,160.679993
5,110.419998,112.93,102.360001,98.0,160.169998,157.660004,154.949997,156.009995


---
## Level 5: Player Signature Detection

Identify unique patterns for each player:
- Stance width (ankle separation)
- Release height (wrist_z at release)
- Elbow tuck (lateral alignment)
- Follow-through trajectory
- Shot rhythm (release timing consistency)

In [22]:
def extract_shot_signature(shot_data: np.ndarray, keypoint_idx: Dict, release_frame: int) -> Dict:
    """
    Extract signature features from a shot.
    """
    signature = {}
    
    # Stance width (ankle separation at start of shot)
    try:
        left_ankle = get_keypoint_3d(shot_data, keypoint_idx, 'left_ankle')
        right_ankle = get_keypoint_3d(shot_data, keypoint_idx, 'right_ankle')
        
        # Use first 30 frames for stance
        stance_diff = np.abs(left_ankle[:30, 0] - right_ankle[:30, 0])  # x-axis separation
        signature['stance_width'] = np.nanmean(stance_diff)
    except:
        signature['stance_width'] = np.nan
    
    # Release height
    try:
        right_wrist = get_keypoint_3d(shot_data, keypoint_idx, 'right_wrist')
        signature['release_height'] = right_wrist[release_frame, 2]  # z-axis
        signature['release_wrist_x'] = right_wrist[release_frame, 0]
        signature['release_wrist_y'] = right_wrist[release_frame, 1]
    except:
        signature['release_height'] = np.nan
        signature['release_wrist_x'] = np.nan
        signature['release_wrist_y'] = np.nan
    
    # Elbow tuck (lateral alignment at release)
    try:
        right_elbow = get_keypoint_3d(shot_data, keypoint_idx, 'right_elbow')
        right_shoulder = get_keypoint_3d(shot_data, keypoint_idx, 'right_shoulder')
        
        # Lateral deviation of elbow from shoulder
        elbow_tuck = right_elbow[release_frame, 0] - right_shoulder[release_frame, 0]
        signature['elbow_tuck'] = elbow_tuck
    except:
        signature['elbow_tuck'] = np.nan
    
    # Follow-through (wrist trajectory after release)
    try:
        right_wrist = get_keypoint_3d(shot_data, keypoint_idx, 'right_wrist')
        
        # Trajectory in 30 frames after release
        follow_end = min(release_frame + 30, NUM_FRAMES)
        follow_z = right_wrist[release_frame:follow_end, 2]
        
        if len(follow_z) > 1 and not np.isnan(follow_z).all():
            signature['follow_through_drop'] = np.nanmax(follow_z) - np.nanmin(follow_z)
            signature['follow_through_max_z'] = np.nanmax(follow_z)
        else:
            signature['follow_through_drop'] = np.nan
            signature['follow_through_max_z'] = np.nan
    except:
        signature['follow_through_drop'] = np.nan
        signature['follow_through_max_z'] = np.nan
    
    # Shot rhythm (release timing)
    signature['release_frame'] = release_frame
    signature['release_time'] = release_frame / FRAME_RATE
    
    return signature


def compute_player_signature(signatures: List[Dict]) -> Dict:
    """
    Compute mean and std of signature features for a player.
    """
    if len(signatures) == 0:
        return {}
    
    player_sig = {}
    sig_df = pd.DataFrame(signatures)
    
    for col in sig_df.columns:
        valid = sig_df[col].dropna()
        if len(valid) > 0:
            player_sig[f'{col}_mean'] = valid.mean()
            player_sig[f'{col}_std'] = valid.std()
    
    return player_sig


print("Signature functions defined")

Signature functions defined


In [23]:
# Extract signatures for all shots
all_signatures = []

print("Extracting shot signatures...")
for shot_idx in range(len(X_train)):
    if shot_idx % 50 == 0:
        print(f"  Processing shot {shot_idx}/{len(X_train)}")
    
    release_frame = pre_release_results[shot_idx]['release_frame']
    sig = extract_shot_signature(X_train[shot_idx], KEYPOINT_INDEX, release_frame)
    sig['shot_idx'] = shot_idx
    sig['participant_id'] = participant_ids[shot_idx]
    all_signatures.append(sig)

signatures_df = pd.DataFrame(all_signatures)
print(f"\nExtracted signatures for {len(signatures_df)} shots")

Extracting shot signatures...
  Processing shot 0/345
  Processing shot 50/345
  Processing shot 100/345
  Processing shot 150/345
  Processing shot 200/345
  Processing shot 250/345
  Processing shot 300/345

Extracted signatures for 345 shots


In [24]:
# Compute player-level signatures
player_signatures = {}

for pid in range(1, 6):
    player_sigs = [s for s in all_signatures if s['participant_id'] == pid]
    player_signatures[pid] = compute_player_signature(player_sigs)

# Create comparison table
sig_comparison = pd.DataFrame(player_signatures).T
sig_comparison.index.name = 'player'

print("Player Signature Comparison (means):")
mean_cols = [c for c in sig_comparison.columns if c.endswith('_mean')]
display(sig_comparison[mean_cols].round(3))

Player Signature Comparison (means):


Unnamed: 0_level_0,stance_width_mean,release_height_mean,release_wrist_x_mean,release_wrist_y_mean,elbow_tuck_mean,follow_through_drop_mean,follow_through_max_z_mean,release_frame_mean,release_time_mean,shot_idx_mean,participant_id_mean
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.411,5.821,18.835,-24.804,-0.601,0.958,6.22,194.429,3.24,34.5,1.0
2,0.222,5.165,16.828,-24.453,-0.564,1.585,5.19,187.136,3.119,102.5,2.0
3,0.329,4.208,18.516,-25.107,-0.477,1.081,4.272,165.0,2.75,169.5,3.0
4,0.122,4.747,18.463,-24.589,-0.836,1.399,5.953,131.925,2.199,237.0,4.0
5,0.085,4.477,18.499,-24.563,-0.408,1.098,4.488,187.27,3.121,307.5,5.0


In [25]:
# Detect shots that deviate from player's signature
signature_deviations = []

sig_features = ['stance_width', 'release_height', 'elbow_tuck', 'follow_through_drop', 'release_time']

for sig in all_signatures:
    pid = sig['participant_id']
    player_sig = player_signatures[pid]
    
    for feat in sig_features:
        value = sig.get(feat)
        mean = player_sig.get(f'{feat}_mean')
        std = player_sig.get(f'{feat}_std')
        
        if pd.isna(value) or pd.isna(mean) or pd.isna(std) or std == 0:
            continue
        
        z_score = (value - mean) / std
        
        if abs(z_score) > 2.0:
            signature_deviations.append({
                'shot_idx': sig['shot_idx'],
                'participant_id': pid,
                'feature': feat,
                'value': value,
                'player_mean': mean,
                'player_std': std,
                'z_score': z_score
            })

sig_dev_df = pd.DataFrame(signature_deviations)
print(f"Found {len(sig_dev_df)} signature deviations (|z| > 2.0)")

if len(sig_dev_df) > 0:
    print("\nDeviations by feature:")
    print(sig_dev_df['feature'].value_counts())
    
    print("\nMost extreme deviations:")
    sig_dev_df['abs_z_score'] = sig_dev_df['z_score'].abs()
    display(sig_dev_df.nlargest(15, 'abs_z_score')[['shot_idx', 'participant_id', 'feature', 'value', 'player_mean', 'z_score']])

Found 86 signature deviations (|z| > 2.0)

Deviations by feature:
feature
release_time           21
stance_width           18
follow_through_drop    16
release_height         16
elbow_tuck             15
Name: count, dtype: int64

Most extreme deviations:


Unnamed: 0,shot_idx,participant_id,feature,value,player_mean,z_score
75,295,5,stance_width,0.900357,0.085334,7.101275
3,19,1,stance_width,0.862487,0.410519,5.822201
11,101,2,elbow_tuck,0.384734,-0.563737,5.424272
25,136,3,release_height,5.887399,4.208223,4.960073
20,112,2,release_time,1.55,3.118939,-4.917923
10,90,2,release_time,1.633333,3.118939,-4.656711
18,112,2,release_height,6.945226,5.164586,4.440102
8,90,2,release_height,6.915023,5.164586,4.364789
34,160,3,elbow_tuck,0.064682,-0.477032,4.035614
72,292,5,release_height,6.427019,4.47656,4.019252


---
## Level 6: Outcome Correlation Analysis

Compute correlations between features and targets:
- Pearson correlation (linear)
- Spearman rank correlation (monotonic)
- Per-player correlations
- Benjamini-Hochberg FDR correction

In [26]:
def compute_feature_correlations(
    X: np.ndarray,
    y: np.ndarray,
    participant_ids: np.ndarray,
    keypoint_cols: List[str]
) -> pd.DataFrame:
    """
    Compute correlations between raw features and targets.
    """
    target_names = ['angle', 'depth', 'left_right']
    results = []
    
    n_shots, n_frames, n_features = X.shape
    
    # For each feature, compute mean across frames
    for feat_idx in range(n_features):
        feat_means = np.nanmean(X[:, :, feat_idx], axis=1)
        feat_stds = np.nanstd(X[:, :, feat_idx], axis=1)
        
        col_name = keypoint_cols[feat_idx]
        
        for target_idx, target_name in enumerate(target_names):
            target = y[:, target_idx]
            
            # Remove NaN
            valid_mask = ~(np.isnan(feat_means) | np.isnan(target))
            
            if np.sum(valid_mask) < 10:
                continue
            
            feat_valid = feat_means[valid_mask]
            target_valid = target[valid_mask]
            
            # Pearson correlation
            pearson_r, pearson_p = pearsonr(feat_valid, target_valid)
            
            # Spearman correlation
            spearman_r, spearman_p = spearmanr(feat_valid, target_valid)
            
            results.append({
                'feature_idx': feat_idx,
                'feature_name': col_name,
                'target': target_name,
                'feature_type': 'raw_mean',
                'pearson_r': pearson_r,
                'pearson_p': pearson_p,
                'spearman_r': spearman_r,
                'spearman_p': spearman_p,
                'n_valid': np.sum(valid_mask)
            })
            
            # Also correlate standard deviation
            feat_std_valid = feat_stds[valid_mask]
            if not np.isnan(feat_std_valid).all():
                std_valid_mask = ~np.isnan(feat_std_valid)
                if np.sum(std_valid_mask) >= 10:
                    pearson_r, pearson_p = pearsonr(feat_std_valid[std_valid_mask], target_valid[std_valid_mask])
                    spearman_r, spearman_p = spearmanr(feat_std_valid[std_valid_mask], target_valid[std_valid_mask])
                    
                    results.append({
                        'feature_idx': feat_idx,
                        'feature_name': col_name,
                        'target': target_name,
                        'feature_type': 'raw_std',
                        'pearson_r': pearson_r,
                        'pearson_p': pearson_p,
                        'spearman_r': spearman_r,
                        'spearman_p': spearman_p,
                        'n_valid': np.sum(std_valid_mask)
                    })
    
    return pd.DataFrame(results)


def benjamini_hochberg_correction(p_values: np.ndarray, alpha: float = 0.05) -> np.ndarray:
    """
    Apply Benjamini-Hochberg FDR correction.
    
    Returns:
        Boolean mask of significant results
    """
    n = len(p_values)
    sorted_idx = np.argsort(p_values)
    sorted_p = p_values[sorted_idx]
    
    # BH threshold
    thresholds = (np.arange(1, n + 1) / n) * alpha
    significant = sorted_p <= thresholds
    
    # Find the largest k where p_k <= (k/n)*alpha
    if not significant.any():
        return np.zeros(n, dtype=bool)
    
    k = np.max(np.where(significant)[0]) + 1
    
    # All p-values up to k are significant
    result = np.zeros(n, dtype=bool)
    result[sorted_idx[:k]] = True
    
    return result


print("Correlation functions defined")

Correlation functions defined


In [27]:
# Compute correlations for all features
print("Computing feature-outcome correlations...")
correlation_df = compute_feature_correlations(X_train, y_train, participant_ids, keypoint_cols)
print(f"Computed {len(correlation_df)} correlations")

# Apply FDR correction
correlation_df['pearson_significant'] = benjamini_hochberg_correction(
    correlation_df['pearson_p'].values, alpha=FDR_ALPHA
)
correlation_df['spearman_significant'] = benjamini_hochberg_correction(
    correlation_df['spearman_p'].values, alpha=FDR_ALPHA
)

print(f"\nSignificant correlations (FDR < {FDR_ALPHA}):")
print(f"  Pearson: {correlation_df['pearson_significant'].sum()}")
print(f"  Spearman: {correlation_df['spearman_significant'].sum()}")

Computing feature-outcome correlations...


Computed 1242 correlations

Significant correlations (FDR < 0.05):
  Pearson: 412
  Spearman: 440


In [28]:
# Show top correlations by target
for target in ['angle', 'depth', 'left_right']:
    print(f"\n{'='*60}")
    print(f"Top correlations with {target.upper()}:")
    print(f"{'='*60}")
    
    target_corrs = correlation_df[correlation_df['target'] == target].copy()
    target_corrs['abs_pearson'] = target_corrs['pearson_r'].abs()
    
    top_corrs = target_corrs.nlargest(20, 'abs_pearson')
    display(top_corrs[['feature_name', 'feature_type', 'pearson_r', 'pearson_p', 'pearson_significant', 'spearman_r']].reset_index(drop=True))


Top correlations with ANGLE:


Unnamed: 0,feature_name,feature_type,pearson_r,pearson_p,pearson_significant,spearman_r
0,right_knee_z,raw_mean,-0.511907,1.941219e-24,True,-0.607016
1,left_knee_z,raw_mean,-0.510987,2.419542e-24,True,-0.607697
2,left_ankle_z,raw_mean,-0.503963,1.2728350000000001e-23,True,-0.598643
3,right_shoulder_z,raw_mean,-0.494655,1.083776e-22,True,-0.601835
4,right_eye_x,raw_mean,0.484872,9.604234e-22,True,0.627142
5,right_ear_x,raw_mean,0.483694,1.24312e-21,True,0.611461
6,left_eye_x,raw_mean,0.483664,1.251429e-21,True,0.605986
7,right_ear_z,raw_mean,-0.474406,9.1826e-21,True,-0.60192
8,left_ear_x,raw_mean,0.472018,1.5206689999999998e-20,True,0.584356
9,right_small_toe_x,raw_mean,0.471948,1.543301e-20,True,0.611404



Top correlations with DEPTH:


Unnamed: 0,feature_name,feature_type,pearson_r,pearson_p,pearson_significant,spearman_r
0,left_second_finger_distal_z,raw_std,0.212607,6.9e-05,True,0.225051
1,left_second_finger_dip_z,raw_std,0.20262,0.000151,True,0.217349
2,left_third_finger_distal_z,raw_std,0.202459,0.000153,True,0.222172
3,left_first_finger_distal_z,raw_std,0.199891,0.000186,True,0.206151
4,left_third_finger_dip_z,raw_std,0.19737,0.000225,True,0.217219
5,left_fourth_finger_distal_z,raw_std,0.197168,0.000229,True,0.21989
6,right_heel_z,raw_mean,0.194274,0.000283,True,0.224471
7,left_eye_z,raw_mean,0.192213,0.00033,True,0.172639
8,nose_z,raw_mean,0.191806,0.00034,True,0.171147
9,left_fourth_finger_dip_z,raw_std,0.191579,0.000345,True,0.213326



Top correlations with LEFT_RIGHT:


Unnamed: 0,feature_name,feature_type,pearson_r,pearson_p,pearson_significant,spearman_r
0,right_heel_y,raw_std,0.147477,0.006063,True,0.067751
1,right_ankle_y,raw_std,0.146614,0.00637,True,0.054389
2,right_big_toe_y,raw_std,0.146344,0.006469,True,0.071784
3,right_small_toe_y,raw_std,0.143673,0.007522,True,0.050489
4,right_hip_y,raw_std,0.136603,0.011085,True,0.141457
5,mid_hip_y,raw_std,0.131614,0.014429,True,0.147657
6,right_knee_y,raw_std,0.127034,0.018247,False,0.040429
7,left_hip_y,raw_std,0.126529,0.018718,False,0.154387
8,left_shoulder_y,raw_std,0.122706,0.022637,False,0.167472
9,right_elbow_z,raw_std,0.118746,0.027425,False,0.076957


In [29]:
# Compute per-player correlations for key features
print("Computing per-player correlations...")

# Focus on release-related features
key_keypoints = ['right_wrist', 'right_elbow', 'right_shoulder', 'right_knee', 'mid_hip']
key_feat_indices = []
for kp in key_keypoints:
    if kp in KEYPOINT_INDEX:
        idx = KEYPOINT_INDEX[kp]
        key_feat_indices.extend([idx*3, idx*3+1, idx*3+2])

per_player_corrs = []

for pid in range(1, 6):
    mask = participant_ids == pid
    X_player = X_train[mask]
    y_player = y_train[mask]
    
    for feat_idx in key_feat_indices:
        feat_means = np.nanmean(X_player[:, :, feat_idx], axis=1)
        col_name = keypoint_cols[feat_idx]
        
        for target_idx, target_name in enumerate(['angle', 'depth', 'left_right']):
            target = y_player[:, target_idx]
            
            valid_mask = ~(np.isnan(feat_means) | np.isnan(target))
            if np.sum(valid_mask) < 5:
                continue
            
            pearson_r, pearson_p = pearsonr(feat_means[valid_mask], target[valid_mask])
            
            per_player_corrs.append({
                'player': pid,
                'feature_name': col_name,
                'target': target_name,
                'pearson_r': pearson_r,
                'pearson_p': pearson_p,
                'n_shots': np.sum(valid_mask)
            })

per_player_df = pd.DataFrame(per_player_corrs)
print(f"Computed {len(per_player_df)} per-player correlations")

Computing per-player correlations...
Computed 225 per-player correlations


In [30]:
# Show strongest player-specific correlations
per_player_df['abs_r'] = per_player_df['pearson_r'].abs()

print("Strongest player-specific correlations (|r| > 0.3):")
strong_player_corrs = per_player_df[per_player_df['abs_r'] > 0.3].sort_values('abs_r', ascending=False)
display(strong_player_corrs.head(30))

Strongest player-specific correlations (|r| > 0.3):


Unnamed: 0,player,feature_name,target,pearson_r,pearson_p,n_shots,abs_r
17,1,right_elbow_z,left_right,0.509718,7e-06,70,0.509718
8,1,right_wrist_z,left_right,0.501381,1e-05,70,0.501381
113,3,right_shoulder_y,left_right,-0.459518,8.1e-05,68,0.459518
142,4,right_wrist_z,depth,0.412024,0.000532,67,0.412024
71,2,right_shoulder_z,left_right,-0.407516,0.000683,66,0.407516
95,3,right_wrist_y,left_right,-0.401885,0.000681,68,0.401885
196,5,right_elbow_z,depth,-0.39278,0.000537,74,0.39278
115,3,right_shoulder_z,depth,0.389845,0.001015,68,0.389845
44,1,mid_hip_z,left_right,0.385094,0.000995,70,0.385094
131,3,mid_hip_y,left_right,-0.377009,0.001529,68,0.377009


In [31]:
# Save correlation results
correlation_df.to_csv(output_dir / 'feature_correlations.csv', index=False)
per_player_df.to_csv(output_dir / 'per_player_correlations.csv', index=False)
print(f"Saved correlation results to {output_dir}")

Saved correlation results to ../output


---
## Test Set Analysis

Apply anomaly detection and signature analysis to test set (no correlation analysis since no targets).

In [32]:
# Run anomaly detection on test set
test_anomalies = []
test_anomaly_counts = []

print("Running anomaly detection on test data...")
for shot_idx in range(len(X_test)):
    if shot_idx % 20 == 0:
        print(f"  Processing shot {shot_idx}/{len(X_test)}")
    
    pid = test_participant_ids[shot_idx]
    shot_data = X_test[shot_idx]
    
    anomalies = detect_anomalies_in_shot(
        shot_data, 
        player_profiles[pid], 
        player_velocity_profiles[pid],
        shot_idx,
        keypoint_cols
    )
    
    test_anomalies.extend(anomalies)
    
    test_anomaly_counts.append({
        'shot_idx': shot_idx,
        'shot_id': test_df.iloc[shot_idx]['shot_id'],
        'participant_id': pid,
        'total_anomalies': len(anomalies),
        'z_score_anomalies': sum(1 for a in anomalies if a['method'] == 'z_score'),
        'iqr_anomalies': sum(1 for a in anomalies if a['method'] == 'iqr'),
        'mad_anomalies': sum(1 for a in anomalies if a['method'] == 'mad'),
        'velocity_anomalies': sum(1 for a in anomalies if a['method'] == 'velocity_spike'),
    })

test_anomaly_df = pd.DataFrame(test_anomaly_counts)
print(f"\nTotal test anomalies: {len(test_anomalies)}")
print(f"Test shots with anomalies: {sum(1 for s in test_anomaly_counts if s['total_anomalies'] > 0)}")

Running anomaly detection on test data...
  Processing shot 0/113


  Processing shot 20/113


  Processing shot 40/113


  Processing shot 60/113


  Processing shot 80/113


  Processing shot 100/113



Total test anomalies: 1049565
Test shots with anomalies: 113


In [33]:
# Show test set anomaly summary
print("Test set anomaly summary:")
display(test_anomaly_df.describe())

print("\nTest shots with most anomalies:")
display(test_anomaly_df.nlargest(10, 'total_anomalies'))

Test set anomaly summary:


Unnamed: 0,shot_idx,participant_id,total_anomalies,z_score_anomalies,iqr_anomalies,mad_anomalies,velocity_anomalies
count,113.0,113.0,113.0,113.0,113.0,113.0,113.0
mean,56.0,3.017699,9288.185841,2068.044248,2596.955752,3713.079646,910.106195
std,32.76431,1.439136,3610.522814,1049.305394,1386.938492,1322.494229,571.058406
min,0.0,1.0,2724.0,130.0,386.0,1411.0,52.0
25%,28.0,2.0,6201.0,1153.0,1483.0,2608.0,493.0
50%,56.0,3.0,9334.0,2306.0,2207.0,3444.0,859.0
75%,84.0,4.0,12382.0,2725.0,3836.0,4911.0,1241.0
max,112.0,5.0,18298.0,4544.0,5663.0,6464.0,2724.0



Test shots with most anomalies:


Unnamed: 0,shot_idx,shot_id,participant_id,total_anomalies,z_score_anomalies,iqr_anomalies,mad_anomalies,velocity_anomalies
27,27,97jdZ0R,2,18298,4544,5428,6464,1862
25,25,3xJBQJV,2,17645,4372,5301,6376,1596
61,61,jOaa7QV,3,16811,3673,5509,6286,1343
30,30,KAXlmQr,2,16077,4034,5198,6223,622
62,62,l7aa8en,3,15351,2722,5663,6390,576
37,37,l7ap930,2,15324,3609,4669,5868,1178
51,51,EWaaqvB,3,15059,3324,4849,5470,1416
64,64,qxDDqYx,3,14386,2534,5281,6160,411
0,0,2MqnW3V,1,14345,3129,3301,5544,2371
53,53,MaVVo0n,3,14218,2532,5028,5799,859


In [34]:
# Check for extreme values in test set
print("Checking for extreme values in test set...")

test_extreme = []
for shot_idx in range(len(X_test)):
    shot_data = X_test[shot_idx]
    
    # Check for values outside reasonable range
    for feat_idx in range(shot_data.shape[1]):
        series = shot_data[:, feat_idx]
        
        max_val = np.nanmax(series)
        min_val = np.nanmin(series)
        
        if max_val > 50 or min_val < -50:
            test_extreme.append({
                'shot_idx': shot_idx,
                'feature_idx': feat_idx,
                'feature_name': keypoint_cols[feat_idx],
                'max_value': max_val,
                'min_value': min_val
            })

if len(test_extreme) > 0:
    print(f"Found {len(test_extreme)} extreme values in test set:")
    display(pd.DataFrame(test_extreme))
else:
    print("No extreme values (>50 or <-50) found in test set.")

Checking for extreme values in test set...
Found 2 extreme values in test set:


Unnamed: 0,shot_idx,feature_idx,feature_name,max_value,min_value
0,17,30,right_wrist_x,54.943493,18.383471
1,17,150,right_second_finger_distal_x,56.660088,17.876362


---
## Summary and Verification

Verify completeness of analysis.

In [35]:
print("="*60)
print("ANALYSIS VERIFICATION")
print("="*60)

# Completeness checks
print(f"\n1. Data Coverage:")
print(f"   - Training shots analyzed: {len(X_train)} (expected: 344)")
print(f"   - Test shots analyzed: {len(X_test)} (expected: 112)")
print(f"   - Total shots: {len(X_train) + len(X_test)} (expected: 456)")

print(f"\n2. Player Coverage:")
print(f"   - Players with profiles: {len(player_profiles)} (expected: 5)")
for pid in range(1, 6):
    n_shots = player_profiles[pid]['n_shots']
    print(f"   - Player {pid}: {n_shots} shots")

print(f"\n3. Feature Coverage:")
print(f"   - Features per shot: {NUM_FEATURES} (expected: 207)")
print(f"   - Keypoints: {len(KEYPOINT_NAMES)} (expected: 69)")

print(f"\n4. Anomaly Detection:")
print(f"   - Training anomalies detected: {len(all_anomalies)}")
print(f"   - Test anomalies detected: {len(test_anomalies)}")

print(f"\n5. Correlation Analysis:")
print(f"   - Total correlations computed: {len(correlation_df)}")
print(f"   - Significant (FDR < 0.05): {correlation_df['pearson_significant'].sum()}")

print(f"\n6. Output Files:")
output_files = list(output_dir.glob('*'))
for f in output_files:
    print(f"   - {f.name}")

ANALYSIS VERIFICATION

1. Data Coverage:
   - Training shots analyzed: 345 (expected: 344)
   - Test shots analyzed: 113 (expected: 112)
   - Total shots: 458 (expected: 456)

2. Player Coverage:
   - Players with profiles: 5 (expected: 5)
   - Player 1: 70 shots
   - Player 2: 66 shots
   - Player 3: 68 shots
   - Player 4: 67 shots
   - Player 5: 74 shots

3. Feature Coverage:
   - Features per shot: 207 (expected: 207)
   - Keypoints: 69 (expected: 69)

4. Anomaly Detection:
   - Training anomalies detected: 3309254
   - Test anomalies detected: 1049565

5. Correlation Analysis:
   - Total correlations computed: 1242
   - Significant (FDR < 0.05): 412

6. Output Files:
   - shot_anomaly_report.csv
   - player_1_profile.json
   - per_player_correlations.csv
   - submission.csv
   - lightgbm_model.pkl
   - features_train.pkl
   - augmentation_test_results.csv
   - train_targets_last3.csv
   - player_1_anomalies.json
   - player_5_profile.json
   - player_4_anomalies.json
   - player_2

In [36]:
print("\n" + "="*60)
print("KEY FINDINGS SUMMARY")
print("="*60)

print("\n1. ANOMALIES:")
anomaly_methods = anomaly_df['method'].value_counts()
for method, count in anomaly_methods.items():
    print(f"   - {method}: {count}")

print("\n2. PRE-RELEASE PATTERNS:")
print(f"   - Shots with finger tremors: {len(tremor_shots)}")
print(f"   - Mean elbow lateral drift: {pre_release_df['elbow_lateral_drift'].mean():.4f}")
print(f"   - Mean body lateral sway: {pre_release_df['body_lateral_sway'].mean():.4f}")

print("\n3. PLAYER SIGNATURES:")
print("   Release height by player:")
for pid in range(1, 6):
    mean_h = player_signatures[pid].get('release_height_mean', np.nan)
    std_h = player_signatures[pid].get('release_height_std', np.nan)
    print(f"   - Player {pid}: {mean_h:.3f} +/- {std_h:.3f}")

print("\n4. TOP PREDICTIVE FEATURES (by |correlation|):")
for target in ['angle', 'depth', 'left_right']:
    target_corrs = correlation_df[correlation_df['target'] == target].copy()
    if len(target_corrs) > 0:
        target_corrs['abs_r'] = target_corrs['pearson_r'].abs()
        top = target_corrs.nlargest(3, 'abs_r')
        print(f"   {target}:")
        for _, row in top.iterrows():
            print(f"      - {row['feature_name']}: r={row['pearson_r']:.3f}")

print("\n" + "="*60)
print("ANALYSIS COMPLETE")
print("="*60)


KEY FINDINGS SUMMARY

1. ANOMALIES:


   - mad: 1325059


   - iqr: 928263
   - z_score: 752059
   - velocity_spike: 303873

2. PRE-RELEASE PATTERNS:
   - Shots with finger tremors: 5
   - Mean elbow lateral drift: 1.0841
   - Mean body lateral sway: 0.5340

3. PLAYER SIGNATURES:
   Release height by player:
   - Player 1: 5.821 +/- 1.177
   - Player 2: 5.165 +/- 0.401
   - Player 3: 4.208 +/- 0.339
   - Player 4: 4.747 +/- 0.513
   - Player 5: 4.477 +/- 0.485

4. TOP PREDICTIVE FEATURES (by |correlation|):


   angle:
      - right_knee_z: r=-0.512
      - left_knee_z: r=-0.511
      - left_ankle_z: r=-0.504
   depth:
      - left_second_finger_distal_z: r=0.213
      - left_second_finger_dip_z: r=0.203
      - left_third_finger_distal_z: r=0.202
   left_right:
      - right_heel_y: r=0.147
      - right_ankle_y: r=0.147
      - right_big_toe_y: r=0.146

ANALYSIS COMPLETE
