In [1]:
import os
import sys
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
from pathlib import Path

# --- Path Setup ---
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
SRC_PATH = os.path.join(PROJECT_ROOT, "src")
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)

# --- Imports from src ---
from config import CONFIG
from skeleton_defs import SKELETON_HIERARCHY 

# --- External Metadata Integration (Subject Data) ---
# Path to the external JSON file containing anthropometric data
METADATA_PATH = os.path.join(PROJECT_ROOT, "data", "subject_metadata.json")

def load_subject_metadata(path):
    """
    Loads subject-specific data (weight, height) from a JSON file.
    If file is missing or values are null, the pipeline switches to 
    'Relative Normalization Mode' based on Hof (1996).
    """
    if os.path.exists(path):
        with open(path, 'r') as f:
            return json.load(f)
    return None

# Global Subject Variables
metadata = load_subject_metadata(METADATA_PATH)
SUBJECT_WEIGHT = None
SUBJECT_HEIGHT = None
PIPELINE_MODE = "Normalization (Unit Mass)"

if metadata and "subject_info" in metadata:
    info = metadata["subject_info"]
    SUBJECT_WEIGHT = info.get("weight_kg")
    SUBJECT_HEIGHT = info.get("height_cm")
    
    if SUBJECT_WEIGHT and SUBJECT_HEIGHT:
        PIPELINE_MODE = "Scientific (Anthropometric)"
        print(f"‚úÖ Scientific Mode: Using Winter (2009) coefficients for {SUBJECT_WEIGHT}kg")
    else:
        print("‚ö†Ô∏è Subject info found but incomplete. Using Internal Normalization.")
else:
    print("‚ÑπÔ∏è No metadata file found. Defaulting to Relative Normalization Mode.")

# --- Directories ---
DERIV_01 = os.path.join(PROJECT_ROOT, CONFIG['derivatives_dir'], "step_01_parse")
DERIV_02 = os.path.join(PROJECT_ROOT, CONFIG['derivatives_dir'], "step_02_preprocess")
QC_02 = os.path.join(PROJECT_ROOT, CONFIG['qc_dir'], "step_02_preprocess")

os.makedirs(DERIV_02, exist_ok=True)
os.makedirs(QC_02, exist_ok=True)

print(f"Ready. Mode: {PIPELINE_MODE}")
print(f"Output directory: {DERIV_02}")

‚ö†Ô∏è Subject info found but incomplete. Using Internal Normalization.
Ready. Mode: Normalization (Unit Mass)
Output directory: /Users/drorhazan/Documents/untitled folder/Gaga-mocap-Kinematics/derivatives/step_02_preprocess


In [2]:
# --- Data Loading and Run ID Definition ---
# Derive the parquet file from the CSV path in config.
# This ensures notebook 02 always processes the same file as notebook 01.
csv_filename = Path(CONFIG['current_csv']).stem  # Gets filename without extension
RUN_ID = csv_filename
PARQUET_PATH = Path(DERIV_01) / f"{RUN_ID}__parsed_run.parquet"

# SCIENTIFIC RATIONALE: Consistent file tracking is essential for 
# Reproducible Research in Biomechanics (Winter, 2009).
if not PARQUET_PATH.exists():
    print(f"‚ùå ERROR: Expected parquet file not found: {PARQUET_PATH}")
    print(f"Did you run notebook 01 first?")
    raise FileNotFoundError(f"Parquet file not found: {PARQUET_PATH}")

print(f"Loading Run ID: {RUN_ID}")
print(f"File: {PARQUET_PATH}")

# Loading the parsed data
df_raw = pd.read_parquet(PARQUET_PATH)

# ISB COMPLIANCE NOTE: Standardizing column structures at this stage 
# facilitates the mapping of global coordinate systems to segment locals 
# according to Wu et al. (2005) standards.
print(f"‚úÖ Loaded successfully. Shape: {df_raw.shape}")

Loading Run ID: 734_T1_P1_R1_Take 2025-12-01 02.18.27 PM
File: /Users/drorhazan/Documents/untitled folder/Gaga-mocap-Kinematics/derivatives/step_01_parse/734_T1_P1_R1_Take 2025-12-01 02.18.27 PM__parsed_run.parquet
‚úÖ Loaded successfully. Shape: (30798, 359)


In [None]:
# ---03 Column Renaming and Scientific Joint Filtering ---

def clean_and_filter_joints(df, keywords_to_drop):
    """
    1. Standardizes column names based on the Hierarchy Validation Report (ISB Standard).
    2. Drops non-essential joints (digits) while preserving biomechanical core.
    3. Validates segments required for Center of Mass (Winter, 2009) and Rotation (Wu et al., 2005).
    """
    # 1. Standardize Names (Mapping OptiTrack labels to our validated Hierarchy names)
    new_columns = {}
    for col in df.columns:
        # Transforming format: 'Skeleton:Joint:Position:X' -> 'Joint__px'
        clean_name = col.replace('Skeleton:', '').replace('Position:', '')
        clean_name = clean_name.replace('X', 'px').replace('Y', 'py').replace('Z', 'pz')
        new_columns[col] = clean_name.replace(':', '__')
    
    df = df.rename(columns=new_columns)
    
    # 2. Filtering non-essential joints (Finger/Toe digits)
    # We keep 'ToeBase' as it is part of our hierarchy, but drop individual fingers
    cols_to_drop = [col for col in df.columns if any(kw.lower() in col.lower() for kw in keywords_to_drop)]
    df_filtered = df.drop(columns=cols_to_drop)
    
    # 3. SCIENTIFIC INTEGRITY CHECK (Based on Winter, 2009 & Your Hierarchy Report)
    # These segments are mandatory for the downstream Kinematic Pipeline (Notebooks 03-08)
    critical_segments = [
        "Hips", "Spine", "Spine1", "Head",             # Core / Trunk
        "LeftUpLeg", "LeftLeg", "RightUpLeg", "RightLeg", # Lower Body
        "LeftArm", "RightArm"                           # Upper Body
    ]
    
    remaining_joints = set([c.split('__')[0] for c in df_filtered.columns if '__' in c])
    missing_critical = [s for s in critical_segments if s not in remaining_joints]
    
    if missing_critical:
        print(f"‚ö†Ô∏è SCIENTIFIC WARNING: Missing joints required for CoM/Angles: {missing_critical}")
        print("Check if they were accidentally dropped or missing in the Raw CSV.")
    else:
        print("üíé SCIENTIFIC INTEGRITY: All hierarchy-essential segments are present.")
        
    return df_filtered

# --- EXECUTE ---
# Note: We do NOT drop 'ToeBase' as it's in your pass list.
DROP_KEYWORDS = ["Thumb", "Index", "Middle", "Ring", "Pinky", "Finger"] 

df_preprocessed = clean_and_filter_joints(df_raw, DROP_KEYWORDS)

print(f"‚úÖ Data standardized to Hierarchy Report. Shape: {df_preprocessed.shape}")

üíé SCIENTIFIC INTEGRITY: All hierarchy-essential segments are present.
‚úÖ Data standardized to Hierarchy Report. Shape: (30798, 149)


In [None]:
# --- UPDATED CELL 04: Build Kinematics Map (The Scientific Blueprint) ---

def build_map_from_available_joints(df_columns, hierarchy_dict):
    """
    Scans the current DataFrame columns and builds the kinematics map.
    This ensures NB 06 and NB 08 only attempt calculations on valid joint-chains.
    """
    print(f"\n{'='*20} BUILDING KINEMATICS MAP {'='*20}")
    
    kinematics_map = {}
    
    # Identify joints that survived filtering in Cell 3
    # We look for the base name before the '__px' suffix
    existing_segments = set([c.split('__')[0] for c in df_columns if '__' in c])
    
    skipped_count = 0
    kept_count = 0
    
    for segment, info in hierarchy_dict.items():
        parent = info['parent']
        angle_name = info['angle_name']
        
        # 1. Check if joint exists in current data
        if segment not in existing_segments:
            skipped_count += 1
            continue
            
        # 2. BIOMECHANICAL INTEGRITY: Parent must exist for relative calculations
        if parent is not None and parent not in existing_segments:
            print(f"‚ö†Ô∏è SCIENTIFIC WARNING: Orphaned Joint '{segment}'.")
            print(f"   Cannot calculate '{angle_name}' because parent '{parent}' was filtered out.")
            continue

        # 3. Validation Passed: Add to map for downstream Notebooks
        kinematics_map[segment] = {
            "parent": parent,
            "angle_name": angle_name,
            "is_global": (parent is None)
        }
        kept_count += 1

    print(f"Total defined in Schema: {len(hierarchy_dict)}")
    print(f"Skipped (Missing/Filtered): {skipped_count}")
    print(f"Mapped (Ready for Physics):  {kept_count}")
    print(f"{'='*45}\n")
    
    return kinematics_map

# --- EXECUTE ---
# Note: Use df_preprocessed from Cell 3
kinematics_map = build_map_from_available_joints(df_preprocessed.columns, SKELETON_HIERARCHY)


Total defined in Schema: 27
Skipped (Missing/Filtered): 6
Mapped (Ready for Physics):  21



In [14]:
# --- CELL 05: Gap Filling and Rotational Re-normalization ---
# RATIONALE: Biomechanical analysis requires continuous derivative signals (velocity/acceleration).
# Small gaps (<100ms) are safely interpolated to prevent signal fragmentation.

# Configuration: 10 frames at 120Hz = 83.3ms. 
# Scientific Limit: Skurowski (2021) suggests avoiding interpolation for gaps > 100ms in dynamic movement.
MAX_GAP_SIZE = 10  

def fill_missing_data(df, max_gap):
    """
    1. Performs linear interpolation for small kinematic gaps.
    2. Re-normalizes quaternions to maintain unit length (Rotational Integrity).
    """
    df_clean = df.copy()
    
    # 1. Linear Interpolation
    # Linear method is standard for positional data and small rotational increments.
    df_clean = df_clean.interpolate(method='linear', limit=max_gap, limit_direction='both')
    
    # 2. Quaternion Re-normalization
    # RATIONALE: Linear interpolation of quaternions (LERP) leads to non-unit vectors.
    # Without re-normalization, angular velocity calculations in NB 06 will be distorted.
    quat_cols = [c for c in df_clean.columns if c.endswith(('__qx', '__qy', '__qz', '__qw'))]
    segments = set(c.split('__')[0] for c in quat_cols)
    
    for seg in segments:
        try:
            qx, qy = df_clean[f"{seg}__qx"], df_clean[f"{seg}__qy"]
            qz, qw = df_clean[f"{seg}__qz"], df_clean[f"{seg}__qw"]
            
            norms = np.sqrt(qx**2 + qy**2 + qz**2 + qw**2)
            norms[norms == 0] = 1.0  # Avoid division by zero
            
            df_clean[f"{seg}__qx"] /= norms
            df_clean[f"{seg}__qy"] /= norms
            df_clean[f"{seg}__qz"] /= norms
            df_clean[f"{seg}__qw"] /= norms
        except KeyError:
            continue # Skip if a specific quaternion component is missing
            
    return df_clean

print(f"Running Scientific Gap Filling (Max Gap: {MAX_GAP_SIZE} frames)...")
# Note: df_preprocessed continues the chain from Cell 03 standard naming
df_preprocessed = fill_missing_data(df_preprocessed, MAX_GAP_SIZE)

# Check remaining NaNs (Signifies gaps larger than MAX_GAP_SIZE)
remaining_nans = df_preprocessed.isna().sum().sum()
print(f"‚úÖ Gap Filling Complete.")
print(f"üìä Quality Control: Remaining NaNs (Critical Gaps): {remaining_nans}")

Running Scientific Gap Filling (Max Gap: 10 frames)...
‚úÖ Gap Filling Complete.
üìä Quality Control: Remaining NaNs (Critical Gaps): 0


In [None]:
# --- CELL 06: Missing Data Scientific Report ---

def print_missing_data_report(df_raw, df_filled):
    """
    Compares raw and filled data to provide a 'Data Reliability Score'.
    Essential for Methodological Transparency (Winter, 2009).
    """
    print(f"\n{'='*20} DATA RELIABILITY REPORT {'='*20}")
    
    total_cells = df_raw.size
    nans_before = df_raw.isna().sum().sum()
    nans_after = df_filled.isna().sum().sum()
    
    interpolated_points = nans_before - nans_after
    interpolation_ratio = (interpolated_points / total_cells) * 100
    
    print(f"Overall Dataset Integrity:")
    print(f"- Original Missing Data: {round((nans_before/total_cells)*100, 3)}%")
    print(f"- Interpolated Data:      {round(interpolation_ratio, 3)}%")
    print(f"- Remaining Unsolved Gaps: {round((nans_after/total_cells)*100, 3)}%")
    
    # Per-Joint Reliability (Focus on Core Joints)
    print(f"\nPer-Joint Integrity Check (Top 5 Interpolated):")
    diff = (df_raw.isna().sum() - df_filled.isna().sum()).sort_values(ascending=False)
    for joint, count in diff.head(5).items():
        if count > 0:
            percentage = (count / len(df_raw)) * 100
            print(f"  * {joint.replace('__px', ''):<15} : {round(percentage, 2)}% Interpolated")

    print(f"{'='*50}\n")

# --- EXECUTE ---
# We compare df_raw (pre-cleaning/filling) to our current df_preprocessed
print_missing_data_report(df_raw, df_preprocessed)
print("note: Original data = 0% - Optitrack confidence-based cleaning")


Overall Dataset Integrity:
- Original Missing Data: 0.0%
- Interpolated Data:      0.0%
- Remaining Unsolved Gaps: 0.0%

Per-Joint Integrity Check (Top 5 Interpolated):



In [17]:
# --- QC Stage: Bone Length Check (Fail Fast) ---
# SCIENTIFIC RATIONALE: Skurowski (2021) identifies bone length consistency 
# as the primary metric for motion capture data quality. High CV% indicates 
# marker occlusion or reconstruction artifacts.

def run_bone_length_qc(df, hierarchy, cfg):
    """
    Quality Gate: Validates the Rigid Body Assumption.
    If SUBJECT_HEIGHT is missing, these mean lengths serve as the 
    internal reference for scaling (Hof, 1996).
    """
    print(f"\n{'='*20} BONE LENGTH QC (Scientific Validation) {'='*20}")
    
    # Thresholds: 2% for Warning, 5% for Critical failure (Skurowski standard)
    thresh_warn = cfg['THRESH'].get('BONE_CV_WARN', 0.02)   
    thresh_alert = cfg['THRESH'].get('BONE_CV_ALERT', 0.05) 
    
    results = []
    
    for child_name, info in hierarchy.items():
        parent_name = info['parent']
        if parent_name is None: continue # Skip Root
            
        try:
            # NB02 standard naming convention: Joint__px
            c_pos = df[[f"{child_name}__px", f"{child_name}__py", f"{child_name}__pz"]].values
            p_pos = df[[f"{parent_name}__px", f"{parent_name}__py", f"{parent_name}__pz"]].values
            
            # Distance calculation
            lengths = np.linalg.norm(c_pos - p_pos, axis=1)
            mean_l = np.nanmean(lengths)
            std_l = np.nanstd(lengths)
            cv = std_l / mean_l if mean_l > 0 else 0.0
            
            status = "PASS"
            if cv > thresh_alert: status = "FAIL üî¥"
            elif cv > thresh_warn: status = "WARN üü°"
            
            results.append({
                "Bone": f"{parent_name}->{child_name}",
                "Mean_mm": round(mean_l * 1000, 1), # Conversion to mm for biomechanical clarity
                "CV%": round(cv * 100, 2),
                "Status": status
            })
        except KeyError:
            continue

    df_qc = pd.DataFrame(results).sort_values("CV%", ascending=False)
    
    # Summary reporting
    n_fails = sum(df_qc['Status'].str.contains("FAIL"))
    print(f"Checked {len(df_qc)} bones.")
    
    if n_fails > 0:
        print(f"‚õî SCIENTIFIC ALERT: {n_fails} bones exceed the 5% instability threshold.")
    else:
        print("‚úÖ SUCCESS: Rigid body integrity confirmed. Ready for Kinematic Derivatives.")
        
    return df_qc

# --- EXECUTE ---
# --- UPDATED EXECUTION FOR BONE QC ---
# We force display of the results for scientific reporting purposes.

# 1. Run the QC
df_bone_qc = run_bone_length_qc(df_preprocessed, kinematics_map, CONFIG)

# 2. Detailed Print (For the 'Methods' section of your research)
print(f"\n{'='*25} DETAILED SEGMENT ANALYSIS {'='*25}")
print(f"{'Bone Segment':<30} | {'Mean (mm)':<10} | {'CV (%)':<8} | {'Status'}")
print("-" * 65)

for _, row in df_bone_qc.iterrows():
    # Adding a visual marker for very high precision (CV < 1%)
    precision_star = "‚≠ê" if row['CV%'] < 1.0 else "  "
    print(f"{row['Bone']:<30} | {row['Mean_mm']:<10} | {row['CV%']:<8} | {row['Status']} {precision_star}")

# 3. Scientific Metadata for Master Report
mean_overall_cv = df_bone_qc['CV%'].mean()
print("-" * 65)
print(f"üìä SCIENTIFIC SUMMARY: Mean Segment CV across all bones: {mean_overall_cv:.2f}%")
print(f"RATIONALE: A mean CV below 2% indicates high-fidelity tracking (Skurowski, 2021).")


Checked 20 bones.
‚úÖ SUCCESS: Rigid body integrity confirmed. Ready for Kinematic Derivatives.

Bone Segment                   | Mean (mm)  | CV (%)   | Status
-----------------------------------------------------------------
Hips->Spine                    | 80827.4    | 4.89     | WARN üü°   
Neck->Head                     | 137793.9   | 2.37     | WARN üü°   
Spine->Spine1                  | 213042.1   | 1.85     | PASS   
Spine1->Neck                   | 220539.0   | 1.49     | PASS   
Spine1->LeftShoulder           | 174661.6   | 0.0      | PASS ‚≠ê
RightArm->RightForeArm         | 251313.9   | 0.0      | PASS ‚≠ê
RightShoulder->RightArm        | 160475.6   | 0.0      | PASS ‚≠ê
Spine1->RightShoulder          | 174661.7   | 0.0      | PASS ‚≠ê
LeftForeArm->LeftHand          | 233159.2   | 0.0      | PASS ‚≠ê
LeftArm->LeftForeArm           | 251313.9   | 0.0      | PASS ‚≠ê
LeftShoulder->LeftArm          | 160475.6   | 0.0      | PASS ‚≠ê
RightLeg->RightFoot            | 389935.

In [18]:
# --- CELL 08: Scientific Data Persistence ---
# RATIONALE: Using Parquet format preserves double-precision accuracy, 
# which is critical for reducing noise in kinematic derivatives (NB 04/06).
# The Kinematics Map acts as the 'Scientific Contract' between pipeline stages.

# 1. Save Processed Data (High-Precision Parquet)
out_parquet_path = os.path.join(DERIV_02, f"{RUN_ID}__preprocessed.parquet")
df_preprocessed.to_parquet(out_parquet_path, index=False)

# 2. Save Kinematics Map (JSON)
# RATIONALE: This ensures that NB 06 (Rotation) and NB 08 (CoM) use 
# the exact same skeletal hierarchy validated in this notebook.
out_map_path = os.path.join(DERIV_02, f"{RUN_ID}__kinematics_map.json")
with open(out_map_path, 'w') as f:
    json.dump(kinematics_map, f, indent=4)

print(f"\n‚úÖ PERSISTENCE SUCCESS!")
print(f"üìä Kinematic Data: {out_parquet_path}")
print(f"üß¨ Kinematics Map: {out_map_path}")
print("\nProceeding to Notebook 03 (Resample).")


‚úÖ PERSISTENCE SUCCESS!
üìä Kinematic Data: /Users/drorhazan/Documents/untitled folder/Gaga-mocap-Kinematics/derivatives/step_02_preprocess/734_T1_P1_R1_Take 2025-12-01 02.18.27 PM__preprocessed.parquet
üß¨ Kinematics Map: /Users/drorhazan/Documents/untitled folder/Gaga-mocap-Kinematics/derivatives/step_02_preprocess/734_T1_P1_R1_Take 2025-12-01 02.18.27 PM__kinematics_map.json

Proceeding to Notebook 03 (Resample).


In [19]:
# --- CELL 09: FINAL CELL - Export Preprocessing Summary for Master Report ---

def export_preprocess_summary(df_pre, df_post, df_bone_qc, run_id, save_dir, cfg):
    """
    Creates a comprehensive QC JSON report. 
    Essential for Methodological Traceability (Winter, 2009).
    """
    total_cells = df_pre.size
    total_nans_pre = df_pre.isna().sum().sum()
    total_nans_post = df_post.isna().sum().sum()
    
    # Bone QC Metrics (Scientific Integrity)
    mean_cv = df_bone_qc['CV%'].mean() if not df_bone_qc.empty else 100.0
    # Capture bones that exceeded the safety threshold
    alerts = df_bone_qc[df_bone_qc['Status'].str.contains("FAIL|WARN")]['Bone'].tolist()
    
    summary = {
        "run_id": run_id,
        "raw_missing_percent": round((total_nans_pre / total_cells) * 100, 3),
        "post_missing_percent": round((total_nans_post / total_cells) * 100, 3),
        "max_interpolation_gap": cfg.get('MAX_GAP_SIZE', 10),
        "bone_qc_mean_cv": round(mean_cv, 3),
        "bone_qc_status": "GOLD" if mean_cv < 1.0 else "SILVER" if mean_cv < 5.0 else "REJECT",
        "bone_qc_alerts": alerts,
        "worst_bone": df_bone_qc.iloc[0]['Bone'] if not df_bone_qc.empty else "None",
        "interpolation_method": "linear_quaternion_normalized"
    }
    
    # Save to JSON
    out_path = os.path.join(save_dir, f"{run_id}__preprocess_summary.json")
    with open(out_path, 'w') as f:
        json.dump(summary, f, indent=4)
    
    print(f"\n{'='*20} PREPROCESS SUMMARY EXPORTED {'='*20}")
    print(f"‚úÖ Summary Path: {out_path}")
    print(f"üìä Bone QC Mean CV: {summary['bone_qc_mean_cv']}% (Status: {summary['bone_qc_status']})")
    print(f"üìâ Missing Data: {summary['raw_missing_percent']}% -> {summary['post_missing_percent']}%")
    print(f"{'='*50}\n")

# --- EXECUTE ---
export_preprocess_summary(df_raw, df_preprocessed, df_bone_qc, RUN_ID, DERIV_02, CONFIG)


‚úÖ Summary Path: /Users/drorhazan/Documents/untitled folder/Gaga-mocap-Kinematics/derivatives/step_02_preprocess/734_T1_P1_R1_Take 2025-12-01 02.18.27 PM__preprocess_summary.json
üìä Bone QC Mean CV: 0.53% (Status: GOLD)
üìâ Missing Data: 0.0% -> 0.0%

