### Pre-process openpose files

This code reads in the raw openpose files, selects the relevant columns to keep, interpolates gaps, filters the data, and calculates various metrics including euclidean distance, velocity and acceleration, for the head and body keypoints.

In [None]:
# Import relevant packages
from utils_dir import preprocess_utils as pre
import os
from tqdm import tqdm

def generate_file_name(name):
    return name.replace("_pose_combined", "_clean")

In [None]:
"""
Pipeline to process raw OpenPose timeseries data, calculate metrics, and store results.

Steps included:
1. Set input/output directories.
2. Specify keypoint columns to keep for analysis.
3. Run pre-processing on all CSV files in the input directory.
4. Extract processed data and missing information.
5. Calculate metrics using sliding windows on processed data.
6. Store each metrics DataFrame to CSV files in the output directory.
"""

# 1. Set input directory path

directory = "...../Raw_Pose_Data" # Directory to OpenPose CSV Files.

columns_to_keep = ["Nose", "Neck", "RShoulder", "LShoulder", "REye", "LEye"] # Keypoints to retain. 

# 2. Set directory for filtered/pre-processed data

filtered_dir = "...."  # Optional: Directory to save filtered/pre-processed data
os.makedirs(filtered_dir, exist_ok=True)  # Ensure directory exists

# 3. Run the pre-processing pipeline

results = pre.process_all_csv_files(
    directory,                  # Directory of raw CSV files
    columns_to_keep,            # Keypoints to retain
    filtered_data_path=None,    # Optional path to store filtered data
    conf_threshold=0.3,         # Confidence threshold for keypoints
    interpolate_max=25,         # Maximum frames to interpolate missing values
    meta_data='metaData_coding.csv',  # Optional meta-data file for files
    rem_couples=['163', '164']        # Optional list of files/couples to remove, Couples were removed due to issues with video recording.

)

# 4. Extract results from pre-processing

processed_data = results['processed_data']           # Dictionary of file_name â†’ DataFrame
missing_info_all_files = results['missing_info_all_files']  # Info about missing data

# 5. Calculate metrics for processed data

metrics_data_frames = pre.calculate_metrics_for_dataframes(
    processed_data=processed_data,
    columns_to_keep=columns_to_keep,
    window_size=25*60, 
    window_overlap=0.5,
    diagnostics_plots=False         # Whether to generate diagnostics plots
)

# 6. Set output directory for metrics CSVs

output_directory = "processed_timeseries"  # Directory to store final metrics CSV files
os.makedirs(output_directory, exist_ok=True)


# 7. Save each metrics DataFrame as a CSV

for name, df in tqdm(metrics_data_frames.items(), desc=f"Storing files to {output_directory}"):
    # Generate a proper file name for each CSV
    new_file_name = generate_file_name(name)
    output_file_path = os.path.join(output_directory, f"{new_file_name}.csv")
    # Save DataFrame without row indices
    df.to_csv(output_file_path, index=False)
