In [1]:
import numpy as np
import xarray as xr
from pathlib import Path
from ethograph.utils.io import TrialTree, set_media_files
from movement.kinematics import compute_velocity, compute_speed

from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd
import xarray as xr


In [2]:

def from_csv_with_behaviors(
    file_path: Path | str,
    fps: Optional[float] = None,
) -> xr.Dataset:
    """Convert CSV with pose and behavior data to movement dataset.
    
    Parameters
    ----------
    file_path : Path or str
        Path to CSV file containing pose and behavior data
    fps : float, optional
        Frames per second of the video
    source_software : str, optional
        Name of the software that generated the data
        
    Returns
    -------
    xr.Dataset
        Movement dataset with pose tracks (both aligned and absolute),
        behavior labels, and position_type coordinate
    """
    df = pd.read_csv(file_path)
    
    # Extract metadata
    keypoint_names = [
        "HeadF", "HeadB", "HeadL", "SpineF", "SpineM", "SpineL",
        "Offset1", "Offset2", "HipL", "HipR", "ShoulderL", "ShoulderR"
    ]
    individual_names = ["an1", "an2"]
    position_types = ["aligned", "absolute"]
    n_frames = len(df)
    n_keypoints = len(keypoint_names)
    n_individuals = len(individual_names)
    n_space = 3  # x, y, z coordinates
    n_position_types = len(position_types)
    
    # Initialize arrays with additional dimension for position type
    position_array = np.zeros((n_frames, n_position_types, n_space, n_keypoints, n_individuals))
    # Fix: confidence array should also have position_type dimension
    confidence_array = np.ones((n_frames, n_position_types, n_keypoints, n_individuals))

    
    # Fill position data for both aligned and absolute
    for p, pos_type in enumerate(position_types):
        # Map our position types to the CSV column prefixes
        csv_prefix = "alignedPosition" if pos_type == "aligned" else "absolutePosition"
        
        for i, individual in enumerate(individual_names):
            for j, keypoint in enumerate(keypoint_names):
                for k, coord in enumerate(["x", "y", "z"]):
                    col_name = f"{csv_prefix}_{individual}_{keypoint}_{coord}"
                    if col_name in df.columns:
                        position_array[:, p, k, j, i] = df[col_name].values


    

    time_coords = np.arange(n_frames, dtype=float) / fps

    
    # Create base dataset with position_type as a coordinate
    ds = xr.Dataset(
        data_vars={
            "position": xr.DataArray(
                position_array,
                dims=["time", "position_type", "space", "keypoints", "individuals"],
            ),
            "confidence": xr.DataArray(
                confidence_array,
                dims=["time", "position_type", "keypoints", "individuals"], # confidence across space
            ),
        },
        coords={
            "time": time_coords,
            "position_type": position_types,
            "space": ["x", "y", "z"],
            "keypoints": keypoint_names,
            "individuals": individual_names,
        },
        attrs={
            "source_software": "DeepLabCut", # for compatibility with movement napari
            "ds_type": "poses",
        }
    )
    
    ds.attrs["fps"] = fps
    
    # Add behavioral annotations
    behavior_mapping = {
        "behaviorCoarse_an1": 0,
        "behaviorCoarse_an2": 1,
    }
    behavior_coarse = np.full((n_frames, n_individuals), np.nan)

    for col, ind_idx in behavior_mapping.items():
        if col in df.columns:
            behavior_coarse[:, ind_idx] = df[col].values
    

    ds["labels"] = xr.DataArray(
        behavior_coarse,
        dims=["time", "individuals"],
        attrs={
            "description": "Coarse behavior annotations",
            "classes": [
                "Idle", "SmallMovement", "HeadTilt", "Groom", "Sniff",
                "Investigate", "RearUp", "RearDown", "CrouchExplore",
                "Amble", "Locomotion"
            ]
        }
    )
    
    # Add center of mass data if present
    com_data = np.zeros((n_frames, n_space, n_individuals))
    for i, individual in enumerate(individual_names):
        for j, coord in enumerate(["x", "y", "z"]):
            col_name = f"centerOfmass_{individual}_{coord}"
            if col_name in df.columns:
                com_data[:, j, i] = df[col_name].values
    
    ds["center_of_mass"] = xr.DataArray(
        com_data,
        dims=["time", "space", "individuals"],
        attrs={"description": "Center of mass for each individual"}
    )
    
    return ds



ds_full = from_csv_with_behaviors(
    r"C:\Users\aksel\Documents\Code\EthoGraph\data\20210119_Recording_SR1_SR2_social_vidtwo\markerDataset.csv",
    fps=120,  # Set your actual fps
)

In [3]:
# Configuration
CHUNK_SIZE = 3500
VIDEO_DIR = Path("data/20210119_Recording_SR1_SR2_social_vidtwo/videos")
CAMERAS = ["Camera1", "Camera2", "Camera3", "Camera4", "Camera5", "Camera6"]
N_FRAMES = 213500
FPS = 120

In [4]:
def get_video_paths_for_chunk(start_frame: int, video_dir: Path, cameras: list[str]) -> list[str]:
    """Get video file paths for a chunk based on its starting frame."""
    video_paths = []
    for camera in cameras:
        video_path = Path(camera) / f"{start_frame}.mp4"
        video_paths.append(str(video_path))
    return video_paths


def split_dataset_into_chunks(
    ds_full: xr.Dataset,
    chunk_size: int,
    video_dir: Path,
    cameras: list[str],
    fps: int,
) -> list[xr.Dataset]:
    """Split a full dataset into fixed-size chunks with video assignments."""
    n_frames = ds_full.sizes["time"]
    n_chunks = n_frames // chunk_size
    
    datasets = []
    
    for i in range(n_chunks):
        start_idx = i * chunk_size
        end_idx = start_idx + chunk_size
        start_frame = start_idx  # Frame number for video filename
        
        # Slice the dataset
        ds_chunk = ds_full.isel(time=slice(start_idx, end_idx)).copy()
        
        # Reset time coordinate to start from 0 for each chunk
        ds_chunk = ds_chunk.assign_coords(time=np.arange(chunk_size) / fps)
        
        # Set trial number
        ds_chunk.attrs["trial"] = i
        ds_chunk.attrs["original_start_frame"] = start_frame
        
        # Assign video files
        video_paths = get_video_paths_for_chunk(start_frame, video_dir, cameras)
        ds_chunk = set_media_files(ds_chunk, cameras=video_paths)
        
        ds_chunk["velocity"] = compute_velocity(ds_chunk.position.sel(position_type='aligned'))
        ds_chunk["velocity"].attrs["type"] = "features" 
        
        
        ds_chunk["speed"] = compute_speed(ds_chunk.position.sel(position_type='aligned'))
        ds_chunk["speed"].attrs["type"] = "features"
        
        ds_chunk.attrs["cameras"] = ["cam1", "cam2", "cam3", "cam4", "cam5", "cam6"]
        
        datasets.append(ds_chunk)
        
    remaining = n_frames % chunk_size
    if remaining > 0:
        print(f"Discarded {remaining} frames at the end (not a full chunk)")
    
    print(f"Created {len(datasets)} chunks of {chunk_size} frames each")
    return datasets, ds_chunk


In [5]:
# Split the dataset
datasets, ds_chunk = split_dataset_into_chunks(
    ds_full,
    chunk_size=CHUNK_SIZE,
    video_dir=VIDEO_DIR,
    cameras=CAMERAS,
    fps=FPS,
)

Discarded 3000 frames at the end (not a full chunk)
Created 30 chunks of 3500 frames each


In [6]:
# Create TrialTree from datasets
dt = TrialTree.from_datasets(datasets)
print(f"Created TrialTree with {len(dt.trials)} trials")
print(f"Trial numbers: {dt.trials}")

Extracted type_vars_dict: {'individuals': array(['an1', 'an2'], dtype='<U3'), 'features': ['velocity', 'speed'], 'cameras': array(['cam1', 'cam2', 'cam3', 'cam4', 'cam5', 'cam6'], dtype='<U4'), 'keypoints': array(['HeadF', 'HeadB', 'HeadL', 'SpineF', 'SpineM', 'SpineL', 'Offset1',
       'Offset2', 'HipL', 'HipR', 'ShoulderL', 'ShoulderR'], dtype='<U9'), 'trial_conditions': ['original_start_frame']}
Created TrialTree with 30 trials
Trial numbers: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]


In [8]:
# Save the TrialTree
output_path = r"C:\Users\aksel\Documents\Code\EthoGraph\data\20210119_Recording_SR1_SR2_social_vidtwo\pair24.nc"
dt.to_netcdf(output_path)
print(f"Saved to {output_path}")

Saved to C:\Users\aksel\Documents\Code\EthoGraph\data\20210119_Recording_SR1_SR2_social_vidtwo\pair24.nc
