# Create Dataset - Moll 2025 (2 videos)

Dataset includes behavioural data from 2 trials:

- Videos published in Moll et al., 2025¹, see available online [here](https://www.sciencedirect.com/science/article/pii/S0960982225011005?via%3Dihub#mmc1). We recorded with two cameras (`cam-1`, `cam-2`), but only video from left camera (`cam-2`) is shared.
- DeepLabCut² pose files generated for the videos of each camera and 3D pose file `*_DLC_3D.csv` generated using [3D triangulation](https://deeplabcut.github.io/DeepLabCut/docs/Overviewof3D.html). 
- Video features files (`_s3d.npy`) generated using the Video Features repository³
- `Trial_data.nc` file with behavioural features (kinematic, video features), changepoints, custom colours, and trial meta data. 

Below is a example script how one can generate the `Trial_data.nc` file from the raw data.



---

¹ Moll, F. W., Würzler, J., & Nieder, A. (2025). Learned precision tool use in carrion crows. Current Biology, 35(19), 4845-4852.e3. https://doi.org/10.1016/j.cub.2025.08.033

² Nath, T., Mathis, A., Chen, A. C., Patel, A., Bethge, M., & Mathis, M. W. (2019). Using DeepLabCut for 3D markerless pose estimation across species and behaviors. Nature Protocols, 14(7), 2152–2176. https://doi.org/10.1038/s41596-019-0176-0

³ Iashin, V. (2020). Video Features [Computer software]. https://github.com/v-iashin/video_features

<img src="assets/moll1.png" width="600">
<img src="assets/moll2.png" width="900">

Left: Figure 1C from Moll et al., 2025¹

Right: Screenshot from GUI. Bottom line plot shows speed of beak tip.

### File structure
```
Moll2025/
├── labels/                                    # GUI saved label files
├── 2024-12-17_115_Crow1-cam-1.mp4             # video (trial 115)
├── 2024-12-17_115_Crow1-cam-1DLC.csv          # 2D pose cam-1
├── 2024-12-17_115_Crow1-cam-2DLC.csv          # 2D pose cam-2
├── 2024-12-17_115_Crow1_DLC_3D.csv            # 3D triangulated pose
├── 2024-12-17_115_Crow1-cam-1_s3d.npy         # Video features
├
├── 2024-12-18_041_Crow1-cam-1.mp4             # video (trial 41)
├── 2024-12-18_041_Crow1-cam-1DLC.csv          # 2D pose cam-1
├── 2024-12-18_041_Crow1-cam-2DLC.csv          # 2D pose cam-2
├── 2024-12-18_041_Crow1_DLC_3D.csv            # 3D triangulated pose
├── 2024-12-18_041_Crow1-cam-1_s3d.npy         # Video features
└── Trial_data.nc                              # all behavioural and meta data in one place
```

### Filename convention

```
2024-12-17_115_Crow1-cam-1DLC.csv
2024-12-17_115_Crow1-cam-1.mp4
│            │   │
│            │   └── bird ID
│            └────── trial number
└─────────────────── session date
```
Using a similar file convention across related files (video, 2D pose, 3D pose), makes it easier to match file names across trials using the  [`set_media_attrs`](../ethograph/utils/io.py) function.


### Download example data

Run the cell below to download the raw data from the GitHub release. If you already have the files locally, this will be skipped automatically. Video features (`*_s3d.npy`) are not included in the release — the cell that loads them will be skipped if missing.

In [None]:
from ethograph import get_project_root
from ethograph.utils.download import download_example_dataset

data_folder = get_project_root() / "data" / "Moll2025"
download_example_dataset("moll2025", data_folder)

print(f"\ndata_folder:  {data_folder}")

In [None]:
import glob
import os
import warnings
import numpy as np

from movement.io import load_poses
from movement.kinematics import compute_pairwise_distances, compute_velocity, compute_acceleration
from movement.utils.vector import compute_norm

from ethograph.utils.paths import extract_trial_info_from_filename
from ethograph import TrialTree, add_angle_rgb_to_ds,  add_changepoints_to_ds, set_media_attrs, get_project_root
from ethograph.features.mov_features import compute_distance_to_constant, Position3DCalibration
from ethograph.features.changepoints import find_troughs_binary, find_nearest_turning_points_binary
from ethograph.features.preprocessing import gaussian_smoothing
warnings.filterwarnings(
    'ignore', 
    message='Confidence array was not provided.Setting to an array of NaNs',
    module='movement.validators.datasets'
)

# Config
data_folder = get_project_root() / "data" / "Moll2025"
dlc_3d_paths = np.sort(glob.glob(os.path.join(data_folder, "*_3D.csv")))

fps = 200
clip_distance = 50 # Exclude unrealistic distance features (> 50cm)

smoothing_params = {"sigma": 1.5, "axis": 0, "mode": "constant", "cval": np.nan}


# We found out that using a subset of s3d video features 
# (with high Cohen's D for a label) works better than all 1024
good_s3d_feats = [ 326, 327, 292, 363, 219, 192, 260, 66, 332, 199,
       288, 763, 837, 182, 24, 218, 213, 21, 733, 242] # Crow 1


# Stationary locations
disp_xyz = [-10.23, -5.907, -1.395]
box1 = np.array([-7.08514973,  0.14055037,  0.58930513]) # front left
box2 = np.array([-6.97786264,  9.87752058,  0.96104736]) # back left
box3 = np.array([6.81007923, 9.72017014, 0.77115876]) # back right
box4 = np.array([ 6.77518696, -0.02424044,  0.60184316]) # front right


ds_list = []
for dlc_3d_path in dlc_3d_paths:
    ds = load_poses.from_dlc_file(dlc_3d_path, fps=fps)

    # Meta data (available in filename)
    session_date, trial_num, bird = extract_trial_info_from_filename(dlc_3d_path)
    ds = ds.assign_coords(individuals=[bird])
    ds.attrs["trial"] = trial_num 
    ds.attrs["bird"] = bird    
    ds.attrs["session_date"] = session_date 
    

    
    
    # dlc_suffix = "DLC_resnet50_Felix_cross_SessionsAug1shuffle1_200000_filtered"
    
    # E.g. mapping trial 41 -> 2024-12-18_041_Crow1-cam-1.mp4
    base_name = os.path.basename(dlc_3d_path)
    ds = set_media_attrs(
        ds,
        cameras=[
            base_name.replace("_DLC_3D.csv", "-cam-1.mp4")
            # base_name.replace("_DLC_3D.csv", "-cam-2.mp4"),
        ],
        pose=[
            base_name.replace("_DLC_3D.csv", f"-cam-1DLC.csv"),
            base_name.replace("_DLC_3D.csv", f"-cam-2DLC.csv"),
        ],
    )


    # Transform 2D pose -> 3D pose
    calibration = Position3DCalibration()
    ds = calibration.transform(ds)
    
     
    # Kinematic features
    ds["position"] = gaussian_smoothing(ds.position, **smoothing_params)
    ds["velocity"] = compute_velocity(ds.position.sel(keypoints=['stickTip', 'beakTip'])).clip(min=-150, max=150)
    ds["speed"] = compute_norm(ds.velocity.sel(keypoints=['stickTip', 'beakTip']))
    smooth_2x = {**smoothing_params, "sigma": smoothing_params["sigma"] * 2}
    position_smooth = gaussian_smoothing(ds.position, **smooth_2x)
    ds["acceleration"] = compute_acceleration(position_smooth.sel(keypoints=['stickTip', 'beakTip'])).clip(min=-1500, max=1500)
        
    

    # Distance features
    ds["pellet_beakTip_dist"] = compute_pairwise_distances(ds.position, "keypoints", {"pellet": "beakTip"}).clip(0, clip_distance)
    ds["pellet_stickTip_dist"] = compute_pairwise_distances(ds.position, "keypoints", {"pellet": "stickTip"}).clip(0, clip_distance)
    ds["disp_beakTip_dist"] = compute_distance_to_constant(ds.position, reference_point=disp_xyz, keypoint="beakTip").clip(0, clip_distance)
    ds["disp_stickTip_dist"] = compute_distance_to_constant(ds.position, reference_point=disp_xyz, keypoint="stickTip").clip(0, clip_distance)
    
    ds["sticktip_cornerLFront_dist"] = compute_pairwise_distances(ds.position, "keypoints", {"stickTip": "box1"}).clip(0, clip_distance)
    ds["sticktip_cornerLBack_dist"] = compute_pairwise_distances(ds.position, "keypoints", {"stickTip": "box2"}).clip(0, clip_distance)
    ds["sticktip_cornerRBack_dist"] = compute_pairwise_distances(ds.position, "keypoints", {"stickTip": "box3"}).clip(0, clip_distance)
    ds["sticktip_cornerRFront_dist"] = compute_pairwise_distances(ds.position, "keypoints", {"stickTip": "box4"}).clip(0, clip_distance)

    ds["beakTip_cornerLFront_dist"] = compute_pairwise_distances(ds.position, "keypoints", {"beakTip": "box1"}).clip(0, clip_distance)
    ds["beakTip_cornerLBack_dist"] = compute_pairwise_distances(ds.position, "keypoints", {"beakTip": "box2"}).clip(0, clip_distance)
    ds["beakTip_cornerRBack_dist"] = compute_pairwise_distances(ds.position, "keypoints", {"beakTip": "box3"}).clip(0, clip_distance)
    ds["beakTip_cornerRFront_dist"] = compute_pairwise_distances(ds.position, "keypoints", {"beakTip": "box4"}).clip(0, clip_distance)


    # Only keep subset of keypoints
    ds = ds.sel(keypoints=["beakTip", "stickTip", "pellet"])

    # Video Features (not included in GitHub release download)
    s3d_file = base_name.replace("_DLC_3D.csv", "-cam-1_s3d.npy")
    s3d_path = os.path.join(data_folder, s3d_file)
    if os.path.exists(s3d_path):
        s3d_data = np.load(s3d_path)
        ds["s3d"] = (("time", "s3d_dims"), s3d_data[:, good_s3d_feats])
    else:
        print(f"Skipping video features (not found): {s3d_file}")


    # Filter for variables shown in Feature DropDown ('Data controls')
    for var in ds.data_vars:
        ds[var].attrs["type"] = "features"
        
    
    # For changepoint correction on speed curve
    ds = add_changepoints_to_ds(
        ds=ds, 
        target_feature="speed",
        changepoint_name="troughs", 
        changepoint_func=find_troughs_binary, 
        prominence=0.5, 
        distance=2
    )
    ds = add_changepoints_to_ds(
        ds=ds,
        target_feature="speed",
        changepoint_name="turning_points",
        changepoint_func=find_nearest_turning_points_binary,
        threshold=1.0,
        max_value=50,
        prominence=5,
        width=2
    )
    
    # Colouring for Lineplots
    ds = add_angle_rgb_to_ds(ds, smoothing_params=smoothing_params)
    
    
    # Trial meta data -> Filter trials in 'Navigation controls'
    if int(trial_num) == 41:
        ds.attrs["pellet_position"] = "right"
    if int(trial_num) == 115:
        ds.attrs["pellet_position"] = "left"


    # IGNORE, for later use
    
    # # Pellet position and num pellets is detected from video using custom matlab code (not shared). Saved in 'trial_info.csv'.
    # ADD CSV
    # if trial_num in trial_info.index:
    #     row = trial_info.loc[trial_num]
    #     poscat_map = {0: "no pellet", 1: "left", 2: "middle", 3: "right"}
    #     ds.attrs["pellet_position"] = poscat_map.get(int(row["poscat"]))
    #     ds.attrs["num_pellets"] = int(row["num_pellets"])
        
    ds_list.append(ds)



dt = TrialTree.from_datasets(ds_list)
dt.to_netcdf(os.path.join(data_folder, "Trial_data2.nc"))