# Encounters between ants
Function to calculate the encounters between specific ants (e.g., focal and caregiver ants) from a ```.mymridon``` experiment file. <br><br>
There is probably an easier way to do this by querying individual frames directly.

In [1]:
from datetime import datetime, timedelta  # For convenient handling of time and date

import numpy as np
import pandas as pd  # Used to create a dataframe, similar to the structure used in R
import py_fort_myrmidon as fm

### Function to output trajectories of all ants

In [2]:
def trajectory_output_all(start_time, end_time, exp):
    """
    Function to extract daily trajectories, grouped by AntID. While it is setup to extract daily trajectories, it can work for any arbitrary time duration
    :param start_time: The start datetime object. this will be converted to a fort-myrmidon Time object
    :param end_time: The end datetime object. this will be converted to a fort-myrmidon Time object
    :param exp: The name of the experiment i.e., the myrmidon file
    :param matcher_query: The fm matcher corresponding to the focal IDs
    :return: Outputs a pandas dataframe containing AntID, Space, Time, X_coordinates and Y_coordinates of each ID averaged over 1 second from the X and Y coordinates. Averagingg is done to have a dataset which can be merged across IDs using at the resolution of 1s.
    """
    start = datetime.now()
    t_begin = fm.Time(start_time)
    t_stop = fm.Time(end_time)
    trajectory = fm.Query.ComputeAntTrajectories(
        experiment=exp,
        start=t_begin,
        end=t_stop,
        # matcher=matcher_query,
        maximumGap=fm.Duration.Parse("1000h"),
        reportProgress=False,
    )
    # Make a list of lists with trajectory values needed. Position is an array of 5 columns, so specific columns are called
    traj_list = [
        [
            trajectory.Ant,
            trajectory.Space,
            trajectory.Start.ToDateTime(),
            trajectory.Positions[:, 0],
            trajectory.Positions[:, 1],
            trajectory.Positions[:, 2],
        ]
        for trajectory in trajectory
    ]
    # Make the list into a dataframe
    traj_df = pd.DataFrame(
        traj_list,
        columns=["AntID", "Space", "StartTime", "Pos_time", "X_coor", "Y_coor"],
    )
    # Explode columns which are in the form of lists to expand the dataframe
    traj_df = traj_df.explode(column=["Pos_time", "X_coor", "Y_coor"])
    # Coerce coordinates to integer
    traj_df["X_coor"] = pd.to_numeric(traj_df["X_coor"], errors="coerce")
    traj_df["Y_coor"] = pd.to_numeric(traj_df["Y_coor"], errors="coerce")
    # Convert Pos_time to timedelta and obtain actual datetime for each trajectory entry
    traj_df["Pos_time"] = pd.to_numeric(traj_df["Pos_time"], errors="coerce")
    traj_df["Pos_time"] = pd.to_timedelta(
        traj_df["Pos_time"], unit="S", errors="coerce"
    )
    traj_df["Time"] = traj_df["StartTime"] + traj_df["Pos_time"]
    # Drop unwanted ccolumns
    traj_df = traj_df.drop(["StartTime", "Pos_time"], axis=1)
    # Reorder columns
    traj_df = traj_df[["AntID", "Space", "Time", "X_coor", "Y_coor"]]
    if traj_df.empty:  # If no trajectories are output
        # empty_row = pd.DataFrame([{'AntID': 'Unknown', 'Space':np.nan, 'Time':np.nan, 'X_coor':np.nan, 'Y_coor':np.nan}]) # Create empty row with unknown as antID
        # traj_df = pd.concat([empty_row]) # Add empty row to dataframe
        print("No trajectories found. Created empty dataframe")
        return traj_df  # Return empty dataframe
    # Obtain average X and Y coordinates per second
    # traj_df = (
    #     traj_df.groupby([pd.Grouper(key="Time", freq="1s"), "AntID", "Space"])
    #     .agg(X_mean=("X_coor", "mean"), Y_mean=("Y_coor", "mean"))
    #     .reset_index()
    # )
    end = datetime.now()
    # print("Trajectories output in", end-start)
    return traj_df

### Function to calculate duration of encounters
This function will calculate the duration of time an ant spends in the encounter zone after moving from the away zone to the encounter zone. This will require first identifying instances of an encounter, then calculating duration when the ant is within the encounter threshold

In [40]:
# Function to run over each sequence of encounters to extract sub-sequences where individuals are within the encounter threshold (enc_dummy=1).
# The number and total duration of these sequences is output
def time_within_encounter_threshold(displacement_dataset, encounter_sequence):
    """Function to quantify sub-sequences when an individual is within the encounter threshold during an encounter.
    These sub-sequences have a enc_dummy value of 1. The function extracts these sub-sequences, obtains their start and end time points
    from the displacment dataset and outputs the number and total duration of all sub-sequences within an encounter sequence.
    Note that the encounter sequence should only contain values of 0.5 and 1, since an encounter by definition is within two instances of enc_dummy == 0

    Args:
        displacement_dataset (pandas.DataFrame): A pandas dataframe which contains at least the main index values and a column with Time
        encounter_sequence (numpy.array): A numpy array containing the sequence of enc_dummy values with 0.5 and 1 corresponding to when displacement is within away threshold and within encounter threshold

    Returns:
        total_enc (int): The number of sub-sequences which are within the encounter threshold
        total_enc (numpy.float64): The total duration of all sub-sequences which are within the encounter threshold
    """
    # Add 0.5 to end of the list, then find out indices where consecutive difference is not 0
    change_indices = np.where(
        np.diff(np.concatenate(([0.5], encounter_sequence, [0.5]))) != 0
    )[0]
    # Get start indices starting from the bginning and jumping by 2 upto the end
    start_indices = [encounter_sequence.index.values[x] for x in change_indices[::2]]
    # Get end indices starting from 1 going upto the end jumping by 2. Subtract 1 from all index values to account for the extra value added at the concatenation step
    end_indices = [encounter_sequence.index.values[x] for x in change_indices[1::2] - 1]
    # Get time of starting indices
    start_times = [displacement_dataset.loc[x, "Time"] for x in start_indices]
    # Get time of ending indices
    end_times = [displacement_dataset.loc[x, "Time"] for x in end_indices]
    # Get time duration within encounter thresholds by subtraction
    enc_times = np.subtract(end_times, start_times)
    enc_times_sec = [x.total_seconds() for x in enc_times]
    # Obtain total time and number of times ants are within encounter threshold consecutively
    total_enc_times = np.sum(enc_times_sec)
    total_enc = len(enc_times_sec)
    return total_enc, total_enc_times

In [38]:
def encounter_duration(displacement_dataset, encounter_threshold, away_threshold):
    """Function to calculate duration of encounters between a focal ant and another individual.
    Encounters are defined as when an ant moves from beyond the away threshold to within the encounter threshold with respect to the focal ant.
    Two metrics of encounter duration are calculated - one based on the total time from when an ant moves within the encounter threshold to when it crosses the away threshold again
    and another based on only the total time spent within the encounter threshold during the encounter
    Outputs a dataframe with these durations calculated for each encounter between the 2 ants

    Args:
        displacement_dataset (pandas.DataFrame): A pandas dataframe containing at least Timestamps and displacement between 2 ants at each timestamp. It should also contain an index which is used to match sequence starts and stops to timestamps
        encounter_threshold (int): The value of displacement for the encounter threshold
        away_threshold (int): The value of displacement for the away threshold

    Returns:
        enc_df(pandas.DataFrame): A dataframe containing the number of the encounter, the start time, total duration,
    number of times within the encounter where the displacement between the two ants were within the encounter threshold and the total duration of these phases
    """
    # First interpolate (linearly) missing displacement values
    displacement_dataset["disp"].interpolate(
        method="linear", limit_direction="forward", inplace=True
    )
    # Create a new column based on converting the thresholds to dummy numbers. Values given are 1, if displacement < encounter threshold, 0.5, if encounter threshold < disp < away threshold and 0 if disp > away threshold.
    # Due to the linear interpolation there are no np.nans
    displacement_dataset.loc[:, ["enc_dummy"]] = pd.cut(
        displacement_dataset.disp,
        [0, encounter_threshold, away_threshold, np.inf],
        labels=[1, 0.5, 0],
    )
    # Convert datatype to float from category (due to pd.cut) for downstream functions
    displacement_dataset = displacement_dataset.astype({"enc_dummy": float})
    # Check if 1 is present at least once in the dataset. If not create a dataframe with 0 values to return
    if 1 not in displacement_dataset.enc_dummy.values:
        enc_df = pd.DataFrame(data=[[0, np.nan, 0.0, 0, 0.0]])
        enc_df.columns = [
            "enc_number",
            "enc_start_time",
            "enc_duration",
            "enc_sequences",
            "enc_sequences_duration",
        ]
        return enc_df
    # Identify index values where enc_dummy is 0 i.e., displacement is > away threshold
    away_indices = displacement_dataset[
        displacement_dataset["enc_dummy"] == 0
    ].index.values
    # Check if there is none or only 1 index with enc_dummy=0
    if away_indices.size < 2:
        enc_df = pd.DataFrame(data=[[0, np.nan, 0.0, 0, 0.0]])
        enc_df.columns = [
            "enc_number",
            "enc_start_time",
            "enc_duration",
            "enc_sequences",
            "enc_sequences_duration",
        ]
        return enc_df
    # Check if first value is not starting index and insert it if not
    if away_indices[0] != np.take(displacement_dataset.index.values, 0):
        away_indices = np.insert(
            away_indices, 0, np.take(displacement_dataset.index.values, 0)
        )  # Insert starting value of index range
    # Get pairs of consecutive indices
    away_indices_pair = list(zip(away_indices, away_indices[1:]))
    # Get values between the consecutive indices with value of 1
    seq_bw_away = [
        displacement_dataset.loc[x + 1 : y - 1, "enc_dummy"]
        for (x, y) in away_indices_pair
    ]
    # Remove list elements which have only 1 index value
    seq_bw_away_sub = [x for x in seq_bw_away if x.size > 1]
    # Subset list elements which have at least 1 value of 1
    # This list of lists will be used for all subsequent calculations
    enc_seq = [x for x in seq_bw_away_sub if np.in1d(1, x)]

    # Calculate overall encounter duration as time from when an individual crosses threshold (enc_dummy=1) to when it crosses away threshold again (enc_dummy=0)
    # Get index values where 1 is present for first time in each list within encounter sequences.
    # Then use this to obtain the start times of the encounters
    enc_start_times = [
        displacement_dataset.loc[x.index.values[np.where(x == 1)[0][0]], "Time"]
        for x in enc_seq
    ]
    # Get index values of last time point in each encounter.
    # Then use this to obtain the end times of the whole encounters
    enc_end_times = [
        displacement_dataset.loc[x.index.values[-1], "Time"] for x in enc_seq
    ]
    # Subtract end and start times element wise and convert to seconds
    enc_time = np.subtract(enc_end_times, enc_start_times)
    enc_time_sec = [x.total_seconds() for x in enc_time]

    # Calculate number of instances within each encounter where the individual is within encounter threshold for a sequential period of time and the total duration of these instances
    # This is based on extracting all consecutive sequences where enc_dummy=1 and calculating the start and end time of these sequences
    # The function `time_within_encounter_threshold` runs over each encounter sequence to extract the sub-sequences
    enc_sub_seq = [
        time_within_encounter_threshold(displacement_dataset, x) for x in enc_seq
    ]
    enc_sub_seq_num = [x for x, y in enc_sub_seq]
    enc_sub_seq_time = [y for x, y in enc_sub_seq]

    # Create a dataframe combining all the paeameters
    # First calculate number of encounters
    enc_num = np.arange(1, len(enc_seq) + 1)
    # Create a dataframe
    enc_df = pd.DataFrame(
        data=[enc_num, enc_start_times, enc_time_sec, enc_sub_seq_num, enc_sub_seq_time]
    )
    # transpose dataframe
    enc_df = enc_df.T
    # Add column names
    enc_df.columns = [
        "enc_number",
        "enc_start_time",
        "enc_duration",
        "enc_sequences",
        "enc_sequences_duration",
    ]
    enc_df = enc_df.astype(
        {
            "enc_number": int,
            "enc_duration": float,
            "enc_sequences": int,
            "enc_sequences_duration": float,
        }
    )
    return enc_df

### Function to calculate encounter durations between specific ants

This function combines obtaining all the trajectories from the `trajectory_output_all` function, then rearranges this to obtain the displacement between the focal ant and all other ants at each time point (in this every second, since the trajectories are summarised to the nearest second by averaging the X and Y coordinate for each second). This dataset is then grouped by focalID and antID and then the `encounter_duration` function is run over these groups.

In [4]:
def focal_encounters(
    start_time, end_time, exp, focal_ID, exp_day, encounter_threshold, away_threshold
):
    """
    Function to obtain trajectories for focal and caregiver antIDs, merge by time and calculate displacement of each caregiver ID from the focal ID at every second
    :param start_time: Starting time to obtain trajectories from. Passed on to function trajectory_output
    :param end_time: Ending time to obtain trajectories from. Passed on to function trajectory_output
    :param exp: Location of myrmidon file
    :param focal_ID: Injured AntID
    :param exp_day: Day of the experiment. This is added to the dataframe for identification
    :param encounter_threshold: Threshold displacement to use as encounter
    :param away_threshold: Threshold displacement to use as the start/end of an encounter
    :return: Returns a datafarme containing the Time (in bins of 1s based on function trajectory_output), the focal and caregiver ID, the space in which the focal and caregiver ants are present, and the displacement between them (calculated as np.nan if they are in different spaces. In a CSV output this will be converted to a blank entry).
    """
    start = datetime.now()
    # # Focal Ant matcher
    # focal_matcher = fm.Matcher.AntID(focal_ID)
    # # Caregiver individual matchers
    # # others = [fm.Matcher.AntID(x) for x in other_IDs]
    # # Create single matcher object by unpacking the list within an Or Matcher
    # #others_matcher = fm.Matcher.Or(*others)
    # # Focal Ant trajectory
    # focal_traj = trajectory_output(start_time, end_time, exp, focal_matcher)
    # All ant trajectories
    other_traj = trajectory_output_all(start_time, end_time, exp)
    # Focal ant trajectory
    focal_traj = other_traj[other_traj["AntID"] == focal_ID]
    # Sort Time column for both dataframes
    other_traj = other_traj.sort_values("Time")
    focal_traj = focal_traj.sort_values("Time")

    # If focal trajectory is an empty dataframe, create a dataframe with na values for encounter parameters
    if focal_traj.empty:
        full_traj = other_traj.rename(columns={"Space": "Space_ant"})
        full_traj["focalID"] = focal_ID
        full_traj["Space_focal"] = full_traj["disp"] = (
            np.nan
        )  # Create columns with na values
        full_traj = full_traj[
            ["Time", "focalID", "AntID", "disp", "Space_focal", "Space_ant"]
        ]
        full_traj["exp_day"] = exp_day
        full_traj = full_traj[full_traj["focalID"] != full_traj["AntID"]].reset_index()
        # Group data frame, create columns with na and output encounter dataframe
        enc_df = (
            full_traj.groupby(["exp_day", "focalID", "AntID"])
            .apply(lambda x: pd.Series([np.nan] * 5))
            .reset_index()
            .rename(
                columns={
                    0: "enc_number",
                    1: "enc_start_time",
                    2: "enc_duration",
                    3: "enc_sequences",
                    4: "enc_sequences_duration",
                }
            )
        )
        print(
            f"{'Focal ID trajectory is empty for list item '}{exp_day}{' .Returning dataframe with no displacement and encounters calculated'}"
        )
        return enc_df
    # If trajectory of all other individuals is an empty dataframe, create a dataframe with na values for encounter parameters
    if other_traj.empty:
        full_traj = focal_traj.rename(
            columns={"AntID": "focalID", "Space": "Space_focal"}
        )
        full_traj["AntID"] = full_traj["Space_ant"] = full_traj["disp"] = (
            np.nan
        )  # Create columns with na values
        full_traj = full_traj[
            ["Time", "focalID", "AntID", "disp", "Space_focal", "Space_ant"]
        ]
        full_traj["exp_day"] = exp_day
        # Group data frame, create columns with na and output encounter dataframe
        enc_df = (
            full_traj.groupby(["exp_day", "focalID", "AntID"])
            .apply(lambda x: pd.Series([np.nan] * 5))
            .reset_index()
            .rename(
                columns={
                    0: "enc_number",
                    1: "enc_start_time",
                    2: "enc_duration",
                    3: "enc_sequences",
                    4: "enc_sequences_duration",
                }
            )
        )
        print(
            f"{'Caregiver ID trajectories are empty for list item '}{exp_day}{' .Returning dataframe with no displacement and encounters calculated'}"
        )
        return enc_df

    # Merge focal and caregiver trajectories on Time column using merge_asof to match nearest time values
    full_traj = pd.merge_asof(
        other_traj,
        focal_traj,
        on="Time",
        suffixes=("_ant", "_focal"),
        direction="nearest",
        tolerance=pd.Timedelta("1s"),
    )
    # Obtain X coordinate and Y coordinate difference between Focal and Caregivers, for each row
    full_traj["X_diff"] = full_traj["X_coor_focal"] - full_traj["X_coor_ant"]
    full_traj["Y_diff"] = full_traj["Y_coor_focal"] - full_traj["Y_coor_ant"]
    # Obtain displacement
    full_traj["disp"] = np.linalg.norm(
        full_traj[["X_diff", "Y_diff"]].to_numpy(), axis=1
    )
    # Rename columns
    full_traj = full_traj.rename(
        columns={"AntID_focal": "focalID", "AntID_ant": "AntID"}
    )
    # Subset specific columns
    full_traj = full_traj[
        ["Time", "focalID", "AntID", "disp", "Space_focal", "Space_ant"]
    ]
    # Add experimental day
    full_traj["exp_day"] = exp_day
    # Remove instances where the focal ant's displacement is calculated wrt itself.
    full_traj = full_traj[full_traj["focalID"] != full_traj["AntID"]].reset_index(
        drop=True
    )
    # Replace with arbitrary high value of displacemeent if focal ant and caregiver are in different spaces. Use notnull to filter out instances where focal or caregiver space is not known. The higgh value will ensure that this case is always considered as > away_threshold in count_encounters function
    full_traj.loc[
        (
            (full_traj.Space_focal.notnull())
            & (full_traj.Space_ant.notnull())
            & (full_traj.Space_focal != full_traj.Space_ant)
        ),
        "disp",
    ] = 100000
    # Apply encounter_duration function over grouped dataframe, reset index and rename columns
    enc_df = (
        full_traj.groupby(["exp_day", "focalID", "AntID"])
        .apply(lambda x: encounter_duration(x, encounter_threshold, away_threshold))
        .reset_index()
        .drop("level_3", axis=1)
    )
    end = datetime.now()
    print(
        f"{'Encounters for experimental day '}{exp_day}{' calculated in '}{end - start}"
    )
    return enc_df

### Function to run encounter calculations over multiple phases of one experiment

In [5]:
def calculate_encounters_cluster(exp_list, control_list, pre_list, post_list, colonyID):
    """
    Helper function to run focal_caregivers_disp_exp and count_encounters over multiple lists associated with different phases for one colony
    :param exp_list: List of start_time, end_time, exp, focalID, careggiverIDs and exp_day for Experimental phase
    :param control_list: List for control phase
    :param pre_list: List for pre-experimental phase
    :param post_list: List for post-experimental phase
    :param colonyID: colonyID corresponding to the myrmidon file
    :return: Dataframe combining the encounters for each of the 4 phases for one colony
    """
    print("Experimental phase")
    exp_enc = [
        focal_encounters(
            start, end, exp, focalID, exp_day, encounter_threshold, away_threshold
        )
        for start, end, exp, focalID, exp_day, encounter_threshold, away_threshold in exp_list
    ]
    exp_enc = pd.concat(exp_enc)
    exp_enc["phase"] = "Exp"
    print("Control phase")
    control_enc = [
        focal_encounters(
            start, end, exp, focalID, exp_day, encounter_threshold, away_threshold
        )
        for start, end, exp, focalID, exp_day, encounter_threshold, away_threshold in control_list
    ]
    control_enc = pd.concat(control_enc)
    control_enc["phase"] = "Control"
    print("Pre-Experimental phase")
    pre_enc = [
        focal_encounters(
            start, end, exp, focalID, exp_day, encounter_threshold, away_threshold
        )
        for start, end, exp, focalID, exp_day, encounter_threshold, away_threshold in pre_list
    ]
    pre_enc = pd.concat(pre_enc)
    pre_enc["phase"] = "Pre"
    print("Post-Experimental phase")
    post_enc = [
        focal_encounters(
            start, end, exp, focalID, exp_day, encounter_threshold, away_threshold
        )
        for start, end, exp, focalID, exp_day, encounter_threshold, away_threshold in post_list
    ]
    post_enc = pd.concat(post_enc)
    post_enc["phase"] = "Post"
    # Combine encounter dataframes
    enc_list = [exp_enc, control_enc, pre_enc, post_enc]
    enc = pd.concat(enc_list)
    # Add colonyID
    enc["colony"] = colonyID
    # Sort by values
    enc = enc.sort_values(
        by=["colony", "phase", "exp_day", "focalID", "AntID"]
    ).reset_index(drop=True)
    enc = enc[
        [
            "colony",
            "phase",
            "exp_day",
            "focalID",
            "AntID",
            "enc_number",
            "enc_start_time",
            "enc_duration",
            "enc_sequences",
            "enc_sequences_duration",
        ]
    ]
    return enc

# Injury Experiments

In [6]:
# Thresholds
encounter_threshold = 300  # threshold for counting as an encounter
away_threshold = 1000  # Threshold for counting as the end of an encounter

## Colony Cfel 42

In [7]:
f_myrmidon = "/media/ebiag/Ebi-2/Woundcare Experiment1/Cfell_wound_col42.myrmidon"
exp = fm.Experiment.Open(f_myrmidon)
# Create list of focal ants
focal = [106, 63, 23, 53, 19, 22, 24, 103, 94, 102]
# Experimental phase
day_starts_exp = [
    datetime(2022, 5, 2, 16, 3).astimezone(tz=None),
    datetime(2022, 5, 3, 15, 53).astimezone(tz=None),
    datetime(2022, 5, 4, 15, 50).astimezone(tz=None),
    datetime(2022, 5, 5, 15, 50).astimezone(tz=None),
    datetime(2022, 5, 6, 15, 55).astimezone(tz=None),
] * 2
day_ends_exp = [day_time + timedelta(hours=6) for day_time in day_starts_exp]
exp_days = [1, 2, 3, 4, 5] * 2

disp_list_exp = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(day_starts_exp, day_ends_exp, focal, exp_days)
]

# Control Phase
day_starts_control = list(
    np.repeat(datetime(2022, 5, 1, 15, 54).astimezone(tz=None), 10)
)
day_ends_control = [day_time + timedelta(hours=6) for day_time in day_starts_control]

disp_list_control = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(
        day_starts_control, day_ends_control, focal, exp_days
    )
]

# Pre experimental phase
day_starts_pre = [
    datetime(2022, 5, 2, 9, 0).astimezone(tz=None),
    datetime(2022, 5, 3, 9, 0).astimezone(tz=None),
    datetime(2022, 5, 4, 9, 0).astimezone(tz=None),
    datetime(2022, 5, 5, 9, 0).astimezone(tz=None),
    datetime(2022, 5, 6, 9, 0).astimezone(tz=None),
] * 2
day_ends_pre = [day_time + timedelta(hours=6) for day_time in day_starts_pre]

disp_list_pre = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(day_starts_pre, day_ends_pre, focal, exp_days)
]

# Post experimental phase
day_starts_post = [
    datetime(2022, 5, 3, 9, 0).astimezone(tz=None),
    datetime(2022, 5, 4, 9, 0).astimezone(tz=None),
    datetime(2022, 5, 5, 9, 0).astimezone(tz=None),
    datetime(2022, 5, 6, 9, 0).astimezone(tz=None),
    datetime(2022, 5, 7, 9, 0).astimezone(tz=None),
] * 2
day_ends_post = [day_time + timedelta(hours=6) for day_time in day_starts_post]

disp_list_post = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(
        day_starts_post, day_ends_post, focal, exp_days
    )
]

In [19]:
exp_enc = [
    focal_encounters(
        start, end, exp, focalID, exp_day, encounter_threshold, away_threshold
    )
    for start, end, exp, focalID, exp_day, encounter_threshold, away_threshold in disp_list_exp
]
exp_enc = pd.concat(exp_enc)
exp_enc["phase"] = "Exp"

Encounters for experimental day 1 calculated in 0:02:44.439539
Encounters for experimental day 2 calculated in 0:04:45.928922
Encounters for experimental day 3 calculated in 0:02:00.413341
Encounters for experimental day 4 calculated in 0:02:07.269116
Encounters for experimental day 5 calculated in 0:02:14.014184
Encounters for experimental day 1 calculated in 0:03:35.573845
Encounters for experimental day 2 calculated in 0:02:42.963323
Encounters for experimental day 3 calculated in 0:03:48.218443
Encounters for experimental day 4 calculated in 0:00:57.718380
Encounters for experimental day 5 calculated in 0:02:08.735015


In [20]:
ctrl_enc = [
    focal_encounters(
        start, end, exp, focalID, exp_day, encounter_threshold, away_threshold
    )
    for start, end, exp, focalID, exp_day, encounter_threshold, away_threshold in disp_list_control
]
ctrl_enc = pd.concat(ctrl_enc)
ctrl_enc["phase"] = "Control"

Encounters for experimental day 1 calculated in 0:04:09.546647


KeyboardInterrupt: 

In [None]:
pre_enc = [
    focal_encounters(
        start, end, exp, focalID, exp_day, encounter_threshold, away_threshold
    )
    for start, end, exp, focalID, exp_day, encounter_threshold, away_threshold in disp_list_pre
]
pre_enc = pd.concat(pre_enc)
pre_enc["phase"] = "Pre"

In [None]:
post_enc = [
    focal_encounters(
        start, end, exp, focalID, exp_day, encounter_threshold, away_threshold
    )
    for start, end, exp, focalID, exp_day, encounter_threshold, away_threshold in disp_list_post
]
post_enc = pd.concat(post_enc)
post_enc["phase"] = "Post"

In [None]:
colonyID = "Cfel42"
# Combine encounter dataframes
enc_list = [exp_enc, ctrl_enc, pre_enc, post_enc]
enc = pd.concat(enc_list)
# Add colonyID
enc["colony"] = colonyID
# Sort by values
enc = enc.sort_values(
    by=["colony", "phase", "exp_day", "focalID", "AntID"]
).reset_index(drop=True)
enc = enc[
    [
        "colony",
        "phase",
        "exp_day",
        "focalID",
        "AntID",
        "enc_number",
        "enc_start_time",
        "enc_duration",
        "enc_sequences",
        "enc_sequences_duration",
    ]
]
# Save to CSV
enc.to_csv("Cfel42_AllAnts_Focal_Encounters.csv", index=False)

In [None]:
cfel42_enc = calculate_encounters_cluster(
    disp_list_exp, disp_list_control, disp_list_pre, disp_list_post, "Cfel42"
)

Experimental phase
Encounters for experimental day 1 calculated in 0:03:13.661084
Encounters for experimental day 2 calculated in 0:05:28.910801
Encounters for experimental day 3 calculated in 0:02:39.460039
Encounters for experimental day 4 calculated in 0:02:57.221570
Encounters for experimental day 5 calculated in 0:03:14.032626
Encounters for experimental day 1 calculated in 0:05:36.792071
Encounters for experimental day 2 calculated in 0:05:04.176355
Encounters for experimental day 3 calculated in 0:08:20.989096
Encounters for experimental day 4 calculated in 0:03:00.067046
Encounters for experimental day 5 calculated in 0:08:58.424003
Control phase


In [None]:
cfel42_enc.to_csv("Cfel42_AllAnts_Focal_Encounters.csv", index=False)

## Colony Cfel 1

In [None]:
f_myrmidon = "/media/ebiag/Ebi-2/Woundcare Experiment2/woundcare_cfell1_T2.myrmidon"
exp = fm.Experiment.Open(f_myrmidon)
# Focal Ants
focal = [87, 37, 58, 38, 3, 67, 2, 46, 30, 54]
exp_days = [1, 2, 3, 4, 5] * 2
# Experimental Phase list
day_starts_exp = [
    datetime(2022, 6, 5, 14, 57).astimezone(tz=None),
    datetime(2022, 6, 6, 14, 30).astimezone(tz=None),
    datetime(2022, 6, 7, 14, 49).astimezone(tz=None),
    datetime(2022, 6, 8, 14, 43).astimezone(tz=None),
    datetime(2022, 6, 9, 15, 5).astimezone(tz=None),
] * 2
day_ends_exp = [day_time + timedelta(hours=6) for day_time in day_starts_exp]

disp_list_exp = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(day_starts_exp, day_ends_exp, focal, exp_days)
]

# Control phase list
day_starts_control = list(
    np.repeat(datetime(2022, 6, 4, 14, 48).astimezone(tz=None), 10)
)
day_ends_control = [day_time + timedelta(hours=6) for day_time in day_starts_control]

disp_list_control = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(
        day_starts_control, day_ends_control, focal, exp_days
    )
]

# Pre Experimental phase list
day_starts_pre = [
    datetime(2022, 6, 5, 8, 0).astimezone(tz=None),
    datetime(2022, 6, 6, 8, 0).astimezone(tz=None),
    datetime(2022, 6, 7, 8, 0).astimezone(tz=None),
    datetime(2022, 6, 8, 8, 0).astimezone(tz=None),
    datetime(2022, 6, 9, 8, 0).astimezone(tz=None),
] * 2
day_ends_pre = [day_time + timedelta(hours=6) for day_time in day_starts_pre]

disp_list_pre = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(day_starts_pre, day_ends_pre, focal, exp_days)
]

# Post experimental phase list
day_starts_post = [
    datetime(2022, 6, 6, 8, 0).astimezone(tz=None),
    datetime(2022, 6, 7, 8, 0).astimezone(tz=None),
    datetime(2022, 6, 8, 8, 0).astimezone(tz=None),
    datetime(2022, 6, 9, 8, 0).astimezone(tz=None),
    datetime(2022, 6, 10, 8, 0).astimezone(tz=None),
] * 2
day_ends_post = [day_time + timedelta(hours=6) for day_time in day_starts_post]

disp_list_post = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(
        day_starts_post, day_ends_post, focal, exp_days
    )
]

In [None]:
cfel1_enc = calculate_encounters_cluster(
    disp_list_exp, disp_list_control, disp_list_pre, disp_list_post, "Cfel1"
)

In [None]:
cfel1_enc.to_csv("Cfel1_AllAnts_Focal_Encounters.csv", index=False)

## Colony Cfel 54

In [None]:
f_myrmidon = "/media/ebiag/Ebi-2/Woundcare Experiment3/woundcare_cfell54_T3.myrmidon"
exp = fm.Experiment.Open(f_myrmidon)
# Focal Ants
focal = [108, 114, 62, 12, 53, 107, 9, 87, 83, 101]
exp_days = [1, 2, 3, 4, 5] * 2
# Experimental Phase list
day_starts_exp = [
    datetime(2022, 6, 20, 14, 35).astimezone(tz=None),
    datetime(2022, 6, 21, 14, 21).astimezone(tz=None),
    datetime(2022, 6, 22, 14, 28).astimezone(tz=None),
    datetime(2022, 6, 23, 14, 14).astimezone(tz=None),
    datetime(2022, 6, 24, 14, 31).astimezone(tz=None),
]
day_ends_exp = [day_time + timedelta(hours=6) for day_time in day_starts_exp]

disp_list_exp = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(day_starts_exp, day_ends_exp, focal, exp_days)
]

# Control phase list
day_starts_control = list(
    np.repeat(datetime(2022, 6, 19, 14, 25).astimezone(tz=None), 10)
)
day_ends_control = [day_time + timedelta(hours=6) for day_time in day_starts_control]

disp_list_control = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(
        day_starts_control, day_ends_control, focal, exp_days
    )
]

# Pre Experimental phase list
day_starts_pre = [
    datetime(2022, 6, 20, 8, 0).astimezone(tz=None),
    datetime(2022, 6, 21, 8, 0).astimezone(tz=None),
    datetime(2022, 6, 22, 8, 0).astimezone(tz=None),
    datetime(2022, 6, 23, 8, 0).astimezone(tz=None),
    datetime(2022, 6, 24, 8, 0).astimezone(tz=None),
] * 2
day_ends_pre = [day_time + timedelta(hours=6) for day_time in day_starts_pre]

disp_list_pre = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(day_starts_pre, day_ends_pre, focal, exp_days)
]

# Post experimental phase list
day_starts_post = [
    datetime(2022, 6, 21, 8, 0).astimezone(tz=None),
    datetime(2022, 6, 22, 8, 0).astimezone(tz=None),
    datetime(2022, 6, 23, 8, 0).astimezone(tz=None),
    datetime(2022, 6, 24, 8, 0).astimezone(tz=None),
    datetime(2022, 6, 25, 8, 0).astimezone(tz=None),
] * 2
day_ends_post = [day_time + timedelta(hours=6) for day_time in day_starts_post]

disp_list_post = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(
        day_starts_post, day_ends_post, focal, exp_days
    )
]

In [None]:
cfel54_enc = calculate_encounters_cluster(
    disp_list_exp, disp_list_control, disp_list_pre, disp_list_post, "Cfel54"
)

In [None]:
cfel54_enc.to_csv("Cfel54_AllAnts_Focal_Encounters.csv", index=False)

# Infection Experiments

## Colony Cfel 13

In [None]:
f_myrmidon = "/media/ebiag/Ebi-3/InfectionExp_Cfel13/InfectionExp_Cfel13.myrmidon"
exp = fm.Experiment.Open(f_myrmidon)
# Focal Ants
focal = [9, 82, 40, 7, 55, 80, 26, 22, 27, 98]
exp_days = [1, 2, 3, 4, 5] * 2
# Experimental Phase list
day_starts_exp = [
    datetime(2023, 4, 24, 15, 29).astimezone(tz=None),
    datetime(2023, 4, 25, 14, 19).astimezone(tz=None),
    datetime(2023, 4, 26, 15, 3).astimezone(tz=None),
    datetime(2023, 4, 27, 16, 43).astimezone(tz=None),
    datetime(2023, 4, 28, 14, 27).astimezone(tz=None),
] * 2
day_ends_exp = [day_time + timedelta(hours=6) for day_time in day_starts_exp]

disp_list_exp = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(day_starts_exp, day_ends_exp, focal, exp_days)
]

# Control phase list
day_starts_control = list(
    np.repeat(datetime(2023, 4, 23, 15, 5).astimezone(tz=None), 10)
)
day_ends_control = [day_time + timedelta(hours=6) for day_time in day_starts_control]

disp_list_control = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(
        day_starts_control, day_ends_control, focal, exp_days
    )
]

# Pre Experimental phase list
day_starts_pre = [
    datetime(2023, 4, 24, 8, 0).astimezone(tz=None),
    datetime(2023, 4, 25, 8, 0).astimezone(tz=None),
    datetime(2023, 4, 26, 8, 0).astimezone(tz=None),
    datetime(2023, 4, 27, 8, 0).astimezone(tz=None),
    datetime(2023, 4, 28, 8, 0).astimezone(tz=None),
] * 2
day_ends_pre = [day_time + timedelta(hours=6) for day_time in day_starts_pre]

disp_list_pre = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(day_starts_pre, day_ends_pre, focal, exp_days)
]

# Post experimental phase list
day_starts_post = [
    datetime(2023, 4, 25, 8, 0).astimezone(tz=None),
    datetime(2023, 4, 26, 8, 0).astimezone(tz=None),
    datetime(2023, 4, 27, 8, 0).astimezone(tz=None),
    datetime(2023, 4, 28, 8, 0).astimezone(tz=None),
    datetime(2023, 4, 29, 8, 0).astimezone(tz=None),
] * 2
day_ends_post = [day_time + timedelta(hours=6) for day_time in day_starts_post]

disp_list_post = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(
        day_starts_post, day_ends_post, focal, exp_days
    )
]

In [None]:
cfel13_enc = calculate_encounters_cluster(
    disp_list_exp, disp_list_control, disp_list_pre, disp_list_post, "Cfel13"
)

In [None]:
cfel13_enc.to_csv("Cfel13_AllAnts_Focal_Encounters.csv", index=False)

## Colony Cfel 55

In [None]:
f_myrmidon = "/media/ebiag/Ebi-3/InfectionExp_Cfel55/InfectionExpCol55.myrmidon"
exp = fm.Experiment.Open(f_myrmidon)
# Focal Ants
focal = [30, 36, 44, 53, 55, 72, 15, 57, 67, 81]
exp_days = [1, 2, 3, 4, 5] * 2
# Experimental Phase list
day_starts_exp = [
    datetime(2023, 4, 20, 15, 45).astimezone(tz=None),
    datetime(2023, 4, 21, 14, 48).astimezone(tz=None),
    datetime(2023, 4, 22, 14, 17).astimezone(tz=None),
    datetime(2023, 4, 23, 14, 0).astimezone(tz=None),
    datetime(2023, 4, 24, 14, 54).astimezone(tz=None),
] * 2
day_ends_exp = [day_time + timedelta(hours=6) for day_time in day_starts_exp]

disp_list_exp = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(day_starts_exp, day_ends_exp, focal, exp_days)
]

# Control phase list
day_starts_control = list(
    np.repeat(datetime(2023, 4, 18, 14, 40).astimezone(tz=None), 10)
)
day_ends_control = [day_time + timedelta(hours=6) for day_time in day_starts_control]

disp_list_control = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(
        day_starts_control, day_ends_control, focal, exp_days
    )
]

# Pre Experimental phase list
day_starts_pre = [
    datetime(2023, 4, 20, 8, 0).astimezone(tz=None),
    datetime(2023, 4, 21, 8, 0).astimezone(tz=None),
    datetime(2023, 4, 22, 7, 30).astimezone(tz=None),
    datetime(2023, 4, 23, 7, 30).astimezone(tz=None),
    datetime(2023, 4, 24, 8, 0).astimezone(tz=None),
] * 2
day_ends_pre = [day_time + timedelta(hours=6) for day_time in day_starts_pre]

disp_list_pre = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(day_starts_pre, day_ends_pre, focal, exp_days)
]

# Post experimental phase list
day_starts_post = [
    datetime(2023, 4, 21, 8, 0).astimezone(tz=None),
    datetime(2023, 4, 22, 7, 30).astimezone(tz=None),
    datetime(2023, 4, 23, 7, 30).astimezone(tz=None),
    datetime(2023, 4, 24, 8, 0).astimezone(tz=None),
    datetime(2023, 4, 25, 8, 0).astimezone(tz=None),
] * 2
day_ends_post = [day_time + timedelta(hours=6) for day_time in day_starts_post]

disp_list_post = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(
        day_starts_post, day_ends_post, focal, exp_days
    )
]

In [None]:
cfel55_enc = calculate_encounters_cluster(
    disp_list_exp, disp_list_control, disp_list_pre, disp_list_post, "Cfel55"
)

In [None]:
cfel55_enc.to_csv("Cfel55_AllAnts_Focal_Encounters.csv", index=False)

## Colony Cfel 64

In [None]:
f_myrmidon = "/media/ebiag/Ebi-1/InfectionExp_Cfel64/InfectionExpCol64.myrmidon"
exp = fm.Experiment.Open(f_myrmidon)
# Focal Ants
focal = [6, 104, 115, 78, 59, 32, 86, 1, 38, 3]
exp_days = [1, 2, 3, 4, 5] * 2
# Experimental Phase list
day_starts_exp = [
    datetime(2023, 6, 1, 15, 51).astimezone(tz=None),
    datetime(2023, 6, 2, 14, 44).astimezone(tz=None),
    datetime(2023, 6, 3, 14, 50).astimezone(tz=None),
    datetime(2023, 6, 4, 14, 43).astimezone(tz=None),
    datetime(2023, 6, 5, 14, 52).astimezone(tz=None),
] * 2
day_ends_exp = [day_time + timedelta(hours=6) for day_time in day_starts_exp]

disp_list_exp = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(day_starts_exp, day_ends_exp, focal, exp_days)
]

# Control phase list
day_starts_control = list(
    np.repeat(datetime(2023, 5, 31, 15, 5).astimezone(tz=None), 10)
)
day_ends_control = [day_time + timedelta(hours=6) for day_time in day_starts_control]

disp_list_control = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(
        day_starts_control, day_ends_control, focal, exp_days
    )
]

# Pre Experimental phase list
day_starts_pre = [
    datetime(2023, 6, 1, 8, 0).astimezone(tz=None),
    datetime(2023, 6, 2, 8, 0).astimezone(tz=None),
    datetime(2023, 6, 3, 8, 0).astimezone(tz=None),
    datetime(2023, 6, 4, 8, 0).astimezone(tz=None),
    datetime(2023, 6, 5, 8, 0).astimezone(tz=None),
] * 2
day_ends_pre = [day_time + timedelta(hours=6) for day_time in day_starts_pre]

disp_list_pre = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(day_starts_pre, day_ends_pre, focal, exp_days)
]

# Post experimental phase list
day_starts_post = [
    datetime(2023, 6, 2, 8, 0).astimezone(tz=None),
    datetime(2023, 6, 3, 8, 0).astimezone(tz=None),
    datetime(2023, 6, 4, 8, 0).astimezone(tz=None),
    datetime(2023, 6, 5, 8, 0).astimezone(tz=None),
    datetime(2023, 6, 6, 8, 0).astimezone(tz=None),
] * 2
day_ends_post = [day_time + timedelta(hours=6) for day_time in day_starts_post]

disp_list_post = [
    (start, end, exp, focal, exp_day, encounter_threshold, away_threshold)
    for start, end, focal, exp_day in zip(
        day_starts_post, day_ends_post, focal, exp_days
    )
]

In [None]:
cfel64_enc = calculate_encounters_cluster(
    disp_list_exp, disp_list_control, disp_list_pre, disp_list_post, "Cfel64"
)

In [None]:
cfel64_enc.to_csv("Cfel64_AllAnts_Focal_Encounters.csv", index=False)

### Optimised code

In [None]:
def encounter_duration_optimized(
    displacement_dataset, encounter_threshold, away_threshold
):
    """Function to calculate duration of encounters between a focal ant and another individual.
    Optimized version with faster NumPy operations and reduced overhead.

    Args:
        displacement_dataset (pandas.DataFrame): A pandas dataframe containing at least Timestamps and displacement between 2 ants at each timestamp.
        encounter_threshold (int): The value of displacement for the encounter threshold
        away_threshold (int): The value of displacement for the away threshold

    Returns:
        enc_df(pandas.DataFrame): A dataframe containing encounter metrics
    """
    # Quick check for minimum dataset size
    if len(displacement_dataset) < 3:
        return pd.DataFrame(
            {
                "enc_number": [0],
                "enc_start_time": [np.nan],
                "enc_duration": [0.0],
                "enc_sequences": [0],
                "enc_sequences_duration": [0.0],
            }
        )

    # Work with copies of only the columns we need
    disp_values = displacement_dataset["disp"].values
    # time_values = displacement_dataset["Time"].values
    index_values = displacement_dataset.index.values

    # Interpolate missing values only if there are NaNs
    if np.isnan(disp_values).any():
        valid_indices = ~np.isnan(disp_values)
        if np.any(valid_indices):  # Make sure there's at least one valid value
            valid_idx = np.where(valid_indices)[0]
            valid_disp = disp_values[valid_indices]
            # Create interpolator function
            interp_indices = np.arange(len(disp_values))
            disp_values = np.interp(interp_indices, valid_idx, valid_disp)

    # Create encounter dummy values using NumPy's faster conditional selection
    # 1.0: within encounter threshold, 0.5: between encounter and away, 0.0: beyond away threshold
    enc_dummy = np.select(
        [
            disp_values <= encounter_threshold,
            (disp_values > encounter_threshold) & (disp_values <= away_threshold),
        ],
        [1.0, 0.5],
        default=0.0,
    )

    # Early check if no encounters exist
    if 1.0 not in enc_dummy:
        return pd.DataFrame(
            {
                "enc_number": [0],
                "enc_start_time": [np.nan],
                "enc_duration": [0.0],
                "enc_sequences": [0],
                "enc_sequences_duration": [0.0],
            }
        )

    # Find indices where ants are beyond away threshold (dummy=0)
    away_indices = np.where(enc_dummy == 0.0)[0]

    # If no away indices or too few, handle edge case
    if len(away_indices) < 2:
        if len(away_indices) == 0 and 1.0 in enc_dummy:
            # Special case: entire sequence might be an encounter without an "away" period
            # Find first occurrence of value 1
            first_encounter = np.where(enc_dummy == 1.0)[0][0]
            start_time = displacement_dataset.loc[index_values[first_encounter], "Time"]
            end_time = displacement_dataset.loc[index_values[-1], "Time"]
            duration = (end_time - start_time).total_seconds()

            # Count sequences where dummy=1 (continuous stretches)
            switches = np.diff(np.concatenate(([0], enc_dummy == 1.0, [0])))
            enc_sequences = np.sum(switches == 1)

            # Calculate total duration within encounter threshold
            total_encounter_time = (
                duration if enc_dummy[first_encounter:].all() else np.nan
            )

            return pd.DataFrame(
                {
                    "enc_number": [1],
                    "enc_start_time": [start_time],
                    "enc_duration": [duration],
                    "enc_sequences": [enc_sequences],
                    "enc_sequences_duration": [total_encounter_time],
                }
            )
        else:
            return pd.DataFrame(
                {
                    "enc_number": [0],
                    "enc_start_time": [np.nan],
                    "enc_duration": [0.0],
                    "enc_sequences": [0],
                    "enc_sequences_duration": [0.0],
                }
            )

    # Ensure first away index is at the start if needed
    if away_indices[0] > 0:
        away_indices = np.insert(away_indices, 0, 0)

    # Make pairs of consecutive away indices
    away_indices_pair = list(zip(away_indices, away_indices[1:]))

    # Store output data
    segments = []
    segment_details = []
    segment_start_times = []
    segment_end_times = []
    segment_durations = []
    segment_encounter_counts = []
    segment_encounter_durations = []

    # Process each segment between consecutive away points
    for start_idx, end_idx in away_indices_pair:
        if end_idx - start_idx <= 1:  # Skip if segment is too small
            continue

        segment = enc_dummy[start_idx:end_idx]

        if 1.0 not in segment:  # Skip if no encounters in segment
            continue

        segments.append(segment)
        segment_details.append((start_idx, end_idx))

        # Get segment start time (first encounter index)
        first_encounter_offset = np.where(segment == 1.0)[0][0]
        first_encounter_idx = start_idx + first_encounter_offset
        segment_start_time = displacement_dataset.loc[
            index_values[first_encounter_idx], "Time"
        ]
        segment_start_times.append(segment_start_time)

        # Get segment end time (last index in segment)
        segment_end_time = displacement_dataset.loc[index_values[end_idx - 1], "Time"]
        segment_end_times.append(segment_end_time)

        # Calculate segment duration
        segment_duration = (segment_end_time - segment_start_time).total_seconds()
        segment_durations.append(segment_duration)

        # FIXED: Calculate encounter sub-sequences properly
        # Find sub-sequences where ants are within encounter threshold (dummy=1)
        segment_with_borders = np.concatenate(([0.5], segment, [0.5]))
        transitions = np.diff(segment_with_borders)

        # Specifically identify transitions TO and FROM value 1.0
        transitions_to_one = np.where(transitions > 0)[
            0
        ]  # Identifies 0.5→1.0 transitions
        transitions_from_one = np.where(transitions < 0)[
            0
        ]  # Identifies 1.0→0.5 transitions

        # If the first element is already 1.0, insert a starting transition
        if segment[0] == 1.0:
            transitions_to_one = np.insert(transitions_to_one, 0, 0)

        # If the last element is 1.0, add a final transition
        if segment[-1] == 1.0:
            transitions_from_one = np.append(transitions_from_one, len(segment))

        # Ensure we have matching transition pairs by taking the minimum length
        subsequence_count = min(len(transitions_to_one), len(transitions_from_one))
        total_subsequence_duration = 0.0

        for j in range(subsequence_count):
            # Account for concatenation offset in to_one transitions
            subseq_start_idx = start_idx + transitions_to_one[j]

            # Account for concatenation and inclusive end in from_one transitions
            subseq_end_idx = (
                start_idx + transitions_from_one[j] - 1
            )  # -1 for inclusive end

            if subseq_end_idx >= subseq_start_idx:
                subseq_start_time = displacement_dataset.loc[
                    index_values[subseq_start_idx], "Time"
                ]
                subseq_end_time = displacement_dataset.loc[
                    index_values[subseq_end_idx], "Time"
                ]
                subseq_duration = (subseq_end_time - subseq_start_time).total_seconds()
                total_subsequence_duration += subseq_duration

        segment_encounter_counts.append(subsequence_count)
        segment_encounter_durations.append(total_subsequence_duration)

    # Make sure all arrays have the same length
    num_segments = len(segments)

    if num_segments == 0:
        return pd.DataFrame(
            {
                "enc_number": [0],
                "enc_start_time": [np.nan],
                "enc_duration": [0.0],
                "enc_sequences": [0],
                "enc_sequences_duration": [0.0],
            }
        )

    # Verify all arrays have the same length
    arrays = [
        segment_start_times,
        segment_durations,
        segment_encounter_counts,
        segment_encounter_durations,
    ]
    if not all(len(arr) == num_segments for arr in arrays):
        # Debug information
        print(
            f"Array length mismatch: segments={num_segments}, "
            f"start_times={len(segment_start_times)}, "
            f"durations={len(segment_durations)}, "
            f"encounter_counts={len(segment_encounter_counts)}, "
            f"encounter_durations={len(segment_encounter_durations)}"
        )

        # Ensure all arrays have the same length by truncating to shortest
        min_length = min(len(arr) for arr in arrays)
        segment_start_times = segment_start_times[:min_length]
        segment_durations = segment_durations[:min_length]
        segment_encounter_counts = segment_encounter_counts[:min_length]
        segment_encounter_durations = segment_encounter_durations[:min_length]

    # Create encounter numbers
    enc_numbers = np.arange(1, len(segment_start_times) + 1)

    # Create output dataframe
    enc_df = pd.DataFrame(
        {
            "enc_number": enc_numbers,
            "enc_start_time": segment_start_times,
            "enc_duration": segment_durations,
            "enc_sequences": segment_encounter_counts,
            "enc_sequences_duration": segment_encounter_durations,
        }
    )

    return enc_df
    # # Create pairs of consecutive away indices
    # # This identifies segments bounded by "away" states
    # segments = []
    # segment_start_times = []
    # segment_end_times = []
    # segment_encounter_counts = []
    # segment_encounter_durations = []

    # # Process each segment bounded by "away" periods
    # for i in range(len(away_indices) - 1):
    #     start_idx = away_indices[i] + 1
    #     end_idx = away_indices[i+1]

    #     # Skip if segment is too small
    #     if end_idx - start_idx <= 1:
    #         continue

    #     # Get the segment of dummy values
    #     segment = enc_dummy[start_idx:end_idx]

    #     # Check if segment contains any encounters (dummy=1)
    #     if 1.0 not in segment:
    #         continue

    #     segments.append((start_idx, end_idx))

    #     # Find first encounter (dummy=1) within segment
    #     first_encounter_offset = np.where(segment == 1.0)[0][0]
    #     first_encounter_idx = start_idx + first_encounter_offset

    #     # Get start and end times for the segment
    #     start_time = displacement_dataset.loc[index_values[first_encounter_idx], "Time"]
    #     end_time = displacement_dataset.loc[index_values[end_idx-1], "Time"]

    #     segment_start_times.append(start_time)
    #     segment_end_times.append(end_time)

    #     # Calculate overall duration
    #     duration = (end_time - start_time).total_seconds()

    #     # Find sub-sequences where ants are within encounter threshold (dummy=1)
    #     # We need to specifically identify transitions TO and FROM value 1.0
    #     segment_with_borders = np.concatenate(([0.5], segment, [0.5]))
    #     transitions = np.diff(segment_with_borders)
    #     transitions_to_one = np.where(transitions > 0)[0]   # Identifies 0.5→1.0
    #     transitions_from_one = np.where(transitions < 0)[0] # Identifies 1.0→0.5

    #     # Ensure we have matching transition pairs
    #     subsequence_count = min(len(transitions_to_one), len(transitions_from_one))
    #     total_subsequence_duration = 0.0
    #     subseq_durations = []

    #     for j in range(subsequence_count):
    #         # Get indices for transitions TO encounter and FROM encounter
    #         subseq_start_idx = start_idx + transitions_to_one[j]
    #         subseq_end_idx = start_idx + transitions_from_one[j] - 1  # -1 for inclusive end

    #         if subseq_end_idx >= subseq_start_idx:
    #             subseq_start_time = displacement_dataset.loc[index_values[subseq_start_idx], "Time"]
    #             subseq_end_time = displacement_dataset.loc[index_values[subseq_end_idx], "Time"]
    #             subseq_duration = (subseq_end_time - subseq_start_time).total_seconds()
    #             subseq_durations.append(subseq_duration)
    #             total_subsequence_duration += subseq_duration

    # # If no valid segments were found
    # if not segments:
    #     return pd.DataFrame({
    #         "enc_number": [0], "enc_start_time": [np.nan], "enc_duration": [0.0],
    #         "enc_sequences": [0], "enc_sequences_duration": [0.0]
    #     })

    # # Calculate segment durations
    # segment_durations = [(end_time - start_time).total_seconds()
    #                      for start_time, end_time in zip(segment_start_times, segment_end_times)]

    # # Create encounter numbers
    # enc_numbers = np.arange(1, len(segments) + 1)

    # # Create output dataframe directly with all data
    # enc_df = pd.DataFrame({
    #     "enc_number": enc_numbers,
    #     "enc_start_time": segment_start_times,
    #     "enc_duration": segment_durations,
    #     "enc_sequences": segment_encounter_counts,
    #     "enc_sequences_duration": segment_encounter_durations
    # })

    # return enc_df

In [None]:
def focal_encounters(
    start_time, end_time, exp, focal_ID, exp_day, encounter_threshold, away_threshold
):
    """
    Function to obtain trajectories for focal and caregiver antIDs, merge by time and calculate displacement of each caregiver ID from the focal ID at every second
    :param start_time: Starting time to obtain trajectories from. Passed on to function trajectory_output
    :param end_time: Ending time to obtain trajectories from. Passed on to function trajectory_output
    :param exp: Location of myrmidon file
    :param focal_ID: Injured AntID
    :param exp_day: Day of the experiment. This is added to the dataframe for identification
    :param encounter_threshold: Threshold displacement to use as encounter
    :param away_threshold: Threshold displacement to use as the start/end of an encounter
    :return: Returns a datafarme containing the Time (in bins of 1s based on function trajectory_output), the focal and caregiver ID, the space in which the focal and caregiver ants are present, and the displacement between them (calculated as np.nan if they are in different spaces. In a CSV output this will be converted to a blank entry).
    """
    start = datetime.now()
    # # Focal Ant matcher
    # focal_matcher = fm.Matcher.AntID(focal_ID)
    # # Caregiver individual matchers
    # # others = [fm.Matcher.AntID(x) for x in other_IDs]
    # # Create single matcher object by unpacking the list within an Or Matcher
    # #others_matcher = fm.Matcher.Or(*others)
    # # Focal Ant trajectory
    # focal_traj = trajectory_output(start_time, end_time, exp, focal_matcher)
    # All ant trajectories
    other_traj = trajectory_output_all(start_time, end_time, exp)
    # Focal ant trajectory
    focal_traj = other_traj[other_traj["AntID"] == focal_ID]
    # Sort Time column for both dataframes
    other_traj = other_traj.sort_values("Time")
    focal_traj = focal_traj.sort_values("Time")

    # If focal trajectory is an empty dataframe, create a dataframe with na values for encounter parameters
    if focal_traj.empty:
        full_traj = other_traj.rename(columns={"Space": "Space_ant"})
        full_traj["focalID"] = focal_ID
        full_traj["Space_focal"] = full_traj["disp"] = (
            np.nan
        )  # Create columns with na values
        full_traj = full_traj[
            ["Time", "focalID", "AntID", "disp", "Space_focal", "Space_ant"]
        ]
        full_traj["exp_day"] = exp_day
        full_traj = full_traj[full_traj["focalID"] != full_traj["AntID"]].reset_index()
        # Group data frame, create columns with na and output encounter dataframe
        enc_df = (
            full_traj.groupby(["exp_day", "focalID", "AntID"])
            .apply(lambda x: pd.Series([np.nan] * 5))
            .reset_index()
            .rename(
                columns={
                    0: "enc_number",
                    1: "enc_start_time",
                    2: "enc_duration",
                    3: "enc_sequences",
                    4: "enc_sequences_duration",
                }
            )
        )
        print(
            f"{'Focal ID trajectory is empty for list item '}{exp_day}{' .Returning dataframe with no displacement and encounters calculated'}"
        )
        return enc_df
    # If trajectory of all other individuals is an empty dataframe, create a dataframe with na values for encounter parameters
    if other_traj.empty:
        full_traj = focal_traj.rename(
            columns={"AntID": "focalID", "Space": "Space_focal"}
        )
        full_traj["AntID"] = full_traj["Space_ant"] = full_traj["disp"] = (
            np.nan
        )  # Create columns with na values
        full_traj = full_traj[
            ["Time", "focalID", "AntID", "disp", "Space_focal", "Space_ant"]
        ]
        full_traj["exp_day"] = exp_day
        # Group data frame, create columns with na and output encounter dataframe
        enc_df = (
            full_traj.groupby(["exp_day", "focalID", "AntID"])
            .apply(lambda x: pd.Series([np.nan] * 5))
            .reset_index()
            .rename(
                columns={
                    0: "enc_number",
                    1: "enc_start_time",
                    2: "enc_duration",
                    3: "enc_sequences",
                    4: "enc_sequences_duration",
                }
            )
        )
        print(
            f"{'Caregiver ID trajectories are empty for list item '}{exp_day}{' .Returning dataframe with no displacement and encounters calculated'}"
        )
        return enc_df

    # Merge focal and caregiver trajectories on Time column using merge_asof to match nearest time values
    full_traj = pd.merge_asof(
        other_traj,
        focal_traj,
        on="Time",
        suffixes=("_ant", "_focal"),
        direction="nearest",
        tolerance=pd.Timedelta("1s"),
    )
    # Obtain X coordinate and Y coordinate difference between Focal and Caregivers, for each row
    full_traj["X_diff"] = full_traj["X_coor_focal"] - full_traj["X_coor_ant"]
    full_traj["Y_diff"] = full_traj["Y_coor_focal"] - full_traj["Y_coor_ant"]
    # Obtain displacement
    full_traj["disp"] = np.linalg.norm(
        full_traj[["X_diff", "Y_diff"]].to_numpy(), axis=1
    )
    # Rename columns
    full_traj = full_traj.rename(
        columns={"AntID_focal": "focalID", "AntID_ant": "AntID"}
    )
    # Subset specific columns
    full_traj = full_traj[
        ["Time", "focalID", "AntID", "disp", "Space_focal", "Space_ant"]
    ]
    # Add experimental day
    full_traj["exp_day"] = exp_day
    # Remove instances where the focal ant's displacement is calculated wrt itself.
    full_traj = full_traj[full_traj["focalID"] != full_traj["AntID"]].reset_index(
        drop=True
    )
    # Replace with arbitrary high value of displacemeent if focal ant and caregiver are in different spaces. Use notnull to filter out instances where focal or caregiver space is not known. The higgh value will ensure that this case is always considered as > away_threshold in count_encounters function
    full_traj.loc[
        (
            (full_traj.Space_focal.notnull())
            & (full_traj.Space_ant.notnull())
            & (full_traj.Space_focal != full_traj.Space_ant)
        ),
        "disp",
    ] = 100000
    # Apply encounter_duration function over grouped dataframe, reset index and rename columns
    enc_df = (
        full_traj.groupby(["exp_day", "focalID", "AntID"])
        .apply(
            lambda x: encounter_duration_optimized(
                x, encounter_threshold, away_threshold
            )
        )
        .reset_index()
        .drop("level_3", axis=1)
    )
    end = datetime.now()
    print(
        f"{'Encounters for experimental day '}{exp_day}{' calculated in '}{end - start}"
    )
    return enc_df

In [23]:
f_myrmidon = "/media/ebiag/Ebi-2/Woundcare Experiment1/Cfell_wound_col42.myrmidon"
exp = fm.Experiment.Open(f_myrmidon)
focal_ID = 106
start_time = datetime(2022, 5, 2, 16, 3).astimezone(tz=None)
end_time = start_time + timedelta(hours=6)

In [None]:
other_traj = trajectory_output_all(start_time, end_time, exp)
# Focal ant trajectory
focal_traj = other_traj[other_traj["AntID"] == focal_ID]
# Sort Time column for both dataframes
other_traj = other_traj.sort_values("Time")
focal_traj = focal_traj.sort_values("Time")
# Merge focal and caregiver trajectories on Time column using merge_asof to match nearest time values
full_traj = pd.merge_asof(
    other_traj,
    focal_traj,
    on="Time",
    suffixes=("_ant", "_focal"),
    direction="nearest",
    tolerance=pd.Timedelta("1s"),
)
# Obtain X coordinate and Y coordinate difference between Focal and Caregivers, for each row
full_traj["X_diff"] = full_traj["X_coor_focal"] - full_traj["X_coor_ant"]
full_traj["Y_diff"] = full_traj["Y_coor_focal"] - full_traj["Y_coor_ant"]
# Obtain displacement
full_traj["disp"] = np.linalg.norm(full_traj[["X_diff", "Y_diff"]].to_numpy(), axis=1)
# Rename columns
full_traj = full_traj.rename(columns={"AntID_focal": "focalID", "AntID_ant": "AntID"})
# Subset specific columns
full_traj = full_traj[["Time", "focalID", "AntID", "disp", "Space_focal", "Space_ant"]]
# Add experimental day
full_traj["exp_day"] = 1
# Remove instances where the focal ant's displacement is calculated wrt itself.
full_traj = full_traj[full_traj["focalID"] != full_traj["AntID"]].reset_index(drop=True)
# Replace with arbitrary high value of displacemeent if focal ant and caregiver are in different spaces. Use notnull to filter out instances where focal or caregiver space is not known. The higgh value will ensure that this case is always considered as > away_threshold in count_encounters function
full_traj.loc[
    (
        (full_traj.Space_focal.notnull())
        & (full_traj.Space_ant.notnull())
        & (full_traj.Space_focal != full_traj.Space_ant)
    ),
    "disp",
] = 100000

In [25]:
# Apply encounter_duration function over grouped dataframe, reset index and rename columns
enc_df = (
    full_traj.groupby(["exp_day", "focalID", "AntID"])
    .apply(lambda x: encounter_duration(x, encounter_threshold, away_threshold))
    .reset_index()
    .drop("level_3", axis=1)
)

In [None]:
# Apply encounter_duration function over grouped dataframe, reset index and rename columns
enc_df_optimised = (
    full_traj.groupby(["exp_day", "focalID", "AntID"])
    .apply(
        lambda x: encounter_duration_optimized(x, encounter_threshold, away_threshold)
    )
    .reset_index()
    .drop("level_3", axis=1)
)

In [27]:
# Compare enc_df and enc_df optimised to see whether they match up
enc_df_merged = enc_df.merge(
    enc_df_optimised,
    on=["exp_day", "focalID", "AntID"],
    suffixes=("_original", "_optimized"),
)

In [None]:
# Example test
ant_id = 1  # Use a specific ant ID that shows differences
focal_id = 106  # Use your focal ant ID

# Get data for just this pair
pair_data = full_traj[
    (full_traj["AntID"] == ant_id) & (full_traj["focalID"] == focal_id)
].copy()

# Run both functions
orig_result = encounter_duration(pair_data, encounter_threshold, away_threshold)
optimised_result = encounter_duration_optimized(
    pair_data, encounter_threshold, away_threshold
)


In [None]:
displacement_dataset = full_traj[
    (full_traj["AntID"] == ant_id) & (full_traj["focalID"] == focal_id)
].copy()
# Work with copies of only the columns we need
disp_values = displacement_dataset["disp"].values
# time_values = displacement_dataset["Time"].values
index_values = displacement_dataset.index.values
# Interpolate missing values only if there are NaNs
if np.isnan(disp_values).any():
    print("NA values")
    valid_indices = ~np.isnan(disp_values)
    if np.any(valid_indices):  # Make sure there's at least one valid value
        valid_idx = np.where(valid_indices)[0]
        valid_disp = disp_values[valid_indices]
        # Create interpolator function
        interp_indices = np.arange(len(disp_values))
        disp_values = np.interp(interp_indices, valid_idx, valid_disp)

In [None]:
# Create encounter dummy values using NumPy's faster conditional selection
# 1.0: within encounter threshold, 0.5: between encounter and away, 0.0: beyond away threshold
enc_dummy = np.select(
    [
        disp_values <= encounter_threshold,
        (disp_values > encounter_threshold) & (disp_values <= away_threshold),
    ],
    [1.0, 0.5],
    default=0.0,
)

In [47]:
# Find indices where ants are beyond away threshold (dummy=0)
away_indices = np.where(enc_dummy == 0.0)[0]
if away_indices[0] > 0:
    away_indices = np.insert(away_indices, 0, 0)

In [51]:
# Make pairs of consecutive away indices
away_indices_pair = list(zip(away_indices, away_indices[1:]))

# Store output data
segments = []
segment_details = []
segment_start_times = []
segment_end_times = []
segment_durations = []
segment_encounter_counts = []
segment_encounter_durations = []

In [None]:
# Process each segment between consecutive away points
for start_idx, end_idx in away_indices_pair:
    if end_idx - start_idx <= 1:  # Skip if segment is too small
        continue
    segment = enc_dummy[start_idx:end_idx]

    if 1.0 not in segment:  # Skip if no encounters in segment
        continue

    segments.append(segment)
    segment_details.append((start_idx, end_idx))

    # Get segment start time (first encounter index)
    first_encounter_offset = np.where(segment == 1.0)[0][0]
    first_encounter_idx = start_idx + first_encounter_offset
    segment_start_time = displacement_dataset.loc[
        index_values[first_encounter_idx], "Time"
    ]
    segment_start_times.append(segment_start_time)

    # Get segment end time (last index in segment)
    segment_end_time = displacement_dataset.loc[index_values[end_idx - 1], "Time"]
    segment_end_times.append(segment_end_time)

    # Calculate segment duration
    segment_duration = (segment_end_time - segment_start_time).total_seconds()
    segment_durations.append(segment_duration)

    # FIXED: Calculate encounter sub-sequences properly
# Directly mimic the original time_within_encounter_threshold function
# Add 0.5 to beginning and end for boundary detection
segment_with_borders = np.concatenate(([0.5], segment, [0.5]))
differences = np.diff(segment_with_borders)
change_indices = np.where(differences != 0)[0]

# Debug output
print(f"Segment with borders: {segment_with_borders}")
print(f"Differences: {differences}")
print(f"Change indices: {change_indices}")

# The original function pairs indices differently
# It groups them by even/odd positions
start_indices = []
end_indices = []

# Only proceed if we have enough change points
if len(change_indices) >= 2:
    # Get every even-indexed change point (0, 2, 4...)
    for i in range(0, len(change_indices), 2):
        if i + 1 < len(change_indices):
            # Convert from change index to segment index (adjusting for the added 0.5 at start)
            start_idx_in_segment = change_indices[i] - 1 + start_idx
            # The original code does "-1" to get inclusive end
            end_idx_in_segment = change_indices[i + 1] - 1 - 1 + start_idx

            # Only keep pairs where start≤end and the segment contains value 1.0
            if end_idx_in_segment >= start_idx_in_segment:
                # Check if this range includes encounter values
                segment_slice = segment[
                    change_indices[i] - 1 : change_indices[i + 1] - 1
                ]
                if 1.0 in segment_slice:
                    start_indices.append(start_idx_in_segment)
                    end_indices.append(end_idx_in_segment)

subsequence_count = len(start_indices)
total_subsequence_duration = 0.0

# Calculate duration for each subsequence
for j in range(subsequence_count):
    subseq_start_idx = start_indices[j]
    subseq_end_idx = end_indices[j]

    subseq_start_time = displacement_dataset.loc[index_values[subseq_start_idx], "Time"]
    subseq_end_time = displacement_dataset.loc[index_values[subseq_end_idx], "Time"]
    subseq_duration = (subseq_end_time - subseq_start_time).total_seconds()
    total_subsequence_duration += subseq_duration

segment_encounter_counts.append(subsequence_count)
segment_encounter_durations.append(total_subsequence_duration)

Segment with borders: [0.5 0.  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 1.  1.
 1.  1.  1.  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0

IndexError: index 79994 is out of bounds for axis 0 with size 79964

In [59]:
print(f"Number of segments: {len(segments)}")
print(f"Segment start times: {segment_start_times}")
print(f"Segment end times: {segment_end_times}")
print(f"Segment durations: {segment_durations}")
print(f"Segment encounter counts: {segment_encounter_counts}")
print(f"Segment encounter durations: {segment_encounter_durations}")

Number of segments: 42
Segment start times: [Timestamp('2022-05-02 16:17:17.602161'), Timestamp('2022-05-02 17:44:48.659337'), Timestamp('2022-05-02 18:50:05.672149'), Timestamp('2022-05-02 20:00:49.707241'), Timestamp('2022-05-02 21:26:30.163532'), Timestamp('2022-05-02 21:32:26.967445'), Timestamp('2022-05-02 21:57:31.583962'), Timestamp('2022-05-02 16:17:17.602161'), Timestamp('2022-05-02 17:44:48.659337'), Timestamp('2022-05-02 18:50:05.672149'), Timestamp('2022-05-02 20:00:49.707241'), Timestamp('2022-05-02 21:26:30.163532'), Timestamp('2022-05-02 21:32:26.967445'), Timestamp('2022-05-02 21:57:31.583962'), Timestamp('2022-05-02 16:17:17.602161'), Timestamp('2022-05-02 17:44:48.659337'), Timestamp('2022-05-02 18:50:05.672149'), Timestamp('2022-05-02 20:00:49.707241'), Timestamp('2022-05-02 21:26:30.163532'), Timestamp('2022-05-02 21:32:26.967445'), Timestamp('2022-05-02 21:57:31.583962'), Timestamp('2022-05-02 16:17:17.602161'), Timestamp('2022-05-02 17:44:48.659337'), Timestamp('2

In [None]:
# Sample data - replace this with your actual data for the focal 106, ant ID 1 case
# Let's assume pair_data is already defined with your real data

print("===== STEP-BY-STEP COMPARISON =====\n")

# Make copies to avoid modifying the original data
original_data = pair_data.copy()
optimized_data = pair_data.copy()

print(f"Input dataset shape: {pair_data.shape}")
print(f"First few rows of input data:\n{pair_data.head()}\n")

# STEP 1: Check if dataset is too small
print("STEP 1: Check for minimum dataset size")
if len(pair_data) < 3:
    print("Dataset too small, both functions will return empty result")
else:
    print(f"Dataset has {len(pair_data)} rows, proceeding with analysis")

# STEP 2: Interpolate missing values
print("\nSTEP 2: Interpolate missing displacement values")

# Original version
print("Original: Using pandas interpolate with forward fill")
nan_before_orig = original_data["disp"].isna().sum()
print(f"Original: NaN values before interpolation: {nan_before_orig}")

original_data["disp"].interpolate(
    method="linear", limit_direction="forward", inplace=True
)

nan_after_orig = original_data["disp"].isna().sum()
print(f"Original: NaN values after interpolation: {nan_after_orig}")

# Optimized version
print("\nOptimized: Using NumPy interpolation")
disp_values = optimized_data["disp"].values
index_values = optimized_data.index.values

nan_before_opt = np.isnan(disp_values).sum()
print(f"Optimized: NaN values before interpolation: {nan_before_opt}")

if np.isnan(disp_values).any():
    valid_indices = ~np.isnan(disp_values)
    if np.any(valid_indices):
        valid_idx = np.where(valid_indices)[0]
        valid_disp = disp_values[valid_indices]
        interp_indices = np.arange(len(disp_values))
        disp_values = np.interp(interp_indices, valid_idx, valid_disp)

nan_after_opt = np.isnan(disp_values).sum()
print(f"Optimized: NaN values after interpolation: {nan_after_opt}")

# Compare interpolated values
print("\nComparison of interpolated values (first 5):")
print(f"Original: {original_data['disp'].head().values}")
print(f"Optimized: {disp_values[:5]}")

if not np.allclose(
    original_data["disp"].head().values, disp_values[:5], equal_nan=True
):
    print("❌ DIFFERENCE DETECTED: Interpolated values don't match")
else:
    print("✓ Interpolated values match")

# STEP 3: Create encounter dummy column
print("\nSTEP 3: Create encounter dummy values")

# Original version
print("Original: Using pd.cut")
original_data.loc[:, ["enc_dummy"]] = pd.cut(
    original_data.disp,
    [0, encounter_threshold, away_threshold, np.inf],
    labels=[1, 0.5, 0],
)
original_data = original_data.astype({"enc_dummy": float})

# Optimized version
print("Optimized: Using np.select")
enc_dummy = np.select(
    [
        disp_values <= encounter_threshold,
        (disp_values > encounter_threshold) & (disp_values <= away_threshold),
    ],
    [1.0, 0.5],
    default=0.0,
)

# Compare dummy value counts
print("\nDummy value counts:")
orig_counts = original_data["enc_dummy"].value_counts().to_dict()
opt_counts = pd.Series(enc_dummy).value_counts().to_dict()
print(f"Original: {orig_counts}")
print(f"Optimized: {opt_counts}")

# Compare the first 5 values
print("\nFirst 5 dummy values:")
print(f"Original: {original_data['enc_dummy'].head().values}")
print(f"Optimized: {enc_dummy[:5]}")

if not np.allclose(original_data["enc_dummy"].head().values, enc_dummy[:5]):
    print("❌ DIFFERENCE DETECTED: Dummy values don't match")
else:
    print("✓ Dummy values match")

# STEP 4: Check if encounters exist
print("\nSTEP 4: Check if encounters exist")

# Original version
has_encounters_orig = 1 in original_data.enc_dummy.values
print(f"Original: Has encounters: {has_encounters_orig}")

# Optimized version
has_encounters_opt = 1.0 in enc_dummy
print(f"Optimized: Has encounters: {has_encounters_opt}")

if has_encounters_orig != has_encounters_opt:
    print("❌ DIFFERENCE DETECTED: Encounter existence check doesn't match")
else:
    print("✓ Encounter existence check matches")

if not has_encounters_orig:
    print("No encounters detected, both functions will return empty result")
    # Exit here

# STEP 5: Find 'away' indices
print("\nSTEP 5: Find indices where ants are beyond away threshold (dummy=0)")

# Original version
away_indices_orig = original_data[original_data["enc_dummy"] == 0].index.values
print(f"Original: Found {len(away_indices_orig)} 'away' indices")
if len(away_indices_orig) > 0:
    print(
        f"Original: First few away indices: {away_indices_orig[:5] if len(away_indices_orig) >= 5 else away_indices_orig}"
    )

# Optimized version
away_indices_opt = np.where(enc_dummy == 0.0)[0]
print(f"Optimized: Found {len(away_indices_opt)} 'away' indices")
if len(away_indices_opt) > 0:
    print(
        f"Optimized: First few away indices: {away_indices_opt[:5] if len(away_indices_opt) >= 5 else away_indices_opt}"
    )

    # Map optimized indices to original indices for comparison
    mapped_indices = index_values[away_indices_opt]
    print(
        f"Optimized (mapped to original indices): {mapped_indices[:5] if len(mapped_indices) >= 5 else mapped_indices}"
    )

if len(away_indices_orig) != len(away_indices_opt):
    print(
        f"❌ DIFFERENCE DETECTED: Number of 'away' indices doesn't match ({len(away_indices_orig)} vs {len(away_indices_opt)})"
    )
else:
    print("✓ Number of 'away' indices matches")

# Step 6: Insert starting index if needed
print("\nSTEP 6: Insert starting index if needed")

# Original version
first_index_orig = np.take(original_data.index.values, 0)
print(f"Original: First index in dataset is {first_index_orig}")

if away_indices_orig.size > 0 and away_indices_orig[0] != first_index_orig:
    print(f"Original: Inserting starting index {first_index_orig}")
    away_indices_orig = np.insert(away_indices_orig, 0, first_index_orig)
else:
    print("Original: No need to insert starting index")

# Optimized version
if len(away_indices_opt) > 0 and away_indices_opt[0] > 0:
    print(f"Optimized: Inserting starting index 0")
    away_indices_opt = np.insert(away_indices_opt, 0, 0)
else:
    print("Optimized: No need to insert starting index")

print(
    f"Original: Away indices after insertion: {away_indices_orig[:5] if len(away_indices_orig) >= 5 else away_indices_orig}"
)
print(
    f"Optimized: Away indices after insertion: {away_indices_opt[:5] if len(away_indices_opt) >= 5 else away_indices_opt}"
)

# Count after insertion
if len(away_indices_orig) != len(away_indices_opt):
    print(
        f"❌ DIFFERENCE DETECTED: Number of 'away' indices after insertion doesn't match ({len(away_indices_orig)} vs {len(away_indices_opt)})"
    )
else:
    print("✓ Number of 'away' indices after insertion matches")

===== STEP-BY-STEP COMPARISON =====

Input dataset shape: (79964, 8)
First few rows of input data:
                            Time  focalID  AntID      disp  Space_focal  \
86025 2022-05-02 16:07:06.945717    106.0      1  100000.0          2.0   
86117 2022-05-02 16:07:07.145721    106.0      1  100000.0          2.0   
86188 2022-05-02 16:07:07.345724    106.0      1  100000.0          2.0   
86237 2022-05-02 16:07:07.545728    106.0      1  100000.0          2.0   
86322 2022-05-02 16:07:07.745732    106.0      1  100000.0          2.0   

       Space_ant  exp_day enc_dummy  
86025          1        1       0.0  
86117          1        1       0.0  
86188          1        1       0.0  
86237          1        1       0.0  
86322          1        1       0.0  

STEP 1: Check for minimum dataset size
Dataset has 79964 rows, proceeding with analysis

STEP 2: Interpolate missing displacement values
Original: Using pandas interpolate with forward fill
Original: NaN values before int

In [None]:
# Step 7: Create pairs of consecutive away indices
print("\nSTEP 7: Create pairs of consecutive away indices")

# Original version
away_indices_pair_orig = list(zip(away_indices_orig, away_indices_orig[1:]))
print(f"Original: Number of away index pairs: {len(away_indices_pair_orig)}")
if len(away_indices_pair_orig) > 0:
    print(f"Original: First few pairs: {away_indices_pair_orig[:3]}")

# Optimized version
away_indices_pair_opt = list(zip(away_indices_opt, away_indices_opt[1:]))
print(f"Optimized: Number of away index pairs: {len(away_indices_pair_opt)}")
if len(away_indices_pair_opt) > 0:
    print(f"Optimized: First few pairs: {away_indices_pair_opt[:3]}")

if len(away_indices_pair_orig) != len(away_indices_pair_opt):
    print(
        f"❌ DIFFERENCE DETECTED: Number of away index pairs doesn't match ({len(away_indices_pair_orig)} vs {len(away_indices_pair_opt)})"
    )
else:
    print("✓ Number of away index pairs matches")

# Step 8: Extract sequences between away points
print("\nSTEP 8: Extract sequences between away points")

# Original version
seq_bw_away = [
    original_data.loc[x + 1 : y - 1, "enc_dummy"] for x, y in away_indices_pair_orig
]
print(f"Original: Extracted {len(seq_bw_away)} sequences between 'away' points")

seq_bw_away_sub = [x for x in seq_bw_away if x.size > 1]
print(f"Original: After removing sequences with size ≤ 1: {len(seq_bw_away_sub)}")

enc_seq_orig = [x for x in seq_bw_away_sub if np.in1d(1, x)]
print(f"Original: After keeping only sequences with encounters: {len(enc_seq_orig)}")

# Print some details about these sequences
if len(enc_seq_orig) > 0:
    print("Original: First sequence details:")
    for i, seq in enumerate(enc_seq_orig[:1]):  # Just show the first one
        print(f"  Sequence {i + 1}: Length={len(seq)}, Values={seq.values}")
        print(f"  Index values: {seq.index.values}")
        print(f"  Contains {sum(seq == 1)} values of 1 (encounters)")

# Optimized version
enc_seq_opt = []
segment_details = []

for i, (start_idx, end_idx) in enumerate(away_indices_pair_opt):
    if end_idx - start_idx <= 1:  # Skip if too small
        continue

    segment = enc_dummy[start_idx:end_idx]
    if 1.0 not in segment:  # Skip if no encounters
        continue

    enc_seq_opt.append(segment)
    segment_details.append((start_idx, end_idx))

print(f"Optimized: Found {len(enc_seq_opt)} sequences with encounters")

if len(enc_seq_opt) > 0:
    print("Optimized: First sequence details:")
    for i, seq in enumerate(enc_seq_opt[:1]):  # Just show the first one
        start_idx, end_idx = segment_details[i]
        print(
            f"  Sequence {i + 1}: Length={len(seq)}, Start={start_idx}, End={end_idx}"
        )
        print(f"  Values: {seq}")
        print(f"  Contains {sum(seq == 1.0)} values of 1 (encounters)")

if len(enc_seq_orig) != len(enc_seq_opt):
    print(
        f"❌ DIFFERENCE DETECTED: Number of encounter sequences doesn't match ({len(enc_seq_orig)} vs {len(enc_seq_opt)})"
    )
else:
    print("✓ Number of encounter sequences matches")



STEP 7: Create pairs of consecutive away indices
Original: Number of away index pairs: 75747
Original: First few pairs: [(86025, 86117), (86117, 86188), (86188, 86237)]
Optimized: Number of away index pairs: 75747
Optimized: First few pairs: [(0, 1), (1, 2), (2, 3)]
✓ Number of away index pairs matches

STEP 8: Extract sequences between away points
Original: Extracted 75747 sequences between 'away' points
Original: After removing sequences with size ≤ 1: 43
Original: After keeping only sequences with encounters: 7
Original: First sequence details:
  Sequence 1: Length=63, Values=[1.  1.  1.  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 1.  1.  1.  1.  1.  1.  1.  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5]
  Index values: [297324 297394 297461 297600 297738 297808 297877 297946 298012 298075
 298143 298212 298283 298351 298418 298484 298550 298616 298681 29874

In [None]:
# Step 9: Calculate encounter start times and end times
print("\nSTEP 9: Calculate encounter start and end times")

if len(enc_seq_orig) > 0:
    # Original version
    enc_start_times_orig = []
    for seq in enc_seq_orig:
        first_enc_idx = seq.index.values[np.where(seq == 1)[0][0]]
        start_time = original_data.loc[first_enc_idx, "Time"]
        enc_start_times_orig.append(start_time)

    enc_end_times_orig = [
        original_data.loc[seq.index.values[-1], "Time"] for seq in enc_seq_orig
    ]

    print(f"Original: First encounter start time: {enc_start_times_orig[0]}")
    print(f"Original: First encounter end time: {enc_end_times_orig[0]}")

    # Optimized version
    enc_start_times_opt = []
    enc_end_times_opt = []

    for i, segment in enumerate(enc_seq_opt):
        start_idx, end_idx = segment_details[i]
        first_enc_offset = np.where(segment == 1.0)[0][0]
        first_enc_idx = start_idx + first_enc_offset

        start_time = optimized_data.loc[index_values[first_enc_idx], "Time"]
        end_time = optimized_data.loc[index_values[end_idx - 1], "Time"]

        enc_start_times_opt.append(start_time)
        enc_end_times_opt.append(end_time)

    print(f"Optimized: First encounter start time: {enc_start_times_opt[0]}")
    print(f"Optimized: First encounter end time: {enc_end_times_opt[0]}")

    if enc_start_times_orig[0] != enc_start_times_opt[0]:
        print("❌ DIFFERENCE DETECTED: First encounter start time doesn't match")
    else:
        print("✓ First encounter start time matches")

    if enc_end_times_orig[0] != enc_end_times_opt[0]:
        print("❌ DIFFERENCE DETECTED: First encounter end time doesn't match")
    else:
        print("✓ First encounter end time matches")



STEP 9: Calculate encounter start and end times
Original: First encounter start time: 2022-05-02 16:17:17.602161
Original: First encounter end time: 2022-05-02 16:17:31.802316
Optimized: First encounter start time: 2022-05-02 16:17:17.602161
Optimized: First encounter end time: 2022-05-02 16:17:31.802316
✓ First encounter start time matches
✓ First encounter end time matches


In [None]:
# Step 10: Calculate encounter durations
print("\nSTEP 10: Calculate encounter durations")

if len(enc_seq_orig) > 0:
    # Original version
    enc_durations_orig = [
        (end - start).total_seconds()
        for start, end in zip(enc_start_times_orig, enc_end_times_orig)
    ]
    print(f"Original: First encounter duration: {enc_durations_orig[0]} seconds")

    # Optimized version
    enc_durations_opt = [
        (end - start).total_seconds()
        for start, end in zip(enc_start_times_opt, enc_end_times_opt)
    ]
    print(f"Optimized: First encounter duration: {enc_durations_opt[0]} seconds")

    if (
        abs(enc_durations_orig[0] - enc_durations_opt[0]) > 0.001
    ):  # Allow small floating point differences
        print(
            f"❌ DIFFERENCE DETECTED: First encounter duration doesn't match ({enc_durations_orig[0]} vs {enc_durations_opt[0]})"
        )
    else:
        print("✓ First encounter duration matches")


STEP 10: Calculate encounter durations
Original: First encounter duration: 14.200155 seconds
Optimized: First encounter duration: 14.200155 seconds
✓ First encounter duration matches


In [None]:
# Step 11: Calculate encounter sub-sequences and their durations (the critical step)
print("\nSTEP 11: Calculate encounter sub-sequences and durations (CRITICAL STEP)")

if len(enc_seq_orig) > 0:
    # Original version using time_within_encounter_threshold
    def time_within_encounter_threshold(ds, encounter_sequence):
        """Original function to calculate sub-sequences when an individual is within the encounter threshold"""
        # Add 0.5 to beginning and end for boundary detection
        change_indices = np.where(
            np.diff(np.concatenate(([0.5], encounter_sequence, [0.5]))) != 0
        )[0]

        # Get start indices (even-indexed change points)
        start_indices = [
            encounter_sequence.index.values[x] for x in change_indices[::2]
        ]

        # Get end indices (odd-indexed change points, minus 1 for inclusivity)
        end_indices = [
            encounter_sequence.index.values[x] for x in change_indices[1::2] - 1
        ]

        # Get times for start and end
        start_times = [ds.loc[x, "Time"] for x in start_indices]
        end_times = [ds.loc[x, "Time"] for x in end_indices]

        # Calculate durations
        enc_times = np.subtract(end_times, start_times)
        enc_times_sec = [x.total_seconds() for x in enc_times]

        # Total count and duration
        total_enc = len(enc_times_sec)
        total_enc_times = np.sum(enc_times_sec)

        return total_enc, total_enc_times, start_indices, end_indices, enc_times_sec

    # Apply to first sequence as example
    first_seq = enc_seq_orig[0]
    num_subseq_orig, total_duration_orig, starts_orig, ends_orig, durations_orig = (
        time_within_encounter_threshold(original_data, first_seq)
    )

    print(f"Original: First encounter has {num_subseq_orig} sub-sequences")
    print(f"Original: Total duration of sub-sequences: {total_duration_orig} seconds")
    print(f"Original: Sub-sequence start indices: {starts_orig}")
    print(f"Original: Sub-sequence end indices: {ends_orig}")
    print(f"Original: Individual sub-sequence durations: {durations_orig}")

    # Optimized version - let's extract just the first sequence to match
    first_segment = enc_seq_opt[0]
    start_idx, end_idx = segment_details[0]

    # This is the critical part that differs in the optimized version:
    # Find sub-sequences where ants are within encounter threshold (dummy=1)
    bordered_segment = np.concatenate(([0.5], first_segment, [0.5]))
    boundaries = np.diff(bordered_segment) != 0
    change_indices = np.where(boundaries)[0]

    print(f"Optimized: Found {len(change_indices)} change points in first segment")
    print(f"Optimized: Change indices: {change_indices}")

    # Check values at change points in bordered_segment
    print("Optimized: Values at change points:")
    for i, idx in enumerate(change_indices):
        if idx > 0 and idx < len(bordered_segment):
            before = bordered_segment[idx - 1]
            after = bordered_segment[idx]
            print(f"  Change {i + 1}: Index {idx}, Before={before}, After={after}")

    # Pair start and end indices of encounter sub-sequences
    subsequence_count = len(change_indices) // 2
    total_subsequence_duration = 0.0
    subseq_start_indices = []
    subseq_end_indices = []
    subseq_durations = []

    for j in range(subsequence_count):
        subseq_start_idx = (
            start_idx + change_indices[j * 2] - 1
        )  # -1 to account for concatenation
        subseq_end_idx = (
            start_idx + change_indices[j * 2 + 1] - 1 - 1
        )  # Additional -1 for inclusive end

        # Check if we're only processing subsequences with value 1
        value_at_start = (
            enc_dummy[subseq_start_idx] if subseq_start_idx < len(enc_dummy) else None
        )
        print(
            f"  Subsequence {j + 1}: Start idx={subseq_start_idx}, value={value_at_start}"
        )

        if subseq_end_idx >= subseq_start_idx:
            subseq_start_indices.append(subseq_start_idx)
            subseq_end_indices.append(subseq_end_idx)

            subseq_start_time = optimized_data.loc[
                index_values[subseq_start_idx], "Time"
            ]
            subseq_end_time = optimized_data.loc[index_values[subseq_end_idx], "Time"]
            subseq_duration = (subseq_end_time - subseq_start_time).total_seconds()

            subseq_durations.append(subseq_duration)
            total_subsequence_duration += subseq_duration

    print(f"Optimized: First encounter has {subsequence_count} sub-sequences")
    print(
        f"Optimized: Total duration of sub-sequences: {total_subsequence_duration} seconds"
    )
    print(f"Optimized: Sub-sequence start indices: {subseq_start_indices}")
    print(f"Optimized: Sub-sequence end indices: {subseq_end_indices}")
    print(f"Optimized: Individual sub-sequence durations: {subseq_durations}")

    if num_subseq_orig != subsequence_count:
        print(
            f"❌ DIFFERENCE DETECTED: Number of sub-sequences doesn't match ({num_subseq_orig} vs {subsequence_count})"
        )
    else:
        print("✓ Number of sub-sequences matches")

    if abs(total_duration_orig - total_subsequence_duration) > 0.001:
        print(
            f"❌ DIFFERENCE DETECTED: Total sub-sequence duration doesn't match ({total_duration_orig} vs {total_subsequence_duration})"
        )
    else:
        print("✓ Total sub-sequence duration matches")



STEP 11: Calculate encounter sub-sequences and durations (CRITICAL STEP)
Original: First encounter has 2 sub-sequences
Original: Total duration of sub-sequences: 1.600017 seconds
Original: Sub-sequence start indices: [297324, 300462]
Original: Sub-sequence end indices: [297461, 300854]
Original: Individual sub-sequence durations: [0.400004, 1.200013]
Optimized: Found 5 change points in first segment
Optimized: Change indices: [ 0  1  4 39 46]
Optimized: Values at change points:
  Change 2: Index 1, Before=0.5, After=0.0
  Change 3: Index 4, Before=1.0, After=1.0
  Change 4: Index 39, Before=0.5, After=0.5
  Change 5: Index 46, Before=1.0, After=1.0
  Subsequence 1: Start idx=2012, value=0.0
  Subsequence 2: Start idx=2016, value=1.0
Optimized: First encounter has 2 sub-sequences
Optimized: Total duration of sub-sequences: 8.600093 seconds
Optimized: Sub-sequence start indices: [2012, 2016]
Optimized: Sub-sequence end indices: [2012, 2050]
Optimized: Individual sub-sequence durations: 

In [34]:
# Deep investigation of the subsequence identification
print("\nDEEP INVESTIGATION OF SUBSEQUENCE IDENTIFICATION:")
print("Original: Detailed analysis of first encounter sequence")
enc_dummy_values = first_seq.values
print(f"  Sequence values: {enc_dummy_values}")
print(f"  Sequence indices: {first_seq.index.values}")



DEEP INVESTIGATION OF SUBSEQUENCE IDENTIFICATION:
Original: Detailed analysis of first encounter sequence
  Sequence values: [1.  1.  1.  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 1.  1.  1.  1.  1.  1.  1.  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5]
  Sequence indices: [297324 297394 297461 297600 297738 297808 297877 297946 298012 298075
 298143 298212 298283 298351 298418 298484 298550 298616 298681 298749
 298817 298888 298958 299025 299091 299160 299232 299299 299363 299428
 299491 299556 299623 300075 300138 300267 300330 300398 300462 300525
 300588 300654 300721 300787 300854 300919 300985 301052 301119 301185
 301252 301316 301382 301448 301512 301577 301640 301704 301771 301839
 301908 301977 302044]


In [None]:
# Deep investigation of the subsequence identification
print("\nDEEP INVESTIGATION OF SUBSEQUENCE IDENTIFICATION:")
print("Original: Detailed analysis of first encounter sequence")
enc_dummy_values = first_seq.values
print(f"  Sequence values: {enc_dummy_values}")
print(f"  Sequence indices: {first_seq.index.values}")

# Create a diagram to visualize the sequence
print("\nOriginal sequence visualization (1=encounter, 0.5=between thresholds):")
enc_str = ""
for val in enc_dummy_values:
    if val == 1.0:
        enc_str += "E"  # Encounter
    elif val == 0.5:
        enc_str += "B"  # Between thresholds
    else:
        enc_str += "A"  # Away
print(enc_str)

print("\nOptimized sequence visualization:")
opt_str = ""
for val in first_segment:
    if val == 1.0:
        opt_str += "E"  # Encounter
    elif val == 0.5:
        opt_str += "B"  # Between thresholds
    else:
        opt_str += "A"  # Away
print(opt_str)

# Check the behavior with the concatenation and diff
orig_concat = np.concatenate(([0.5], enc_dummy_values, [0.5]))
orig_diff = np.diff(orig_concat)
orig_changes = np.where(orig_diff != 0)[0]
print(f"Original: Changes after concatenation at indices: {orig_changes}")

opt_concat = np.concatenate(([0.5], first_segment, [0.5]))
opt_diff = np.diff(opt_concat)
opt_changes = np.where(opt_diff != 0)[0]
print(f"Optimized: Changes after concatenation at indices: {opt_changes}")

# Compare critical calculations
print("\nCOMPARISON OF CRITICAL SUBSEQUENCE CALCULATIONS:")
print("Original:")
for i in range(0, len(orig_changes), 2):
    if i + 1 < len(orig_changes):
        print(
            f"  Pair {i // 2 + 1}: Start={orig_changes[i]}, End={orig_changes[i + 1]}"
        )
        # What values are at these points?
        start_val = (
            orig_concat[orig_changes[i]]
            if orig_changes[i] < len(orig_concat)
            else "out of bounds"
        )
        end_val = (
            orig_concat[orig_changes[i + 1]]
            if orig_changes[i + 1] < len(orig_concat)
            else "out of bounds"
        )
        print(f"  Values: Start={start_val}, End={end_val}")

print("Optimized:")
for i in range(0, len(opt_changes), 2):
    if i + 1 < len(opt_changes):
        print(f"  Pair {i // 2 + 1}: Start={opt_changes[i]}, End={opt_changes[i + 1]}")
        # What values are at these points?
        start_val = (
            opt_concat[opt_changes[i]]
            if opt_changes[i] < len(opt_concat)
            else "out of bounds"
        )
        end_val = (
            opt_concat[opt_changes[i + 1]]
            if opt_changes[i + 1] < len(opt_concat)
            else "out of bounds"
        )
        print(f"  Values: Start={start_val}, End={end_val}")

# Final output for the first encounter
print("\nFINAL OUTPUT FOR FIRST ENCOUNTER:")
print(
    f"Original: {num_subseq_orig} subsequences with total duration {total_duration_orig} seconds"
)
print(
    f"Optimized: {subsequence_count} subsequences with total duration {total_subsequence_duration} seconds"
)


DEEP INVESTIGATION OF SUBSEQUENCE IDENTIFICATION:
Original: Detailed analysis of first encounter sequence
  Sequence values: [1.  1.  1.  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 1.  1.  1.  1.  1.  1.  1.  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5]
  Sequence indices: [297324 297394 297461 297600 297738 297808 297877 297946 298012 298075
 298143 298212 298283 298351 298418 298484 298550 298616 298681 298749
 298817 298888 298958 299025 299091 299160 299232 299299 299363 299428
 299491 299556 299623 300075 300138 300267 300330 300398 300462 300525
 300588 300654 300721 300787 300854 300919 300985 301052 301119 301185
 301252 301316 301382 301448 301512 301577 301640 301704 301771 301839
 301908 301977 302044]

Original sequence visualization (1=encounter, 0.5=between thresholds):
EEEBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBEEEEEEEBBBBBBBBBBBBBBBBBB

Optimized s