In [None]:
import glob
import os
import pandas as pd

In this script we will:  
1. Attribute a single behaviour to an acceleration sample based on majority rule  
2. Identify outlier samples in resting based on VeDBA values and remove these from the training data  
3. Create the correct parquet files for use in the [tsai](https://timeseriesai.github.io/tsai/data.preparation.html) package  

The behavioral attribution is based on annotations in the [./data/raw/annotations/](./data/raw/annotations) folder.  

## Obtain burst attribution based on majority rule

In [None]:
# Load in new behavioural annotations data
annotations_burst1 = pd.read_csv(
    "../data/raw/annotations/annotations_burst1.csv"
)
annotations_burst1["burst"] = "burst_1"
annotations_burst2 = pd.read_csv(
    "../data/raw/annotations/annotations_burst2.csv"
)
annotations_burst2["burst"] = "burst_2"
annotations_burst3 = pd.read_csv(
    "../data/raw/annotations/annotations_burst3.csv"
)
annotations_burst3["burst"] = "burst_3"
annotations_burst4 = pd.read_csv(
    "../data/raw/annotations/annotations_burst4.csv"
)
annotations_burst4["burst"] = "burst_4"

# Combine all annotations
annotations_combined = pd.concat(
    [annotations_burst1, annotations_burst2, annotations_burst3, annotations_burst4]
)

In [None]:
# Obtain unique value counts for the behaviours
annotations_combined["Final_behaviour"].value_counts()
# List of 20 behaviours

In [None]:
# Remove bursts which have been attributed to specific behaviours
beh_remove = [
    "Out of sight",
    "Other",
    "Mating",
    "Vigilant",
    "Male inspection",
    "Social",
    "Aggressive display",
    "Jumping",
]
# Note that this means we are conservative with the data we keep for analysis. If a burst has e.g., 10 Resting, 5 Eating, 7 Out of sight and 4 Vigilant, it will be removed as the Removed category will be ther majority (7+4 = 11) and greater than the Resting category (10). This should help reduce noise in the data.

annotations_combined["attribution_merged"] = annotations_combined["Final_behaviour"]

# Add tag remove to the behaviours in beh_remove
annotations_combined.loc[
    annotations_combined["attribution_merged"].isin(beh_remove), "attribution_merged"
] = "Remove"

# Combine specific behaviours
# Foraging ground and canopy to eating
annotations_combined.loc[
    (annotations_combined["attribution_merged"] == "Foraging ground")
    | (annotations_combined["attribution_merged"] == "Foraging canopy"),
    "attribution_merged",
] = "Eating"

# Running, Aggression to Active
annotations_combined.loc[
    (annotations_combined["attribution_merged"] == "Running")
    | (annotations_combined["attribution_merged"] == "Aggression"),
    "attribution_merged",
] = "Running"

# Canopy movement to Walking
annotations_combined.loc[
    annotations_combined["attribution_merged"] == "Canopy movement",
    "attribution_merged",
] = "Walking"

# Value counts of the attribution_merged column
annotations_combined["attribution_merged"].value_counts()

In [None]:
# Group by burst, Ind_ID, new_burst and get the majority behaviour in a dataframe
annotations_majority = (
    annotations_combined.groupby(["burst", "Ind_ID", "new_burst"])["attribution_merged"]
    .agg(lambda x: x.value_counts().index[0])
    .reset_index()
)

# Obtain value counts for each burst separately
print("\nValue counts per burst:")
for burst in annotations_majority["burst"].unique():
    print(f"\n{burst}:")
    print(
        annotations_majority[annotations_majority["burst"] == burst][
            "attribution_merged"
        ].value_counts()
    )

In [None]:
# Proportion of bursts which are being removed from the dataset per burst
# Group by burst and calculate proportion of "Remove" for each burst
(
    annotations_majority.groupby("burst").apply(
        lambda x: (x["attribution_merged"] == "Remove").mean()
    )
)

# Around 16% of the data is being removed from each burst dataset

In [None]:
# Save the annotations_majority dataframe
annotations_majority.to_csv(
    "../data/raw/annotations/attributions_merged_majority.csv", index=False
)

## Shortlist bursts with VeDBA > 0.1

We use a threshold of 0.1 to remove outlier Resting bursts based on visual inspection of the VeDBA values across all bursts. See [R file](02_vedba_outliers.R).

In [None]:
# Load features parquet files and subset bursts with vedba values less than 0.1
# Load a sample file to check
pq_file_path = (
    "../data/raw/features/Burst_1/annotated_features_burst_1_uncorrected.parquet"
)
feat = pd.read_parquet(
    pq_file_path,
    engine="pyarrow",
    filters=[("mean_vedba", ">", 0.1)],
    # Select only the columns we need
    columns=["Ind_ID", "new_burst"],
)

In [None]:
# Run over all features parquet files
features_dir = "../data/raw/features/"
# Get all parquet files even in sub directories
pq_files = glob.glob(os.path.join(features_dir, "**", "*.parquet"), recursive=True)

In [None]:
# Run loop over all files to shortlist the bursts with vedba values less than 0.1
all_shortlist = []
for pq_file_path in pq_files:
    # Load the parquet file
    feat = pd.read_parquet(
        pq_file_path,
        engine="pyarrow",
        filters=[("mean_vedba", ">", 0.1)],
        # Select only the columns we need
        columns=["Ind_ID", "new_burst"],
    )

    if not feat.empty:
        # Extract metadata from filename
        file_name = os.path.basename(pq_file_path)
        burst = file_name.split("_")[3]
        correction_type = file_name.split("_")[-1].split(".")[0]

        # Add metadata columns
        feat["burst"] = "burst_" + burst
        feat["correction_type"] = correction_type

        # Reorder columns
        feat = feat[["burst", "correction_type", "Ind_ID", "new_burst"]]

        # Add to results list
        all_shortlist.append(feat)

In [None]:
# Combine into one df
combined_shortlist = pd.concat(all_shortlist, ignore_index=True)

In [None]:
# Output the shortlist
combined_shortlist.to_csv(
    "../data/temp/annotations/shortlist_bursts_vedba_01.csv", index=False
)

## Modify burst attribution based on outlier
We need to identify resting bursts with vedba greater than 0.1 and then convert these to removed.  

In [None]:
# Load attributions data
annotations_combined = pd.read_csv(
    "../data/temp/annotations/attributions_merged_majority.csv"
)

In [None]:
# Load outlier information
outlier_info = pd.read_csv("../data/temp/annotations/shortlist_bursts_vedba_01.csv")

In [None]:
# Attach attribution information to outlier dataset
outlier_info = outlier_info.merge(
    annotations_combined[["Ind_ID", "new_burst", "attribution_merged"]],
    on=["Ind_ID", "new_burst"],
    how="left",
)

In [None]:
# Select only from uncorrected as it is the raw data
outlier_info = outlier_info[
    outlier_info["correction_type"] == "uncorrected"
]
outlier_info = outlier_info[["burst", "Ind_ID", "new_burst"]].drop_duplicates()
# 366 bursts to be removed across

In [None]:
# Merge the datasets
annotations_outliers = annotations_combined.merge(
    outlier_info, how="left", on=["burst", "Ind_ID", "new_burst"], indicator=True
)

# Add Remove label to Resting outliers
annotations_outliers["attribution_merged"] = annotations_outliers.apply(
    lambda x: "Remove"
    if x["_merge"] == "both" and x["attribution_merged"] == "Resting"
    else x["attribution_merged"],
    axis=1,
)

# Drop merge indicator and return
annotations_outliers.drop("_merge", axis=1, inplace=True)

# Value counts
annotations_outliers.attribution_merged.value_counts()

## Obtain Active/Inactive attribution
We will use Resting, Sleeping and Grooming receiver as Inactive behaviours

In [None]:
# Create list of inactive behaviours
inactive_beh = ["Resting", "Sleeping", "Grooming receiver"]
# Create new column for activity, ensuring that Remove columns remain as Remove
annotations_outliers["activity"] = annotations_outliers.apply(
    lambda x: "Remove"
    if x["attribution_merged"] == "Remove"
    else "Inactive"
    if x["attribution_merged"] in inactive_beh
    else "Active",
    axis=1,
)
annotations_outliers.activity.value_counts()

In [None]:
# Create an id column combining Ind_ID and new_burst
annotations_outliers["id"] = (
    annotations_outliers["Ind_ID"].astype(str)
    + "_"
    + annotations_outliers["new_burst"].astype(str)
)

In [None]:
# Output the final dataframe
annotations_outliers.to_csv(
    "../data/temp/annotations/attributions_merged_majority_outliers.csv",
    index=False,
)

## Modify accelerometer data for tsai

In [None]:
# Get all parquet files even in sub directories
acc_dir = "../data/raw/acc/"
acc_files = glob.glob(os.path.join(acc_dir, "**", "*.parquet"), recursive=True)

# Output directory
output_dir = "../data/temp/acc_tsai"

for pq_file_name in acc_files:
    # Load the parquet file
    acc = pd.read_parquet(
        pq_file_name,
        engine="pyarrow",
    )
    # First melt the X, Y, Z columns into a feature column
    acc_melted = pd.melt(
        acc,
        id_vars=[
            "Ind_ID",
            "burst_id",
            "new_burst",
            "burst_start_time",
            "sample_number",
        ],
        value_vars=["X", "Y", "Z"],
        var_name="feature",
        value_name="value",
    )
    # Then pivot to get sample_number as columns
    acc_wide_alt = acc_melted.pivot_table(
        index=[
            "Ind_ID",
            "burst_id",
            "new_burst",
            "burst_start_time",
            "feature",
        ],
        columns="sample_number",
        values="value",
    )
    # Rename columns to be just the numbers
    acc_wide_alt.columns = [str(col) for col in acc_wide_alt.columns]
    # Reset index
    acc_wide_alt = acc_wide_alt.reset_index()
    # Remove burst_id and burst-start_time columns
    acc_wide_alt.drop(["burst_id", "burst_start_time"], axis=1, inplace=True)
    # Identify Burst from filename
    file_name = os.path.basename(pq_file_name)
    burst = file_name.split("_")[3]
    # Add burst info to output dir
    burst_output_dir = f"{output_dir}/Burst_{burst}/"
    # Create the output directory if it doesn't exist
    os.makedirs(burst_output_dir, exist_ok=True)
    # Replace _acc_ with _acc_tsai_ in the filename
    file_name = file_name.replace("_acc_", "_acc_tsai_")
    # output file name
    output_file_name = os.path.join(burst_output_dir, file_name)
    # Save the dataframe to a parquet file
    acc_wide_alt.to_parquet(
        output_file_name,
        index=False,
        engine="pyarrow",
    )
    print(f"Saved {output_file_name}")

In [None]:
# Check output file
output_file_name = (
    "../data/temp/acc_tsai/Burst_1/annotated_acc_tsai_burst_1_rotbasal.parquet"
)
acc_check = pd.read_parquet(
    output_file_name,
    engine="pyarrow",
)