In [18]:
# %%

import pandas as pd
from rich.pretty import pprint
from sklearn.model_selection import train_test_split

In [19]:
# %%

# Load the CSV file
df = pd.read_csv("/disk/scratch4/felix/sonar_datasets/shipsear/shipsEar.csv")

In [20]:
# %%

def duration_to_seconds(duration):
    minutes, seconds = map(int, duration.split(":"))
    return minutes * 60 + seconds


df["Duration_seconds"] = df["Duration"].apply(duration_to_seconds)
# One fishboat type label is not captialised, we do it here to group it with other fishboats
df["Type"] = df["Type"].apply(lambda x: x.lower())

# Group the DataFrame by 'Type', 'Ship'
# Strangely, there is no mention of Ship column in the paper, but it may likely be unique ship and how our splitting will be based on
ship_counts = df.groupby("Type")["Ship"].nunique()

# Keep only types with at least two unique ships
valid_types = ship_counts[ship_counts > 2].index

# Filter the DataFrame to include only the valid types
df = df[df["Type"].isin(valid_types)]

# Recompute the pruned dictionary
result = df.groupby(["Type", "Ship"])["ID"].apply(list).to_dict()



# Convert the result to the desired format: {type: {ship_number: [id]}}
formatted_result = {}
for (type_, ship), ids in result.items():
    if type_ not in formatted_result:
        formatted_result[type_] = {}
    formatted_result[type_][ship] = ids

# Print the formatted result
pprint(formatted_result)
# Should be 12 types
print(f"Total types: {len(formatted_result)}")

Total types: 6


In [21]:
# %%

# As some types contains 1 ship/1 audio file, can't do ship-wise split on 12 types
# We group them further according to original paper's mapping

df["Class"] = df["Type"]
# Group the DataFrame by 'Class' and 'Ship'
result = df.groupby(["Class", "Ship"])["ID"].apply(list).to_dict()
unmapped_types = df[df["Class"].isna()]["Type"].unique()
if len(unmapped_types) > 0:
    raise ValueError(
        f"Mapping failed for the following types: {unmapped_types}. Please update the mapping dictionary."
    )
# Convert the result to the desired format: {class: {ship_number: [id]}}
class_formatted_result = {}
for (class_, ship), ids in result.items():
    if class_ not in class_formatted_result:
        class_formatted_result[class_] = {}
    class_formatted_result[class_][ship] = ids

In [22]:
# %%

####
# For readability, we map shipID to the first name in the shipID
ship_to_name = df.groupby("Ship")["Name"].apply(list).to_dict()


name_formatted_result = {}
for class_, ship_data in class_formatted_result.items():
    if class_ not in name_formatted_result:
        name_formatted_result[class_] = {}
    for ship, ids in ship_data.items():
        # Get the first name from the ship_to_name mapping
        first_name = ship_to_name[ship][0]
        name_formatted_result[class_][first_name] = ids

# Print the version where Ship is replaced by the first entry of Name
print("\nFormatted Result by Class (Vessel Size + Noise) with Ship Replaced by Name:")
pprint(name_formatted_result)

#


Formatted Result by Class (Vessel Size + Noise) with Ship Replaced by Name:


In [23]:
# %%

ship_durations = df.groupby(["Class", "Ship"])["Duration_seconds"].sum().reset_index()

ship_durations = ship_durations.sort_values(by="Duration_seconds", ascending=True)


# Split the Ships into train, validation, and test sets for each class
def split_ships_by_class_and_duration(
    ship_durations, test_size=0.2, val_size=0.2, random_state=42
):
    train_ships = {}
    val_ships = {}
    test_ships = {}

    for class_ in ship_durations["Class"].unique():
        class_data = ship_durations[ship_durations["Class"] == class_]

        # Calculate target durations for each split
        total_duration = class_data["Duration_seconds"].sum()
        test_duration = total_duration * test_size
        val_duration = total_duration * val_size

        # Initialize splits
        train_ships[class_] = []
        val_ships[class_] = []
        test_ships[class_] = []

        current_train_duration = 0
        current_val_duration = 0
        current_test_duration = 0

        for _, row in class_data.iterrows():
            ship = row["Ship"]
            duration = row["Duration_seconds"]
            if current_val_duration < val_duration:
                val_ships[class_].append(ship)
                current_val_duration += duration
            elif current_test_duration < test_duration:
                test_ships[class_].append(ship)
                current_test_duration += duration
            else:
                train_ships[class_].append(ship)
                current_train_duration += duration

    return train_ships, val_ships, test_ships


# Perform the split
train_ships, val_ships, test_ships = split_ships_by_class_and_duration(
    ship_durations, test_size=0.1, val_size=0.1
)

# Assign rows to splits based on Ship and Class
train_df = df[df.apply(lambda row: row["Ship"] in train_ships[row["Class"]], axis=1)]
val_df = df[df.apply(lambda row: row["Ship"] in val_ships[row["Class"]], axis=1)]
test_df = df[df.apply(lambda row: row["Ship"] in test_ships[row["Class"]], axis=1)]


# Print duration details for each split
def print_duration_details(df, split_name):
    total_duration = df["Duration_seconds"].sum()
    print(f"\n{split_name} Duration: {total_duration} seconds")

    # Ensure all classes are included, even if duration is 0
    classes = sorted(df["Class"].unique())
    for class_ in classes:
        # Get the duration and IDs for this class in the split
        class_duration = df[df["Class"] == class_]["Duration_seconds"].sum()
        class_ids = df[df["Class"] == class_][
            "ID"
        ].tolist()  # Collect IDs for this class
        print(f"  {class_}: {class_duration} seconds, Assigned IDs: {class_ids}")


train_df.to_csv("pruned_train.csv", index=False)
val_df.to_csv("pruned_validation.csv", index=False)
test_df.to_csv("pruned_test.csv", index=False)

print_duration_details(train_df, "Train")
print_duration_details(val_df, "Validation")
print_duration_details(test_df, "Test")


Train Duration: 6385 seconds
  fishboat: 230 seconds, Assigned IDs: [75, 76]
  motorboat: 630 seconds, Assigned IDs: [21, 26, 39, 27, 72, 51, 52, 70]
  natural ambient noise: 770 seconds, Assigned IDs: [81, 82, 83, 84, 86, 87, 88]
  ocean liner: 644 seconds, Assigned IDs: [25, 22, 24, 23, 69]
  passengers: 3025 seconds, Assigned IDs: [40, 60, 6, 61, 62, 7, 59, 17, 34, 63, 8, 35, 64, 10, 9, 67, 43, 13, 42, 14]
  roro: 1086 seconds, Assigned IDs: [18, 20, 19]

Validation Duration: 1567 seconds
  fishboat: 139 seconds, Assigned IDs: [73]
  motorboat: 193 seconds, Assigned IDs: [33, 77, 50]
  natural ambient noise: 186 seconds, Assigned IDs: [85, 91, 92]
  ocean liner: 135 seconds, Assigned IDs: [71]
  passengers: 726 seconds, Assigned IDs: [38, 32, 53, 55, 54]
  roro: 188 seconds, Assigned IDs: [78]

Test Duration: 1441 seconds
  fishboat: 145 seconds, Assigned IDs: [74]
  motorboat: 191 seconds, Assigned IDs: [45, 79]
  natural ambient noise: 184 seconds, Assigned IDs: [89, 90]
  ocean 

In [24]:
# %%

assert len(set(train_df["Ship"]) & set(val_df["Ship"])) == 0, "Ships overlap between Train and Validation!"
assert len(set(train_df["Ship"]) & set(test_df["Ship"])) == 0, "Ships overlap between Train and Test!"
assert len(set(val_df["Ship"]) & set(test_df["Ship"])) == 0, "Ships overlap between Validation and Test!"