Count all files in the consvmets data
- load their csv
- assign each folder to a dataset
- count audio files

In [1]:
import pandas as pd
import csv
import os
import soundfile as sf
import wave

In [90]:
# define master folder
master_folder = r'D:\CV4Eco\Data\Conservation Metrics data\cmi_soundfiles_20230719\src'

# folder to write results to
results_folder = r'C:\Users\ben07\OneDrive - University College London\Google\work\data_wrangling\csvs'

# Load the cmi_labels_20230713_215906.csv file
cmi_labels_df = pd.read_csv(r'C:\Users\ben07\OneDrive - University College London\Google\work\data_wrangling\csvs\cmi_labels_20230719.csv')

### Find all raw audio files

In [3]:
# list all dirs in master folder
def list_directories(master_folder):
    with os.scandir(master_folder) as entries:
        for entry in entries:
            if entry.is_dir():
                print(entry.name)

# Run
dataset_list = list_directories(master_folder)

Atlantic
CMI_PalmyraReef_2023_R1_FLAC
CMI_Palmyra_HydroMoth_2021_R1
CRIMP_Kaneohe_2022_R1
CRIMP_Kaneohe_2023_R1_FLAC
HIMB_Halos_2022_R1
HIMB_Halos_2022_R2
HIMB_Halos_2022_R3
HIMB_Halos_2022_R4
HIMB_Halos_2022_R5
HIMB_Halos_2023_R1
HIMB_Halos_2023_R2
HIMB_Halos_2023_R3_FLAC
HIMB_Halos_2023_R4_FLAC
mozambique
NOAA_SanctSound_FLKeys
Rice_Pacific
SOS_STX_2022_R6_FLAC
SOS_STX_2022_R7_FLAC
SOS_STX_ButlerBay
SOS_Thailand_2023_R1_FLAC
SOS_ZAN_ChumbeIsland


Make a lengthy dictionary storing each dataset and current known metadata (None where unknown)

In [40]:
datasets = {
        "palmyra_hydromoth": {"folders": ["CMI_Palmyra_HydroMoth_2021_R1"],
                        "data_owner": None, 
                        "data_provider": "Conservation metrics",
                        "site": None,
                        "hydrophone": "Hydromoth"},

        "palmyra_other": {"folders": ["CMI_PalmyraReef_2023_R1_FLAC"],
                        "data_owner": None, 
                        "data_provider": "Conservation metrics",
                        "site": None,
                        "hydrophone": None},

        "hawaii_kaneohe": {"folders": ["CRIMP_Kaneohe_2022_R1", 
                                       "CRIMP_Kaneohe_2023_R1_FLAC"],
                        "data_owner": None, 
                        "data_provider": "Conservation metrics",
                        "site": None,
                        "hydrophone": None},

        "hawaii_himb": {"folders": ["HIMB_Halos_2022_R1",
                                    "HIMB_Halos_2022_R2",
                                    "HIMB_Halos_2022_R3",
                                    "HIMB_Halos_2022_R4",
                                    "HIMB_Halos_2022_R5",
                                    "HIMB_Halos_2023_R1",
                                    "HIMB_Halos_2023_R2",
                                    "HIMB_Halos_2023_R3_FLAC",
                                    "HIMB_Halos_2023_R4_FLAC"],
                        "data_owner": None, 
                        "data_provider": "Conservation metrics",
                        "site": None,
                        "hydrophone": None},

        "mozambique": {"folders": ["mozambique"],
                        "data_owner": None, 
                        "data_provider": "Conservation metrics",
                        "site": None,
                        "hydrophone": None},

        "florida_noaa": {"folders": ["NOAA_SanctSound_FLKeys"],
                        "data_owner": None, 
                        "data_provider": "Conservation metrics",
                        "site": None,
                        "hydrophone": None},

        "florida_noaa": {"folders": ["NOAA_SanctSound_FLKeys"],
                        "data_owner": None, 
                        "data_provider": "Conservation metrics",
                        "site": None,
                        "hydrophone": None},

        "philippines_rice": {"folders": ["Rice_Pacific"],
                        "data_owner": None, 
                        "data_provider": "Conservation metrics",
                        "site": None,
                        "hydrophone": None},

        "usvi": {"folders": ["SOS_STX_2022_R6_FLAC",
                             "SOS_STX_2022_R7_FLAC",
                             "SOS_STX_ButlerBay"],
                        "data_owner": None, 
                        "data_provider": "Conservation metrics",
                        "site": None,
                        "hydrophone": None},

        "thailand": {"folders": ["SOS_Thailand_2023_R1_FLAC"],
                        "data_owner": None, 
                        "data_provider": "Conservation metrics",
                        "site": None,
                        "hydrophone": None},


        "zanzibar": {"folders": ["SOS_ZAN_ChumbeIsland"],
                        "data_owner": None, 
                        "data_provider": "Conservation metrics",
                        "site": None,
                        "hydrophone": None},

        "belize": {"folders": ["Atlantic/Belize"],
                        "data_owner": None, 
                        "data_provider": "Conservation metrics",
                        "site": None,
                        "hydrophone": None},

        "bermuda": {"folders": ["Atlantic/Bermuda"],
                        "data_owner": None, 
                        "data_provider": "Conservation metrics",
                        "site": None,
                        "hydrophone": None},

        "mexico": {"folders": ["Atlantic/Cozumel"],
                        "data_owner": None, 
                        "data_provider": "Conservation metrics",
                        "site": None,
                        "hydrophone": None}
                }

#### Create csv of all raw audio files in all datasets

In [41]:
# function to get info from eahc individual audio file
def get_audio_info(filepath):
    file_format, sample_rate, sample_length = None, None, None
    if filepath.lower().endswith('.flac'):
        file_format = 'flac'
        with sf.SoundFile(filepath) as f:
            sample_rate = f.samplerate
            sample_length = len(f) / f.samplerate
    elif filepath.lower().endswith('.wav'):
        file_format = 'wav'
        with wave.open(filepath, 'rb') as f:
            sample_rate = f.getframerate()
            sample_length = f.getnframes() / f.getframerate()
    return file_format, sample_rate, sample_length

# function to run through each folder and find each audio file, then store its info
def collect_audio_files(folder_path, dataset_name, extra_info, file_list):
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.lower().endswith(('.flac', '.wav')):
                full_path = os.path.join(root, file)
                rel_path = os.path.relpath(root, start = master_folder)
                file_format, sample_rate, sample_length = get_audio_info(full_path)
                full_rel_path = os.path.join(rel_path, file)
                file_info = {
                    "filepath": full_rel_path,
                    "dataset": dataset_name,
                    "file_format": file_format,
                    "sample_rate": sample_rate,
                    "sample_length": sample_length
                }
                # add the extra keys in the dict to to the file list
                file_info.update(extra_info) 
                file_list.append(file_info)

# Make file list
file_list = []

# loop to start running through each dataset and its folders
for dataset, attributes in datasets.items():
    folders = attributes.get("folders", [])
    extra_info = {k: v for k, v in attributes.items() if k != "folders"}  # Exclude 'folders' from extra_info
    for folder in folders:
        folder_path = os.path.join(master_folder, folder)
        collect_audio_files(folder_path, dataset, extra_info, file_list)

# Create DataFrame from file_list
df = pd.DataFrame(file_list)

# Count the number of audio files per dataset
dataset_counts = df.groupby('dataset').size()

# Print dataset counts
print("Number of audio files per dataset:")
print(dataset_counts)

# Save DataFrame to CSV
df.to_csv(results_folder + "/consvmets_data.csv", index=False)

# Print total count
print(f"Total audio files: {len(df)}")


Number of audio files per dataset:
dataset
belize                 315
bermuda               3766
florida_noaa         29959
hawaii_himb          24416
hawaii_kaneohe        1446
mexico                1086
mozambique            3909
palmyra_hydromoth     2414
palmyra_other        22289
philippines_rice       857
thailand               116
usvi                  4482
zanzibar              2869
dtype: int64
Total audio files: 97924


### Now find all annotated files

Conservation metrics original cmi csv is only of annotations, we:
- Find every entry in the cmi csv url column that is in the new csv we made above. This must account for back/forward slashes and the 'src/' prefix.
- Make a new df that is a copy of the csv made above, with an added annotation_count column. This will include how many annotations given to each file (most will be 0)

for every entry in the filepath column of new_consvmets_df, check the last 3 characters of the entry []. if these are 'wav' replace these last 3 characters with 'flac'.

code to do this please

In [91]:
# Some cmi file entries in the csv end .wav, even though the real files are .flac, so fix
cmi_labels_df_new = cmi_labels_df.copy()

# Check if the last 3 characters are 'wav' and replace them with 'flac'
cmi_labels_df_new['url'] = cmi_labels_df_new['url'].apply(
    lambda x: x[:-3] + 'flac' if x[-3:].lower() == 'wav' else x)


In [92]:
# Create a new dataframe that is a copy of consvmets_df and add an 'annotation_count' column initialized to zero.
new_consvmets_df = df.copy()

# Create a new dataframe that is a copy of consvmets_df and add an 'annotation_count' column initialized to zero.
new_consvmets_df['annotation_count'] = 0

# Replace backslashes with forward slashes in the 'filepath' column
new_consvmets_df['filepath'] = new_consvmets_df['filepath'].str.replace('\\', '/')

# Add the prefix 'src/' to the entries in the 'filepath' column
new_consvmets_df['filepath'] = 'src/' + new_consvmets_df['filepath']

# Count the occurrences of each unique 'url' in cmi_labels_df
url_count_series = cmi_labels_df_new['url'].value_counts()

# Map the counts to the 'filepath' in new_consvmets_df
new_consvmets_df['annotation_count'] = new_consvmets_df['filepath'].map(url_count_series).fillna(0).astype(int)

# Save the new dataframe if needed
new_consvmets_df.to_csv(results_folder + '/consvmets_data_annotation_count.csv', index=False)

# Display or use the new dataframe
new_consvmets_df


Unnamed: 0,filepath,dataset,file_format,sample_rate,sample_length,data_owner,data_provider,site,hydrophone,annotation_count
0,src/CMI_Palmyra_HydroMoth_2021_R1/242A26046037...,palmyra_hydromoth,flac,96000,10.000000,,Conservation metrics,,Hydromoth,0
1,src/CMI_Palmyra_HydroMoth_2021_R1/242A26046037...,palmyra_hydromoth,flac,96000,10.000000,,Conservation metrics,,Hydromoth,0
2,src/CMI_Palmyra_HydroMoth_2021_R1/242A26046037...,palmyra_hydromoth,flac,96000,10.000000,,Conservation metrics,,Hydromoth,0
3,src/CMI_Palmyra_HydroMoth_2021_R1/242A26046037...,palmyra_hydromoth,flac,96000,10.000000,,Conservation metrics,,Hydromoth,0
4,src/CMI_Palmyra_HydroMoth_2021_R1/242A26046037...,palmyra_hydromoth,flac,96000,10.000000,,Conservation metrics,,Hydromoth,0
...,...,...,...,...,...,...,...,...,...,...
97919,src/Atlantic/Cozumel/Cozumel_wavs/AMAR394_1_03...,mexico,flac,32000,10.000000,,Conservation metrics,,,0
97920,src/Atlantic/Cozumel/Cozumel_wavs/AMAR394_1_03...,mexico,flac,32000,10.000000,,Conservation metrics,,,1
97921,src/Atlantic/Cozumel/Cozumel_wavs/AMAR394_1_03...,mexico,flac,32000,10.000000,,Conservation metrics,,,0
97922,src/Atlantic/Cozumel/Cozumel_wavs/AMAR394_1_03...,mexico,flac,32000,10.000000,,Conservation metrics,,,1


# to do here!
-  create a csv that summarises each dataset (including capturing any difference in length)
    - and num classes (will need other csv for that)
    - track num clips with an annotation (can we assume at least that whole clip has been inspected?)
- send this to jill and ask if they can fill any metadata gaps.

here is my new csv.

I want to groupby the dataset column. for each group I want to know:
- how many entries there are in the dataset (call this file_count)
- how many of these have an annotation_count that isnt 0
- the mean annotation_count for each file
- the number of different file_format entries, list these
- the number of different sample_rate entries, list these
- the number of entries which are not of sample_length 10
- the data_provider entry (will be the same for each whole dataset)
- the data_owner entry (will be the same for each whole dataset)
- the hydrophone entry (will be the same for each whole dataset)
- the number of different entries for sites (put None if None)

give me the code to this on my machine

In [93]:
# Define an aggregation dictionary to perform multiple operations at once
aggregation_dict = {
    'filepath': ['count'],
    'annotation_count': [lambda x: (x != 0).sum()],
    'sample_length': [lambda x: (x != 10).sum()],
    'file_format': ['nunique', lambda x: list(x.unique())],
    'sample_rate': ['nunique', lambda x: list(x.unique())],
    'data_provider': ['first'],
    'data_owner': ['first'],
    'hydrophone': ['first'],
    'site': ['nunique', lambda x: None if x.isnull().all() else list(x.unique())]
}

# Group by the 'dataset' column and aggregate
grouped_df = new_consvmets_df.groupby('dataset').agg(aggregation_dict).reset_index()

# Rename columns for better readability
grouped_df.columns = [
    'dataset', 'file_count', 'files_with_annotation',
    'num_entries_not_sample_length_10', 'num_unique_file_formats', 
    'unique_file_formats',
    'num_unique_sample_rates', 'unique_sample_rates',
    'data_provider', 'data_owner', 'hydrophone', 'num_unique_sites', 'unique_sites'
]

# Display or use the grouped dataframe
grouped_df


Unnamed: 0,dataset,file_count,files_with_annotation,num_entries_not_sample_length_10,num_unique_file_formats,unique_file_formats,num_unique_sample_rates,unique_sample_rates,data_provider,data_owner,hydrophone,num_unique_sites,unique_sites
0,belize,315,294,144,1,[flac],1,[48000],Conservation metrics,,,0,
1,bermuda,3766,2184,218,1,[flac],1,[24000],Conservation metrics,,,0,
2,florida_noaa,29959,15389,18,1,[flac],1,[48000],Conservation metrics,,,0,
3,hawaii_himb,24416,11675,89,1,[flac],2,"[24000, 48000]",Conservation metrics,,,0,
4,hawaii_kaneohe,1446,785,13,1,[flac],2,"[24000, 48000]",Conservation metrics,,,0,
5,mexico,1086,683,9,1,[flac],1,[32000],Conservation metrics,,,0,
6,mozambique,3909,2456,172,1,[flac],1,[96000],Conservation metrics,,,0,
7,palmyra_hydromoth,2414,0,2,1,[flac],1,[96000],Conservation metrics,,Hydromoth,0,
8,palmyra_other,22289,10530,501,1,[flac],1,[48000],Conservation metrics,,,0,
9,philippines_rice,857,492,3,1,[flac],1,[96000],Conservation metrics,,,0,
