In [21]:
import json
import pandas as pd
import os
from datasets import Dataset, DatasetDict

### Create proper metadata files in csv format

In [None]:
def append_entry(entries, category, audio_dir_name, is_missing):
    """Append an entry to the list of entries."""
    entry = {
        'audio_dir_name': audio_dir_name,
        'category_id': category['id'],
        'category_name': category['name'],
        'common_name': category['common_name'],
        'supercategory': category['supercategory'],
        'kingdom': category['kingdom'],
        'phylum': category['phylum'],
        'class': category['class'],
        'order': category['order'],
        'family': category['family'],
        'genus': category['genus'],
        'specific_epithet': category['specific_epithet'],
        'evaluation': category['evaluation'],
    }
    if is_missing:
        entry['file_path'] = None  # No file path for missing entries
    else:
        entry['file_path'] = os.path.join(audio_dir_name, category['file_name'])  # Add file path for valid entries
    entries.append(entry)

def convert_json_to_csv(folder: str):
    """
    Convert JSON metadata to CSV format.
    The JSON file should contain information about audio files and their categories.
    """
    path_to_dir = '/workspace/data_inatsounds'  # Update this path to your audio files directory
    path_to_json = os.path.join(path_to_dir, f'{folder}.json')
    path_to_audio_files = os.path.join(path_to_dir, folder)

    print(f"Converting JSON file {path_to_json} to CSV...")

    try:
        with open(path_to_json, 'r') as file:
            data = json.load(file)
    except FileNotFoundError:
        print(f"File {path_to_json} not found.")
        return

    rows = []
    missing_entries = []

    for category in data['categories']:
        audio_dir_name = category['audio_dir_name']
        
        audio_files_path = os.path.join(path_to_audio_files, audio_dir_name)
        
        if os.path.exists(audio_files_path):
            audio_files = [f for f in os.listdir(audio_files_path) if f.endswith('.wav')]
            
            if audio_files:
                for audio_file in audio_files:
                    category['file_name'] = audio_file
                    append_entry(rows, category, audio_dir_name, is_missing=False)
            else: 
                append_entry(missing_entries, category, audio_dir_name, is_missing=True)
        else: 
            append_entry(missing_entries, category, audio_dir_name, is_missing=True)

    valid_df = pd.DataFrame(rows)
    missing_df = pd.DataFrame(missing_entries)

    valid_df.to_csv(f'{folder}.csv', index=False)
    missing_df.to_csv(f'missing_{folder}.csv', index=False)

    total_missing = len(missing_entries)
    total_valid = len(rows)

    with open(f'{folder}_log.txt', 'w') as log_file:
        log_file.write(f"Total valid entries: {total_valid}\n")
        log_file.write(f"Total missing entries: {total_missing}\n")

    print("Metadata has been converted to metadata.csv")
    print("Missing entries have been logged in missing_metadata.csv")
    print("General information about missing entries has been logged in missing_log.txt")


In [19]:
for folder in ['train', 'test', 'val']:
    convert_json_to_csv(folder)

Converting JSON file /workspace/data_inatsounds/train.json to CSV...
Metadata has been converted to metadata.csv
Missing entries have been logged in missing_metadata.csv
General information about missing entries has been logged in missing_log.txt
Converting JSON file /workspace/data_inatsounds/test.json to CSV...
Metadata has been converted to metadata.csv
Missing entries have been logged in missing_metadata.csv
General information about missing entries has been logged in missing_log.txt
Converting JSON file /workspace/data_inatsounds/val.json to CSV...
Metadata has been converted to metadata.csv
Missing entries have been logged in missing_metadata.csv
General information about missing entries has been logged in missing_log.txt


### Create dataset

In [23]:
def load_audio_dataset(csv_file: str, audio_dir: str):
    # Load the metadata from the CSV file
    data = pd.read_csv(csv_file)

    # Create a new column for the full audio file path
    data['file_path'] = data['file_path'].apply(lambda x: os.path.join(audio_dir, x))

    # Create a Hugging Face dataset
    dataset = Dataset.from_pandas(data)

    return dataset

In [24]:
train_dataset = load_audio_dataset('train.csv', 'train')
val_dataset = load_audio_dataset('val.csv', 'val')
test_dataset = load_audio_dataset('test.csv', 'test')

In [25]:
dataset_dict = DatasetDict({
        'train': train_dataset,
        'validation': val_dataset,
        'test': test_dataset
    })

In [27]:
dataset_dict.save_to_disk('/workspace')

Saving the dataset (0/1 shards):   0%|          | 0/137012 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/45698 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/49527 [00:00<?, ? examples/s]