In [5]:
import json
import pandas as pd
import os

In [17]:
import json
import pandas as pd
import os

def append_entry(entries, category, audio_dir_name, is_missing):
    """Append an entry to the list of entries."""
    entry = {
        'audio_dir_name': audio_dir_name,
        'category_id': category['id'],
        'category_name': category['name'],
        'common_name': category['common_name'],
        'supercategory': category['supercategory'],
        'kingdom': category['kingdom'],
        'phylum': category['phylum'],
        'class': category['class'],
        'order': category['order'],
        'family': category['family'],
        'genus': category['genus'],
        'specific_epithet': category['specific_epithet'],
        'evaluation': category['evaluation'],
    }
    if is_missing:
        entry['file_path'] = None  # No file path for missing entries
    else:
        entry['file_path'] = os.path.join(audio_dir_name, category['file_name'])  # Add file path for valid entries
    entries.append(entry)

def convert_json_to_csv(folder: str):
    """
    Convert JSON metadata to CSV format.
    The JSON file should contain information about audio files and their categories.
    """
    path_to_dir = '/workspace/data_inatsounds'  # Update this path to your audio files directory
    path_to_json = os.path.join(path_to_dir, f'{folder}.json')
    path_to_audio_files = os.path.join(path_to_dir, folder)

    print(f"Converting JSON file {path_to_json} to CSV...")

    try:
        with open(path_to_json, 'r') as file:
            data = json.load(file)
    except FileNotFoundError:
        print(f"File {path_to_json} not found.")
        return

    rows = []
    missing_entries = []

    for category in data['categories']:
        audio_dir_name = category['audio_dir_name']
        
        audio_files_path = os.path.join(path_to_audio_files, audio_dir_name)
        
        # Check if the directory exists
        if os.path.exists(audio_files_path):
            audio_files = [f for f in os.listdir(audio_files_path) if f.endswith('.wav')]
            
            if audio_files:  # If there are audio files
                for audio_file in audio_files:
                    category['file_name'] = audio_file  # Add the file name to the category
                    append_entry(rows, category, audio_dir_name, is_missing=False)
            else:  # If no audio files are found
                append_entry(missing_entries, category, audio_dir_name, is_missing=True)
        else:  # If the directory does not exist
            append_entry(missing_entries, category, audio_dir_name, is_missing=True)

    # Create DataFrames
    valid_df = pd.DataFrame(rows)
    missing_df = pd.DataFrame(missing_entries)

    # Save to CSV
    valid_df.to_csv(f'{folder}.csv', index=False)
    missing_df.to_csv(f'missing_{folder}.csv', index=False)

    # Log general information about missing entries
    total_missing = len(missing_entries)
    total_valid = len(rows)

    with open(f'{folder}_log.txt', 'w') as log_file:
        log_file.write(f"Total valid entries: {total_valid}\n")
        log_file.write(f"Total missing entries: {total_missing}\n")

    print("Metadata has been converted to metadata.csv")
    print("Missing entries have been logged in missing_metadata.csv")
    print("General information about missing entries has been logged in missing_log.txt")


In [None]:
for folder in ['train', 'test', 'val']:
    convert_json_to_csv(folder)

Converting JSON file /workspace/data_inatsounds/train.json to CSV...
Metadata has been converted to metadata.csv
Missing entries have been logged in missing_metadata.csv
General information about missing entries has been logged in missing_log.txt
Converting JSON file /workspace/data_inatsounds/test.json to CSV...
Metadata has been converted to metadata.csv
Missing entries have been logged in missing_metadata.csv
General information about missing entries has been logged in missing_log.txt
Converting JSON file /workspace/data_inatsounds/validation.json to CSV...
File /workspace/data_inatsounds/validation.json not found.
