In [1]:
from datasets import load_dataset, Features, Value

original_test_dataset = load_dataset(
    "csv",
    data_files = "/workspace/oekofor/testset/labels/*.csv",
    features = Features({ # TODO: Add all features available in BirdSet
        "ebird_code": Value("string"),
        "common_name": Value("string"),
        "vocalization_type": Value("string"),
        "start_time": Value("float"),
        "end_time": Value("float"),
        "audio_filename": Value("string"),

        # shared
        "high_freq": Value("float"),
        "low_freq": Value("float"),
        "lat": Value("float"),
        "long": Value("float"),
        "GPS_uncertainty_km": Value("float"),
        "validator": Value("string"),
        "validation_time": Value("string"),

        #other specific
        "project_code": Value("string"),
        "recorder": Value("string"),
        "pointofvocalization": Value("string"),
        "correct": Value("string"),
        "time_UTC": Value("string"),
        "tsn_code": Value("string"),
        "GBIF_ID": Value("string"),
        "scientific_name": Value("string"),
        "subspecies": Value("string"),
        "microphone": Value("string"),
        "source": Value("string"),
        "recordist": Value("string"),
    }),
    cache_dir = None,
    num_proc = 1,
    trust_remote_code = True, # While not needed for local datasets, it is kept for consistency
)

original_test_dataset = original_test_dataset.rename_column("GPS_uncertainty_km", "gps_uncertainty")
original_test_dataset = original_test_dataset.rename_column("validator", "validator_name")
original_test_dataset = original_test_dataset.rename_column("validation_time", "validation_date")

original_train_dataset = load_dataset(
    "csv",
    data_files = "/workspace/oekofor/trainset/csvlabels/*.csv",
    features = Features({ # TODO: Add all features available in BirdSet
        "ebird_code": Value("string"),
        "call_type": Value("string"),
        "start_sample [s]": Value("float"),
        "end_sample [s]": Value("float"),
        "actual_filename": Value("string"),

        # shared
        "high_freq [Hz]": Value("float"),
        "low_freq [Hz]": Value("float"),
        "latitude [WGS84]": Value("float"),
        "longitude [WGS84]": Value("float"),
        "uncertainty of GPS data [km]": Value("float"),
        "name_of_validator": Value("string"),
        "date_of_validation": Value("string"),

        #other specific
        "call_center [s]": Value("float"),
        "randomised_call_center [s]": Value("float"),
        "subspecies": Value("string"),
        "sex": Value("string"),
        "correct": Value("string"),
        "confidence": Value("float"),
        "date": Value("string"),
        "BirdNET Version": Value("string"),
        "notes": Value("string"),
    }),
    delimiter=";",
    cache_dir = None,
    num_proc = 1,
    trust_remote_code = True, # While not needed for local datasets, it is kept for consistency
)

original_train_dataset = original_train_dataset.rename_column("start_sample [s]", "start_time")
original_train_dataset = original_train_dataset.rename_column("end_sample [s]", "end_time")

original_train_dataset = original_train_dataset.rename_column("high_freq [Hz]", "high_freq")
original_train_dataset = original_train_dataset.rename_column("low_freq [Hz]", "low_freq")
original_train_dataset = original_train_dataset.rename_column("latitude [WGS84]", "lat")
original_train_dataset = original_train_dataset.rename_column("longitude [WGS84]", "long")
original_train_dataset = original_train_dataset.rename_column("uncertainty of GPS data [km]", "gps_uncertainty")
original_train_dataset = original_train_dataset.rename_column("name_of_validator", "validator_name")
original_train_dataset = original_train_dataset.rename_column("date_of_validation", "validation_date")

original_train_dataset = original_train_dataset.rename_column("randomised_call_center [s]", "rand_center")
original_train_dataset = original_train_dataset.rename_column("subspecies", "sub_species")
original_train_dataset = original_train_dataset.rename_column("BirdNET Version", "birdnet_version")

Resolving data files:   0%|          | 0/153 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]

In [2]:
def printTopValuesOfColumn(column_name, top_k=10):
    print(f"--- For test dataset ---")
    printTopValuesOfColumnOf(original_test_dataset['train'], column_name, top_k)
    print(f"--- For train dataset ---")
    printTopValuesOfColumnOf(original_train_dataset['train'], column_name, top_k)

def printTopValuesOfColumnOf(dataset, column_name, top_k=10):
    all_values = dataset[column_name]
    value_counts = {}
    for value in all_values:
        if value in value_counts:
            value_counts[value] += 1
        else:
            value_counts[value] = 1
    sorted_value_counts = sorted(value_counts.items(), key=lambda item: item[1], reverse=True)
    print(f"Top {top_k} values for column '{column_name}':")
    for value, count in sorted_value_counts[:top_k]:
        print(f"  {value}: {count}")

In [3]:
printTopValuesOfColumnOf(original_test_dataset["train"], 'pointofvocalization', top_k=20)

Top 20 values for column 'pointofvocalization':
  None: 16020
