# Helper

## Dataset

In [128]:
from datasets import load_dataset, Dataset

def load_birdset_train_split(name: str):
    return load_dataset(
    path="DBD-research-group/BirdSet",
    name=name,
    cache_dir=f"/home/rantjuschin/data_birdset/{name}",
    trust_remote_code=True,
    split="train"
)

## Data Selection

In [129]:
from datasets import Dataset
import soundfile as sf

def get_all_legal_indeces(dataset: Dataset, condition: callable) -> list[int]:
    legal_indeces = []
    
    for i in range(len(dataset)):
        if condition(dataset, i):
            legal_indeces.append(i)

    return legal_indeces


def strict_condition(dataset: Dataset, idx: int) -> bool:
    """
    This condition only allows files that up to 5s long so that no event detection has to occur when sampling.
    """
    file_info = sf.info(dataset[idx]["filepath"])
    if file_info.duration <= 5:
        return True
    

def lenient_condition(dataset: Dataset, idx: int) -> bool:
    """
    This condition allows files up to 10s but only if one bird occurence is in the file.
    """
    file_info = sf.info(dataset[idx]["filepath"])
    if file_info.duration <= 10 and (not dataset[idx]["ebird_code_secondary"]) and len(dataset[idx]["ebird_code_multilabel"]) == 1:
        return True

    
def count_labels_from_legal_indeces(dataset: Dataset, indeces: list[int]) -> dict:
    all_labels = set(dataset["ebird_code"])
    unique_label_counts = {label: 0 for label in all_labels}
    for i in indeces:
       unique_label_counts[dataset[i]["ebird_code"]] = unique_label_counts.get(dataset[i]["ebird_code"]) + 1
    return unique_label_counts



## Few-Shot analysis

In [130]:
def analyse_with_few_shot_counts(few_shot_counts: list[int], dataset_label_counts: dict) -> dict:
    results = {}
    for condition, label_counts in dataset_label_counts.items():
        results[condition] = {}
        for few_shot_count in few_shot_counts:
            limited_label_counts = {label: min(few_shot_count, count) for label, count in label_counts.items()}
            
            max_possible_sample_count = len(label_counts) * few_shot_count
            actual_sample_count = sum(limited_label_counts.values())
            total_difference = max_possible_sample_count - actual_sample_count

            max_difference = 0
            min_difference = few_shot_count
            incomplete_labels = 0
            for label_count in limited_label_counts.values():
                if label_count < few_shot_count:
                    incomplete_labels += 1
                    max_difference = max(max_difference, few_shot_count - label_count)
                    min_difference = min(min_difference, few_shot_count - label_count)
            
            # avg and deviaton are only computed on labels that are actually missing samples
            if incomplete_labels > 0:
                avg_difference = total_difference / max(incomplete_labels, 1)
                
                total_difference_to_avg = 0
                for label_count in limited_label_counts.values():
                    if label_count < few_shot_count:
                        total_difference_to_avg += (label_count - avg_difference)**2
                std_deviation = total_difference_to_avg / max((incomplete_labels - 1), 1) 
                std_deviation = std_deviation**(1/2)
            else:
                avg_difference = 0
                std_deviation = 0

            results[condition][few_shot_count] = {"max_possible_sample_count":max_possible_sample_count, 
                                    "actual_sample_count": actual_sample_count,
                                    "total_difference": total_difference, 
                                    "incomplete_labels": incomplete_labels,
                                    "max_difference": max_difference,
                                    "min_difference": min_difference,
                                    "avg_difference": avg_difference,
                                    "std_deviation": std_deviation,
                                    }
    return results

def print_results(results):
    for condition in results.keys():
        print(f"\n{condition} results:")
        printed_first_row = False
        first_row = "\t"
        for few_shot_count, values in results[condition].items():
            if not printed_first_row:
                for key in values.keys():
                    if key in ["max_difference", "min_difference", "avg_difference", "std_deviation"]:
                        first_row += key + "\t\t" 
                    else:
                        first_row += key + "\t"
                print(first_row)
                printed_first_row = True

            output = str(few_shot_count) + "\t\t\t"
            for value in values.values():
                output += f"{value:.2f}\t\t\t"
            print(output)

# PER

In [131]:
PER = load_birdset_train_split("PER")
PER

Dataset({
    features: ['audio', 'filepath', 'start_time', 'end_time', 'low_freq', 'high_freq', 'ebird_code', 'ebird_code_multilabel', 'ebird_code_secondary', 'call_type', 'sex', 'lat', 'long', 'length', 'microphone', 'license', 'source', 'local_time', 'detected_events', 'event_cluster', 'peaks', 'quality', 'recordist', 'genus', 'species_group', 'order', 'genus_multilabel', 'species_group_multilabel', 'order_multilabel'],
    num_rows: 16802
})

In [132]:
PER_label_counts = {"strict_condition": count_labels_from_legal_indeces(PER, get_all_legal_indeces(PER, strict_condition)),
                    "lenient_condition": count_labels_from_legal_indeces(PER, get_all_legal_indeces(PER, lenient_condition))}

In [133]:
results = analyse_with_few_shot_counts([2, 4, 8, 16, 32, 64, 128], PER_label_counts)
print_results(results)


strict_condition results:
	max_possible_sample_count	actual_sample_count	total_difference	incomplete_labels	max_difference		min_difference		avg_difference		std_deviation		
2			264.00			226.00			38.00			22.00			2.00			1.00			1.73			1.56			
4			528.00			381.00			147.00			64.00			4.00			1.00			2.30			1.30			
8			1056.00			547.00			509.00			102.00			8.00			1.00			4.99			2.84			
16			2112.00			682.00			1430.00			121.00			16.00			1.00			11.82			8.40			
32			4224.00			752.00			3472.00			130.00			32.00			4.00			26.71			22.15			
64			8448.00			774.00			7674.00			132.00			64.00			11.00			58.14			52.96			
128			16896.00			774.00			16122.00			132.00			128.00			75.00			122.14			116.93			

lenient_condition results:
	max_possible_sample_count	actual_sample_count	total_difference	incomplete_labels	max_difference		min_difference		avg_difference		std_deviation		
2			264.00			254.00			10.00			7.00			2.00			1.00			1.43			1.07			
4			528.00			495.00			33.00			14.00			4.00			1.00			2.36			1.42			
8			1056

# NES

In [134]:
NES = load_birdset_train_split("NES")
NES

Dataset({
    features: ['audio', 'filepath', 'start_time', 'end_time', 'low_freq', 'high_freq', 'ebird_code', 'ebird_code_multilabel', 'ebird_code_secondary', 'call_type', 'sex', 'lat', 'long', 'length', 'microphone', 'license', 'source', 'local_time', 'detected_events', 'event_cluster', 'peaks', 'quality', 'recordist', 'genus', 'species_group', 'order', 'genus_multilabel', 'species_group_multilabel', 'order_multilabel'],
    num_rows: 16116
})

In [135]:
NES_label_counts = {"strict_condition": count_labels_from_legal_indeces(NES, get_all_legal_indeces(NES, strict_condition)),
                    "lenient_condition": count_labels_from_legal_indeces(NES, get_all_legal_indeces(NES, lenient_condition))}

In [136]:
results = analyse_with_few_shot_counts([2, 4, 8, 16, 32, 64, 128], NES_label_counts)
print_results(results)


strict_condition results:
	max_possible_sample_count	actual_sample_count	total_difference	incomplete_labels	max_difference		min_difference		avg_difference		std_deviation		
2			178.00			152.00			26.00			16.00			2.00			1.00			1.62			1.38			
4			356.00			267.00			89.00			34.00			4.00			1.00			2.62			1.65			
8			712.00			431.00			281.00			54.00			8.00			1.00			5.20			3.24			
16			1424.00			614.00			810.00			72.00			16.00			2.00			11.25			7.68			
32			2848.00			748.00			2100.00			85.00			32.00			1.00			24.71			18.97			
64			5696.00			807.00			4889.00			88.00			64.00			11.00			55.56			48.34			
128			11392.00			811.00			10581.00			89.00			128.00			60.00			118.89			110.99			

lenient_condition results:
	max_possible_sample_count	actual_sample_count	total_difference	incomplete_labels	max_difference		min_difference		avg_difference		std_deviation		
2			178.00			171.00			7.00			5.00			2.00			1.00			1.40			1.05			
4			356.00			328.00			28.00			12.00			4.00			1.00			2.33			1.28			
8			712.00			604.

# UHH

In [137]:
UHH = load_birdset_train_split("UHH")
UHH

Dataset({
    features: ['audio', 'filepath', 'start_time', 'end_time', 'low_freq', 'high_freq', 'ebird_code', 'ebird_code_multilabel', 'ebird_code_secondary', 'call_type', 'sex', 'lat', 'long', 'length', 'microphone', 'license', 'source', 'local_time', 'detected_events', 'event_cluster', 'peaks', 'quality', 'recordist', 'genus', 'species_group', 'order', 'genus_multilabel', 'species_group_multilabel', 'order_multilabel'],
    num_rows: 3626
})

In [138]:
UHH_label_counts = {"strict_condition": count_labels_from_legal_indeces(UHH, get_all_legal_indeces(UHH, strict_condition)),
                    "lenient_condition": count_labels_from_legal_indeces(UHH, get_all_legal_indeces(UHH, lenient_condition))}

In [139]:
results = analyse_with_few_shot_counts([2, 4, 8, 16, 32, 64, 128], UHH_label_counts)
print_results(results)


strict_condition results:
	max_possible_sample_count	actual_sample_count	total_difference	incomplete_labels	max_difference		min_difference		avg_difference		std_deviation		
2			50.00			28.00			22.00			14.00			2.00			1.00			1.57			1.29			
4			100.00			43.00			57.00			19.00			4.00			1.00			3.00			2.33			
8			200.00			62.00			138.00			21.00			8.00			1.00			6.57			5.56			
16			400.00			85.00			315.00			23.00			16.00			3.00			13.70			12.13			
32			800.00			115.00			685.00			24.00			32.00			2.00			28.54			26.45			
64			1600.00			147.00			1453.00			24.00			64.00			34.00			60.54			58.68			
128			3200.00			211.00			2989.00			24.00			128.00			98.00			124.54			123.86			

lenient_condition results:
	max_possible_sample_count	actual_sample_count	total_difference	incomplete_labels	max_difference		min_difference		avg_difference		std_deviation		
2			50.00			39.00			11.00			8.00			2.00			1.00			1.38			0.95			
4			100.00			64.00			36.00			14.00			4.00			1.00			2.57			1.61			
8			200.00			100.00			100.00

# HSN

In [140]:
HSN = load_birdset_train_split("HSN")
HSN

Dataset({
    features: ['audio', 'filepath', 'start_time', 'end_time', 'low_freq', 'high_freq', 'ebird_code', 'ebird_code_multilabel', 'ebird_code_secondary', 'call_type', 'sex', 'lat', 'long', 'length', 'microphone', 'license', 'source', 'local_time', 'detected_events', 'event_cluster', 'peaks', 'quality', 'recordist', 'genus', 'species_group', 'order', 'genus_multilabel', 'species_group_multilabel', 'order_multilabel'],
    num_rows: 5460
})

In [141]:
HSN_label_counts = {"strict_condition": count_labels_from_legal_indeces(HSN, get_all_legal_indeces(HSN, strict_condition)),
                    "lenient_condition": count_labels_from_legal_indeces(HSN, get_all_legal_indeces(HSN, lenient_condition))}

In [142]:
results = analyse_with_few_shot_counts([2, 4, 8, 16, 32, 64, 128], HSN_label_counts)
print_results(results)


strict_condition results:
	max_possible_sample_count	actual_sample_count	total_difference	incomplete_labels	max_difference		min_difference		avg_difference		std_deviation		
2			42.00			40.00			2.00			2.00			1.00			1.00			1.00			0.00			
4			84.00			75.00			9.00			5.00			3.00			1.00			1.80			1.18			
8			168.00			127.00			41.00			9.00			7.00			1.00			4.56			2.22			
16			336.00			209.00			127.00			14.00			15.00			1.00			9.07			5.67			
32			672.00			262.00			410.00			19.00			31.00			9.00			21.58			13.72			
64			1344.00			294.00			1050.00			20.00			63.00			32.00			52.50			42.97			
128			2688.00			314.00			2374.00			21.00			127.00			44.00			113.05			102.11			

lenient_condition results:
	max_possible_sample_count	actual_sample_count	total_difference	incomplete_labels	max_difference		min_difference		avg_difference		std_deviation		
2			42.00			42.00			0.00			0.00			0.00			2.00			0.00			0.00			
4			84.00			83.00			1.00			1.00			1.00			1.00			1.00			2.00			
8			168.00			151.00			17.00			5.00			5.

# NBP

In [143]:
NBP = load_birdset_train_split("NBP")
NBP

Downloading data:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/980M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.84M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/24.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.8k [00:00<?, ?B/s]

Extracting train split:   0%|          | 0/32 [00:00<?, ?it/s]

Extracting test split:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting test_5s split:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating test_5s split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['audio', 'filepath', 'start_time', 'end_time', 'low_freq', 'high_freq', 'ebird_code', 'ebird_code_multilabel', 'ebird_code_secondary', 'call_type', 'sex', 'lat', 'long', 'length', 'microphone', 'license', 'source', 'local_time', 'detected_events', 'event_cluster', 'peaks', 'quality', 'recordist', 'genus', 'species_group', 'order', 'genus_multilabel', 'species_group_multilabel', 'order_multilabel'],
    num_rows: 24327
})

In [144]:
NBP_label_counts = {"strict_condition": count_labels_from_legal_indeces(NBP, get_all_legal_indeces(NBP, strict_condition)),
                    "lenient_condition": count_labels_from_legal_indeces(NBP, get_all_legal_indeces(NBP, lenient_condition))}

In [145]:
results = analyse_with_few_shot_counts([2, 4, 8, 16, 32, 64, 128], NBP_label_counts)
print_results(results)


strict_condition results:
	max_possible_sample_count	actual_sample_count	total_difference	incomplete_labels	max_difference		min_difference		avg_difference		std_deviation		
2			102.00			102.00			0.00			0.00			0.00			2.00			0.00			0.00			
4			204.00			204.00			0.00			0.00			0.00			4.00			0.00			0.00			
8			408.00			401.00			7.00			4.00			3.00			1.00			1.75			5.28			
16			816.00			756.00			60.00			10.00			11.00			1.00			6.00			5.64			
32			1632.00			1171.00			461.00			35.00			27.00			3.00			13.17			8.93			
64			3264.00			1478.00			1786.00			46.00			59.00			1.00			38.83			19.47			
128			6528.00			1656.00			4872.00			51.00			123.00			2.00			95.53			69.07			

lenient_condition results:
	max_possible_sample_count	actual_sample_count	total_difference	incomplete_labels	max_difference		min_difference		avg_difference		std_deviation		
2			102.00			102.00			0.00			0.00			0.00			2.00			0.00			0.00			
4			204.00			204.00			0.00			0.00			0.00			4.00			0.00			0.00			
8			408.00			408.00			0.00			0.00	

# POW

In [146]:
POW = load_birdset_train_split("POW")
POW

Downloading data:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

OSError: [Errno 28] No space left on device

In [None]:
POW_label_counts = {"strict_condition": count_labels_from_legal_indeces(POW, get_all_legal_indeces(POW, strict_condition)),
                    "lenient_condition": count_labels_from_legal_indeces(POW, get_all_legal_indeces(POW, lenient_condition))}

In [None]:
results = analyse_with_few_shot_counts([2, 4, 8, 16, 32, 64, 128], POW_label_counts)
print_results(results)

# SSW

In [None]:
SSW = load_birdset_train_split("SSW")
SSW

In [None]:
SSW_label_counts = {"strict_condition": count_labels_from_legal_indeces(SSW, get_all_legal_indeces(SSW, strict_condition)),
                    "lenient_condition": count_labels_from_legal_indeces(SSW, get_all_legal_indeces(SSW, lenient_condition))}

In [None]:
results = analyse_with_few_shot_counts([2, 4, 8, 16, 32, 64, 128], SSW_label_counts)
print_results(results)

# SNE

In [None]:
SNE = load_birdset_train_split("SNE")
SNE

In [None]:
SNE_label_counts = {"strict_condition": count_labels_from_legal_indeces(SNE, get_all_legal_indeces(SNE, strict_condition)),
                    "lenient_condition": count_labels_from_legal_indeces(SNE, get_all_legal_indeces(SNE, lenient_condition))}

In [None]:
results = analyse_with_few_shot_counts([2, 4, 8, 16, 32, 64, 128], SNE_label_counts)
print_results(results)