# How to Analyze
Use the `analyze_unlabled_data_dir` and `analyze_labled_data_dir` methods to analyze the corresponding data.

Beforehand: Adjust `LABLED_DATA_PATH` and `UNLABLED_DATA_PATH` as well as `LOG_PATH`.

In [2]:
import pandas as pd
import os
import wave
from pydub import AudioSegment
import datetime

In [3]:
def get_wav_duration(file_path: str) -> float:
    with wave.open(file_path, "rb") as wav_file:
        num_frames = wav_file.getnframes()
        frame_rate = wav_file.getframerate()

        duration = num_frames / float(frame_rate)

    return duration


def get_flac_duration(file_path: str) -> float:
    audio = AudioSegment.from_file(file_path, format="flac")
    duration = len(audio) / 1000.0
    return duration

In [4]:
AUDIO_FORMATS = ["wav", "flac"]
AUDIO_FORMATS_ANALYZER = {"wav": get_wav_duration, "flac": get_flac_duration}
CSV_FORMATS = ["txt"]
LOG_PATH = "/home/rschwinger/oekofor-log"  # ! YOU HAVE TO CREATE THIS DIR ON YOUR OWN
LABLED_DATA_PATH = "/data/deepbirdraw/Labels_DeepBirdDetect"
UNLABLED_DATA_PATH = "/data/deepbirdraw/Aufnahmen"
DELIMITER = "\t"
LOG_FREQUENCY = 5000  # log every X files in case the program terminates

In [5]:
call_types = {}
recording_duration = {}
recording_num = {}
common_names = []

In [7]:
def get_file_names_in_dir(dir: str) -> list[str]:
    all_files = []

    for root, dirs, files in os.walk(dir):
        for file in files:
            file_path = os.path.join(root, file)
            all_files.append(file_path)

    return all_files

In [8]:
def reduce_files_names_to_relevant_files(files: list[str]) -> list[str]:
    return [f for f in files if f.split(".")[-1] in CSV_FORMATS]

TypeError: 'type' object is not subscriptable

In [115]:
def read_csv_file(file: str) -> pd.DataFrame:
    return pd.read_csv(file, delimiter=DELIMITER)

In [116]:
def get_audio_file_duration(file_path: str) -> float:
    file_name, file_extension = os.path.splitext(file_path)

    for audio_format in AUDIO_FORMATS:
        if os.path.isfile(file_name + "." + audio_format):
            # TODO Something broken here
            print(file_name + "." + audio_format)
            return AUDIO_FORMATS_ANALYZER[audio_format](file_name + "." + audio_format)
    print(f"Did not find file format: {file_path}")
    return 0

In [117]:
def extract_relevant_information(file: str) -> None:
    file_extension = file.split(".")[-1]

    if not file_extension in CSV_FORMATS:
        raise ValueError(f"Unknown file extension for csv: {file_extension}")

    df = read_csv_file(file)

    common_name = df.at[0, "common_name"]
    call_type = df.at[0, "call_type"]
    duration = get_audio_file_duration(file)

    # add to call_types
    if not call_type in call_types:
        call_types[common_name] = [call_type]
    else:
        if not call_type in call_types[common_name]:
            call_types[common_name].append(call_type)

    # add to recording_duration
    if not common_name in recording_duration:
        recording_duration[common_name] = duration
    else:
        recording_duration[common_name] += duration

    # add to recording_duration
    if not common_name in recording_num:
        recording_num[common_name] = 1
    else:
        recording_num[common_name] += 1

    # add to common_names
    if not common_name in common_names:
        common_names.append(common_name)

In [118]:
def log_common_names() -> None:
    with open(os.path.join(LOG_PATH, "common_names.log"), "w") as f:
        for common_name in common_names:
            f.write(common_name + "\n")

In [119]:
def write_log(files_processed: int, relevant_files: int, total_files: int) -> None:
    file_name = f"{LOG_PATH}/labled_log_{str(datetime.datetime.now())}.txt"

    with open(file_name, "w") as file:
        if relevant_files == 0:
            procentage = 0
        else:
            procentage = files_processed / relevant_files
        file.write(
            f"Log with progress {files_processed} / {relevant_files} ({round(procentage, 2)}%) \n"
        )
        file.write("Meta Information\n:")
        file.write(f"Total Files: {total_files}\n")
        file.write(f"Relevant Files: {relevant_files}\n")
        file.write(f"Files Processed: {files_processed}\n")
        file.write(f"LOG_PATH: {LOG_PATH}\n")
        file.write(f"LABLED_DATA_PATH: {LABLED_DATA_PATH}\n")
        file.write(f"UNLABLED_DATA_PATH: {UNLABLED_DATA_PATH}\n")
        file.write(f"DELIMITER: {DELIMITER}\n")
        file.write(f"LOG_FREQUENCY: {LOG_FREQUENCY}\n")
        file.write("\n")

        file.write("General Dataset Information:\n")
        file.write(f"Total number of classes: {len(common_names)}\n")
        file.write(f"Total number of recordings: {sum(recording_num.values())}\n")
        file.write(
            f"Total duration of recordings: {sum(recording_duration.values())}\n"
        )
        file.write("\n")

        for common_name in common_names:
            file.write(f"Common Name: {common_name}\n")
            file.write(f"Number of recordings: {recording_num[common_name]}\n")
            file.write(
                f"Total duration of recordings: {recording_duration[common_name]}\n"
            )
            file.write(f"Call Types: {call_types[common_name]}\n")
            file.write("\n")

In [120]:
def write_unlabled_log(duration, files):
    with open(f"{LOG_PATH}/unlabled_log_{str(datetime.datetime.now())}.txt", "w") as f:
        f.write(f"Total duration of recordings: {duration}\n")
        f.write(f"Total number of recordings: {len(files)}\n")

In [121]:
from tqdm import tqdm

In [122]:
def analyze_labled_data_dir() -> None:
    files = get_file_names_in_dir(LABLED_DATA_PATH)
    relevant_files = reduce_files_names_to_relevant_files(files)

    for i, file in tqdm(enumerate(relevant_files)):
        extract_relevant_information(file)

        if i % LOG_FREQUENCY == 0 and not i == 0:
            write_log(i, len(relevant_files), len(files))

    write_log(len(relevant_files), len(relevant_files), len(files))
    log_common_names()
    print("Finished analyzing labled data")

In [123]:
def analyze_unlabled_data_dir() -> None:
    files = get_file_names_in_dir(UNLABLED_DATA_PATH)
    duration = 0
    # get duration of all audio files
    for i, file in tqdm(enumerate(files)):
        if file.split(".")[-1] in AUDIO_FORMATS:
            duration += get_audio_file_duration(file)
        else:
            print(f"File {file} is not a valid audio file")

        if i % LOG_FREQUENCY == 0 and not i == 0:
            write_unlabled_log(duration, files)

    write_unlabled_log(duration, files)

    print("finished analyzing unlabled data")

In [124]:
analyze_labled_data_dir()

0it [00:00, ?it/s]

/data/deepbirdraw/Labels_DeepBirdDetect/Common Quail_s (Gesang)_1_0.88_Ralph Martin_SN11_2022-06-20T011034+0000_s161.1.flac





CouldntDecodeError: Decoding failed. ffmpeg returned error code: 1

Output from ffmpeg/avlib:

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-pocketsphinx --enable-librsvg --enable-libmfx --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared
  libavutil      56. 70.100 / 56. 70.100
  libavcodec     58.134.100 / 58.134.100
  libavformat    58. 76.100 / 58. 76.100
  libavdevice    58. 13.100 / 58. 13.100
  libavfilter     7.110.100 /  7.110.100
  libswscale      5.  9.100 /  5.  9.100
  libswresample   3.  9.100 /  3.  9.100
  libpostproc    55.  9.100 / 55.  9.100
[flac @ 0x5611b96086c0] Could not find codec parameters for stream 0 (Audio: flac, 0 channels): unspecified sample format
Consider increasing the value for the 'analyzeduration' (0) and 'probesize' (5000000) options
Input #0, flac, from '/data/deepbirdraw/Labels_DeepBirdDetect/Common Quail_s (Gesang)_1_0.88_Ralph Martin_SN11_2022-06-20T011034+0000_s161.1.flac':
  Duration: N/A, bitrate: N/A
  Stream #0:0: Audio: flac, 0 channels
Output #0, wav, to 'pipe:':
Output file #0 does not contain any stream


In [None]:
analyze_unlabled_data_dir()