# File Listing and Labeling
List dataset audio files and their assigned labels.

In [80]:
# 1) Import required libraries
from pathlib import Path
import yaml
import pandas as pd

SUPPORTED_EXTS = {".wav", ".mp3", ".m4a", ".wma", ".ogg"}


In [81]:
# 2) Define base path and helper functions
from pathlib import Path
base_dir = Path.cwd()
recordings_root = base_dir / "Recordings_1"
print(f"Using recordings root: {recordings_root}")


def parse_labels_yaml(yaml_path: Path):
    if not yaml_path.exists():
        return {}
    with open(yaml_path, "r", encoding="utf-8") as f:
        data = yaml.safe_load(f) or {}

    def coerce_entry(value, default_speaker):
        # Support tuple/list, dict, or tuple-like strings: (M,True,Name) / (M, False, Name)
        if isinstance(value, (list, tuple)) and len(value) >= 3:
            gender, in_group, speaker_name = value[0], bool(value[1]), value[2]
            return gender, in_group, speaker_name
        if isinstance(value, dict):
            return (
                value.get("gender", "Unknown"),
                bool(value.get("in_group", False)),
                value.get("speaker_name", default_speaker),
            )
        if isinstance(value, str):
            cleaned = value.strip()
            if cleaned.startswith("(") and cleaned.endswith(")"):
                cleaned = cleaned[1:-1]
            parts = [p.strip() for p in cleaned.split(",")]
            if len(parts) >= 3:
                gender = parts[0]
                in_group_raw = parts[1].lower()
                in_group = in_group_raw in {"true", "1", "yes"}
                speaker_name = parts[2]
                return gender, in_group, speaker_name
        return None

    parsed = {}
    for filename, value in data.items():
        coerced = coerce_entry(value, yaml_path.parent.name)
        if coerced:
            gender, in_group, speaker_name = coerced
            parsed[filename] = {
                "gender": gender,
                "in_group": in_group,
                "speaker_name": speaker_name,
                "source": "yaml",
            }
    return parsed


def build_speaker_maps(root: Path):
    speaker_to_label = {}
    label_to_speaker = {}
    label_counter = 0
    # pass 1: yaml (only use explicit metadata, folders are not speakers)
    for folder in sorted(root.iterdir()):
        if not folder.is_dir():
            continue
        meta = parse_labels_yaml(folder / "labels.yaml")
        for info in meta.values():
            spk = info["speaker_name"]
            if spk not in speaker_to_label:
                speaker_to_label[spk] = label_counter
                label_to_speaker[label_counter] = spk
                label_counter += 1
    return speaker_to_label, label_to_speaker

Using recordings root: c:\Users\pczec\Desktop\Studia\SEM5\IML\IML-PW\Recordings_1


In [82]:
# 3) List files and assign labels

def list_files(root: Path):
    speaker_to_label, _ = build_speaker_maps(root)
    rows = []
    for folder in sorted(root.iterdir()):
        if not folder.is_dir():
            continue
        meta = parse_labels_yaml(folder / "labels.yaml")
        for audio_file in sorted(folder.iterdir()):
            if audio_file.suffix.lower() not in SUPPORTED_EXTS:
                continue
            info = meta.get(
                audio_file.name,
                {
                    "speaker_name": None,
                    "gender": "Unknown",
                    "in_group": False,
                    "source": "missing_in_yaml",
                },
            )
            speaker_name = info["speaker_name"]
            speaker_label = speaker_to_label.get(speaker_name, -1) if speaker_name else -1
            gender = info.get("gender", "Unknown")
            in_group = info.get("in_group", False)
            source = info.get("source", "folder_fallback")
            rel_path = audio_file.relative_to(root)
            rows.append(
                {
                    "file_path": str(rel_path),
                    "speaker_label": speaker_label,
                    "speaker_name": speaker_name,
                    "gender": gender,
                    "in_group": in_group,
                    "source": source,
                }
            )
    return pd.DataFrame(rows)


df = list_files(recordings_root)
print(df.head())
print(f"Total files: {len(df)}")

                               file_path  speaker_label speaker_name gender  \
0                     Aleksander\adi.wav             13          Adi      M   
1    Aleksander\Alexander-aleksander.mp3              0   Aleksander      M   
2             Aleksander\churchill-1.wav             10    Churchill      M   
3                     Aleksander\fdr.wav             14          FDR      M   
4  Aleksander\Gallic-Wars-Aleksander.mp3              0   Aleksander      M   

   in_group source  
0     False   yaml  
1      True   yaml  
2     False   yaml  
3     False   yaml  
4      True   yaml  
Total files: 107


In [83]:
# 4) Display summary counts per speaker
speaker_counts = df.groupby(["speaker_label", "speaker_name"]).size().reset_index(name="count")
display(speaker_counts)


Unnamed: 0,speaker_label,speaker_name,count
0,0,Aleksander,4
1,1,Szyc,1
2,2,AnnaAleksander,3
3,3,Pati,1
4,4,LenaW,1
5,5,GrianYT,3
6,6,Oversimplified,2
7,7,KryptydaYT,1
8,8,queenElisabeth,4
9,9,Thatcher,1


In [84]:
# 5) Export to CSV
out_csv = base_dir / "file_labels.csv"
df.to_csv(out_csv, index=False)
print(f"Saved: {out_csv}")

Saved: c:\Users\pczec\Desktop\Studia\SEM5\IML\IML-PW\file_labels.csv


In [85]:
# 6) Analyze files per person in class 1 (in_group=True)
class1_df = df[df["in_group"] == True]
class1_counts = class1_df.groupby(["speaker_label", "speaker_name"]).size().reset_index(name="file_count")
class1_counts = class1_counts.sort_values("file_count", ascending=False)
print("Files per person in Class 1 (in_group=True):")
display(class1_counts)
print(f"Total files in Class 1: {len(class1_df)}")


Files per person in Class 1 (in_group=True):


Unnamed: 0,speaker_label,speaker_name,file_count
0,0,Aleksander,4
1,17,Mantas,4
2,25,Michal,3
3,31,Piotr,1
4,48,Rafal,1


Total files in Class 1: 13


In [86]:
# 8) Calculate recording duration (Parallelized for speed)
from joblib import Parallel, delayed
import librosa

def get_duration_fast(file_path):
    """Get audio duration without fully loading the file."""
    try:
        # librosa.get_duration with path only reads metadata - much faster
        duration = librosa.get_duration(path=file_path)
        return duration
    except Exception as e:
        print(f"Error getting duration for {file_path}: {e}")
        return 0.0

# Build list of full paths
file_paths = [recordings_root / row["file_path"] for _, row in df.iterrows()]

# Parallel duration calculation using all CPU cores
print(f"Calculating durations for {len(file_paths)} files in parallel...")
durations = Parallel(n_jobs=-1, backend="loky", verbose=1)(
    delayed(get_duration_fast)(fp) for fp in file_paths
)

df["duration_seconds"] = durations
print(f"Total duration: {sum(durations) / 60:.2f} minutes")

Calculating durations for 107 files in parallel...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  84 out of 107 | elapsed:    1.6s remaining:    0.4s


Total duration: 495.34 minutes


[Parallel(n_jobs=-1)]: Done 107 out of 107 | elapsed:    3.1s finished


In [87]:
# 9) Duration per speaker in Class 1
class1_duration = df[df["in_group"] == True].copy()
duration_by_speaker = class1_duration.groupby(["speaker_label", "speaker_name"])["duration_seconds"].sum().reset_index()
duration_by_speaker.columns = ["speaker_label", "speaker_name", "total_duration_seconds"]
duration_by_speaker["total_duration_minutes"] = duration_by_speaker["total_duration_seconds"] / 60
duration_by_speaker = duration_by_speaker.sort_values("total_duration_seconds", ascending=False)

print("Recording duration per speaker in Class 1:")
display(duration_by_speaker[["speaker_label", "speaker_name", "total_duration_minutes"]])
print(f"\nTotal Class 1 duration: {duration_by_speaker['total_duration_seconds'].sum() / 60:.2f} minutes")

Recording duration per speaker in Class 1:


Unnamed: 0,speaker_label,speaker_name,total_duration_minutes
1,17,Mantas,22.73
3,31,Piotr,20.096667
2,25,Michal,13.876667
0,0,Aleksander,13.392436
4,48,Rafal,3.177229



Total Class 1 duration: 73.27 minutes


In [88]:
# 10) Duration per directory
from pathlib import Path

# Extract directory name from file path if column doesn't exist or to ensure it's correct
df["directory"] = df["file_path"].apply(lambda x: Path(x).parts[0])

duration_by_dir = df.groupby("directory")["duration_seconds"].sum().reset_index()
duration_by_dir.columns = ["directory", "total_duration_seconds"]
duration_by_dir["total_duration_minutes"] = duration_by_dir["total_duration_seconds"] / 60
duration_by_dir = duration_by_dir.sort_values("total_duration_seconds", ascending=False)

print("Recording duration per directory:")
display(duration_by_dir)
print(f"\nTotal duration: {duration_by_dir['total_duration_seconds'].sum() / 60:.2f} minutes")

Recording duration per directory:


Unnamed: 0,directory,total_duration_seconds,total_duration_minutes
1,Mantas,7867.244898,131.120748
0,Aleksander,7586.250014,126.4375
3,Rafał,7229.873334,120.497889
2,Piotr,4677.972154,77.966203
4,michał,2359.3,39.321667



Total duration: 495.34 minutes
