In [1]:
from pathlib import Path
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import soundfile as sf

import sys, os
project_root = os.path.abspath(os.path.join(os.getcwd(), "../"))
if project_root not in sys.path:
    sys.path.append(project_root)

CWD = Path().resolve()
PROJECT_ROOT = CWD.parents[1]   # notebooks/hls_cmds → heart_and_lungsounds
RAW_ROOT = PROJECT_ROOT / "data" / "raw" / "hls_cmds"

DIR_HS = RAW_ROOT / "HS"
DIR_LS = RAW_ROOT / "LS"
DIR_MIX = RAW_ROOT / "Mix"

CSV_HS = RAW_ROOT / "HS.csv"
CSV_LS = RAW_ROOT / "LS.csv"
CSV_MIX = RAW_ROOT / "Mix.csv"

for p in [DIR_HS, DIR_LS, DIR_MIX, CSV_HS, CSV_LS, CSV_MIX]:
    print(p)

from src.plot_style import set_default_style
from src.plots import histplot, countplot

set_default_style()

C:\Users\Esmee Werk\Documents\Persoonlijke Projecten\AI-Onderzoeksvragen\research_questions\heart_and_lungsounds\data\raw\hls_cmds\HS
C:\Users\Esmee Werk\Documents\Persoonlijke Projecten\AI-Onderzoeksvragen\research_questions\heart_and_lungsounds\data\raw\hls_cmds\LS
C:\Users\Esmee Werk\Documents\Persoonlijke Projecten\AI-Onderzoeksvragen\research_questions\heart_and_lungsounds\data\raw\hls_cmds\Mix
C:\Users\Esmee Werk\Documents\Persoonlijke Projecten\AI-Onderzoeksvragen\research_questions\heart_and_lungsounds\data\raw\hls_cmds\HS.csv
C:\Users\Esmee Werk\Documents\Persoonlijke Projecten\AI-Onderzoeksvragen\research_questions\heart_and_lungsounds\data\raw\hls_cmds\LS.csv
C:\Users\Esmee Werk\Documents\Persoonlijke Projecten\AI-Onderzoeksvragen\research_questions\heart_and_lungsounds\data\raw\hls_cmds\Mix.csv


In [2]:
def audio_info(path: Path) -> dict:
    info = sf.info(str(path))
    return {
        "filename": path.name,
        "samplerate": info.samplerate,
        "duration_sec": info.frames / info.samplerate,
        "channels": info.channels,
        "path": str(path),
    }

def scan_folder(folder: Path) -> pd.DataFrame:
    rows = [audio_info(p) for p in sorted(folder.glob("*.wav"))]
    return pd.DataFrame(rows)

def parse_hs_ls_filename(fname: str) -> dict:
    """
    Format:
        Gender_SoundType_Location.wav
    Example:
        F_AF_A.wav
        F_CC_LLA.wav
    """
    base = fname.replace(".wav", "")
    parts = base.split("_")
    if len(parts) != 3:
        return {"gender": None, "sound_code": None, "loc_code": None}
    gender, sound_code, loc_code = parts
    return {"gender": gender, "sound_code": sound_code, "loc_code": loc_code}

def parse_mix_filename(fname: str) -> dict:
    """
    Format:
        M0002.wav, H0002.wav, L0002.wav
    """
    base = fname.replace(".wav", "")
    prefix = base[0]         # H / L / M
    num = base[1:]
    return {"prefix": prefix, "id": int(num) if num.isdigit() else None}

In [3]:
HS_MAP = {
    "N":   "Normal",
    "LDM": "Late Diastolic Murmur",
    "MSM": "Mid Systolic Murmur",
    "LSM": "Late Systolic Murmur",
    "AF":  "Atrial Fibrillation",
    "S4":  "S4",
    "ESM": "Early Systolic Murmur",
    "S3":  "S3",
    "T":   "Tachycardia",
    "AVB": "AV Block",
}

LS_MAP = {
    "N":  "Normal",
    "W":  "Wheezing",
    "FC": "Fine Crackles",
    "R":  "Rhonchi",
    "PR": "Pleural Rub",
    "CC": "Coarse Crackles",
}

LOC_MAP = {
    # A in filenames komt overeen met "Apex" in de CSV
    "A": "Apex",
    # alle andere (RUSB, LUSB, LLSB, RC, LC, RUA, LUA, LMA, RMA, LLA, RLA, ...)
    # zijn al identiek tussen filename en CSV en hoeven niet gemapped
}

In [4]:
hs_csv  = pd.read_csv(CSV_HS, encoding="utf-8-sig")
ls_csv  = pd.read_csv(CSV_LS, encoding="utf-8-sig")
mix_csv = pd.read_csv(CSV_MIX, encoding="utf-8-sig")

print("HS.csv columns :", hs_csv.columns.tolist())
print("LS.csv columns :", ls_csv.columns.tolist())
print("Mix.csv columns:", mix_csv.columns.tolist())


HS.csv columns : ['Gender', 'Heart Sound Type', 'Location', 'Heart Sound ID']
LS.csv columns : ['Gender', 'Lung Sound Type', 'Location', 'Lung Sound ID']
Mix.csv columns: ['Gender', 'Heart Sound Type', 'Lung Sound Type', 'Location', 'Heart Sound ID', 'Lung Sound ID', 'Mixed Sound ID']


In [5]:
hs_audio  = scan_folder(DIR_HS)
ls_audio  = scan_folder(DIR_LS)
mix_audio = scan_folder(DIR_MIX)

mix_csv["mix_id"] = (
    mix_csv["Mixed Sound ID"]
    .astype(str)
    .str.extract(r"(\d+)", expand=False)
    .astype(int)
)


print("HS audio files :", len(hs_audio))
print("LS audio files :", len(ls_audio))
print("Mix audio files:", len(mix_audio))

HS audio files : 50
LS audio files : 50
Mix audio files: 435


In [6]:
# parse filenames for HS/LS
hs_audio = hs_audio.join(
    hs_audio["filename"].apply(parse_hs_ls_filename).apply(pd.Series)
)
ls_audio = ls_audio.join(
    ls_audio["filename"].apply(parse_hs_ls_filename).apply(pd.Series)
)

# parse filenames for Mix
mix_audio = mix_audio.join(
    mix_audio["filename"].apply(parse_mix_filename).apply(pd.Series)
)

In [7]:
hs_audio["sound_full"] = hs_audio["sound_code"].map(HS_MAP)
ls_audio["sound_full"] = ls_audio["sound_code"].map(LS_MAP)

hs_audio["loc_full"] = hs_audio["loc_code"].map(lambda x: LOC_MAP.get(x, x))
ls_audio["loc_full"] = ls_audio["loc_code"].map(lambda x: LOC_MAP.get(x, x))

In [8]:
# HS:
# join on:
#   gender (F/M)
#   sound_full  (ex. "Atrial Fibrillation")
#   loc_full    (ex. "Apex", "RUSB", "LLSB", ...)
df_hs = hs_audio.merge(
    hs_csv,
    left_on=["gender", "sound_full", "loc_full"],
    right_on=["Gender", "Heart Sound Type", "Location"],
    how="left"
)

In [9]:
# LS:
# join on:
#   gender
#   sound_full  (ex. "Coarse Crackles")
#   loc_full
df_ls = ls_audio.merge(
    ls_csv,
    left_on=["gender", "sound_full", "loc_full"],
    right_on=["Gender", "Lung Sound Type", "Location"],
    how="left",
)


In [10]:
# MIX:
# mix_audio.id is bv. 1, 2, 3 ... derraved uit H0001/M0001/L0001
# mix_csv.mix_id is numeric from Mixed Sound ID
df_mix = mix_audio.merge(
    mix_csv,
    left_on="id",
    right_on="mix_id",
    how="left",
)

In [11]:
df_hs["category"]  = "HS"
df_ls["category"]  = "LS"
df_mix["category"] = "Mix"

print("Rows HS :", len(df_hs))
print("Rows LS :", len(df_ls))
print("Rows Mix:", len(df_mix))

# korte check op metadata coverage
print("\nHS – missing Heart Sound ID:", df_hs["Heart Sound ID"].isna().sum())
print("LS – missing Lung  Sound ID:", df_ls["Lung Sound ID"].isna().sum())
print("Mix – missing Mixed Sound ID:", df_mix["Mixed Sound ID"].isna().sum())

Rows HS : 50
Rows LS : 50
Rows Mix: 435

HS – missing Heart Sound ID: 0
LS – missing Lung  Sound ID: 0
Mix – missing Mixed Sound ID: 0


In [16]:
print("Total HS recordings:", len(df_hs))
print("Samplerates:", df_hs["samplerate"].value_counts().to_dict())
print("Channels:", df_hs["channels"].value_counts().to_dict())

Total HS recordings: 50
Samplerates: {4000: 50}
Channels: {1: 50}


In [17]:
# Check durations
print("\nDuration [s]:")
print(df_hs["duration_sec"].describe())


Duration [s]:
count    50.0
mean     15.0
std       0.0
min      15.0
25%      15.0
50%      15.0
75%      15.0
max      15.0
Name: duration_sec, dtype: float64


In [None]:
# Count sound types
print("\nSound types (from filename):")
print(df_hs["sound_code"].value_counts().to_dict())


Sound types (from filename):
{'N': 9, 'MSM': 7, 'LDM': 6, 'ESM': 6, 'S3': 5, 'LSM': 5, 'AF': 4, 'T': 3, 'AVB': 3, 'S4': 2}


In [19]:
# Count locations
print("\nLocations (from filename):")
print(df_hs["loc_code"].value_counts().to_dict())


Locations (from filename):
{'LUSB': 13, 'A': 10, 'LLSB': 10, 'RUSB': 7, 'LC': 6, 'RC': 4}


In [None]:
# Check that all IDs exist
missing_hsid = df_hs["Heart Sound ID"].isna().sum()
print(f"\nMissing Heart Sound ID values: {missing_hsid}")


Missing Heart Sound ID values: 0


In [None]:
print("Total LS recordings:", len(df_ls))
print("Samplerates:", df_ls["samplerate"].value_counts().to_dict())
print("Channels:", df_ls["channels"].value_counts().to_dict())

Total LS recordings: 50
Samplerates: {4000: 50}
Channels: {1: 50}


In [23]:
print("\nDuration [s]:")
print(df_ls["duration_sec"].describe())


Duration [s]:
count    50.0
mean     15.0
std       0.0
min      15.0
25%      15.0
50%      15.0
75%      15.0
max      15.0
Name: duration_sec, dtype: float64


In [24]:
print("\nSound types (from filename):")
print(df_ls["sound_code"].value_counts().to_dict())


Sound types (from filename):
{'N': 12, 'CC': 9, 'PR': 9, 'R': 8, 'W': 7, 'FC': 5}


In [25]:
print("\nLocations (from filename):")
print(df_ls["loc_code"].value_counts().to_dict())


Locations (from filename):
{'LUA': 11, 'RLA': 10, 'LMA': 9, 'LLA': 8, 'RUA': 7, 'RMA': 5}


In [26]:
missing_lsid = df_ls["Lung Sound ID"].isna().sum()
print(f"\nMissing Lung Sound ID values: {missing_lsid}")


Missing Lung Sound ID values: 0


In [None]:
print("Total Mix recordings:", len(df_mix))
print("By prefix (H/L/M):", df_mix["prefix"].value_counts().to_dict())

Total Mix recordings: 435
By prefix (H/L/M): {'H': 145, 'L': 145, 'M': 145}


In [28]:
print("\nSamplerates:", df_mix["samplerate"].value_counts().to_dict())
print("Channels:", df_mix["channels"].value_counts().to_dict())


Samplerates: {4000: 435}
Channels: {1: 435}


In [29]:
print("\nDuration [s]:")
print(df_mix["duration_sec"].describe())


Duration [s]:
count    435.0
mean      15.0
std        0.0
min       15.0
25%       15.0
50%       15.0
75%       15.0
max       15.0
Name: duration_sec, dtype: float64


In [None]:
print("\nHeart Sound Types (from CSV):")
if "Heart Sound Type" in df_mix.columns:
    print(df_mix["Heart Sound Type"].value_counts(dropna=False).to_dict())

print("\nLung Sound Types (from CSV):")
if "Lung Sound Type" in df_mix.columns:
    print(df_mix["Lung Sound Type"].value_counts(dropna=False).to_dict())

missing_mixed = df_mix["Mixed Sound ID"].isna().sum()
print(f"\nMissing Mixed Sound ID values: {missing_mixed}")


Heart Sound Types (from CSV):
{'Late Systolic Murmur': 51, 'Tachycardia': 48, 'S4': 48, 'S3': 45, 'Atrial Fibrillation': 45, 'Mid Systolic Murmur': 42, 'Early Systolic Murmur': 39, 'AV Block': 39, 'Late Diastolic Murmur': 39, 'Normal': 39}

Lung Sound Types (from CSV):
{'Normal': 84, 'Wheezing': 84, 'Pleural Rub': 75, 'Rhonchi': 69, 'Fine Crackles': 66, 'Coarse Crackles': 57}

Missing Mixed Sound ID values: 0


In [31]:
prefix_counts = df_mix["prefix"].value_counts()
assert prefix_counts.get("H", 0) == 145, "Mix: Expected 145 H files"
assert prefix_counts.get("L", 0) == 145, "Mix: Expected 145 L files"
assert prefix_counts.get("M", 0) == 145, "Mix: Expected 145 M files"

assert (df_mix["duration_sec"] > 0).all(), "Mix: non-positive durations"
assert df_mix["samplerate"].nunique() <= 5, "Mix: unexpected number of samplerates"