In [1]:
from pathlib import Path
import os
import pandas as pd
import soundfile as sf


In [2]:
# paths relative to project structure
CWD = Path().resolve()
PROJECT_ROOT = CWD.parents[1]   # notebooks/hls_cmds â†’ heart_and_lungsounds
RAW_ROOT = PROJECT_ROOT / "data" / "raw" / "hls_cmds"

DIR_HS = RAW_ROOT / "HS"
DIR_LS = RAW_ROOT / "LS"
DIR_MIX = RAW_ROOT / "Mix"

CSV_HS = RAW_ROOT / "HS.csv"
CSV_LS = RAW_ROOT / "LS.csv"
CSV_MIX = RAW_ROOT / "Mix.csv"


In [3]:
CHEST_ORDER = [
    "RUSB", "LUSB", "LLSB", "RC", "LC", "Apex",
    "RUA", "LUA", "RMA", "LMA", "RLA", "LLA",
]

HS_MAP = {
    "N":   "Normal",
    "LDM": "Late Diastolic Murmur",
    "MSM": "Mid Systolic Murmur",
    "LSM": "Late Systolic Murmur",
    "AF":  "Atrial Fibrillation",
    "S4":  "S4",
    "ESM": "Early Systolic Murmur",
    "S3":  "S3",
    "T":   "Tachycardia",
    "AVB": "AV Block",
}

LS_MAP = {
    "N":  "Normal",
    "W":  "Wheezing",
    "FC": "Fine Crackles",
    "R":  "Rhonchi",
    "PR": "Pleural Rub",
    "CC": "Coarse Crackles",
}

LOC_MAP = {
    # a in filenames corresponds to "apex" in the csv
    "A": "Apex",
    # all other codes (rusb, lusb, llsb, rc, lc, rua, lua, lma, rma, lla, rla, ...)
    # are already identical between filename and csv and do not need mapping
}


In [4]:
def audio_info(path: Path) -> dict:
    info = sf.info(str(path))
    return {
        "file_name": path.name,
        "samplerate": info.samplerate,
        "duration_sec": info.frames / info.samplerate,
        "channels": info.channels,
        "path": str(path),
    }


def scan_folder(folder: Path) -> pd.DataFrame:
    rows = [audio_info(p) for p in sorted(folder.glob("*.wav"))]
    return pd.DataFrame(rows)


def parse_hs_ls_filename(fname: str) -> dict:
    """
    format:
        gender_soundtype_location.wav
    example:
        F_AF_A.wav
        F_CC_LLA.wav
    """
    base = fname.replace(".wav", "")
    parts = base.split("_")
    if len(parts) != 3:
        return {"gender_code": None, "sound_code": None, "loc_code": None}
    gender, sound_code, loc_code = parts
    return {"gender_code": gender, "sound_code": sound_code, "loc_code": loc_code}


In [5]:
def clean_strings(df: pd.DataFrame) -> pd.DataFrame:
    """strip whitespace from column names and all string columns."""
    df = df.rename(columns={c: c.strip() for c in df.columns})
    for col in df.select_dtypes(include="object").columns:
        df[col] = df[col].astype(str).str.strip()
    return df


In [6]:
def load_hs_ls_meta(path: Path, kind: str) -> pd.DataFrame:
    """
    kind = 'HS' or 'LS'
    hs.csv: gender, heart sound type, location, heart sound id
    ls.csv: gender, lung sound type, location, lung sound id
    """
    df = pd.read_csv(path)
    df = clean_strings(df)

    if kind == "HS":
        id_col = "Heart Sound ID"
    elif kind == "LS":
        id_col = "Lung Sound ID"
    else:
        raise ValueError("kind must be 'HS' or 'LS'.")

    df["file_name"] = df[id_col].astype(str) + ".wav"
    df["Location"] = df["Location"].str.strip()

    unknown = sorted(set(df["Location"]) - set(CHEST_ORDER))
    if unknown:
        print(f"[warn] unknown locations in {path.name}: {unknown}")

    return df


In [7]:
def load_mix_meta(path: Path) -> pd.DataFrame:
    """
    load mix.csv with:
    - aggressive whitespace stripping
    - guaranteed 'file_name' column

    expected columns include e.g.:
    ['Gender', 'Heart Sound Type', 'Lung Sound Type',
     'Location', 'Heart Sound ID', 'Lung Sound ID', 'Mixed Sound ID']
    """
    df = pd.read_csv(path)

    # clean column names
    df.columns = [c.strip() for c in df.columns]

    # if there is already some kind of file name column, use it
    if "file_name" in df.columns:
        pass
    else:
        cand_cols = [
            c for c in df.columns
            if c.lower().replace(" ", "") in {"filename", "file_name"}
        ]

        if cand_cols:
            df = df.rename(columns={cand_cols[0]: "file_name"})
        else:
            # no explicit file-name column, derive from mixed sound id
            if "Mixed Sound ID" in df.columns:
                df["file_name"] = (
                    df["Mixed Sound ID"]
                    .astype(str)
                    .str.strip()
                    + ".wav"
                )
            else:
                raise ValueError(
                    f"could not find a file-name column in {path} "
                    f"(columns={df.columns.tolist()})"
                )

    # clean strings
    df["file_name"] = df["file_name"].astype(str).str.strip()

    if "Location" in df.columns:
        df["Location"] = df["Location"].astype(str).str.strip()

    return df


In [8]:
def build_from_filenames_hs_ls(dir_path: Path, kind: str) -> pd.DataFrame:
    df = scan_folder(dir_path)
    if df.empty:
        print(f"[warn] no wav files found in {dir_path}")
        return df

    parsed = df["file_name"].apply(parse_hs_ls_filename).apply(pd.Series)
    df = pd.concat([df, parsed], axis=1)

    # human-readable sound types
    if kind == "HS":
        df["sound_full"] = df["sound_code"].map(HS_MAP)
        type_col_name = "Heart Sound Type"
    else:
        df["sound_full"] = df["sound_code"].map(LS_MAP)
        type_col_name = "Lung Sound Type"

    df["loc_full"] = df["loc_code"].map(LOC_MAP).fillna(df["loc_code"])

    return df, type_col_name


In [9]:
def compare_hs_ls(kind: str):
    if kind == "HS":
        dir_path = DIR_HS
        csv_path = CSV_HS
    else:
        dir_path = DIR_LS
        csv_path = CSV_LS

    print(f"\n==== {kind}: filename vs csv ====")
    df_files, type_col = build_from_filenames_hs_ls(dir_path, kind)
    meta = load_hs_ls_meta(csv_path, kind=kind)

    # join on file_name
    merged = df_files.merge(meta, on="file_name", how="outer", indicator=True)

    # check missing / extra
    only_files = merged[merged["_merge"] == "left_only"]
    only_csv = merged[merged["_merge"] == "right_only"]

    if not only_files.empty:
        print(f"[error] {kind}: {len(only_files)} wav files not found in {csv_path.name}")
        display(only_files[["file_name", "path"]])
    if not only_csv.empty:
        print(f"[error] {kind}: {len(only_csv)} rows in csv without matching wav")
        display(only_csv[["file_name", type_col, "Location"]])

    # content mismatches
    both = merged[merged["_merge"] == "both"].copy()

    # gender: filename gender_code vs csv gender (first letter)
    both["Gender_csv_first"] = both["Gender"].str[0]
    mask_gender = both["gender_code"] != both["Gender_csv_first"]

    # sound type
    mask_sound = both["sound_full"] != both[type_col]

    # location
    mask_loc = both["loc_full"] != both["Location"]

    if mask_gender.any() or mask_sound.any() or mask_loc.any():
        print(f"[warn] {kind}: metadata mismatches found.")
        mismatches = both[mask_gender | mask_sound | mask_loc]
        display(
            mismatches[
                [
                    "file_name",
                    "gender_code", "Gender",
                    "sound_code", "sound_full", type_col,
                    "loc_code", "loc_full", "Location",
                ]
            ]
        )
    else:
        print(f"[ok] {kind}: all filename metadata matches {csv_path.name}")

    # counts per location: filename vs csv
    counts_files = df_files.groupby("loc_full")["file_name"].nunique()
    counts_csv = meta.groupby("Location")["file_name"].nunique()

    counts_table = (
        pd.concat([counts_files, counts_csv], axis=1, keys=["from_filename", "from_csv"])
        .reindex(CHEST_ORDER)
        .fillna(0)
        .astype(int)
    )
    print(f"\n{kind}: counts per location (filename vs csv)")
    print(counts_table)


In [10]:
def compare_mix():
    # scan mix folder
    df_mix_files = scan_folder(DIR_MIX)
    df_mix_files["file_name"] = df_mix_files["file_name"].astype(str)

    # keep only mixed recordings (m...)
    df_m = df_mix_files[df_mix_files["file_name"].str.startswith("M")].copy()

    # load mix csv (creates file_name from mixed sound id)
    meta_mix = load_mix_meta(CSV_MIX).copy()

    files_set = set(df_m["file_name"])
    csv_set = set(meta_mix["file_name"])

    only_in_files = sorted(files_set - csv_set)
    only_in_csv = sorted(csv_set - files_set)

    print("==== mix: m-files vs mix.csv ====")
    if only_in_files:
        print("[error] mix: m wav files not found in mix.csv:")
        print(pd.DataFrame({"file_name": only_in_files}))
    else:
        print("[ok] mix: all m wav files are present in mix.csv")

    if only_in_csv:
        print("[error] mix: rows in mix.csv without m wav:")
        print(pd.DataFrame({"file_name": only_in_csv}))
    else:
        print("[ok] mix: mix.csv has no extra rows without m wav")

    # counts per location from mix.csv
    loc_counts = (
        meta_mix["Location"]
        .value_counts()
        .rename_axis("Location")
        .to_frame("count")
        .reindex(
            [
                "RUSB", "LUSB", "LLSB", "RC", "LC", "Apex",
                "RUA", "LUA", "RMA", "LMA", "RLA", "LLA",
            ]
        )
    )
    print("\nMix: counts per location (from mix.csv)")
    print(loc_counts)


In [11]:
if __name__ == "__main__":
    compare_hs_ls("HS")
    compare_hs_ls("LS")
    compare_mix()



==== HS: filename vs csv ====
[ok] HS: all filename metadata matches HS.csv

HS: counts per location (filename vs csv)
      from_filename  from_csv
RUSB              7         7
LUSB             13        13
LLSB             10        10
RC                4         4
LC                6         6
Apex             10        10
RUA               0         0
LUA               0         0
RMA               0         0
LMA               0         0
RLA               0         0
LLA               0         0

==== LS: filename vs csv ====
[ok] LS: all filename metadata matches LS.csv

LS: counts per location (filename vs csv)
      from_filename  from_csv
RUSB              0         0
LUSB              0         0
LLSB              0         0
RC                0         0
LC                0         0
Apex              0         0
RUA               7         7
LUA              11        11
RMA               5         5
LMA               9         9
RLA              10        10
LLA       