In [19]:
from pathlib import Path
import pandas as pd
import numpy as np
import soundfile as sf

In [20]:
# paths relative to project structure
CWD = Path().resolve()
PROJECT_ROOT = CWD.parents[1]   # notebooks/hls_cmds â†’ heart_and_lungsounds
RAW_ROOT = PROJECT_ROOT / "data" / "raw" / "hls_cmds"

DIR_HS  = RAW_ROOT / "HS"
DIR_LS  = RAW_ROOT / "LS"
DIR_MIX = RAW_ROOT / "Mix"

CSV_HS  = RAW_ROOT / "HS.csv"
CSV_LS  = RAW_ROOT / "LS.csv"
CSV_MIX = RAW_ROOT / "Mix.csv"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("RAW_ROOT    :", RAW_ROOT)
print("DIR_HS      :", DIR_HS)
print("DIR_LS      :", DIR_LS)
print("DIR_MIX     :", DIR_MIX)
print("CSV_HS      :", CSV_HS)
print("CSV_LS      :", CSV_LS)
print("CSV_MIX     :", CSV_MIX)

PROJECT_ROOT: C:\Users\MSI\Documents\Persoonlijke Projecten\AI-Onderzoeksvragen\research_questions\heart_and_lungsounds
RAW_ROOT    : C:\Users\MSI\Documents\Persoonlijke Projecten\AI-Onderzoeksvragen\research_questions\heart_and_lungsounds\data\raw\hls_cmds
DIR_HS      : C:\Users\MSI\Documents\Persoonlijke Projecten\AI-Onderzoeksvragen\research_questions\heart_and_lungsounds\data\raw\hls_cmds\HS
DIR_LS      : C:\Users\MSI\Documents\Persoonlijke Projecten\AI-Onderzoeksvragen\research_questions\heart_and_lungsounds\data\raw\hls_cmds\LS
DIR_MIX     : C:\Users\MSI\Documents\Persoonlijke Projecten\AI-Onderzoeksvragen\research_questions\heart_and_lungsounds\data\raw\hls_cmds\Mix
CSV_HS      : C:\Users\MSI\Documents\Persoonlijke Projecten\AI-Onderzoeksvragen\research_questions\heart_and_lungsounds\data\raw\hls_cmds\HS.csv
CSV_LS      : C:\Users\MSI\Documents\Persoonlijke Projecten\AI-Onderzoeksvragen\research_questions\heart_and_lungsounds\data\raw\hls_cmds\LS.csv
CSV_MIX     : C:\Users\MSI\D

In [21]:
CHEST_ORDER = [
    "RUSB", "LUSB", "LLSB", "RC", "LC", "Apex",
    "RUA", "LUA", "RMA", "LMA", "RLA", "LLA",
]

HS_MAP = {
    "N":   "Normal",
    "LDM": "Late Diastolic Murmur",
    "MSM": "Mid Systolic Murmur",
    "LSM": "Late Systolic Murmur",
    "AF":  "Atrial Fibrillation",
    "S4":  "S4",
    "ESM": "Early Systolic Murmur",
    "S3":  "S3",
    "T":   "Tachycardia",
    "AVB": "AV Block",
}

LS_MAP = {
    "N":  "Normal",
    "W":  "Wheezing",
    "FC": "Fine Crackles",
    "R":  "Rhonchi",
    "PR": "Pleural Rub",
    "CC": "Coarse Crackles",
}

LOC_MAP = {"A": "Apex"}  # 'A' in filenames corresponds to "Apex" in csv


In [22]:
def audio_info(path: Path) -> dict:
    info = sf.info(str(path))
    return {
        "file_name": path.name,
        "samplerate": info.samplerate,
        "duration_sec": info.frames / info.samplerate,
        "channels": info.channels,
        "path": str(path),
    }

def scan_folder(folder: Path) -> pd.DataFrame:
    rows = [audio_info(p) for p in sorted(folder.glob("*.wav"))]
    return pd.DataFrame(rows)

def parse_hs_ls_filename(fname: str) -> dict:
    """
    format: gender_soundtype_location.wav
    examples: F_AF_A.wav, F_CC_LLA.wav
    """
    base = fname.replace(".wav", "")
    parts = base.split("_")
    if len(parts) != 3:
        return {"gender_code": None, "sound_code": None, "loc_code": None}
    gender, sound_code, loc_code = parts
    return {"gender_code": gender, "sound_code": sound_code, "loc_code": loc_code}

def clean_strings(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(columns={c: c.strip() for c in df.columns})
    for col in df.select_dtypes(include="object").columns:
        df[col] = df[col].astype(str).str.strip()
    return df


In [23]:
df_hs_files = scan_folder(DIR_HS)
parsed = df_hs_files["file_name"].apply(parse_hs_ls_filename).apply(pd.Series)
df_hs_files = pd.concat([df_hs_files, parsed], axis=1)

df_hs_files["heart_sound_type_from_fname"] = df_hs_files["sound_code"].map(HS_MAP)
df_hs_files["location_from_fname"] = df_hs_files["loc_code"].map(LOC_MAP).fillna(df_hs_files["loc_code"])
df_hs_files["gender_from_fname"] = df_hs_files["gender_code"]

display(df_hs_files.head())
print("HS wav count:", len(df_hs_files))


Unnamed: 0,file_name,samplerate,duration_sec,channels,path,gender_code,sound_code,loc_code,heart_sound_type_from_fname,location_from_fname,gender_from_fname
0,F_AF_A.wav,4000,15.0,1,C:\Users\MSI\Documents\Persoonlijke Projecten\...,F,AF,A,Atrial Fibrillation,Apex,F
1,F_AF_LUSB.wav,4000,15.0,1,C:\Users\MSI\Documents\Persoonlijke Projecten\...,F,AF,LUSB,Atrial Fibrillation,LUSB,F
2,F_ESM_LLSB.wav,4000,15.0,1,C:\Users\MSI\Documents\Persoonlijke Projecten\...,F,ESM,LLSB,Early Systolic Murmur,LLSB,F
3,F_ESM_LUSB.wav,4000,15.0,1,C:\Users\MSI\Documents\Persoonlijke Projecten\...,F,ESM,LUSB,Early Systolic Murmur,LUSB,F
4,F_ESM_RUSB.wav,4000,15.0,1,C:\Users\MSI\Documents\Persoonlijke Projecten\...,F,ESM,RUSB,Early Systolic Murmur,RUSB,F


HS wav count: 50


In [24]:
df_hs_meta = clean_strings(pd.read_csv(CSV_HS))

# derive file_name from Heart Sound ID
df_hs_meta["file_name"] = df_hs_meta["Heart Sound ID"].astype(str) + ".wav"

# basic location sanity
unknown = sorted(set(df_hs_meta["Location"]) - set(CHEST_ORDER))
if unknown:
    print("[warn] unknown HS locations in HS.csv:", unknown)

display(df_hs_meta.head())
print("HS csv rows:", len(df_hs_meta))


Unnamed: 0,Gender,Heart Sound Type,Location,Heart Sound ID,file_name
0,F,Normal,RC,F_N_RC,F_N_RC.wav
1,F,Normal,LC,F_N_LC,F_N_LC.wav
2,M,Normal,RUSB,M_N_RUSB,M_N_RUSB.wav
3,F,Normal,LUSB,F_N_LUSB,F_N_LUSB.wav
4,F,Normal,LLSB,F_N_LLSB,F_N_LLSB.wav


HS csv rows: 50


In [25]:
hs_merged = df_hs_files.merge(df_hs_meta, on="file_name", how="outer", indicator=True)

only_files = hs_merged[hs_merged["_merge"] == "left_only"]
only_csv   = hs_merged[hs_merged["_merge"] == "right_only"]
both       = hs_merged[hs_merged["_merge"] == "both"].copy()

print("HS only in files:", len(only_files))
print("HS only in csv  :", len(only_csv))
print("HS matched      :", len(both))

if len(only_files):
    display(only_files[["file_name", "path"]].head(50))
if len(only_csv):
    display(only_csv[["file_name", "Heart Sound Type", "Location"]].head(50))

# mismatches within matched
both["gender_csv_first"] = both["Gender"].astype(str).str[0]
mask_gender = both["gender_from_fname"] != both["gender_csv_first"]
mask_sound  = both["heart_sound_type_from_fname"] != both["Heart Sound Type"]
mask_loc    = both["location_from_fname"] != both["Location"]

mismatch = both[mask_gender | mask_sound | mask_loc]
print("HS metadata mismatches:", len(mismatch))
if len(mismatch):
    display(mismatch[[
        "file_name",
        "gender_from_fname", "Gender",
        "sound_code", "heart_sound_type_from_fname", "Heart Sound Type",
        "loc_code", "location_from_fname", "Location"
    ]].head(100))

# per-location counts
counts_hs_files = df_hs_files.groupby("location_from_fname")["file_name"].nunique()
counts_hs_csv   = df_hs_meta.groupby("Location")["file_name"].nunique()

hs_counts = (
    pd.concat([counts_hs_files, counts_hs_csv], axis=1, keys=["from_filename", "from_csv"])
      .reindex(CHEST_ORDER)
      .fillna(0).astype(int)
)
display(hs_counts)


HS only in files: 0
HS only in csv  : 0
HS matched      : 50
HS metadata mismatches: 0


Unnamed: 0,from_filename,from_csv
RUSB,7,7
LUSB,13,13
LLSB,10,10
RC,4,4
LC,6,6
Apex,10,10
RUA,0,0
LUA,0,0
RMA,0,0
LMA,0,0


In [26]:
df_ls_files = scan_folder(DIR_LS)
parsed = df_ls_files["file_name"].apply(parse_hs_ls_filename).apply(pd.Series)
df_ls_files = pd.concat([df_ls_files, parsed], axis=1)

df_ls_files["lung_sound_type_from_fname"] = df_ls_files["sound_code"].map(LS_MAP)
df_ls_files["location_from_fname"] = df_ls_files["loc_code"].map(LOC_MAP).fillna(df_ls_files["loc_code"])
df_ls_files["gender_from_fname"] = df_ls_files["gender_code"]

display(df_ls_files.head())
print("LS wav count:", len(df_ls_files))


Unnamed: 0,file_name,samplerate,duration_sec,channels,path,gender_code,sound_code,loc_code,lung_sound_type_from_fname,location_from_fname,gender_from_fname
0,F_CC_LLA.wav,4000,15.0,1,C:\Users\MSI\Documents\Persoonlijke Projecten\...,F,CC,LLA,Coarse Crackles,LLA,F
1,F_CC_LMA.wav,4000,15.0,1,C:\Users\MSI\Documents\Persoonlijke Projecten\...,F,CC,LMA,Coarse Crackles,LMA,F
2,F_CC_LUA.wav,4000,15.0,1,C:\Users\MSI\Documents\Persoonlijke Projecten\...,F,CC,LUA,Coarse Crackles,LUA,F
3,F_CC_RLA.wav,4000,15.0,1,C:\Users\MSI\Documents\Persoonlijke Projecten\...,F,CC,RLA,Coarse Crackles,RLA,F
4,F_CC_RMA.wav,4000,15.0,1,C:\Users\MSI\Documents\Persoonlijke Projecten\...,F,CC,RMA,Coarse Crackles,RMA,F


LS wav count: 50


In [27]:
df_ls_meta = clean_strings(pd.read_csv(CSV_LS))

df_ls_meta["file_name"] = df_ls_meta["Lung Sound ID"].astype(str) + ".wav"

unknown = sorted(set(df_ls_meta["Location"]) - set(CHEST_ORDER))
if unknown:
    print("[warn] unknown LS locations in LS.csv:", unknown)

display(df_ls_meta.head())
print("LS csv rows:", len(df_ls_meta))


Unnamed: 0,Gender,Lung Sound Type,Location,Lung Sound ID,file_name
0,M,Normal,RUA,M_N_RUA,M_N_RUA.wav
1,F,Normal,LUA,F_N_LUA,F_N_LUA.wav
2,F,Normal,RMA,F_N_RMA,F_N_RMA.wav
3,F,Normal,LMA,F_N_LMA,F_N_LMA.wav
4,M,Normal,RLA,M_N_RLA,M_N_RLA.wav


LS csv rows: 50


In [28]:
df_ls_meta['Lung Sound Type'].value_counts()

Lung Sound Type
Normal             12
Pleural Rub         9
Coarse Crackles     9
Rhonchi             8
Wheezing            7
Fine Crackles       5
Name: count, dtype: int64

In [29]:
df_hs_meta['Heart Sound Type'].value_counts()

Heart Sound Type
Normal                   9
Mid Systolic Murmur      7
Late Diastolic Murmur    6
Early Systolic Murmur    6
S3                       5
Late Systolic Murmur     5
Atrial Fibrillation      4
Tachycardia              3
AV Block                 3
S4                       2
Name: count, dtype: int64

In [30]:
ls_merged = df_ls_files.merge(df_ls_meta, on="file_name", how="outer", indicator=True)

only_files = ls_merged[ls_merged["_merge"] == "left_only"]
only_csv   = ls_merged[ls_merged["_merge"] == "right_only"]
both       = ls_merged[ls_merged["_merge"] == "both"].copy()

print("LS only in files:", len(only_files))
print("LS only in csv  :", len(only_csv))
print("LS matched      :", len(both))

if len(only_files):
    display(only_files[["file_name", "path"]].head(50))
if len(only_csv):
    display(only_csv[["file_name", "Lung Sound Type", "Location"]].head(50))

both["gender_csv_first"] = both["Gender"].astype(str).str[0]
mask_gender = both["gender_from_fname"] != both["gender_csv_first"]
mask_sound  = both["lung_sound_type_from_fname"] != both["Lung Sound Type"]
mask_loc    = both["location_from_fname"] != both["Location"]

mismatch = both[mask_gender | mask_sound | mask_loc]
print("LS metadata mismatches:", len(mismatch))
if len(mismatch):
    display(mismatch[[
        "file_name",
        "gender_from_fname", "Gender",
        "sound_code", "lung_sound_type_from_fname", "Lung Sound Type",
        "loc_code", "location_from_fname", "Location"
    ]].head(100))

counts_ls_files = df_ls_files.groupby("location_from_fname")["file_name"].nunique()
counts_ls_csv   = df_ls_meta.groupby("Location")["file_name"].nunique()

ls_counts = (
    pd.concat([counts_ls_files, counts_ls_csv], axis=1, keys=["from_filename", "from_csv"])
      .reindex(CHEST_ORDER)
      .fillna(0).astype(int)
)
display(ls_counts)


LS only in files: 0
LS only in csv  : 0
LS matched      : 50
LS metadata mismatches: 0


Unnamed: 0,from_filename,from_csv
RUSB,0,0
LUSB,0,0
LLSB,0,0
RC,0,0
LC,0,0
Apex,0,0
RUA,7,7
LUA,11,11
RMA,5,5
LMA,9,9


In [31]:
df_mix_files = scan_folder(DIR_MIX)
df_mix_files["file_name"] = df_mix_files["file_name"].astype(str).str.strip()

# in jouw oude code filterde je m-files; laat dat expliciet zien
df_mix_mfiles = df_mix_files[df_mix_files["file_name"].str.startswith("M")].copy()
print("Mix wav total:", len(df_mix_files))
print("Mix wav M*   :", len(df_mix_mfiles))


Mix wav total: 435
Mix wav M*   : 145


In [32]:
df_mix_meta = clean_strings(pd.read_csv(CSV_MIX))

# derive file_name from Mixed Sound ID als er geen file_name kolom is
if "file_name" not in df_mix_meta.columns:
    df_mix_meta["file_name"] = df_mix_meta["Mixed Sound ID"].astype(str) + ".wav"

df_mix_meta["file_name"] = df_mix_meta["file_name"].astype(str).str.strip()
df_mix_meta["Location"]  = df_mix_meta["Location"].astype(str).str.strip()

files_set = set(df_mix_mfiles["file_name"])
csv_set   = set(df_mix_meta["file_name"])

only_in_files = sorted(files_set - csv_set)
only_in_csv   = sorted(csv_set - files_set)

print("Mix only in files:", len(only_in_files))
print("Mix only in csv  :", len(only_in_csv))

if only_in_files:
    display(pd.DataFrame({"file_name": only_in_files}).head(50))
if only_in_csv:
    display(pd.DataFrame({"file_name": only_in_csv}).head(50))

mix_loc_counts = (
    df_mix_meta["Location"]
      .value_counts()
      .rename_axis("Location")
      .to_frame("count")
      .reindex(CHEST_ORDER)
)
display(mix_loc_counts)


Mix only in files: 0
Mix only in csv  : 0


Unnamed: 0_level_0,count
Location,Unnamed: 1_level_1
RUSB,13
LUSB,12
LLSB,12
RC,14
LC,13
Apex,12
RUA,9
LUA,12
RMA,11
LMA,13


In [33]:
df_mix_meta['Heart Sound Type'].value_counts()

Heart Sound Type
Late Systolic Murmur     17
Tachycardia              16
S4                       16
S3                       15
Atrial Fibrillation      15
Mid Systolic Murmur      14
Early Systolic Murmur    13
AV Block                 13
Late Diastolic Murmur    13
Normal                   13
Name: count, dtype: int64

In [34]:
df_mix_meta['Lung Sound Type'].value_counts()

Lung Sound Type
Normal             28
Wheezing           28
Pleural Rub        25
Rhonchi            23
Fine Crackles      22
Coarse Crackles    19
Name: count, dtype: int64

In [35]:
# HS processed
hs_processed = pd.DataFrame({
    "category": "HS",
    "gender": df_hs_meta["Gender"].astype(str).str[0],
    "location": df_hs_meta["Location"],
    "heart_sound_type": df_hs_meta["Heart Sound Type"],
    "lung_sound_type": pd.NA,
    "heart_sound_id": df_hs_meta["Heart Sound ID"],
    "lung_sound_id": pd.NA,
    "mixed_sound_id": pd.NA,
    "file_name": df_hs_meta["file_name"],
})

# LS processed
ls_processed = pd.DataFrame({
    "category": "LS",
    "gender": df_ls_meta["Gender"].astype(str).str[0],
    "location": df_ls_meta["Location"],
    "heart_sound_type": pd.NA,
    "lung_sound_type": df_ls_meta["Lung Sound Type"],
    "heart_sound_id": pd.NA,
    "lung_sound_id": df_ls_meta["Lung Sound ID"],
    "mixed_sound_id": pd.NA,
    "file_name": df_ls_meta["file_name"],
})

# Mix processed (let op: Mix heeft beide types)
mix_processed = pd.DataFrame({
    "category": "Mix",
    "gender": df_mix_meta["Gender"].astype(str).str[0],
    "location": df_mix_meta["Location"],
    "heart_sound_type": df_mix_meta["Heart Sound Type"],
    "lung_sound_type": df_mix_meta["Lung Sound Type"],
    "heart_sound_id": df_mix_meta["Heart Sound ID"],
    "lung_sound_id": df_mix_meta["Lung Sound ID"],
    "mixed_sound_id": df_mix_meta["Mixed Sound ID"],
    "file_name": df_mix_meta["file_name"],
})

hls_cmds_df = pd.concat([hs_processed, ls_processed, mix_processed], ignore_index=True)

# ordering
cols = ["category","gender","location","heart_sound_type","lung_sound_type",
        "heart_sound_id","lung_sound_id","mixed_sound_id","file_name"]
hls_cmds_df = hls_cmds_df[cols]

display(hls_cmds_df.head())
print("Total rows:", len(hls_cmds_df))
print(hls_cmds_df["category"].value_counts(dropna=False))


Unnamed: 0,category,gender,location,heart_sound_type,lung_sound_type,heart_sound_id,lung_sound_id,mixed_sound_id,file_name
0,HS,F,RC,Normal,,F_N_RC,,,F_N_RC.wav
1,HS,F,LC,Normal,,F_N_LC,,,F_N_LC.wav
2,HS,M,RUSB,Normal,,M_N_RUSB,,,M_N_RUSB.wav
3,HS,F,LUSB,Normal,,F_N_LUSB,,,F_N_LUSB.wav
4,HS,F,LLSB,Normal,,F_N_LLSB,,,F_N_LLSB.wav


Total rows: 245
category
Mix    145
HS      50
LS      50
Name: count, dtype: int64


In [36]:
OUT_DIR = PROJECT_ROOT / "data" / "processed" / "hls_cmds"
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_CSV = OUT_DIR / "hls_cmds.csv"

hls_cmds_df.to_csv(OUT_CSV, index=False)

print("Wrote:", OUT_CSV)
print("Rows :", len(hls_cmds_df))


Wrote: C:\Users\MSI\Documents\Persoonlijke Projecten\AI-Onderzoeksvragen\research_questions\heart_and_lungsounds\data\processed\hls_cmds\hls_cmds.csv
Rows : 245
