In [1]:
from pathlib import Path
import pandas as pd


In [None]:
# Pad naar LS-map en CSV
LS_DIR = Path("../../data/raw/hls_cmds/LS").resolve()
LS_CSV = Path("../../data/raw/hls_cmds/LS_old.csv").resolve()

print("LS_DIR:", LS_DIR)
print("LS_CSV:", LS_CSV)

LS_DIR: C:\Users\MSI\Documents\Persoonlijke Projecten\AI-Onderzoeksvragen\research_questions\heart_and_lungsounds\data\raw\hls_cmds\LS
LS_CSV: C:\Users\MSI\Documents\Persoonlijke Projecten\AI-Onderzoeksvragen\research_questions\heart_and_lungsounds\data\raw\hls_cmds\LS.csv


In [3]:
df = pd.read_csv(LS_CSV)

# Verwijder NaN en strip spaties
csv_ids = (
    df["Lung Sound ID"]
    .dropna()
    .astype(str)
    .str.strip()
    .tolist()
)

print(f"Aantal Lung Sound ID's in CSV: {len(csv_ids)}")

Aantal Lung Sound ID's in CSV: 50


In [7]:
import re
from pathlib import Path

LS_DIR = Path("../../data/raw/hls_cmds/LS").resolve()

def normalize_lung_id(lung_id: str) -> str:
    s = str(lung_id).strip()
    # Pattern: Gender_Type_Location (e.g., F_G_LUA)
    # Convert C -> CC, G -> FC (based on observed mismatches)
    s = re.sub(r'^([FM])_C_(.+)$', r'\1_CC_\2', s)
    s = re.sub(r'^([FM])_G_(.+)$', r'\1_FC_\2', s)
    return s

df["lung_sound_id_raw"] = df["Lung Sound ID"].astype(str).str.strip()
df["lung_sound_id_norm"] = df["lung_sound_id_raw"].apply(normalize_lung_id)

# Re-check against wav stems
wav_set = {p.stem.strip() for p in LS_DIR.glob("*.wav")}
norm_set = set(df["lung_sound_id_norm"].dropna())

missing_after = sorted(norm_set - wav_set)
extra_after   = sorted(wav_set - norm_set)

print("Missing after normalization:", len(missing_after))
print("Extra after normalization:", len(extra_after))

if missing_after:
    print("\n=== Still missing (after normalization) ===")
    for x in missing_after:
        print(x)

if extra_after:
    print("\n=== Still extra (after normalization) ===")
    for x in extra_after:
        print(x)

Missing after normalization: 6
Extra after normalization: 6

=== Still missing (after normalization) ===
F_FC_LLA
F_FC_LMA
F_FC_RMA
M_CC_RUA
M_FC_LLA
M_FC_LMA

=== Still extra (after normalization) ===
F_CC_LLA
F_CC_LMA
F_CC_RMA
M_CC_LLA
M_CC_LMA
M_FC_RUA


In [8]:
# expliciete correcties: CSV-ID -> WAV-ID
FORCED_FIXES = {
    "F_FC_LLA": "F_CC_LLA",
    "F_FC_LMA": "F_CC_LMA",
    "F_FC_RMA": "F_CC_RMA",
    "M_FC_LLA": "M_CC_LLA",
    "M_FC_LMA": "M_CC_LMA",
    "M_CC_RUA": "M_FC_RUA",
}

def apply_forced_fix(lung_id: str) -> str:
    lung_id = str(lung_id).strip()
    return FORCED_FIXES.get(lung_id, lung_id)

df["lung_sound_id_final"] = df["lung_sound_id_norm"].apply(apply_forced_fix)

In [11]:
df.head(5)

Unnamed: 0,Gender,Lung Sound Type,Location,Lung Sound ID,lung_sound_id_raw,lung_sound_id_norm,lung_sound_id_final
0,M,Normal,RUA,M_N_RUA,M_N_RUA,M_N_RUA,M_N_RUA
1,F,Normal,LUA,F_N_LUA,F_N_LUA,F_N_LUA,F_N_LUA
2,F,Normal,RMA,F_N_RMA,F_N_RMA,F_N_RMA,F_N_RMA
3,F,Normal,LMA,F_N_LMA,F_N_LMA,F_N_LMA,F_N_LMA
4,M,Normal,RLA,M_N_RLA,M_N_RLA,M_N_RLA,M_N_RLA


In [12]:
# Zet de finale ID als officiÃ«le Lung Sound ID
df["Lung Sound ID"] = df["lung_sound_id_final"]

cols_to_drop = [
    "lung_sound_id_raw",
    "lung_sound_id_norm",
    "lung_sound_id_final",
]

df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

In [13]:
df.head(5)

Unnamed: 0,Gender,Lung Sound Type,Location,Lung Sound ID
0,M,Normal,RUA,M_N_RUA
1,F,Normal,LUA,F_N_LUA
2,F,Normal,RMA,F_N_RMA
3,F,Normal,LMA,F_N_LMA
4,M,Normal,RLA,M_N_RLA


In [14]:
from pathlib import Path

LS_CSV = Path("../../data/raw/hls_cmds/LS.csv").resolve()

df.to_csv(LS_CSV, index=False)

print("LS.csv succesvol overschreven op:")
print(LS_CSV)

LS.csv succesvol overschreven op:
C:\Users\MSI\Documents\Persoonlijke Projecten\AI-Onderzoeksvragen\research_questions\heart_and_lungsounds\data\raw\hls_cmds\LS.csv


In [15]:
LS_DIR = Path("../../data/raw/hls_cmds/LS").resolve()

wav_set = {p.stem for p in LS_DIR.glob("*.wav")}
csv_set = set(df["Lung Sound ID"].dropna())

assert csv_set == wav_set, (
    f"MISMATCH na fix!\n"
    f"Missing: {csv_set - wav_set}\n"
    f"Extra: {wav_set - csv_set}"
)

print("Sanity check OK: LS.csv en wav-bestanden zijn 100% consistent.")

Sanity check OK: LS.csv en wav-bestanden zijn 100% consistent.
