In [1]:
# system / paths
import os
import sys
from pathlib import Path

# numerical / data
import numpy as np
import pandas as pd
import random

# plotting
import matplotlib.pyplot as plt
import seaborn as sns

# audio processing
import librosa
import soundfile as sf
# machine learning
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


# project paths
project_root = Path(os.getcwd()).parent
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# CWD is assumed to be notebooks/icbhi CWD = Path().resolve() # heart_and_lungsounds/ PROJECT_ROOT = CWD.parents[1]
CWD = Path().resolve()  # heart_and_lungsounds/
PROJECT_ROOT = CWD.parents[1]

RAW_ROOT = PROJECT_ROOT / "data" / "raw" / "icbhi"
AUDIO_DIR = RAW_ROOT / "audio_and_txt_files"
PATIENT_CSV = RAW_ROOT / "patient_diagnosis.csv"
DEMOGRAPHIC_TXT = RAW_ROOT / "demographic_info.txt"

print("cwd         :", CWD)
print("project_root:", PROJECT_ROOT)
print("raw_root    :", RAW_ROOT)
print("audio_dir   :", AUDIO_DIR)
print("patient_csv :", PATIENT_CSV)

# project modules
from src.plot_style import set_default_style
from src.widgets import make_patient_audio_widget

set_default_style()

# plotting defaults
sns.set(style="whitegrid", context="talk")
plt.rcParams["figure.figsize"] = (8, 5)

# pandas display
pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 120)

cwd         : C:\Users\Esmee Werk\Documents\Persoonlijke Projecten\AI-Onderzoeksvragen\research_questions\heart_and_lungsounds\notebooks\icbhi
project_root: C:\Users\Esmee Werk\Documents\Persoonlijke Projecten\AI-Onderzoeksvragen\research_questions\heart_and_lungsounds
raw_root    : C:\Users\Esmee Werk\Documents\Persoonlijke Projecten\AI-Onderzoeksvragen\research_questions\heart_and_lungsounds\data\raw\icbhi
audio_dir   : C:\Users\Esmee Werk\Documents\Persoonlijke Projecten\AI-Onderzoeksvragen\research_questions\heart_and_lungsounds\data\raw\icbhi\audio_and_txt_files
patient_csv : C:\Users\Esmee Werk\Documents\Persoonlijke Projecten\AI-Onderzoeksvragen\research_questions\heart_and_lungsounds\data\raw\icbhi\patient_diagnosis.csv


In [2]:
def audio_file_metadata(path: Path) -> dict:
    info = sf.info(str(path))
    duration_sec = info.frames / info.samplerate
    patient_id = int(path.stem.split("_")[0])
    return {
        "path": str(path),
        "file_name": path.name,
        "samplerate": info.samplerate,
        "frames": info.frames,
        "duration_sec": duration_sec,
        "channels": info.channels,
        "patient_id": patient_id,
    }

def parse_icbhi_filename(file_name: str) -> dict:
    stem = file_name[:-4]  # remove .wav
    parts = stem.split("_")
    chest_location = parts[2] if len(parts) > 2 else None
    mode = parts[3] if len(parts) > 3 else None
    equipment = parts[4] if len(parts) > 4 else None
    return {
        "chest_location": chest_location,
        "mode": mode,
        "equipment": equipment,
    }

demo_cols = [
    "patient_id",
    "age",
    "sex",
    "adult_bmi",
    "child_weight",
    "child_height",
]

In [3]:
demo_df = pd.read_csv(
    DEMOGRAPHIC_TXT,
    sep=r"\s+",
    header=None,
    names=demo_cols,
    engine="python",
)

print("demo_df shape:", demo_df.shape)
demo_df.head()

demo_df shape: (126, 6)


Unnamed: 0,patient_id,age,sex,adult_bmi,child_weight,child_height
0,101,3.0,F,,19.0,99.0
1,102,0.75,F,,9.8,73.0
2,103,70.0,F,33.0,,
3,104,70.0,F,28.47,,
4,105,7.0,F,,32.0,135.0


In [4]:
wav_paths = sorted(AUDIO_DIR.glob("*.wav"))
records = []

for p in wav_paths:
    meta = audio_file_metadata(p)
    parsed = parse_icbhi_filename(p.name)
    records.append({**meta, **parsed})

audio_df = pd.DataFrame(records)
audio_df.head()


Unnamed: 0,path,file_name,samplerate,frames,duration_sec,channels,patient_id,chest_location,mode,equipment
0,C:\Users\Esmee Werk\Documents\Persoonlijke Pro...,101_1b1_Al_sc_Meditron.wav,44100,882000,20.0,1,101,Al,sc,Meditron
1,C:\Users\Esmee Werk\Documents\Persoonlijke Pro...,101_1b1_Pr_sc_Meditron.wav,44100,882000,20.0,1,101,Pr,sc,Meditron
2,C:\Users\Esmee Werk\Documents\Persoonlijke Pro...,102_1b1_Ar_sc_Meditron.wav,44100,882000,20.0,1,102,Ar,sc,Meditron
3,C:\Users\Esmee Werk\Documents\Persoonlijke Pro...,103_2b2_Ar_mc_LittC2SE.wav,44100,882000,20.0,1,103,Ar,mc,LittC2SE
4,C:\Users\Esmee Werk\Documents\Persoonlijke Pro...,104_1b1_Al_sc_Litt3200.wav,4000,63424,15.856,1,104,Al,sc,Litt3200


In [5]:
patients_df = pd.read_csv(PATIENT_CSV, header=None, names=["patient_id", "diagnosis"])

df = audio_df.merge(patients_df, on="patient_id", how="left")
print("df shape:", df.shape)
df.head()


df shape: (920, 11)


Unnamed: 0,path,file_name,samplerate,frames,duration_sec,channels,patient_id,chest_location,mode,equipment,diagnosis
0,C:\Users\Esmee Werk\Documents\Persoonlijke Pro...,101_1b1_Al_sc_Meditron.wav,44100,882000,20.0,1,101,Al,sc,Meditron,URTI
1,C:\Users\Esmee Werk\Documents\Persoonlijke Pro...,101_1b1_Pr_sc_Meditron.wav,44100,882000,20.0,1,101,Pr,sc,Meditron,URTI
2,C:\Users\Esmee Werk\Documents\Persoonlijke Pro...,102_1b1_Ar_sc_Meditron.wav,44100,882000,20.0,1,102,Ar,sc,Meditron,Healthy
3,C:\Users\Esmee Werk\Documents\Persoonlijke Pro...,103_2b2_Ar_mc_LittC2SE.wav,44100,882000,20.0,1,103,Ar,mc,LittC2SE,Asthma
4,C:\Users\Esmee Werk\Documents\Persoonlijke Pro...,104_1b1_Al_sc_Litt3200.wav,4000,63424,15.856,1,104,Al,sc,Litt3200,COPD


In [6]:
# Merge demographics into df on patient_id
full_df = df.merge(
    demo_df[["patient_id", "age", "sex"]],
    on="patient_id",
    how="left",
    validate="many_to_one",
)

print("Merged full_df shape:", full_df.shape)
full_df.head()

Merged full_df shape: (920, 13)


Unnamed: 0,path,file_name,samplerate,frames,duration_sec,channels,patient_id,chest_location,mode,equipment,diagnosis,age,sex
0,C:\Users\Esmee Werk\Documents\Persoonlijke Pro...,101_1b1_Al_sc_Meditron.wav,44100,882000,20.0,1,101,Al,sc,Meditron,URTI,3.0,F
1,C:\Users\Esmee Werk\Documents\Persoonlijke Pro...,101_1b1_Pr_sc_Meditron.wav,44100,882000,20.0,1,101,Pr,sc,Meditron,URTI,3.0,F
2,C:\Users\Esmee Werk\Documents\Persoonlijke Pro...,102_1b1_Ar_sc_Meditron.wav,44100,882000,20.0,1,102,Ar,sc,Meditron,Healthy,0.75,F
3,C:\Users\Esmee Werk\Documents\Persoonlijke Pro...,103_2b2_Ar_mc_LittC2SE.wav,44100,882000,20.0,1,103,Ar,mc,LittC2SE,Asthma,70.0,F
4,C:\Users\Esmee Werk\Documents\Persoonlijke Pro...,104_1b1_Al_sc_Litt3200.wav,4000,63424,15.856,1,104,Al,sc,Litt3200,COPD,70.0,F


In [7]:
def normalize_icbhi_diagnosis(diag: str) -> str | None:
    """
    Combine bronchiectasis en bronchiolitis tot BRON.
    Drop LRTI (return None).
    Laat overige labels intact.
    """
    if pd.isna(diag):
        return None

    diag = str(diag).strip()

    if diag in {"Bronchiectasis", "Bronchiolitis"}:
        return "BRON"

    if diag == "LRTI":
        return None

    return diag


patients_df["diagnosis_norm"] = patients_df["diagnosis"].apply(normalize_icbhi_diagnosis)

# drop de twee LRTI patiënten op diagnoseniveau
patients_df = patients_df[~patients_df["diagnosis_norm"].isna()].copy()

# merge met audio
df = audio_df.merge(
    patients_df[["patient_id", "diagnosis_norm"]],
    on="patient_id",
    how="inner",
)

print("df shape na merge en filter:", df.shape)
df.head()

df shape na merge en filter: (918, 11)


Unnamed: 0,path,file_name,samplerate,frames,duration_sec,channels,patient_id,chest_location,mode,equipment,diagnosis_norm
0,C:\Users\Esmee Werk\Documents\Persoonlijke Pro...,101_1b1_Al_sc_Meditron.wav,44100,882000,20.0,1,101,Al,sc,Meditron,URTI
1,C:\Users\Esmee Werk\Documents\Persoonlijke Pro...,101_1b1_Pr_sc_Meditron.wav,44100,882000,20.0,1,101,Pr,sc,Meditron,URTI
2,C:\Users\Esmee Werk\Documents\Persoonlijke Pro...,102_1b1_Ar_sc_Meditron.wav,44100,882000,20.0,1,102,Ar,sc,Meditron,Healthy
3,C:\Users\Esmee Werk\Documents\Persoonlijke Pro...,103_2b2_Ar_mc_LittC2SE.wav,44100,882000,20.0,1,103,Ar,mc,LittC2SE,Asthma
4,C:\Users\Esmee Werk\Documents\Persoonlijke Pro...,104_1b1_Al_sc_Litt3200.wav,4000,63424,15.856,1,104,Al,sc,Litt3200,COPD


In [8]:
def parse_annotation_file(txt_path: Path) -> pd.DataFrame:
    """
    Parse één ICBHI .txt bestand naar cycle-level labels.
    Kolommen: start_sec, end_sec, duration_sec, label, record_id
    """
    rows = []
    with open(txt_path, "r") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) != 4:
                continue
            start, end, crackle_flag, wheeze_flag = parts
            start = float(start)
            end = float(end)
            crackle_flag = int(crackle_flag)
            wheeze_flag = int(wheeze_flag)

            if crackle_flag == 1 and wheeze_flag == 0:
                label = "crackle"
            elif crackle_flag == 0 and wheeze_flag == 1:
                label = "wheeze"
            elif crackle_flag == 1 and wheeze_flag == 1:
                label = "crackle+wheeze"
            else:
                label = "normal"

            rows.append(
                {
                    "start_sec": start,
                    "end_sec": end,
                    "duration_sec": end - start,
                    "crackle_flag": crackle_flag,
                    "wheeze_flag": wheeze_flag,
                    "label": label,
                }
            )
    df_annot = pd.DataFrame(rows)
    df_annot["record_id"] = txt_path.stem
    return df_annot


def load_all_icbhi_annotations(annotation_dir: Path) -> pd.DataFrame:
    txt_paths = sorted(annotation_dir.glob("*.txt"))
    all_dfs = []
    for p in txt_paths:
        all_dfs.append(parse_annotation_file(p))
    if not all_dfs:
        return pd.DataFrame()
    return pd.concat(all_dfs, ignore_index=True)


annotations_df = load_all_icbhi_annotations(AUDIO_DIR)
print("annotations_df shape:", annotations_df.shape)
annotations_df.head()

annotations_df shape: (6898, 7)


Unnamed: 0,start_sec,end_sec,duration_sec,crackle_flag,wheeze_flag,label,record_id
0,0.036,0.579,0.543,0,0,normal,101_1b1_Al_sc_Meditron
1,0.579,2.45,1.871,0,0,normal,101_1b1_Al_sc_Meditron
2,2.45,3.893,1.443,0,0,normal,101_1b1_Al_sc_Meditron
3,3.893,5.793,1.9,0,0,normal,101_1b1_Al_sc_Meditron
4,5.793,7.521,1.728,0,0,normal,101_1b1_Al_sc_Meditron


In [9]:
# maak record_id kolom in audio df
df["record_id"] = df["file_name"].str.replace(".wav", "", regex=False)

annotations_df = annotations_df.merge(
    df[["record_id", "patient_id", "diagnosis_norm"]],
    on="record_id",
    how="left",
)

print("annotations_df shape na merge:", annotations_df.shape)
annotations_df.head()

annotations_df shape na merge: (6898, 9)


Unnamed: 0,start_sec,end_sec,duration_sec,crackle_flag,wheeze_flag,label,record_id,patient_id,diagnosis_norm
0,0.036,0.579,0.543,0,0,normal,101_1b1_Al_sc_Meditron,101.0,URTI
1,0.579,2.45,1.871,0,0,normal,101_1b1_Al_sc_Meditron,101.0,URTI
2,2.45,3.893,1.443,0,0,normal,101_1b1_Al_sc_Meditron,101.0,URTI
3,3.893,5.793,1.9,0,0,normal,101_1b1_Al_sc_Meditron,101.0,URTI
4,5.793,7.521,1.728,0,0,normal,101_1b1_Al_sc_Meditron,101.0,URTI


In [10]:
# demo_df heeft: patient_id, age, sex, adult_bmi, child_weight, child_height

demo_merged = demo_df.merge(
    patients_df[["patient_id", "diagnosis_norm"]],
    on="patient_id",
    how="inner",
)

print("demo_merged shape:", demo_merged.shape)
demo_merged.head()


demo_merged shape: (124, 7)


Unnamed: 0,patient_id,age,sex,adult_bmi,child_weight,child_height,diagnosis_norm
0,101,3.0,F,,19.0,99.0,URTI
1,102,0.75,F,,9.8,73.0,Healthy
2,103,70.0,F,33.0,,,Asthma
3,104,70.0,F,28.47,,,COPD
4,105,7.0,F,,32.0,135.0,URTI


In [17]:
# keep only needed columns and remove recordings without normalized diagnosis
full_diag = df.merge(
    demo_df[["patient_id", "sex"]],
    on="patient_id",
    how="left",
    validate="many_to_one",
)

# one row per patient, diagnosis, sex
patient_sex_diag = (
    full_diag[["patient_id", "diagnosis_norm", "sex"]]
    .dropna(subset=["diagnosis_norm"])
    .drop_duplicates()
)

# subject counts per diagnosis and sex
subj_counts = (
    patient_sex_diag.groupby(["diagnosis_norm", "sex"])["patient_id"]
    .nunique()
    .unstack(fill_value=0)
)

# recording counts per diagnosis
rec_counts = (
    full_diag.dropna(subset=["diagnosis_norm"])
    .groupby("diagnosis_norm")["file_name"]
    .nunique()
)

# helper for "35 (11F, 24M)" format
def format_subjects(row):
    f = int(row.get("F", 0))
    m = int(row.get("M", 0))
    total = f + m
    return f"{total} ({f}F, {m}M)"

# build final table
table_icbhi = pd.DataFrame(index=subj_counts.index)
table_icbhi["# subjects"] = subj_counts.apply(format_subjects, axis=1)
table_icbhi["# recordings"] = rec_counts

# move diagnosis from index to column and sort
table_icbhi = (
    table_icbhi.reset_index()
    .rename(columns={"diagnosis_norm": "Diagnosis"})
    .sort_values("Diagnosis")
    .reset_index(drop=True)
)

print(table_icbhi)


   Diagnosis     # subjects  # recordings
0     Asthma     1 (1F, 0M)             1
1       BRON    13 (7F, 6M)            29
2       COPD  63 (15F, 48M)           793
3    Healthy  26 (13F, 13M)            35
4  Pneumonia     6 (2F, 4M)            37
5       URTI    14 (8F, 6M)            23
