In [None]:
from pathlib import Path
import pandas as pd
import re
from typing import Iterable
import matplotlib.pyplot as plt
import seaborn as sns
from configparser import ConfigParser

HLS_DIR = Path("../data/raw/")
CSV_HS  = HLS_DIR / "HS.csv"
CSV_LS  = HLS_DIR / "LS.csv"
CSV_MIX = HLS_DIR / "Mix.csv"

DIR_HS  = HLS_DIR / "HS"
DIR_LS  = HLS_DIR / "LS"
DIR_MIX = HLS_DIR / "Mix"

In [None]:
# helper functions
def _to_stem(x: str) -> str:
    """Return a clean filename stem (strip and drop trailing .wav, case-insensitive)."""
    s = str(x).strip()
    return re.sub(r"\.wav$", "", s, flags=re.IGNORECASE)

def _normalize_ls_stem(s: str) -> str:
    """Normalize LS stems: map coarse/fine crackles to CC/FC as used on disk."""
    s = _to_stem(s)
    s = s.replace("_G_", "_CC_")  # Coarse Crackles
    s = s.replace("_C_", "_FC_")  # Fine Crackles
    return s

def _rel_wav(category_dir: Path, stem: str, *, is_ls: bool = False) -> str:
    """Build relative path to a .wav under ../data/raw/<HS|LS|Mix>/."""
    final_stem = _normalize_ls_stem(stem) if is_ls else _to_stem(stem)
    return str(category_dir / f"{final_stem}.wav")

def _strip_columns(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
    """Cast to string and strip whitespace for given columns."""
    for c in cols:
        df[c] = df[c].astype(str).str.strip()
    return df

In [35]:
# data loaders
def load_hs(csv_path: Path) -> pd.DataFrame:
    """Load HS.csv and attach wav paths + existence flags."""
    df = pd.read_csv(csv_path).rename(columns={
        "Gender": "gender",
        "Heart Sound Type": "heart_sound_type",
        "Location": "location",
        "Heart Sound ID": "heart_sound_id",
    })
    df = _strip_columns(df, ["gender", "heart_sound_type", "location", "heart_sound_id"])

    df["wav_path"]   = df["heart_sound_id"].map(lambda s: _rel_wav(DIR_HS, s))
    df["wav_exists"] = df["wav_path"].map(lambda p: Path(p).exists())

    return df[["gender", "heart_sound_type", "location", "heart_sound_id", "wav_path", "wav_exists"]]

def load_ls(csv_path: Path) -> pd.DataFrame:
    """Load LS.csv and attach wav paths + existence flags (with CC/FC normalization)."""
    df = pd.read_csv(csv_path).rename(columns={
        "Gender": "gender",
        "Lung Sound Type": "lung_sound_type",
        "Location": "location",
        "Lung Sound ID": "lung_sound_id",
    })
    df = _strip_columns(df, ["gender", "lung_sound_type", "location", "lung_sound_id"])

    # Use is_ls=True so _G_/_C_ → _CC_/_FC_ before building paths
    df["wav_path"]   = df["lung_sound_id"].map(lambda s: _rel_wav(DIR_LS, s, is_ls=True))
    df["wav_exists"] = df["wav_path"].map(lambda p: Path(p).exists())

    return df[["gender", "lung_sound_type", "location", "lung_sound_id", "wav_path", "wav_exists"]]

def load_mix(csv_path: Path) -> pd.DataFrame:
    """
    Load Mix.csv and attach:
      - mixed_wav_path (Mix folder; Mixed Sound ID)
      - heart_wav_path (HS folder; Heart Sound ID)
      - lung_wav_path  (LS folder; Lung Sound ID, normalized to CC/FC)
    Also compute existence flags for each path.
    """
    df = pd.read_csv(csv_path).rename(columns={
        "Gender": "gender",
        "Heart Sound Type": "heart_sound_type",
        "Lung Sound Type": "lung_sound_type",
        "Location": "location",
        "Heart Sound ID": "heart_sound_id",
        "Lung Sound ID": "lung_sound_id",
        "Mixed Sound ID": "mixed_sound_id",
    })
    df = _strip_columns(
        df,
        ["gender", "heart_sound_type", "lung_sound_type", "location",
         "heart_sound_id", "lung_sound_id", "mixed_sound_id"]
    )

    # Build paths
    df["mixed_wav_path"] = df["mixed_sound_id"].map(lambda s: _rel_wav(DIR_MIX, s))
    df["heart_wav_path"] = df["heart_sound_id"].map(lambda s: _rel_wav(DIR_HS,  s))
    df["lung_wav_path"]  = df["lung_sound_id"].map(lambda s: _rel_wav(DIR_LS,  s, is_ls=True))

    # Existence flags
    df["mixed_wav_exists"] = df["mixed_wav_path"].map(lambda p: Path(p).exists())
    df["heart_wav_exists"] = df["heart_wav_path"].map(lambda p: Path(p).exists())
    df["lung_wav_exists"]  = df["lung_wav_path"].map(lambda p: Path(p).exists())

    return df[[
        "gender", "heart_sound_type", "lung_sound_type", "location",
        "heart_sound_id", "lung_sound_id", "mixed_sound_id",
        "mixed_wav_path", "heart_wav_path", "lung_wav_path",
        "mixed_wav_exists", "heart_wav_exists", "lung_wav_exists",
    ]]

In [37]:
# load data
df_hs  = load_hs(CSV_HS)
df_ls  = load_ls(CSV_LS)
df_mix = load_mix(CSV_MIX)

print(f"HS:  rows={len(df_hs)}  | missing wavs={int((~df_hs['wav_exists']).sum())}")
print(f"LS:  rows={len(df_ls)}  | missing wavs={int((~df_ls['wav_exists']).sum())}")
print(f"Mix: rows={len(df_mix)} | missing mixed/heart/lung wavs="
      f"{int((~df_mix['mixed_wav_exists']).sum())}/"
      f"{int((~df_mix['heart_wav_exists']).sum())}/"
      f"{int((~df_mix['lung_wav_exists']).sum())}")

HS:  rows=50  | missing wavs=0
LS:  rows=50  | missing wavs=0
Mix: rows=145 | missing mixed/heart/lung wavs=0/145/145


In [38]:
actual_files   = sorted([p.stem for p in DIR_LS.glob("*.wav")])
expected_files = sorted([Path(p).stem for p in df_ls["wav_path"]])

print("Missing in folder:", sorted(set(expected_files) - set(actual_files))[:20])
print("Extra in folder  :", sorted(set(actual_files) - set(expected_files))[:20])

print("LS rows / missing:", len(df_ls), int((~df_ls['wav_exists']).sum()))

Missing in folder: []
Extra in folder  : []
LS rows / missing: 50 0


In [39]:
# drop missing_wav columns due to readability
df_hs  = df_hs.drop(columns=["wav_exists"])
df_ls  = df_ls.drop(columns=["wav_exists"])
df_mix = df_mix.drop(columns=["mixed_wav_exists", "heart_wav_exists", "lung_wav_exists"])

In [69]:
# add label columns according to the paper's abbreviations

# heart sound mapping
heart_label_map = {
    "Normal": "NH",
    "Late Diastolic Murmur": "LDM",
    "Mid Systolic Murmur": "MSM",
    "Late Systolic Murmur": "LSM",
    "Atrial Fibrillation": "AF",
    "S4": "S4",
    "Early Systolic Murmur": "ESM",
    "S3": "S3",
    "Tachycardia": "T",
    "AV Block": "AVB",
}

# lung sound mapping
lung_label_map = {
    "Normal": "NL",
    "Wheezing": "W",
    "Fine Crackles": "FC",
    "Rhonchi": "R",
    "Pleural Rub": "PR",
    "Coarse Crackles": "CC",
}

# apply to both hs and ls dataset
df_hs["label"] = df_hs["heart_sound_type"].map(heart_label_map)
df_ls["label"] = df_ls["lung_sound_type"].map(lung_label_map)

# also apply to mix dataset which contains both heart and lung sound types
df_mix["heart_label"] = df_mix["heart_sound_type"].map(heart_label_map)
df_mix["lung_label"]  = df_mix["lung_sound_type"].map(lung_label_map)

In [70]:
display(df_hs.head(3))
display(df_ls.head(3))

Unnamed: 0,gender,heart_sound_type,location,heart_sound_id,wav_path,label
0,F,Normal,RC,F_N_RC,..\data\raw\HS\F_N_RC.wav,NH
1,F,Normal,LC,F_N_LC,..\data\raw\HS\F_N_LC.wav,NH
2,M,Normal,RUSB,M_N_RUSB,..\data\raw\HS\M_N_RUSB.wav,NH


Unnamed: 0,gender,lung_sound_type,location,lung_sound_id,wav_path,label
0,M,Normal,RUA,M_N_RUA,..\data\raw\LS\M_N_RUA.wav,NL
1,F,Normal,LUA,F_N_LUA,..\data\raw\LS\F_N_LUA.wav,NL
2,F,Normal,RMA,F_N_RMA,..\data\raw\LS\F_N_RMA.wav,NL


In [71]:
display(df_mix.head(3))

Unnamed: 0,gender,heart_sound_type,lung_sound_type,location,heart_sound_id,lung_sound_id,mixed_sound_id,mixed_wav_path,heart_wav_path,lung_wav_path,heart_label,lung_label
0,F,Late Systolic Murmur,Rhonchi,LUSB,H0001,L0001,M0001,..\data\raw\Mix\M0001.wav,..\data\raw\HS\H0001.wav,..\data\raw\LS\L0001.wav,LSM,R
1,F,S3,Normal,RLA,H0002,L0002,M0002,..\data\raw\Mix\M0002.wav,..\data\raw\HS\H0002.wav,..\data\raw\LS\L0002.wav,S3,NL
2,M,Atrial Fibrillation,Normal,LMA,H0003,L0003,M0003,..\data\raw\Mix\M0003.wav,..\data\raw\HS\H0003.wav,..\data\raw\LS\L0003.wav,AF,NL


In [42]:
print("HS :", df_hs.shape)
print("LS :", df_ls.shape)
print("Mix:", df_mix.shape)

HS : (50, 5)
LS : (50, 5)
Mix: (145, 10)


In [None]:
# show dtypes and unique counts per dataset
def dtypes_and_uniques(df: pd.DataFrame, name: str):
    info = pd.DataFrame({
        "dtype": df.dtypes.astype(str),
        "n_unique": df.nunique()
    }).sort_index()
    display(info)

dtypes_and_uniques(df_hs,  "HS")
dtypes_and_uniques(df_ls,  "LS")
dtypes_and_uniques(df_mix, "Mix")

Unnamed: 0,dtype,n_unique
gender,object,2
heart_sound_id,object,50
heart_sound_type,object,10
location,object,6
wav_path,object,50


Unnamed: 0,dtype,n_unique
gender,object,2
location,object,6
lung_sound_id,object,50
lung_sound_type,object,6
wav_path,object,50


Unnamed: 0,dtype,n_unique
gender,object,2
heart_sound_id,object,145
heart_sound_type,object,10
heart_wav_path,object,145
location,object,12
lung_sound_id,object,145
lung_sound_type,object,6
lung_wav_path,object,145
mixed_sound_id,object,145
mixed_wav_path,object,145


In [68]:
df_mix["lung_sound_type"].value_counts()

lung_sound_type
Normal             28
Wheezing           28
Pleural Rub        25
Rhonchi            23
Fine Crackles      22
Coarse Crackles    19
Name: count, dtype: int64

In [None]:
lung_labels = {
    "NL":  "Normal lung",
    "W":   "Wheezing",
    "FC":  "Fine crackle",
    "R":   "Rhonchi",
    "PR":  "Pleural rub",
    "CC":  "Coarse crackle",
}

Unnamed: 0,gender,lung_sound_type,location,lung_sound_id,wav_path
0,M,Normal,RUA,M_N_RUA,..\data\raw\LS\M_N_RUA.wav
1,F,Normal,LUA,F_N_LUA,..\data\raw\LS\F_N_LUA.wav


In [None]:
# Count sound types per dataset
hs_counts  = df_hs["heart_sound_type"].value_counts()
ls_counts  = df_ls["lung_sound_type"].value_counts()
mix_hs_cnt = df_mix["heart_sound_type"].value_counts()
mix_ls_cnt = df_mix["lung_sound_type"].value_counts()

In [56]:
print("Heart Sound Types — counts in each set")
heart_table = pd.DataFrame({
    "HS.zip": hs_counts,
    "Mix.zip": mix_hs_cnt
}).fillna(0).astype(int)
display(heart_table.sort_index())

print("Lung Sound Types — counts in each set")
lung_table = pd.DataFrame({
    "LS.zip": ls_counts,
    "Mix.zip": mix_ls_cnt
}).fillna(0).astype(int)
display(lung_table.sort_index())

Heart Sound Types — counts in each set


Unnamed: 0_level_0,HS.zip,Mix.zip
heart_sound_type,Unnamed: 1_level_1,Unnamed: 2_level_1
AV Block,3,13
Atrial Fibrillation,4,15
Early Systolic Murmur,6,13
Late Diastolic Murmur,6,13
Late Systolic Murmur,5,17
Mid Systolic Murmur,7,14
Normal,9,13
S3,5,15
S4,2,16
Tachycardia,3,16


Lung Sound Types — counts in each set


Unnamed: 0_level_0,LS.zip,Mix.zip
lung_sound_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Coarse Crackles,9,19
Fine Crackles,5,22
Normal,12,28
Pleural Rub,9,25
Rhonchi,8,23
Wheezing,7,28


In [58]:
print("TABLE IV — Chest Zones")

# Count locations per dataset
hs_loc  = df_hs["location"].value_counts()
ls_loc  = df_ls["location"].value_counts()
mix_loc = df_mix["location"].value_counts()

# Combine into one table
zone_table = pd.DataFrame({
    "HS.zip": hs_loc,
    "LS.zip": ls_loc,
    "Mix.zip": mix_loc
}).fillna(0).astype(int)

display(zone_table.sort_index())

TABLE IV — Chest Zones


Unnamed: 0_level_0,HS.zip,LS.zip,Mix.zip
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apex,10,0,12
LC,6,0,13
LLA,0,8,12
LLSB,10,0,12
LMA,0,9,13
LUA,0,11,12
LUSB,13,0,12
RC,4,0,14
RLA,0,10,12
RMA,0,5,11
