In [19]:
import re
import pandas as pd
from pathlib import Path

import sys, os
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../.."))
if project_root not in sys.path:
    sys.path.append(project_root)

# dataset locations
ROOT = Path("../data/raw/")
DIR_HS  = Path("../data/raw/HS")
DIR_LS  = Path("../data/raw/LS")
DIR_MIX = Path("../data/raw/Mix")

CSV_HS  = ROOT  / "HS.csv"
CSV_LS  = ROOT  / "LS.csv"
CSV_MIX = ROOT / "Mix.csv"

from shared.utils.plots import generate_hls_spectrogram

In [7]:
# helper functions
def _to_stem(x: str) -> str:
    """Clean filename stem (strip & drop '.wav')."""
    s = str(x).strip()
    return re.sub(r"\.wav$", "", s, flags=re.IGNORECASE)

def _normalize_ls_stem(s: str) -> str:
    """Normalize LS stems (map coarse/fine crackles to CC/FC)."""
    s = _to_stem(s)
    s = s.replace("_G_", "_CC_")  # Coarse Crackles
    s = s.replace("_C_", "_FC_")  # Fine Crackles
    return s

def _rel_wav(category_dir: Path, stem: str, *, is_ls: bool = False) -> str:
    """Build relative path to a .wav file under the dataset folders."""
    final_stem = _normalize_ls_stem(stem) if is_ls else _to_stem(stem)
    return str(category_dir / f"{final_stem}.wav")

def _strip_columns(df: pd.DataFrame, cols) -> pd.DataFrame:
    """Trim whitespace from specific columns."""
    for c in cols:
        df[c] = df[c].astype(str).str.strip()
    return df

In [8]:
# label mapping
HEART_MAP = {
    "normal":"NH","late diastolic murmur":"LDM","mid systolic murmur":"MSM",
    "late systolic murmur":"LSM","atrial fibrillation":"AF","s4":"S4",
    "early systolic murmur":"ESM","s3":"S3","tachycardia":"T","av block":"AVB"
}
LUNG_MAP = {
    "normal":"NL","wheezing":"W","fine crackles":"FC",
    "rhonchi":"R","pleural rub":"PR","coarse crackles":"CC"
}

def map_label(series: pd.Series, mapping: dict) -> pd.Series:
    """Case/whitespace-insensitive mapping to short codes."""
    return series.astype(str).str.strip().str.lower().map(mapping)

# label order (same as in the paper)
HEART_TYPE_ORDER = ["NH","LDM","MSM","LSM","AF","S4","ESM","S3","T","AVB"]
LUNG_TYPE_ORDER  = ["NL","W","FC","R","PR","CC"]

# readable names
HEART_TYPE_NAME = {
    "NH":"Normal","LDM":"Late Diastolic Murmur","MSM":"Mid Systolic Murmur",
    "LSM":"Late Systolic Murmur","AF":"Atrial Fibrillation","S4":"S4",
    "ESM":"Early Systolic Murmur","S3":"S3","T":"Tachycardia","AVB":"AV Block"
}
LUNG_TYPE_NAME = {
    "NL":"Normal","W":"Wheezing","FC":"Fine Crackles",
    "R":"Rhonchi","PR":"Pleural Rub","CC":"Coarse Crackles"
}

# locations
HEART_LOCS = ["RUSB","LUSB","LLSB","RC","LC","A"]
LUNG_LOCS  = ["RUA","LUA","RMA","LMA","RLA","LLA"]

# expected paper values for Mix (lung)
EXPECTED_MIX_LUNG = {"RUA":12,"LUA":12,"RMA":12,"LMA":11,"RLA":9,"LLA":12}

In [9]:
# csv loaders
def load_hs(csv_path: Path) -> pd.DataFrame:
    """Load HS.csv, normalize columns, and add label + path."""
    df = pd.read_csv(csv_path).rename(columns={
        "Gender":"gender","Heart Sound Type":"heart_sound_type",
        "Location":"location","Heart Sound ID":"heart_sound_id",
    })
    df = _strip_columns(df, ["gender","heart_sound_type","location","heart_sound_id"])
    df["label"] = map_label(df["heart_sound_type"], HEART_MAP)
    df["wav_path"]   = df["heart_sound_id"].map(lambda s: _rel_wav(DIR_HS, s))
    df["wav_exists"] = df["wav_path"].map(lambda p: Path(p).exists())
    return df

def load_ls(csv_path: Path) -> pd.DataFrame:
    """Load LS.csv, normalize columns, and add label + path."""
    df = pd.read_csv(csv_path).rename(columns={
        "Gender":"gender","Lung Sound Type":"lung_sound_type",
        "Location":"location","Lung Sound ID":"lung_sound_id",
    })
    df = _strip_columns(df, ["gender","lung_sound_type","location","lung_sound_id"])
    df["label"] = map_label(df["lung_sound_type"], LUNG_MAP)
    df["wav_path"]   = df["lung_sound_id"].map(lambda s: _rel_wav(DIR_LS, s, is_ls=True))
    df["wav_exists"] = df["wav_path"].map(lambda p: Path(p).exists())
    return df

def load_mix(csv_path: Path) -> pd.DataFrame:
    """Load Mix.csv and add both heart/lung labels and wav paths."""
    df = pd.read_csv(csv_path).rename(columns={
        "Gender":"gender","Heart Sound Type":"heart_sound_type",
        "Lung Sound Type":"lung_sound_type","Location":"location",
        "Heart Sound ID":"heart_sound_id","Lung Sound ID":"lung_sound_id",
        "Mixed Sound ID":"mixed_sound_id",
    })
    df = _strip_columns(df, ["gender","heart_sound_type","lung_sound_type",
                             "location","heart_sound_id","lung_sound_id","mixed_sound_id"])
    df["heart_label"] = map_label(df["heart_sound_type"], HEART_MAP)
    df["lung_label"]  = map_label(df["lung_sound_type"],  LUNG_MAP)
    df["mixed_wav_path"] = df["mixed_sound_id"].map(lambda s: _rel_wav(DIR_MIX, s))
    return df

In [10]:
# load dataframes & normalize
df_hs  = load_hs(CSV_HS)
df_ls  = load_ls(CSV_LS)
df_mix = load_mix(CSV_MIX)

# replace 'Apex' with 'A' for heart sites
df_hs["location"]  = df_hs["location"].replace({"Apex":"A"})
df_mix["location"] = df_mix["location"].replace({"Apex":"A"})

print("Unique HS locations:", df_hs["location"].unique().tolist())
print("Unique LS locations:", df_ls["location"].unique().tolist())
print("Unique Mix locations:", df_mix["location"].unique().tolist())

Unique HS locations: ['RC', 'LC', 'RUSB', 'LUSB', 'LLSB', 'A']
Unique LS locations: ['RUA', 'LUA', 'RMA', 'LMA', 'RLA', 'LLA']
Unique Mix locations: ['LUSB', 'RLA', 'LMA', 'A', 'RUSB', 'RC', 'LUA', 'RMA', 'RUA', 'LC', 'LLSB', 'LLA']


In [11]:
# Table III

# Heart sounds
hs_counts    = df_hs["label"].value_counts()
mix_h_counts = df_mix["heart_label"].value_counts()

tblIII_heart = pd.DataFrame({
    "Heart Sound Type":[HEART_TYPE_NAME[k] for k in HEART_TYPE_ORDER],
    "Label": HEART_TYPE_ORDER,
    "HS.zip":  [int(hs_counts.get(k,0))    for k in HEART_TYPE_ORDER],
    "Mix.zip": [int(mix_h_counts.get(k,0)) for k in HEART_TYPE_ORDER],
})

# Lung sounds
ls_counts    = df_ls["label"].value_counts()
mix_l_counts = df_mix["lung_label"].value_counts()

tblIII_lung = pd.DataFrame({
    "Lung Sound Type":[LUNG_TYPE_NAME[k] for k in LUNG_TYPE_ORDER],
    "Label": LUNG_TYPE_ORDER,
    "LS.zip":  [int(ls_counts.get(k,0))    for k in LUNG_TYPE_ORDER],
    "Mix.zip": [int(mix_l_counts.get(k,0)) for k in LUNG_TYPE_ORDER],
})

print("TABLE III — Heart Sounds")
display(tblIII_heart)
print("TABLE III — Lung Sounds")
display(tblIII_lung)

TABLE III — Heart Sounds


Unnamed: 0,Heart Sound Type,Label,HS.zip,Mix.zip
0,Normal,NH,9,13
1,Late Diastolic Murmur,LDM,6,13
2,Mid Systolic Murmur,MSM,7,14
3,Late Systolic Murmur,LSM,5,17
4,Atrial Fibrillation,AF,4,15
5,S4,S4,2,16
6,Early Systolic Murmur,ESM,6,13
7,S3,S3,5,15
8,Tachycardia,T,3,16
9,AV Block,AVB,3,13


TABLE III — Lung Sounds


Unnamed: 0,Lung Sound Type,Label,LS.zip,Mix.zip
0,Normal,NL,12,28
1,Wheezing,W,7,28
2,Fine Crackles,FC,5,22
3,Rhonchi,R,8,23
4,Pleural Rub,PR,9,25
5,Coarse Crackles,CC,9,19


In [12]:
# table IV

HEART_SET = set(HEART_LOCS)
LUNG_SET  = set(LUNG_LOCS)

# Heart zones
hs_loc_counts   = df_hs["location"].value_counts()
mix_heart_locs  = df_mix[df_mix["location"].isin(HEART_SET)]["location"].value_counts()

tblIV_heart = pd.DataFrame({
    "location": HEART_LOCS,
    "HS.zip":  [int(hs_loc_counts.get(z,0))  for z in HEART_LOCS],
    "Mix.zip": [int(mix_heart_locs.get(z,0)) for z in HEART_LOCS],
})

print("TABLE IV — Heart Auscultation Landmarks")
display(tblIV_heart)

# Lung zones (Mix.csv as-is)
mix_lung_locs = df_mix[df_mix["location"].isin(LUNG_SET)]["location"].value_counts()
tblIV_lung = pd.DataFrame({
    "location": LUNG_LOCS,
    "LS.zip":  [int(df_ls["location"].value_counts().get(z,0)) for z in LUNG_LOCS],
    "Mix.zip": [int(mix_lung_locs.get(z,0))                    for z in LUNG_LOCS],
})

print("TABLE IV — Lung Auscultation Landmarks (from Mix.csv as-is)")
display(tblIV_lung)

TABLE IV — Heart Auscultation Landmarks


Unnamed: 0,location,HS.zip,Mix.zip
0,RUSB,7,13
1,LUSB,13,12
2,LLSB,10,12
3,RC,4,14
4,LC,6,13
5,A,10,12


TABLE IV — Lung Auscultation Landmarks (from Mix.csv as-is)


Unnamed: 0,location,LS.zip,Mix.zip
0,RUA,7,9
1,LUA,11,12
2,RMA,5,11
3,LMA,9,13
4,RLA,10,12
5,LLA,8,12


In [13]:
print("LS Lung Sound IDs (examples):", df_ls["lung_sound_id"].head(10).tolist())
print("Mix Lung Sound IDs (examples):", df_mix["lung_sound_id"].head(10).tolist())

# --- Check ID overlap ---
overlap = set(df_ls["lung_sound_id"]) & set(df_mix["lung_sound_id"])
print("ID overlap (LS vs Mix):", len(overlap), "common IDs (should be >0 if same naming)")

# --- Find Mix rows recorded at HEART sites ---
suspect_mix = df_mix[df_mix["location"].isin(HEART_SET)]
print(f"\nMix rows recorded at HEART sites: {len(suspect_mix)}")
display(suspect_mix[["mixed_sound_id","lung_sound_type","location","gender"]].head(20))

LS Lung Sound IDs (examples): ['M_N_RUA', 'F_N_LUA', 'F_N_RMA', 'F_N_LMA', 'M_N_RLA', 'M_N_LLA', 'M_PR_RMA', 'M_PR_LUA', 'F_R_LUA', 'M_W_LUA']
Mix Lung Sound IDs (examples): ['L0001', 'L0002', 'L0003', 'L0004', 'L0005', 'L0006', 'L0007', 'L0008', 'L0009', 'L0010']
ID overlap (LS vs Mix): 0 common IDs (should be >0 if same naming)

Mix rows recorded at HEART sites: 76


Unnamed: 0,mixed_sound_id,lung_sound_type,location,gender
0,M0001,Rhonchi,LUSB,F
3,M0004,Coarse Crackles,A,F
4,M0005,Fine Crackles,RUSB,M
5,M0006,Pleural Rub,RC,F
6,M0007,Pleural Rub,A,M
10,M0011,Normal,A,M
12,M0013,Pleural Rub,RC,F
16,M0017,Coarse Crackles,RC,F
20,M0021,Coarse Crackles,LUSB,M
22,M0023,Normal,LC,M


In [14]:
# --- Compute current vs expected paper counts ---
mix_lung_counts = {z: int(mix_lung_locs.get(z,0)) for z in LUNG_LOCS}
delta = {z: EXPECTED_MIX_LUNG[z] - mix_lung_counts.get(z,0) for z in LUNG_LOCS}

print("Current Mix lung counts:", mix_lung_counts)
print("Expected (paper):        ", EXPECTED_MIX_LUNG)
print("Delta (need + / -):      ", delta)

# --- Candidate rows (Mix rows currently tagged as HEART zones) ---
candidates = df_mix[df_mix["location"].isin(HEART_SET)][
    ["mixed_sound_id","lung_sound_type","heart_sound_type","location","gender"]
]
print("\nCandidate Mix rows to review:")
display(candidates.head(30))

# --- Function to patch Mix locations manually ---
def apply_mix_location_patch(df_mix, patch_map):
    """Apply manual location corrections for Mix.csv."""
    patched = df_mix.copy()
    mask = patched["mixed_sound_id"].isin(patch_map.keys())
    patched.loc[mask, "location"] = patched.loc[mask, "mixed_sound_id"].map(patch_map)
    return patched

# Example: fill in any fixes you decide on
manual_patch = {
    # "M0001": "RUA",
    # "M0007": "RUA",
}

# --- Apply patch and recompute counts ---
df_mix_fixed = apply_mix_location_patch(df_mix, manual_patch)
mix_fixed_counts = (
    df_mix_fixed[df_mix_fixed["location"].isin(LUNG_SET)]["location"]
    .value_counts()
    .reindex(LUNG_LOCS, fill_value=0)
    .to_dict()
)

print("\nAfter manual patch — Mix lung counts:", mix_fixed_counts)
print("Still needed delta:",
      {z: EXPECTED_MIX_LUNG[z] - mix_fixed_counts.get(z,0) for z in LUNG_LOCS})

Current Mix lung counts: {'RUA': 9, 'LUA': 12, 'RMA': 11, 'LMA': 13, 'RLA': 12, 'LLA': 12}
Expected (paper):         {'RUA': 12, 'LUA': 12, 'RMA': 12, 'LMA': 11, 'RLA': 9, 'LLA': 12}
Delta (need + / -):       {'RUA': 3, 'LUA': 0, 'RMA': 1, 'LMA': -2, 'RLA': -3, 'LLA': 0}

Candidate Mix rows to review:


Unnamed: 0,mixed_sound_id,lung_sound_type,heart_sound_type,location,gender
0,M0001,Rhonchi,Late Systolic Murmur,LUSB,F
3,M0004,Coarse Crackles,S3,A,F
4,M0005,Fine Crackles,AV Block,RUSB,M
5,M0006,Pleural Rub,S3,RC,F
6,M0007,Pleural Rub,Tachycardia,A,M
10,M0011,Normal,S4,A,M
12,M0013,Pleural Rub,Atrial Fibrillation,RC,F
16,M0017,Coarse Crackles,S3,RC,F
20,M0021,Coarse Crackles,Late Systolic Murmur,LUSB,M
22,M0023,Normal,Tachycardia,LC,M



After manual patch — Mix lung counts: {'RUA': 9, 'LUA': 12, 'RMA': 11, 'LMA': 13, 'RLA': 12, 'LLA': 12}
Still needed delta: {'RUA': 3, 'LUA': 0, 'RMA': 1, 'LMA': -2, 'RLA': -3, 'LLA': 0}


In [15]:
display(tblIV_heart)

Unnamed: 0,location,HS.zip,Mix.zip
0,RUSB,7,13
1,LUSB,13,12
2,LLSB,10,12
3,RC,4,14
4,LC,6,13
5,A,10,12


In [16]:
display(tblIV_lung)

Unnamed: 0,location,LS.zip,Mix.zip
0,RUA,7,9
1,LUA,11,12
2,RMA,5,11
3,LMA,9,13
4,RLA,10,12
5,LLA,8,12
