In [2]:
# =========================================================
# HLS-CMDS — Tables III & IV with diagnostics and fix flow
# =========================================================
import re, json
from pathlib import Path
import pandas as pd

# --------------------------
# Paths (EDIT to your setup)
# --------------------------
ROOT = Path("../data/raw/")
DIR_HS  = Path("../data/raw/HS")
DIR_LS  = Path("../data/raw/LS")
DIR_MIX = Path("../data/raw/Mix")
CSV_HS  = ROOT  / "HS.csv"
CSV_LS  = ROOT  / "LS.csv"
CSV_MIX = ROOT / "Mix.csv"

# --------------------------
# Helpers
# --------------------------
def _to_stem(x: str) -> str:
    s = str(x).strip()
    return re.sub(r"\.wav$", "", s, flags=re.IGNORECASE)

def _normalize_ls_stem(s: str) -> str:
    # Coarse/Fine crackles naming fix on disk
    s = _to_stem(s)
    s = s.replace("_G_", "_CC_")  # Coarse Crackles
    s = s.replace("_C_", "_FC_")  # Fine Crackles
    return s

def _rel_wav(category_dir: Path, stem: str, *, is_ls: bool = False) -> str:
    final_stem = _normalize_ls_stem(stem) if is_ls else _to_stem(stem)
    return str(category_dir / f"{final_stem}.wav")

def _strip_columns(df: pd.DataFrame, cols) -> pd.DataFrame:
    for c in cols:
        df[c] = df[c].astype(str).str.strip()
    return df

# Label maps (short codes)
HEART_MAP = {
    "normal":"NH","late diastolic murmur":"LDM","mid systolic murmur":"MSM",
    "late systolic murmur":"LSM","atrial fibrillation":"AF","s4":"S4",
    "early systolic murmur":"ESM","s3":"S3","tachycardia":"T","av block":"AVB"
}
LUNG_MAP = {
    "normal":"NL","wheezing":"W","fine crackles":"FC",
    "rhonchi":"R","pleural rub":"PR","coarse crackles":"CC"
}
def map_label(series: pd.Series, mapping: dict) -> pd.Series:
    return series.astype(str).str.strip().str.lower().map(mapping)

# Pretty names + canonical order (as in the paper)
HEART_TYPE_ORDER = ["NH","LDM","MSM","LSM","AF","S4","ESM","S3","T","AVB"]
LUNG_TYPE_ORDER  = ["NL","W","FC","R","PR","CC"]
HEART_TYPE_NAME = {
    "NH":"Normal","LDM":"Late Diastolic Murmur","MSM":"Mid Systolic Murmur",
    "LSM":"Late Systolic Murmur","AF":"Atrial Fibrillation","S4":"S4",
    "ESM":"Early Systolic Murmur","S3":"S3","T":"Tachycardia","AVB":"AV Block"
}
LUNG_TYPE_NAME = {
    "NL":"Normal","W":"Wheezing","FC":"Fine Crackles",
    "R":"Rhonchi","PR":"Pleural Rub","CC":"Coarse Crackles"
}
HEART_LOCS = ["RUSB","LUSB","LLSB","RC","LC","A"]
LUNG_LOCS  = ["RUA","LUA","RMA","LMA","RLA","LLA"]
HEART_SET  = set(HEART_LOCS)
LUNG_SET   = set(LUNG_LOCS)

# Paper-expected (Table IV, Mix lung)
EXPECTED_MIX_LUNG = {"RUA":12,"LUA":12,"RMA":12,"LMA":11,"RLA":9,"LLA":12}

# --------------------------
# Loaders
# --------------------------
def load_hs(csv_path: Path) -> pd.DataFrame:
    df = pd.read_csv(csv_path).rename(columns={
        "Gender":"gender","Heart Sound Type":"heart_sound_type",
        "Location":"location","Heart Sound ID":"heart_sound_id",
    })
    df = _strip_columns(df, ["gender","heart_sound_type","location","heart_sound_id"])
    df["label"] = map_label(df["heart_sound_type"], HEART_MAP)
    df["wav_path"]   = df["heart_sound_id"].map(lambda s: _rel_wav(DIR_HS, s))
    df["wav_exists"] = df["wav_path"].map(lambda p: Path(p).exists())
    return df[["gender","heart_sound_type","label","location","heart_sound_id","wav_path","wav_exists"]]

def load_ls(csv_path: Path) -> pd.DataFrame:
    df = pd.read_csv(csv_path).rename(columns={
        "Gender":"gender","Lung Sound Type":"lung_sound_type",
        "Location":"location","Lung Sound ID":"lung_sound_id",
    })
    df = _strip_columns(df, ["gender","lung_sound_type","location","lung_sound_id"])
    df["label"] = map_label(df["lung_sound_type"], LUNG_MAP)
    # NOTE: your LS 'lung_sound_id' looks like 'M_N_RUA' etc., not 'L0001'
    df["wav_path"]   = df["lung_sound_id"].map(lambda s: _rel_wav(DIR_LS, s, is_ls=True))
    df["wav_exists"] = df["wav_path"].map(lambda p: Path(p).exists())
    return df[["gender","lung_sound_type","label","location","lung_sound_id","wav_path","wav_exists"]]

def load_mix(csv_path: Path) -> pd.DataFrame:
    df = pd.read_csv(csv_path).rename(columns={
        "Gender":"gender","Heart Sound Type":"heart_sound_type",
        "Lung Sound Type":"lung_sound_type","Location":"location",
        "Heart Sound ID":"heart_sound_id","Lung Sound ID":"lung_sound_id",
        "Mixed Sound ID":"mixed_sound_id",
    })
    df = _strip_columns(df, ["gender","heart_sound_type","lung_sound_type",
                             "location","heart_sound_id","lung_sound_id","mixed_sound_id"])
    df["heart_label"] = map_label(df["heart_sound_type"], HEART_MAP)
    df["lung_label"]  = map_label(df["lung_sound_type"],  LUNG_MAP)
    df["mixed_wav_path"] = df["mixed_sound_id"].map(lambda s: _rel_wav(DIR_MIX, s))
    df["heart_wav_path"] = df["heart_sound_id"].map(lambda s: _rel_wav(DIR_HS,  s))
    df["lung_wav_path"]  = df["lung_sound_id"].map(lambda s: _rel_wav(DIR_LS,  s, is_ls=True))
    df["mixed_wav_exists"] = df["mixed_wav_path"].map(lambda p: Path(p).exists())
    return df[["gender","heart_sound_type","heart_label","lung_label","lung_sound_type","location",
               "heart_sound_id","lung_sound_id","mixed_sound_id",
               "mixed_wav_path","heart_wav_path","lung_wav_path","mixed_wav_exists"]]

# --------------------------
# Load all
# --------------------------
df_hs  = load_hs(CSV_HS)
df_ls  = load_ls(CSV_LS)
df_mix = load_mix(CSV_MIX)

# --------------------------
# Normalize locations
# --------------------------
# The only non-paper token you had was 'Apex' in heart sites
df_hs["location"]  = df_hs["location"].replace({"Apex":"A"})
df_mix["location"] = df_mix["location"].replace({"Apex":"A"})

# --------------------------
# TABLE III (Sound Types)
# --------------------------
# Heart
hs_counts    = df_hs["label"].value_counts()
mix_h_counts = df_mix["heart_label"].value_counts()
tblIII_heart = pd.DataFrame({
    "Heart Sound Type":[HEART_TYPE_NAME[k] for k in HEART_TYPE_ORDER],
    "Label": HEART_TYPE_ORDER,
    "HS.zip":  [int(hs_counts.get(k,0))    for k in HEART_TYPE_ORDER],
    "Mix.zip": [int(mix_h_counts.get(k,0)) for k in HEART_TYPE_ORDER],
})
# Lung
ls_counts    = df_ls["label"].value_counts()
mix_l_counts = df_mix["lung_label"].value_counts()
tblIII_lung = pd.DataFrame({
    "Lung Sound Type":[LUNG_TYPE_NAME[k] for k in LUNG_TYPE_ORDER],
    "Label": LUNG_TYPE_ORDER,
    "LS.zip":  [int(ls_counts.get(k,0))    for k in LUNG_TYPE_ORDER],
    "Mix.zip": [int(mix_l_counts.get(k,0)) for k in LUNG_TYPE_ORDER],
})
print("TABLE III — Heart Sounds")
display(tblIII_heart)
print("TABLE III — Lung Sounds")
display(tblIII_lung)

# --------------------------
# TABLE IV (Chest Zones)
# --------------------------
# Heart zones
hs_loc_counts   = df_hs["location"].value_counts()
mix_heart_locs  = df_mix[df_mix["location"].isin(HEART_SET)]["location"].value_counts()
tblIV_heart = pd.DataFrame({
    "location": HEART_LOCS,
    "HS.zip":  [int(hs_loc_counts.get(z,0))  for z in HEART_LOCS],
    "Mix.zip": [int(mix_heart_locs.get(z,0)) for z in HEART_LOCS],
})
print("TABLE IV — Heart Auscultation Landmarks")
display(tblIV_heart)

# Lung zones (as-is from Mix location)
mix_lung_locs_as_is = df_mix[df_mix["location"].isin(LUNG_SET)]["location"].value_counts()
tblIV_lung_as_is = pd.DataFrame({
    "location": LUNG_LOCS,
    "LS.zip":  [int(df_ls["location"].value_counts().get(z,0)) for z in LUNG_LOCS],
    "Mix.zip": [int(mix_lung_locs_as_is.get(z,0))              for z in LUNG_LOCS],
})
print("TABLE IV — Lung Auscultation Landmarks (from Mix.csv as-is)")
display(tblIV_lung_as_is)

# --------------------------
# Diagnostics: why Mix lung counts differ
# --------------------------
print("\nDiagnostics — ID schema")
print("LS Lung Sound IDs (examples):", df_ls["lung_sound_id"].head(10).tolist())
print("Mix Lung Sound IDs (examples):", df_mix["lung_sound_id"].head(10).tolist())
overlap = set(df_ls["lung_sound_id"]) & set(df_mix["lung_sound_id"])
print("ID overlap (LS vs Mix):", len(overlap), "common IDs")

# Rows in Mix where Location is a HEART site (likely data-entry at heart spot)
suspect_mix_rows = df_mix[df_mix["location"].isin(HEART_SET)].copy()
print(f"\nMix rows recorded at HEART sites (these cannot contribute to lung-zone counts): {len(suspect_mix_rows)}")
display(suspect_mix_rows[["mixed_sound_id","lung_sound_id","lung_sound_type","location","gender"]].head(20))

# --------------------------
# OPTION A: If you ever get a mapping to LS, use this authoritative flow:
# --------------------------
def authoritative_mix_lung_counts(df_mix, df_ls, key_in_mix="lung_sound_id", key_in_ls="lung_sound_id"):
    """
    Merge LS -> Mix by a shared key (when available),
    then count lung zones from LS's 'location'.
    """
    tmp = df_mix.merge(df_ls[[key_in_ls, "location"]].rename(columns={"location":"lung_loc"}),
                       left_on=key_in_mix, right_on=key_in_ls, how="left")
    c = (tmp["lung_loc"]
         .loc[tmp["lung_loc"].isin(LUNG_SET)]
         .value_counts()
         .reindex(LUNG_LOCS, fill_value=0)
         .to_dict())
    return c

# Attempt (will yield zeros in your current data because IDs don't match)
authoritative_try = authoritative_mix_lung_counts(df_mix, df_ls)
print("\nAuthoritative Mix lung counts via LS merge (expected to fail with current IDs):", authoritative_try)

# --------------------------
# OPTION B: Assisted fix workflow (interactive patch)
# --------------------------
# Compute deltas vs paper using the as-is Mix locations
curr_mix = {z: int(mix_lung_locs_as_is.get(z,0)) for z in LUNG_LOCS}
delta = {z: EXPECTED_MIX_LUNG[z] - curr_mix.get(z, 0) for z in LUNG_LOCS}
print("\nAs-is Mix lung counts:", curr_mix)
print("Expected (paper):     ", EXPECTED_MIX_LUNG)
print("Delta (need to add + / remove -):", delta)

# Show all candidate rows to potentially retag (currently tagged with HEART sites)
# You (or a curator) can reassign some of these to the under-represented lung zones to match the paper.
candidates = df_mix[df_mix["location"].isin(HEART_SET)].copy()
candidates = candidates[["mixed_sound_id","lung_sound_id","heart_sound_id","gender",
                         "lung_sound_type","heart_sound_type","location"]].sort_values("mixed_sound_id")
print("\nCandidate rows to review (Mix rows with HEART locations):")
display(candidates.head(30))
print(f"... total candidates: {len(candidates)}")

# Helper to apply a manual patch: provide a dict {mixed_sound_id: new_lung_location}
def apply_mix_location_patch(df_mix, patch_map):
    patched = df_mix.copy()
    mask = patched["mixed_sound_id"].isin(patch_map.keys())
    patched.loc[mask, "location"] = patched.loc[mask, "mixed_sound_id"].map(patch_map)
    return patched

# --- EXAMPLE PATCH (empty by default): fill this dict with your corrections ---
manual_patch = {
    # "M0001": "RUA",
    # "M0007": "RUA",
    # ...
}

df_mix_fixed = apply_mix_location_patch(df_mix, manual_patch)

# Recompute lung counts with the patched Mix
mix_lung_locs_fixed = df_mix_fixed[df_mix_fixed["location"].isin(LUNG_SET)]["location"].value_counts()
fixed_mix_counts = {z: int(mix_lung_locs_fixed.get(z,0)) for z in LUNG_LOCS}

print("\nAfter manual patch — Mix lung counts:", fixed_mix_counts)
print("Still needed delta:", {z: EXPECTED_MIX_LUNG[z] - fixed_mix_counts.get(z,0) for z in LUNG_LOCS})

# Optional: emit the fixed Mix for audit
# df_mix_fixed.to_csv(DIR_MIX / "Mix_fixed.csv", index=False)

TABLE III — Heart Sounds


Unnamed: 0,Heart Sound Type,Label,HS.zip,Mix.zip
0,Normal,NH,9,13
1,Late Diastolic Murmur,LDM,6,13
2,Mid Systolic Murmur,MSM,7,14
3,Late Systolic Murmur,LSM,5,17
4,Atrial Fibrillation,AF,4,15
5,S4,S4,2,16
6,Early Systolic Murmur,ESM,6,13
7,S3,S3,5,15
8,Tachycardia,T,3,16
9,AV Block,AVB,3,13


TABLE III — Lung Sounds


Unnamed: 0,Lung Sound Type,Label,LS.zip,Mix.zip
0,Normal,NL,12,28
1,Wheezing,W,7,28
2,Fine Crackles,FC,5,22
3,Rhonchi,R,8,23
4,Pleural Rub,PR,9,25
5,Coarse Crackles,CC,9,19


TABLE IV — Heart Auscultation Landmarks


Unnamed: 0,location,HS.zip,Mix.zip
0,RUSB,7,13
1,LUSB,13,12
2,LLSB,10,12
3,RC,4,14
4,LC,6,13
5,A,10,12


TABLE IV — Lung Auscultation Landmarks (from Mix.csv as-is)


Unnamed: 0,location,LS.zip,Mix.zip
0,RUA,7,9
1,LUA,11,12
2,RMA,5,11
3,LMA,9,13
4,RLA,10,12
5,LLA,8,12



Diagnostics — ID schema
LS Lung Sound IDs (examples): ['M_N_RUA', 'F_N_LUA', 'F_N_RMA', 'F_N_LMA', 'M_N_RLA', 'M_N_LLA', 'M_PR_RMA', 'M_PR_LUA', 'F_R_LUA', 'M_W_LUA']
Mix Lung Sound IDs (examples): ['L0001', 'L0002', 'L0003', 'L0004', 'L0005', 'L0006', 'L0007', 'L0008', 'L0009', 'L0010']
ID overlap (LS vs Mix): 0 common IDs

Mix rows recorded at HEART sites (these cannot contribute to lung-zone counts): 76


Unnamed: 0,mixed_sound_id,lung_sound_id,lung_sound_type,location,gender
0,M0001,L0001,Rhonchi,LUSB,F
3,M0004,L0004,Coarse Crackles,A,F
4,M0005,L0005,Fine Crackles,RUSB,M
5,M0006,L0006,Pleural Rub,RC,F
6,M0007,L0007,Pleural Rub,A,M
10,M0011,L0011,Normal,A,M
12,M0013,L0013,Pleural Rub,RC,F
16,M0017,L0017,Coarse Crackles,RC,F
20,M0021,L0021,Coarse Crackles,LUSB,M
22,M0023,L0023,Normal,LC,M



Authoritative Mix lung counts via LS merge (expected to fail with current IDs): {'RUA': 0, 'LUA': 0, 'RMA': 0, 'LMA': 0, 'RLA': 0, 'LLA': 0}

As-is Mix lung counts: {'RUA': 9, 'LUA': 12, 'RMA': 11, 'LMA': 13, 'RLA': 12, 'LLA': 12}
Expected (paper):      {'RUA': 12, 'LUA': 12, 'RMA': 12, 'LMA': 11, 'RLA': 9, 'LLA': 12}
Delta (need to add + / remove -): {'RUA': 3, 'LUA': 0, 'RMA': 1, 'LMA': -2, 'RLA': -3, 'LLA': 0}

Candidate rows to review (Mix rows with HEART locations):


Unnamed: 0,mixed_sound_id,lung_sound_id,heart_sound_id,gender,lung_sound_type,heart_sound_type,location
0,M0001,L0001,H0001,F,Rhonchi,Late Systolic Murmur,LUSB
3,M0004,L0004,H0004,F,Coarse Crackles,S3,A
4,M0005,L0005,H0005,M,Fine Crackles,AV Block,RUSB
5,M0006,L0006,H0006,F,Pleural Rub,S3,RC
6,M0007,L0007,H0007,M,Pleural Rub,Tachycardia,A
10,M0011,L0011,H0011,M,Normal,S4,A
12,M0013,L0013,H0013,F,Pleural Rub,Atrial Fibrillation,RC
16,M0017,L0017,H0017,F,Coarse Crackles,S3,RC
20,M0021,L0021,H0021,M,Coarse Crackles,Late Systolic Murmur,LUSB
22,M0023,L0023,H0023,M,Normal,Tachycardia,LC


... total candidates: 76

After manual patch — Mix lung counts: {'RUA': 9, 'LUA': 12, 'RMA': 11, 'LMA': 13, 'RLA': 12, 'LLA': 12}
Still needed delta: {'RUA': 3, 'LUA': 0, 'RMA': 1, 'LMA': -2, 'RLA': -3, 'LLA': 0}
