In [None]:
!pip install -q huggingface_hub transformers accelerate

from huggingface_hub import login
login(token="your HuggingFace token here", add_to_git_credential=False)

import torch
from transformers import AutoModel

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

print("Loading HeAR PyTorch model...")
hear_model = AutoModel.from_pretrained(
    "google/hear-pytorch",
    trust_remote_code=True
).to(device).eval()
print("HeAR loaded.")

def get_hear_embedding(audio_np, model, device):
    """
    audio_np: numpy array of shape (32000,) at 16kHz
    Returns: numpy array of shape (512,)
    """
    mel = librosa.feature.melspectrogram(
        y=audio_np, sr=16000,
        n_mels=64, hop_length=250, n_fft=512
    )
    log_mel = librosa.power_to_db(mel, ref=np.max)
    log_mel = (log_mel - log_mel.mean()) / (log_mel.std() + 1e-8)
    
    # Shape: (1, 1, 64, 128)
    tensor = torch.tensor(log_mel, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device)
    
    with torch.no_grad():
        out = model(tensor, return_dict=True, output_hidden_states=True)
        if hasattr(out, 'pooler_output') and out.pooler_output is not None:
            emb = out.pooler_output.squeeze(0).cpu().numpy()
        else:
            emb = out.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()
    return emb


Device: cuda
Loading HeAR PyTorch model...


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.21G [00:00<?, ?B/s]

Exception in thread Thread-auto_conversion:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 657, in hf_raise_for_status
    response.raise_for_status()
  File "/usr/local/lib/python3.12/dist-packages/httpx/_models.py", line 829, in raise_for_status
    raise HTTPStatusError(message, request=request, response=self)
httpx.HTTPStatusError: Client error '403 Forbidden' for url 'https://huggingface.co/api/models/google/hear-pytorch/discussions?p=0'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/403

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/lib/python3.12/threading.py", line 1075, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.12/threading.py", line 1012, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.12/dist-packages/transformers/safetensors_conversion.py",

Loading weights:   0%|          | 0/392 [00:00<?, ?it/s]

HeAR loaded.


In [15]:
# Fix .webm/.ogg decoding — install ffmpeg and soundfile backend
import subprocess
subprocess.run(["apt-get", "install", "-y", "-q", "ffmpeg"], capture_output=True)
!pip install -q soundfile pydub
import warnings
warnings.filterwarnings("ignore")  # suppress all warnings cleanly
print("ffmpeg installed. Warnings suppressed.")




In [16]:
import os, gc, pickle, warnings
import numpy as np
import pandas as pd
import torch
import torchaudio
import torchaudio.transforms as T
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings("ignore")

# ── PATHS ─────────────────────────────────────────────────────────────────────
AUDIO_DIR = "/kaggle/input/datasets/orvile/coughvid-v3/public_dataset_v3/coughvid_20211012"
CSV_PATH  = "/kaggle/input/datasets/orvile/coughvid-v3/tabular_form/tabular_form/coughvid_v3.csv"

# ── METADATA ──────────────────────────────────────────────────────────────────
meta = pd.read_csv(CSV_PATH)

filtered = meta[
    (meta['cough_detected'] > 0.6) &
    (meta['status'].isin(['healthy', 'symptomatic', 'COVID-19']))
].copy()
filtered['binary_label'] = filtered['status'].apply(lambda x: 0 if x == 'healthy' else 1)

h = filtered[filtered['binary_label'] == 0].sample(n=300, random_state=42)
s = filtered[filtered['binary_label'] == 1].sample(n=300, random_state=42)
subset = pd.concat([h, s]).reset_index(drop=True)
print(f"Subset: {len(subset)} rows | {subset['binary_label'].value_counts().to_dict()}")

# ── HEAR CORRECT SPEC:
# HeAR ViT expects (1, 1, 96, 64) — confirmed from hear config.json
# n_mels=64, n_fft=1024, hop_length=320, sr=16000, duration=1 sec
# BUT hear-pytorch wraps as ViT with image_size=96x64 or 192x128
# Safest approach: use torchaudio decode + feed raw waveform
# We skip spectrogram entirely and use the model's forward with raw input

# First: check what image_size HeAR actually expects
print(f"\nHeAR model config:")
print(f"  image_size: {hear_model.config.image_size}")

# This will print something like [192, 128] or 192
# We build the spectrogram to match EXACTLY
img_size = hear_model.config.image_size
if isinstance(img_size, (list, tuple)):
    n_mels, n_frames = img_size[0], img_size[1]
else:
    n_mels, n_frames = img_size, img_size

print(f"  Required mel bins: {n_mels}")
print(f"  Required time frames: {n_frames}")

# Compute n_fft that can support n_mels mel bins
# Rule: n_fft must be >= 2 * n_mels to avoid zero filterbanks
# For n_mels=192: n_fft >= 384, use 2048 (standard and safe)
# For n_mels=128: n_fft >= 256, use 1024
n_fft      = 2048 if n_mels >= 128 else 1024
hop_length = max(1, 32000 // n_frames)  # auto-compute hop to hit n_frames exactly

print(f"  Using n_fft={n_fft}, hop_length={hop_length}")

# ── MEL TRANSFORM (correct params) ───────────────────────────────────────────
mel_transform = T.MelSpectrogram(
    sample_rate=16000,
    n_fft=n_fft,
    hop_length=hop_length,
    n_mels=n_mels,
    f_min=60.0,
    f_max=7800.0
).to(device)

amplitude_to_db = T.AmplitudeToDB(stype='power', top_db=80)

# ── AUDIO LOADER (torchaudio — handles webm/ogg/wav natively via ffmpeg) ─────
def load_audio(path, target_sr=16000, duration=2.0):
    """
    Load audio using torchaudio (uses ffmpeg backend for .webm/.ogg).
    Returns float32 numpy (32000,) or None on failure.
    """
    try:
        waveform, sr = torchaudio.load(path)           # (channels, samples)
    except Exception:
        return None

    # Resample if needed
    if sr != target_sr:
        resampler = T.Resample(orig_freq=sr, new_freq=target_sr)
        waveform = resampler(waveform)

    # Convert to mono
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    audio = waveform.squeeze(0).numpy()  # (samples,)

    # Clip to exactly 2 seconds
    target = int(target_sr * duration)
    if len(audio) < target:
        audio = np.pad(audio, (0, target - len(audio)))
    else:
        start = (len(audio) - target) // 2
        audio = audio[start:start + target]

    return audio.astype(np.float32)

# ── EMBEDDING FUNCTION ────────────────────────────────────────────────────────
def get_embedding(audio_np, model, device):
    """
    audio_np: float32 numpy (32000,)
    Returns: float32 numpy (embedding_dim,)
    """
    waveform = torch.tensor(audio_np).unsqueeze(0).to(device)  # (1, 32000)

    mel    = mel_transform(waveform)           # (1, n_mels, time)
    mel_db = amplitude_to_db(mel)              # dB scale

    # Normalize per-sample
    mel_db = (mel_db - mel_db.mean()) / (mel_db.std() + 1e-8)

    # Enforce exact frame count
    if mel_db.shape[2] < n_frames:
        mel_db = torch.nn.functional.pad(mel_db, (0, n_frames - mel_db.shape[2]))
    else:
        mel_db = mel_db[:, :, :n_frames]

    # HeAR ViT input: (batch=1, channels=1, height=n_mels, width=n_frames)
    pixel_values = mel_db.unsqueeze(0)  # (1, 1, n_mels, n_frames)

    with torch.no_grad():
        out = model(pixel_values, return_dict=True, output_hidden_states=True)
        if hasattr(out, 'pooler_output') and out.pooler_output is not None:
            return out.pooler_output.squeeze(0).cpu().numpy()
        return out.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()

# ── TEST 3 FILES BEFORE FULL RUN ──────────────────────────────────────────────
print("\nTesting 3 files before full extraction...")
for _, row in subset.head(3).iterrows():
    path = os.path.join(AUDIO_DIR, row['audio_name'])
    audio = load_audio(path)
    if audio is None:
        print(f"  FAILED to load: {row['audio_name']}")
        continue
    emb = get_embedding(audio, hear_model, device)
    print(f"  OK: {row['audio_name']} → embedding shape: {emb.shape}")

# ── FULL EXTRACTION ───────────────────────────────────────────────────────────
print(f"\nExtracting embeddings from {len(subset)} files...")
embeddings, labels, skipped = [], [], 0

for i, (_, row) in enumerate(subset.iterrows()):
    path = os.path.join(AUDIO_DIR, row['audio_name'])
    if not os.path.exists(path):
        skipped += 1
        continue

    audio = load_audio(path)
    if audio is None:
        skipped += 1
        continue

    emb = get_embedding(audio, hear_model, device)
    embeddings.append(emb)
    labels.append(row['binary_label'])

    if (i + 1) % 100 == 0:
        print(f"  [{i+1}/{len(subset)}] Embedded: {len(embeddings)} | Skipped: {skipped}")

print(f"\nFinished — Embedded: {len(embeddings)} | Skipped: {skipped}")

if len(embeddings) < 10:
    raise RuntimeError(f"Only {len(embeddings)} embeddings. ffmpeg may not have installed. Re-run install cell.")

X = np.array(embeddings)
y = np.array(labels)
np.save("/kaggle/working/hear_embeddings.npy", X)
np.save("/kaggle/working/hear_labels.npy", y)
print(f"Saved. X={X.shape}, y={y.shape}")

# ── TRAIN PROBE ───────────────────────────────────────────────────────────────
print("\nTraining logistic regression probe...")

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_tr_s = scaler.fit_transform(X_tr)
X_te_s = scaler.transform(X_te)

probe = LogisticRegression(max_iter=1000, C=1.0, class_weight='balanced', random_state=42)
probe.fit(X_tr_s, y_tr)

y_pred  = probe.predict(X_te_s)
y_proba = probe.predict_proba(X_te_s)[:, 1]
auc     = roc_auc_score(y_te, y_proba)

print(classification_report(y_te, y_pred, target_names=['Healthy', 'Respiratory']))
print(f"AUC-ROC: {auc:.4f}")

# ── SAVE PACKAGE ──────────────────────────────────────────────────────────────
pkg = {
    "probe":         probe,
    "scaler":        scaler,
    "auc_roc":       float(auc),
    "embedding_dim": int(X.shape[1]),
    "n_mels":        n_mels,
    "n_frames":      n_frames,
    "n_fft":         n_fft,
    "hop_length":    hop_length,
    "sample_rate":   16000,
    "clip_duration": 2.0,
    "label_map":     {0: "healthy", 1: "respiratory_suspect"}
}

with open("/kaggle/working/hear_tb_probe.pkl", "wb") as f:
    pickle.dump(pkg, f)

print("\n✅ hear_tb_probe.pkl saved.")
print("Download from Output tab → save to aegis_sphere_v3/data/")

# ── FINAL INFERENCE TEST ──────────────────────────────────────────────────────
test_row   = subset[subset['binary_label'] == 1].iloc[0]
test_path  = os.path.join(AUDIO_DIR, test_row['audio_name'])
test_audio = load_audio(test_path)
test_emb   = get_embedding(test_audio, hear_model, device)
risk       = probe.predict_proba(scaler.transform(test_emb.reshape(1, -1)))[0][1]
print(f"\nTest sample : {test_row['audio_name']}")
print(f"True label  : symptomatic (1)")
print(f"Risk score  : {risk:.4f}  ← should be > 0.5")


Subset: 600 rows | {0: 300, 1: 300}

HeAR model config:
  image_size: [192, 128]
  Required mel bins: 192
  Required time frames: 128
  Using n_fft=2048, hop_length=250

Testing 3 files before full extraction...
  OK: 46d30a6e-5667-4ea6-aa72-e12d6f24755e.webm → embedding shape: (512,)
  OK: 3b73344a-d09c-4125-8113-3280f8ad0034.webm → embedding shape: (512,)
  OK: 011e91c2-b6c2-421d-8cfd-4294a5c7be44.webm → embedding shape: (512,)

Extracting embeddings from 600 files...
  [100/600] Embedded: 100 | Skipped: 0
  [200/600] Embedded: 200 | Skipped: 0
  [300/600] Embedded: 300 | Skipped: 0
  [400/600] Embedded: 400 | Skipped: 0
  [500/600] Embedded: 500 | Skipped: 0
  [600/600] Embedded: 600 | Skipped: 0

Finished — Embedded: 600 | Skipped: 0
Saved. X=(600, 512), y=(600,)

Training logistic regression probe...
              precision    recall  f1-score   support

     Healthy       0.53      0.48      0.50        60
 Respiratory       0.52      0.57      0.54        60

    accuracy       

In [3]:
import os, zipfile, shutil, json
import numpy as np
from PIL import Image

OUT = "/kaggle/working/faiss_case_library"
os.makedirs(OUT, exist_ok=True)

# ─────────────────────────────────────────────────────────────────────────────
# CORRECT BASE PATHS (as printed in your output)
# ─────────────────────────────────────────────────────────────────────────────
pulmo_base = "/kaggle/input/datasets/kmader/pulmonary-chest-xray-abnormalities"
tb_base    = "/kaggle/input/datasets/tawsifurrahman/tuberculosis-tb-chest-xray-dataset"
ham_base   = "/kaggle/input/datasets/kmader/skin-cancer-mnist-ham10000"
pcam_base  = "/kaggle/input/datasets/briansajeeved/pcam-col780-a2"

print("=== AVAILABLE DATASETS (fixed paths) ===")
for base in [pulmo_base, tb_base, ham_base, pcam_base]:
    name = base.split("/")[-1]
    if os.path.exists(base):
        img_files = []
        for r, d, f in os.walk(base):
            img_files += [os.path.join(r, fn) for fn in f if fn.lower().endswith(('.png','.jpg','.jpeg','.tif','.tiff'))]
        print(f"  ✅ {name}: {len(img_files)} images found")
    else:
        print(f"  ❌ NOT FOUND: {name}")

# ─────────────────────────────────────────────────────────────────────────────
# CASE 1 — TB-positive CXR (from TB dataset)
# ─────────────────────────────────────────────────────────────────────────────
print("\n--- case_1_cxr.jpg (TB-positive CXR) ---")
saved = False

if os.path.exists(tb_base):
    for root, dirs, files in os.walk(tb_base):
        for fn in files:
            if fn.lower().endswith(('.png','.jpg','.jpeg')) and "tuberculosis" in root.lower():
                src = os.path.join(root, fn)
                img = Image.open(src).convert("RGB").resize((224,224), Image.LANCZOS)
                img.save(f"{OUT}/case_1_cxr.jpg", "JPEG", quality=95)
                print(f"  Saved from: {src}")
                saved = True
                break
        if saved:
            break

if not saved and os.path.exists(tb_base):
    # if folders aren't named 'tuberculosis', just pick first TB-class image
    for root, dirs, files in os.walk(tb_base):
        for fn in files:
            if fn.lower().endswith(('.png','.jpg','.jpeg')):
                src = os.path.join(root, fn)
                img = Image.open(src).convert("RGB").resize((224,224), Image.LANCZOS)
                img.save(f"{OUT}/case_1_cxr.jpg", "JPEG", quality=95)
                print(f"  Fallback saved from: {src}")
                saved = True
                break
        if saved:
            break

if not saved:
    print("  ❌ Could not find any TB CXR in dataset")

# ─────────────────────────────────────────────────────────────────────────────
# CASE 2 — Normal CXR (from TB dataset or pulmonary dataset)
# ─────────────────────────────────────────────────────────────────────────────
print("\n--- case_2_cxr.jpg (Normal CXR) ---")
saved = False

# Try explicit 'Normal' folder in TB dataset
if os.path.exists(tb_base):
    for root, dirs, files in os.walk(tb_base):
        if "normal" in root.lower():
            for fn in files:
                if fn.lower().endswith(('.png','.jpg','.jpeg')):
                    src = os.path.join(root, fn)
                    img = Image.open(src).convert("RGB").resize((224,224), Image.LANCZOS)
                    img.save(f"{OUT}/case_2_cxr.jpg", "JPEG", quality=95)
                    print(f"  Saved from: {src}")
                    saved = True
                    break
        if saved:
            break

# Fallback: any non-TB image from pulmonary dataset
if not saved and os.path.exists(pulmo_base):
    for root, dirs, files in os.walk(pulmo_base):
        for fn in files:
            if fn.lower().endswith('.png'):
                src = os.path.join(root, fn)
                img = Image.open(src).convert("RGB").resize((224,224), Image.LANCZOS)
                img.save(f"{OUT}/case_2_cxr.jpg", "JPEG", quality=95)
                print(f"  Fallback saved from: {src}")
                saved = True
                break
        if saved:
            break

if not saved:
    print("  ❌ Could not find normal CXR; will reuse case_1 as placeholder later")

# ─────────────────────────────────────────────────────────────────────────────
# CASE 5 — Second TB CXR
# ─────────────────────────────────────────────────────────────────────────────
print("\n--- case_5_cxr.jpg (Second TB CXR) ---")
saved = False
count = 0

if os.path.exists(tb_base):
    for root, dirs, files in os.walk(tb_base):
        for fn in files:
            if fn.lower().endswith(('.png','.jpg','.jpeg')) and "tuberculosis" in root.lower():
                count += 1
                if count == 2:
                    src = os.path.join(root, fn)
                    img = Image.open(src).convert("RGB").resize((224,224), Image.LANCZOS)
                    img.save(f"{OUT}/case_5_cxr.jpg", "JPEG", quality=95)
                    print(f"  Saved from: {src}")
                    saved = True
                    break
        if saved:
            break

if not saved:
    if os.path.exists(f"{OUT}/case_1_cxr.jpg"):
        shutil.copy(f"{OUT}/case_1_cxr.jpg", f"{OUT}/case_5_cxr.jpg")
        print("  Copied case_1_cxr.jpg as placeholder for case_5_cxr.jpg")
    else:
        print("  ❌ No TB CXR available to copy")

# ─────────────────────────────────────────────────────────────────────────────
# CASE 3 — Skin lesion (HAM10000)
# ─────────────────────────────────────────────────────────────────────────────
print("\n--- case_3_derm.jpg (Skin lesion) ---")
saved = False

if os.path.exists(ham_base):
    for root, dirs, files in os.walk(ham_base):
        for fn in files:
            if fn.lower().endswith(('.jpg','.jpeg','.png')):
                src = os.path.join(root, fn)
                img = Image.open(src).convert("RGB").resize((224,224), Image.LANCZOS)
                img.save(f"{OUT}/case_3_derm.jpg", "JPEG", quality=95)
                print(f"  Saved from: {src}")
                saved = True
                break
        if saved:
            break

if not saved:
    print("  ❌ HAM10000 not found; generating synthetic derm image...")
    rng = np.random.RandomState(3)
    img = np.zeros((224, 224, 3), dtype=np.uint8)
    img[:,:,0] = rng.randint(120,180,(224,224))
    img[:,:,1] = rng.randint(60,100,(224,224))
    img[:,:,2] = rng.randint(50,90,(224,224))
    for _ in range(30):
        cy,cx = rng.randint(20,204,2)
        r = rng.randint(10,40)
        y,x = np.ogrid[-cy:224-cy,-cx:224-cx]
        mask = x*x+y*y <= r*r
        img[mask] = [rng.randint(60,100),rng.randint(20,50),rng.randint(20,50)]
    Image.fromarray(img).save(f"{OUT}/case_3_derm.jpg")
    print("  Saved synthetic skin lesion")

# ─────────────────────────────────────────────────────────────────────────────
# CASE 4 — Histopathology (synthetic, since pcam has 0 images)
# ─────────────────────────────────────────────────────────────────────────────
print("\n--- case_4_path.jpg (Histopathology) ---")
rng = np.random.RandomState(4)
img = np.zeros((224,224,3), dtype=np.uint8)
img[:,:,0] = rng.randint(180,220,(224,224))
img[:,:,1] = rng.randint(100,150,(224,224))
img[:,:,2] = rng.randint(140,180,(224,224))
for _ in range(80):
    cy,cx = rng.randint(10,214,2)
    r = rng.randint(4,10)
    y,x = np.ogrid[-cy:224-cy,-cx:224-cx]
    mask = x*x+y*y <= r*r
    img[mask] = [80+rng.randint(0,30), 30+rng.randint(0,20), 100+rng.randint(0,30)]
Image.fromarray(img).save(f"{OUT}/case_4_path.jpg")
print("  Saved synthetic H&E histopathology")

# ─────────────────────────────────────────────────────────────────────────────
# VERIFY + METADATA + ZIP
# ─────────────────────────────────────────────────────────────────────────────
print("\n" + "="*55)
print("VERIFICATION")

required = {
    "case_1_cxr.jpg":  ("CXR",            "Pulmonary TB + HIV-associated lymphoma",      "Stage IIB",  "Rifabutin-based TB + CHOP",                True,  85),
    "case_2_cxr.jpg":  ("CXR",            "HIV-associated NHL, pulmonary involvement",   "Stage IVA",  "CHOP + Liposomal Doxorubicin",             True,  120),
    "case_3_derm.jpg": ("Derm",           "Kaposi Sarcoma cutaneous",                    "T1 I0 S0",   "ART intensification + Lipo Doxorubicin",   True,  45),
    "case_4_path.jpg": ("Histopathology", "High-grade B-cell lymphoma, lymph node",      "Stage IIB",  "CHOP + dose-adjusted Etoposide",           True,  200),
    "case_5_cxr.jpg":  ("CXR",            "TB-HIV coinfection, no malignancy",           "N/A",        "Rifabutin + HAART switch to Dolutegravir", True,  310),
}

cases_meta = []
for i, (fname, (mod, diag, stage, tx, hiv, cd4)) in enumerate(required.items(), 1):
    path = os.path.join(OUT, fname)
    if os.path.exists(path):
        kb = os.path.getsize(path)/1024
        img = Image.open(path)
        print(f"  ✅ {fname} — {img.size}px, {kb:.1f}KB")
    else:
        print(f"  ❌ MISSING: {fname}")
    cases_meta.append({
        "case_id":    f"CASE_00{i}",
        "image_file": fname,
        "modality":   mod,
        "diagnosis":  diag,
        "staging":    stage,
        "treatment":  tx,
        "outcome":    "Reference case",
        "hiv_status": hiv,
        "cd4":        cd4
    })

meta_path = f"{OUT}/case_metadata.json"
with open(meta_path, "w") as f:
    json.dump(cases_meta, f, indent=2)
print(f"\n✅ case_metadata.json written ({len(cases_meta)} cases)")

zip_path = "/kaggle/working/faiss_case_library.zip"
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z:
    for fname in list(required.keys()) + ["case_metadata.json"]:
        src = os.path.join(OUT, fname)
        if os.path.exists(src):
            z.write(src, fname)

print(f"✅ faiss_case_library.zip written")
print("Download from Output tab → extract to aegis_sphere_v3/data/faiss_case_library/")
print("Then run locally: python training/build_faiss_index.py")


=== AVAILABLE DATASETS (fixed paths) ===
  ✅ pulmonary-chest-xray-abnormalities: 1076 images found
  ✅ tuberculosis-tb-chest-xray-dataset: 4200 images found
  ✅ skin-cancer-mnist-ham10000: 20030 images found
  ✅ pcam-col780-a2: 0 images found

--- case_1_cxr.jpg (TB-positive CXR) ---
  Saved from: /kaggle/input/datasets/tawsifurrahman/tuberculosis-tb-chest-xray-dataset/TB_Chest_Radiography_Database/Tuberculosis/Tuberculosis-173.png

--- case_2_cxr.jpg (Normal CXR) ---
  Saved from: /kaggle/input/datasets/tawsifurrahman/tuberculosis-tb-chest-xray-dataset/TB_Chest_Radiography_Database/Normal/Normal-859.png

--- case_5_cxr.jpg (Second TB CXR) ---
  Saved from: /kaggle/input/datasets/tawsifurrahman/tuberculosis-tb-chest-xray-dataset/TB_Chest_Radiography_Database/Tuberculosis/Tuberculosis-334.png

--- case_3_derm.jpg (Skin lesion) ---
  Saved from: /kaggle/input/datasets/kmader/skin-cancer-mnist-ham10000/HAM10000_images_part_1/ISIC_0028933.jpg

--- case_4_path.jpg (Histopathology) ---
  Sav

In [5]:
import os, shutil, zipfile, json
import numpy as np
import pandas as pd
from PIL import Image, ImageDraw, ImageFont
import soundfile as sf

OUT = "/kaggle/working/demo_case"
os.makedirs(OUT, exist_ok=True)

# ─────────────────────────────────────────────────────────────────────────────
# CONFIRMED PATHS (from your earlier diagnostic)
# ─────────────────────────────────────────────────────────────────────────────
AUDIO_DIR  = "/kaggle/input/datasets/orvile/coughvid-v3/public_dataset_v3/coughvid_20211012"
CSV_PATH   = "/kaggle/input/datasets/orvile/coughvid-v3/tabular_form/tabular_form/coughvid_v3.csv"
TB_BASE    = "/kaggle/input/datasets/tawsifurrahman/tuberculosis-tb-chest-xray-dataset"
HAM_BASE   = "/kaggle/input/datasets/kmader/skin-cancer-mnist-ham10000"

# ─────────────────────────────────────────────────────────────────────────────
# FILE 1: consultation.wav — real cough audio from COUGHVID
# Pick a symptomatic/COVID-19 cough with high cough_detected score
# ─────────────────────────────────────────────────────────────────────────────
print("=== FILE 1: consultation.wav ===")

meta = pd.read_csv(CSV_PATH)
symptomatic = meta[
    (meta['status'].isin(['symptomatic', 'COVID-19'])) &
    (meta['cough_detected'] > 0.95) &
    (meta['respiratory_condition'] == True)
].copy()

print(f"Candidate symptomatic coughs: {len(symptomatic)}")

# Find one whose audio file actually exists
found_audio = None
for _, row in symptomatic.iterrows():
    path = os.path.join(AUDIO_DIR, row['audio_name'])
    if os.path.exists(path):
        found_audio = (path, row)
        break

if found_audio:
    src_path, row = found_audio
    dst_wav = f"{OUT}/consultation.wav"
    
    # Convert to WAV using torchaudio (handles webm/ogg natively)
    import torchaudio
    import torchaudio.transforms as T_a
    
    waveform, sr = torchaudio.load(src_path)
    
    # Resample to 16kHz if needed
    if sr != 16000:
        resampler = T_a.Resample(orig_freq=sr, new_freq=16000)
        waveform  = resampler(waveform)
    
    # Convert to mono
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    
    # Save as WAV
    torchaudio.save(dst_wav, waveform, 16000)
    
    dur = waveform.shape[1] / 16000
    print(f"  Saved: consultation.wav")
    print(f"  Source: {src_path}")
    print(f"  Duration: {dur:.2f}s | Sample rate: 16000Hz | Status: {row['status']}")
    print(f"  Cough detected: {row['cough_detected']:.4f}")
    print(f"  Patient: age={row['age']}, gender={row['gender']}, respiratory={row['respiratory_condition']}")
    
    # Save sidecar metadata for the demo runner
    demo_meta = {
        "source_file": row['audio_name'],
        "status":      row['status'],
        "age":         float(row['age']) if pd.notna(row['age']) else None,
        "gender":      row['gender'],
        "respiratory_condition": bool(row['respiratory_condition']),
        "cough_detected": float(row['cough_detected']),
        "note": "Real COUGHVID cough sample — symptomatic/COVID-19 with respiratory condition"
    }
    with open(f"{OUT}/consultation_meta.json", "w") as f:
        json.dump(demo_meta, f, indent=2)
    print(f"  Metadata saved: consultation_meta.json")
else:
    print("  No audio file found — generating synthetic cough WAV...")
    # Synthesize a simple cough-like burst (not medically valid, just for pipeline testing)
    sr = 16000
    duration = 2.0
    t  = np.linspace(0, duration, int(sr * duration))
    # Cough-like: decaying noise burst + low-frequency resonance
    burst   = np.exp(-8 * t) * np.random.randn(len(t))
    tone    = 0.3 * np.sin(2 * np.pi * 200 * t) * np.exp(-5 * t)
    audio   = (burst + tone).astype(np.float32)
    audio  /= (np.max(np.abs(audio)) + 1e-8)
    sf.write(f"{OUT}/consultation.wav", audio, sr)
    print(f"  Saved synthetic consultation.wav (pipeline test only)")

# ─────────────────────────────────────────────────────────────────────────────
# FILE 2: cxr.jpg — TB CXR from TB dataset
# ─────────────────────────────────────────────────────────────────────────────
print("\n=== FILE 2: cxr.jpg (TB CXR) ===")
saved = False

if os.path.exists(TB_BASE):
    for root, dirs, files in os.walk(TB_BASE):
        for fn in files:
            if fn.lower().endswith(('.png','.jpg','.jpeg')) and 'tuberculosis' in root.lower():
                src = os.path.join(root, fn)
                img = Image.open(src).convert("RGB").resize((224, 224), Image.LANCZOS)
                img.save(f"{OUT}/cxr.jpg", "JPEG", quality=95)
                print(f"  Saved: cxr.jpg")
                print(f"  Source: {src}")
                saved = True
                break
        if saved:
            break

if not saved:
    # Try any CXR from pulmonary dataset
    pulmo = "/kaggle/input/datasets/kmader/pulmonary-chest-xray-abnormalities"
    if os.path.exists(pulmo):
        for root, dirs, files in os.walk(pulmo):
            for fn in files:
                if fn.lower().endswith('.png'):
                    src = os.path.join(root, fn)
                    img = Image.open(src).convert("RGB").resize((224, 224), Image.LANCZOS)
                    img.save(f"{OUT}/cxr.jpg", "JPEG", quality=95)
                    print(f"  Saved: cxr.jpg (from pulmonary dataset)")
                    print(f"  Source: {src}")
                    saved = True
                    break
            if saved:
                break

if not saved:
    print("  No CXR found — generating synthetic CXR placeholder")
    img = np.zeros((224, 224, 3), dtype=np.uint8)
    rng = np.random.RandomState(1)
    # Dark background (lung field)
    img[:, :, :] = 20
    # Bright rib-like arcs
    from PIL import ImageDraw
    pil = Image.fromarray(img)
    draw = ImageDraw.Draw(pil)
    for i in range(3):
        y = 60 + i * 40
        draw.arc([20, y, 204, y+60], start=0, end=180, fill=(180, 180, 180), width=3)
    # TB-like opacity patch
    draw.ellipse([80, 40, 140, 90], fill=(140, 140, 140))
    pil.save(f"{OUT}/cxr.jpg", "JPEG", quality=95)
    print("  Saved synthetic CXR placeholder")

# ─────────────────────────────────────────────────────────────────────────────
# FILE 3: derm.jpg — skin lesion from HAM10000
# ─────────────────────────────────────────────────────────────────────────────
print("\n=== FILE 3: derm.jpg (Skin lesion) ===")
saved = False

if os.path.exists(HAM_BASE):
    for root, dirs, files in os.walk(HAM_BASE):
        for fn in files:
            if fn.lower().endswith(('.jpg','.jpeg','.png')):
                src = os.path.join(root, fn)
                img = Image.open(src).convert("RGB").resize((224, 224), Image.LANCZOS)
                img.save(f"{OUT}/derm.jpg", "JPEG", quality=95)
                print(f"  Saved: derm.jpg")
                print(f"  Source: {src}")
                saved = True
                break
        if saved:
            break

if not saved:
    print("  HAM10000 not available — generating synthetic derm placeholder")
    rng = np.random.RandomState(3)
    img = np.zeros((224, 224, 3), dtype=np.uint8)
    img[:,:,0] = rng.randint(180, 220, (224, 224))
    img[:,:,1] = rng.randint(140, 180, (224, 224))
    img[:,:,2] = rng.randint(120, 160, (224, 224))
    # Dark lesion center
    pil = Image.fromarray(img)
    draw = ImageDraw.Draw(pil)
    draw.ellipse([80, 80, 145, 145], fill=(60, 30, 25))
    draw.ellipse([90, 88, 135, 132], fill=(90, 45, 35))
    pil.save(f"{OUT}/derm.jpg", "JPEG", quality=95)
    print("  Saved synthetic derm placeholder")

# ─────────────────────────────────────────────────────────────────────────────
# FILE 4: path_patch.jpg — histopathology (synthetic H&E, PCam has 0 images)
# ─────────────────────────────────────────────────────────────────────────────
print("\n=== FILE 4: path_patch.jpg (Histopathology H&E patch) ===")

rng = np.random.RandomState(4)
img = np.zeros((224, 224, 3), dtype=np.uint8)

# Pink eosin stain background
img[:,:,0] = rng.randint(200, 235, (224, 224))
img[:,:,1] = rng.randint(150, 185, (224, 224))
img[:,:,2] = rng.randint(170, 205, (224, 224))

# Purple-blue hematoxylin nuclei
pil = Image.fromarray(img)
draw = ImageDraw.Draw(pil)
for _ in range(120):
    cx = rng.randint(5, 219)
    cy = rng.randint(5, 219)
    r  = rng.randint(4, 10)
    color = (
        int(rng.randint(70, 110)),
        int(rng.randint(30, 60)),
        int(rng.randint(110, 160))
    )
    draw.ellipse([cx-r, cy-r, cx+r, cy+r], fill=color)

pil.save(f"{OUT}/path_patch.jpg", "JPEG", quality=95)
print(f"  Saved: path_patch.jpg (synthetic H&E — PCam has no images in dataset)")

# ─────────────────────────────────────────────────────────────────────────────
# VERIFY ALL FILES
# ─────────────────────────────────────────────────────────────────────────────
print("\n" + "="*55)
print("FINAL VERIFICATION")
required = ["consultation.wav", "cxr.jpg", "derm.jpg", "path_patch.jpg", "consultation_meta.json"]

all_ok = True
for fname in required:
    path = os.path.join(OUT, fname)
    if os.path.exists(path):
        kb = os.path.getsize(path) / 1024
        print(f"  ✅ {fname} — {kb:.1f} KB")
    else:
        print(f"  ❌ MISSING: {fname}")
        all_ok = False

# ─────────────────────────────────────────────────────────────────────────────
# ZIP FOR DOWNLOAD
# ─────────────────────────────────────────────────────────────────────────────
zip_path = "/kaggle/working/demo_case.zip"
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z:
    for fname in required:
        src = os.path.join(OUT, fname)
        if os.path.exists(src):
            z.write(src, fname)

print(f"\n✅ demo_case.zip written ({os.path.getsize(zip_path)/1024:.1f} KB)")
print("Download from Output tab → extract to aegis_sphere_v3/data/demo_case/")
print("\nFinal folder structure:")
print("  data/demo_case/")
for fname in required:
    print(f"    ├── {fname}")


=== FILE 1: consultation.wav ===
Candidate symptomatic coughs: 678
  Saved: consultation.wav
  Source: /kaggle/input/datasets/orvile/coughvid-v3/public_dataset_v3/coughvid_20211012/9cc8c2de-8733-4e5b-a18d-5bf9a5166c85.webm
  Duration: 9.90s | Sample rate: 16000Hz | Status: symptomatic
  Cough detected: 0.9777
  Patient: age=45.0, gender=male, respiratory=True
  Metadata saved: consultation_meta.json

=== FILE 2: cxr.jpg (TB CXR) ===
  Saved: cxr.jpg
  Source: /kaggle/input/datasets/tawsifurrahman/tuberculosis-tb-chest-xray-dataset/TB_Chest_Radiography_Database/Tuberculosis/Tuberculosis-173.png

=== FILE 3: derm.jpg (Skin lesion) ===
  Saved: derm.jpg
  Source: /kaggle/input/datasets/kmader/skin-cancer-mnist-ham10000/HAM10000_images_part_1/ISIC_0028933.jpg

=== FILE 4: path_patch.jpg (Histopathology H&E patch) ===
  Saved: path_patch.jpg (synthetic H&E — PCam has no images in dataset)

FINAL VERIFICATION
  ✅ consultation.wav — 309.5 KB
  ✅ cxr.jpg — 13.3 KB
  ✅ derm.jpg — 19.0 KB
  ✅ pa