In [1]:
## Defining the path for parent directory
DIR = '/kaggle/input/airs-ai-in-respiratory-sounds/'
# Defining all paths
train_path = '/kaggle/input/airs-ai-in-respiratory-sounds/train.csv'
test_path = '/kaggle/input/airs-ai-in-respiratory-sounds/test.csv'
sound_files_path = '/kaggle/input/airs-ai-in-respiratory-sounds/sounds/sounds/'

In [2]:
# ======================================
# 1️⃣ IMPORTS + DATA + WAV2VEC EMBEDDINGS
# ======================================
import os
import librosa
import numpy as np
import pandas as pd
import torch
import torchaudio
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# ✅ Device selection
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# ✅ Paths
train_csv = "/kaggle/input/airs-ai-in-respiratory-sounds/train.csv"
audio_path = "/kaggle/input/airs-ai-in-respiratory-sounds/sounds/sounds"

# ✅ Load dataset
train_df = pd.read_csv(train_csv)

# ✅ Tabular features
tab_features = [
    'age','gender','tbContactHistory','wheezingHistory','phlegmCough',
    'familyAsthmaHistory','feverHistory','coldPresent','packYears'
]

# ✅ WAV2VEC2 model
bundle = torchaudio.pipelines.WAV2VEC2_BASE
wav2vec_model = bundle.get_model().to(device).eval()
SR = 16000
FIXED_LENGTH = 5 * SR  # 5 seconds audio

# ✅ Map audio files
file_map = {}
for folder in os.listdir(audio_path):
    fpath = os.path.join(audio_path, folder)
    if os.path.isdir(fpath):
        wavs = [f for f in os.listdir(fpath) if f.endswith(".wav")]
        if wavs:
            file_map[folder] = os.path.join(fpath, wavs[0])

print("Total mapped audio files:", len(file_map))


# ✅ Audio loader + augmentation
def load_audio(file_id):
    y, sr = librosa.load(file_map[file_id], sr=SR)
    if len(y) < FIXED_LENGTH:
        y = np.pad(y, (0, FIXED_LENGTH - len(y)))
    else:
        y = y[:FIXED_LENGTH]
    y = y + 0.001 * np.random.randn(len(y))  # light noise
    return y

# ✅ Extract WAV2VEC embeddings
def get_embedding(audio):
    t = torch.tensor(audio, dtype=torch.float32).unsqueeze(0).to(device)
    with torch.no_grad():
        feat, _ = wav2vec_model.extract_features(t)
    return torch.mean(feat[-1], dim=1).cpu().numpy().squeeze()


# ✅ Build embedding dataset
audio_embs = []
valid_ids = []

for cid in train_df["candidateID"]:
    if cid not in file_map:
        continue
    emb = get_embedding(load_audio(cid))
    audio_embs.append(emb)
    valid_ids.append(cid)

audio_embs = np.array(audio_embs)

# ✅ Filter rows with audio
df = train_df[train_df["candidateID"].isin(valid_ids)]

# ✅ Extract tabular + target
X_tab = df[tab_features].values
y = df["disease"].values

print("Audio embeddings shape:", audio_embs.shape)


Using device: cuda


Downloading: "https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960.pth" to /root/.cache/torch/hub/checkpoints/wav2vec2_fairseq_base_ls960.pth
100%|██████████| 360M/360M [00:01<00:00, 320MB/s]


Total mapped audio files: 882
Audio embeddings shape: (546, 768)


In [3]:
# ======================================
# 2️⃣ PREPROCESS — IMPUTE + SCALE + COMBINE
# ======================================

# ✅ Impute missing tabular values
imputer = SimpleImputer(strategy="median")
X_tab = imputer.fit_transform(X_tab)

# ✅ Scale tabular features (audio already normalized)
scaler = StandardScaler()
X_tab = scaler.fit_transform(X_tab)

# ✅ Combine tabular + audio embeddings
X = np.hstack([X_tab, audio_embs])
print("Final feature shape:", X.shape)

# ✅ Train-val split
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.15, stratify=y, random_state=42
)

# ✅ Convert to tensors
import torch
X_train_t = torch.tensor(X_train, dtype=torch.float32).to(device)
X_val_t = torch.tensor(X_val, dtype=torch.float32).to(device)
y_train_t = torch.tensor(y_train, dtype=torch.long).to(device)
y_val_t = torch.tensor(y_val, dtype=torch.long).to(device)

# ✅ DataLoaders
from torch.utils.data import TensorDataset, DataLoader
train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=32, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val_t, y_val_t), batch_size=32, shuffle=False)

num_classes = len(np.unique(y))


Final feature shape: (546, 777)


In [4]:
# ===========================================================
# ✅ Improved MLP Model for Tabular + Audio Embeddings (Reproducible)
# ===========================================================
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# -----------------------------
# ✅ Set Random Seed for Reproducibility
# -----------------------------
RANDOM_STATE = 42
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
torch.cuda.manual_seed_all(RANDOM_STATE)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# -----------------------------
# ✅ Improved MLP Architecture
# -----------------------------
class TabAudioMLP(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 768),
            nn.LayerNorm(768),
            nn.ReLU(),
            nn.Dropout(0.65),

            nn.Linear(768, 384),
            nn.LayerNorm(384),
            nn.ReLU(),
            nn.Dropout(0.55),

            nn.Linear(384, 192),
            nn.LayerNorm(192),
            nn.ReLU(),
            nn.Dropout(0.45),

            nn.Linear(192, num_classes)
        )

    def forward(self, x):
        return self.net(x)

# -----------------------------
# ✅ Initialize Model
# -----------------------------
model = TabAudioMLP(X_train.shape[1], num_classes).to(device)

# ✅ Label smoothing improves generalization
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

# ✅ AdamW optimizer
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-4)

# ✅ Learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.3, patience=5, verbose=True
)

# -----------------------------
# ✅ Training Loop with Early Stopping
# -----------------------------
epochs = 300
best_acc = 0
patience = 100
patience_counter = 0

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        outputs = model(xb)
        loss = criterion(outputs, yb)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 2.0)

        optimizer.step()
        total_loss += loss.item()

    # -----------------------------
    # ✅ Validation
    # -----------------------------
    model.eval()
    preds = []
    with torch.no_grad():
        for xb, _ in val_loader:
            xb = xb.to(device)
            preds.extend(torch.argmax(model(xb), dim=1).cpu().numpy())

    acc = accuracy_score(y_val, preds)
    scheduler.step(acc)

    print(f"Epoch {epoch+1}/{epochs} | Loss: {total_loss:.4f} | Val Acc: {acc:.4f}")

    # ✅ Early stopping
    if acc > best_acc:
        best_acc = acc
        patience_counter = 0
        torch.save(model.state_dict(), "best_model.pth")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("⛔ Early stopping triggered!")
            break

# -----------------------------
# ✅ Final Evaluation
# -----------------------------
print("\n✅ Validation Accuracy:", accuracy_score(y_val, preds))
print("\n✅ Classification Report:\n", classification_report(y_val, preds))
print("\n✅ Confusion Matrix:\n", confusion_matrix(y_val, preds))




Epoch 1/300 | Loss: 17.1951 | Val Acc: 0.4390
Epoch 2/300 | Loss: 16.3533 | Val Acc: 0.4634
Epoch 3/300 | Loss: 15.3853 | Val Acc: 0.5122
Epoch 4/300 | Loss: 15.3712 | Val Acc: 0.6098
Epoch 5/300 | Loss: 15.0652 | Val Acc: 0.6585
Epoch 6/300 | Loss: 14.7121 | Val Acc: 0.6707
Epoch 7/300 | Loss: 14.3255 | Val Acc: 0.7073
Epoch 8/300 | Loss: 13.4680 | Val Acc: 0.8049
Epoch 9/300 | Loss: 12.8879 | Val Acc: 0.8171
Epoch 10/300 | Loss: 12.7930 | Val Acc: 0.8415
Epoch 11/300 | Loss: 11.5749 | Val Acc: 0.8415
Epoch 12/300 | Loss: 11.5616 | Val Acc: 0.8659
Epoch 13/300 | Loss: 11.2595 | Val Acc: 0.8415
Epoch 14/300 | Loss: 10.6639 | Val Acc: 0.8415
Epoch 15/300 | Loss: 10.5287 | Val Acc: 0.8415
Epoch 16/300 | Loss: 10.2679 | Val Acc: 0.8537
Epoch 17/300 | Loss: 10.1708 | Val Acc: 0.8659
Epoch 18/300 | Loss: 9.5893 | Val Acc: 0.8659
Epoch 19/300 | Loss: 9.7361 | Val Acc: 0.8659
Epoch 20/300 | Loss: 9.5750 | Val Acc: 0.8659
Epoch 21/300 | Loss: 9.2983 | Val Acc: 0.8659
Epoch 22/300 | Loss: 9.464

In [5]:
# ===========================================================
# ✅ Prepare Kaggle Submission (Tabular + Audio)
# ===========================================================
import pandas as pd
import numpy as np
import torch
import librosa

# -----------------------------
# 1️⃣ Load test CSV
# -----------------------------
test_csv = "/kaggle/input/airs-ai-in-respiratory-sounds/test.csv"
test_df = pd.read_csv(test_csv)
test_ids = test_df['candidateID'].values

# -----------------------------
# 2️⃣ Extract tabular features
# -----------------------------
tab_features = [
    'age','gender','tbContactHistory','wheezingHistory','phlegmCough',
    'familyAsthmaHistory','feverHistory','coldPresent','packYears'
]
X_test_tab = test_df[tab_features].values

# Impute missing tabular values using training imputer
X_test_tab = imputer.transform(X_test_tab)
# Scale tabular features using training scaler
X_test_tab = scaler.transform(X_test_tab)

# -----------------------------
# 3️⃣ Extract audio embeddings
# -----------------------------
audio_embs_test = []
for cid in test_df['candidateID']:
    if cid in file_map:  # use same mapping as train
        y, _ = librosa.load(file_map[cid], sr=SR)
        if len(y) < FIXED_LENGTH:
            y = np.pad(y, (0, FIXED_LENGTH - len(y)))
        else:
            y = y[:FIXED_LENGTH]
        y = y + 0.001 * np.random.randn(len(y))  # light noise
        # WAV2VEC embedding
        t = torch.tensor(y, dtype=torch.float32).unsqueeze(0).to(device)
        with torch.no_grad():
            feat, _ = wav2vec_model.extract_features(t)
        emb = torch.mean(feat[-1], dim=1).cpu().numpy().squeeze()
        audio_embs_test.append(emb)
    else:
        # if no audio, fill zeros
        audio_embs_test.append(np.zeros(audio_embs.shape[1]))

audio_embs_test = np.array(audio_embs_test)

# -----------------------------
# 4️⃣ Combine tabular + audio
# -----------------------------
X_test_combined = np.hstack([X_test_tab, audio_embs_test])
X_test_tensor = torch.tensor(X_test_combined, dtype=torch.float32).to(device)

# -----------------------------
# 5️⃣ Make predictions
# -----------------------------
model.eval()
with torch.no_grad():
    test_preds = torch.argmax(model(X_test_tensor), dim=1).cpu().numpy()

# -----------------------------
# 6️⃣ Prepare submission CSV
# -----------------------------
submission = pd.DataFrame({
    "candidateID": test_ids,
    "target": test_preds
})
submission.to_csv("submission.csv", index=False)
print("✅ Submission file created: submission.csv")


✅ Submission file created: submission.csv
