In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Data Prreprocessing

import os
import pandas as pd

DATASET_DIR = "/content/drive/MyDrive/ICBHI_final_database"
data = []

for fname in os.listdir(DATASET_DIR):
    if fname.endswith(".txt"):
        wav_name = fname.replace(".txt", ".wav")
        wav_path = os.path.join(DATASET_DIR, wav_name)

        # skip non-existing .wav files
        if not os.path.exists(wav_path):
            continue

        txt_path = os.path.join(DATASET_DIR, fname)
        wheeze_vals = []
        with open(txt_path, 'r') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) == 4:
                    try:
                        wheeze = int(parts[-1])
                        wheeze_vals.append(wheeze)
                    except ValueError:
                        continue

        has_wheeze = int(any(wheeze_vals))
        data.append((wav_name, has_wheeze))

# DataFrame is buit
df = pd.DataFrame(data, columns=["filename", "wheeze_label"])
df['full_path'] = df['filename'].apply(lambda x: os.path.join(DATASET_DIR, x))

# Resaving is possible
df.to_csv("/content/drive/MyDrive/wheeze_labels_filtered.csv", index=False)

In [None]:
# Deinfe Initial ViT Model

# Validation Accuracy: 0.7011
# F1-score: 0.5926
# ROC-AUC: 0.7276

import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import librosa
import timm

# Config
AUDIO_DIR = "/content/drive/MyDrive/ICBHI_final_database"
LABEL_PATH = "/content/drive/MyDrive/wheeze_labels_filtered.csv"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load label CSV
df = pd.read_csv(LABEL_PATH)
df['full_path'] = df['filename'].apply(lambda x: os.path.join(AUDIO_DIR, x))

# Train/Validation split
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['wheeze_label'], random_state=42)

# Feature extractor
def audio_to_logmel(filepath, sr=16000, n_fft=1024, hop_length=512, n_mels=128):
    y, _ = librosa.load(filepath, sr=sr)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    log_mel = librosa.power_to_db(mel, ref=np.max)
    return log_mel

# Dataset class
class RespiratoryDataset(Dataset):
    def __init__(self, df, sr=16000):
        self.df = df
        self.sr = sr

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        label = row['wheeze_label']
        filepath = row['full_path']
        log_mel = audio_to_logmel(filepath, sr=self.sr)

        x = torch.tensor(log_mel).unsqueeze(0)  # (1, H, W)
        x = torch.nn.functional.interpolate(x.unsqueeze(0), size=(224, 224), mode='bilinear').squeeze(0)
        x = x.repeat(3, 1, 1)  # (3, 224, 224)

        return x, torch.tensor(label, dtype=torch.float32)

# Loaders
train_dataset = RespiratoryDataset(train_df)
val_dataset = RespiratoryDataset(val_df)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Model: ViT without Sigmoid
model = timm.create_model('vit_base_patch16_224', pretrained=True)
model.head = nn.Linear(model.head.in_features, 1)  # remove Sigmoid
model.to(device)

# Weighted Loss
label_counts = df['wheeze_label'].value_counts()
neg, pos = label_counts[0], label_counts[1]
pos_weight = torch.tensor([neg / pos]).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training loop with validation
for epoch in range(5):
    model.train()
    train_loss = 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        preds = model(X).view(-1)
        loss = criterion(preds, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    preds_list, probs_list, targets_list = [], [], []
    with torch.no_grad():
        for X, y in val_loader:
            X, y = X.to(device), y.to(device)
            preds = model(X).view(-1)
            loss = criterion(preds, y)
            val_loss += loss.item()

            probs = torch.sigmoid(preds)
            preds_list.extend((probs > 0.5).int().cpu().numpy())
            probs_list.extend(probs.cpu().numpy())
            targets_list.extend(y.cpu().numpy())

    acc = accuracy_score(targets_list, preds_list)
    f1 = f1_score(targets_list, preds_list)
    auc = roc_auc_score(targets_list, probs_list)

    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Acc: {acc:.4f} | F1: {f1:.4f} | AUC: {auc:.4f}")

In [None]:
# Vision Transformer with SpecAugment, Fine-Tuning, and 30 Epoch Training
# SpecAugmentation Hyperparameter
# freq_mask_param = 15
# time_mask_param = 20

# Validation Accuracy: 0.6522
# F1-score: 0.6000
# ROC-AUC: 0.7131

import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio.transforms as T
import pandas as pd
import librosa
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import timm

# Configuration
AUDIO_DIR = "/content/drive/MyDrive/ICBHI_final_database"
LABEL_PATH = "/content/drive/MyDrive/wheeze_labels_filtered.csv"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load DataFrame with labels
df = pd.read_csv(LABEL_PATH)
df['full_path'] = df['filename'].apply(lambda x: os.path.join(AUDIO_DIR, x))
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['wheeze_label'], random_state=42)

# Audio to Log-Mel Conversion
def audio_to_logmel(filepath, sr=16000, n_fft=1024, hop_length=512, n_mels=128):
    y, _ = librosa.load(filepath, sr=sr)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    log_mel = librosa.power_to_db(mel, ref=np.max)
    return log_mel

# Dataset with SpecAugment
class RespiratoryDataset(Dataset):
    def __init__(self, df, sr=16000, augment=False):
        self.df = df
        self.sr = sr
        self.augment = augment
        self.freq_mask = T.FrequencyMasking(freq_mask_param=15)
        self.time_mask = T.TimeMasking(time_mask_param=20)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        filepath = row['full_path']
        label = row['wheeze_label']
        log_mel = audio_to_logmel(filepath, sr=self.sr)
        x = torch.tensor(log_mel).unsqueeze(0)

        if self.augment:
            x = self.freq_mask(x)
            x = self.time_mask(x)

        x = torch.nn.functional.interpolate(x.unsqueeze(0), size=(224, 224), mode='bilinear').squeeze(0)
        x = x.repeat(3, 1, 1)
        return x, torch.tensor(label, dtype=torch.float32)

# Loaders
train_dataset = RespiratoryDataset(train_df, augment=True)
val_dataset = RespiratoryDataset(val_df, augment=False)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# ViT Model + Fine-tuning strategy
model = timm.create_model('vit_base_patch16_224', pretrained=True)
model.head = nn.Linear(model.head.in_features, 1)

for param in model.parameters():
    param.requires_grad = False
for name, param in model.named_parameters():
    if "blocks.11" in name or "head" in name:
        param.requires_grad = True

model.to(device)

# Weighted Loss for imbalance
label_counts = df['wheeze_label'].value_counts()
neg, pos = label_counts[0], label_counts[1]
pos_weight = torch.tensor([neg / pos]).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5)

# Training Loop (30 epochs)
for epoch in range(30):
    model.train()
    train_loss = 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        preds = model(X).view(-1)
        loss = criterion(preds, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    preds_list, probs_list, targets_list = [], [], []
    with torch.no_grad():
        for X, y in val_loader:
            X, y = X.to(device), y.to(device)
            preds = model(X).view(-1)
            loss = criterion(preds, y)
            val_loss += loss.item()

            probs = torch.sigmoid(preds)
            preds_list.extend((probs > 0.5).int().cpu().numpy())
            probs_list.extend(probs.cpu().numpy())
            targets_list.extend(y.cpu().numpy())

    acc = accuracy_score(targets_list, preds_list)
    f1 = f1_score(targets_list, preds_list)
    auc = roc_auc_score(targets_list, probs_list)

    print(f"Epoch {epoch+1}/50 | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Acc: {acc:.4f} | F1: {f1:.4f} | AUC: {auc:.4f}")


In [None]:
# Vision Transformer with SpecAugment, Fine-Tuning, and 30 Epoch Training
# SpecAugmentation Hyperparameter
# freq_mask_param = 20
# time_mask_param = 60

# Validation Accuracy: 0.5707
# F1-score: 0.5093
# ROC-AUC: 0.5766

import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio.transforms as T
import pandas as pd
import librosa
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import timm

# Configuration
AUDIO_DIR = "/content/drive/MyDrive/ICBHI_final_database"
LABEL_PATH = "/content/drive/MyDrive/wheeze_labels_filtered.csv"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load DataFrame with labels
df = pd.read_csv(LABEL_PATH)
df['full_path'] = df['filename'].apply(lambda x: os.path.join(AUDIO_DIR, x))
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['wheeze_label'], random_state=42)

# Audio to Log-Mel Conversion
def audio_to_logmel(filepath, sr=16000, n_fft=1024, hop_length=512, n_mels=128):
    y, _ = librosa.load(filepath, sr=sr)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    log_mel = librosa.power_to_db(mel, ref=np.max)
    return log_mel

# Dataset with SpecAugment
class RespiratoryDataset(Dataset):
    def __init__(self, df, sr=16000, augment=False):
        self.df = df
        self.sr = sr
        self.augment = augment
        self.freq_mask = T.FrequencyMasking(freq_mask_param=20)
        self.time_mask = T.TimeMasking(time_mask_param=60)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        filepath = row['full_path']
        label = row['wheeze_label']
        log_mel = audio_to_logmel(filepath, sr=self.sr)
        x = torch.tensor(log_mel).unsqueeze(0)

        if self.augment:
            x = self.freq_mask(x)
            x = self.time_mask(x)

        x = torch.nn.functional.interpolate(x.unsqueeze(0), size=(224, 224), mode='bilinear').squeeze(0)
        x = x.repeat(3, 1, 1)
        return x, torch.tensor(label, dtype=torch.float32)

# Loaders
train_dataset = RespiratoryDataset(train_df, augment=True)
val_dataset = RespiratoryDataset(val_df, augment=False)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# ViT Model + Fine-tuning strategy
model = timm.create_model('vit_base_patch16_224', pretrained=True)
model.head = nn.Linear(model.head.in_features, 1)

for param in model.parameters():
    param.requires_grad = False
for name, param in model.named_parameters():
    if "blocks.11" in name or "head" in name:
        param.requires_grad = True

model.to(device)

# Weighted Loss for imbalance
label_counts = df['wheeze_label'].value_counts()
neg, pos = label_counts[0], label_counts[1]
pos_weight = torch.tensor([neg / pos]).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5)

# Training Loop (50 epochs)
for epoch in range(50):
    model.train()
    train_loss = 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        preds = model(X).view(-1)
        loss = criterion(preds, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    preds_list, probs_list, targets_list = [], [], []
    with torch.no_grad():
        for X, y in val_loader:
            X, y = X.to(device), y.to(device)
            preds = model(X).view(-1)
            loss = criterion(preds, y)
            val_loss += loss.item()

            probs = torch.sigmoid(preds)
            preds_list.extend((probs > 0.5).int().cpu().numpy())
            probs_list.extend(probs.cpu().numpy())
            targets_list.extend(y.cpu().numpy())

    acc = accuracy_score(targets_list, preds_list)
    f1 = f1_score(targets_list, preds_list)
    auc = roc_auc_score(targets_list, probs_list)

    print(f"Epoch {epoch+1}/50 | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Acc: {acc:.4f} | F1: {f1:.4f} | AUC: {auc:.4f}")


In [None]:
# Vision Transformer with SpecAugment, Fine-Tuning, and 30 Epoch Training
# SpecAugmentation Hyperparameter
# freq_mask_param = 0
# time_mask_param = 0

# Validation Accuracy: 0.6413
# F1-score: 0.5769
# ROC-AUC: 0.7134

import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio.transforms as T
import pandas as pd
import librosa
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import timm

# Configuration
AUDIO_DIR = "/content/drive/MyDrive/ICBHI_final_database"
LABEL_PATH = "/content/drive/MyDrive/wheeze_labels_filtered.csv"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load DataFrame with labels
df = pd.read_csv(LABEL_PATH)
df['full_path'] = df['filename'].apply(lambda x: os.path.join(AUDIO_DIR, x))
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['wheeze_label'], random_state=42)

# Audio to Log-Mel Conversion
def audio_to_logmel(filepath, sr=16000, n_fft=1024, hop_length=512, n_mels=128):
    y, _ = librosa.load(filepath, sr=sr)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    log_mel = librosa.power_to_db(mel, ref=np.max)
    return log_mel

# Dataset with SpecAugment
class RespiratoryDataset(Dataset):
    def __init__(self, df, sr=16000, augment=False):
        self.df = df
        self.sr = sr
        self.augment = augment
        self.freq_mask = T.FrequencyMasking(freq_mask_param=0)
        self.time_mask = T.TimeMasking(time_mask_param=0)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        filepath = row['full_path']
        label = row['wheeze_label']
        log_mel = audio_to_logmel(filepath, sr=self.sr)
        x = torch.tensor(log_mel).unsqueeze(0)

        if self.augment:
            x = self.freq_mask(x)
            x = self.time_mask(x)

        x = torch.nn.functional.interpolate(x.unsqueeze(0), size=(224, 224), mode='bilinear').squeeze(0)
        x = x.repeat(3, 1, 1)
        return x, torch.tensor(label, dtype=torch.float32)

# Loaders
train_dataset = RespiratoryDataset(train_df, augment=True)
val_dataset = RespiratoryDataset(val_df, augment=False)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# ViT Model + Fine-tuning strategy
model = timm.create_model('vit_base_patch16_224', pretrained=True)
model.head = nn.Linear(model.head.in_features, 1)

for param in model.parameters():
    param.requires_grad = False
for name, param in model.named_parameters():
    if "blocks.11" in name or "head" in name:
        param.requires_grad = True

model.to(device)

# Weighted Loss for imbalance
label_counts = df['wheeze_label'].value_counts()
neg, pos = label_counts[0], label_counts[1]
pos_weight = torch.tensor([neg / pos]).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5)

# Training Loop (50 epochs)
for epoch in range(50):
    model.train()
    train_loss = 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        preds = model(X).view(-1)
        loss = criterion(preds, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    preds_list, probs_list, targets_list = [], [], []
    with torch.no_grad():
        for X, y in val_loader:
            X, y = X.to(device), y.to(device)
            preds = model(X).view(-1)
            loss = criterion(preds, y)
            val_loss += loss.item()

            probs = torch.sigmoid(preds)
            preds_list.extend((probs > 0.5).int().cpu().numpy())
            probs_list.extend(probs.cpu().numpy())
            targets_list.extend(y.cpu().numpy())

    acc = accuracy_score(targets_list, preds_list)
    f1 = f1_score(targets_list, preds_list)
    auc = roc_auc_score(targets_list, probs_list)

    print(f"Epoch {epoch+1}/50 | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Acc: {acc:.4f} | F1: {f1:.4f} | AUC: {auc:.4f}")


In [None]:
# Vision Transformer with SpecAugment, Fine-Tuning, and 30 Epoch Training
# SpecAugmentation Hyperparameter
# freq_mask_param = 3
# time_mask_param = 6

# Validation Accuracy: 0.6169
# F1-score: 0.6169
# ROC-AUC: 0.7468

import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio.transforms as T
import pandas as pd
import librosa
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import timm

# Configuration
AUDIO_DIR = "/content/drive/MyDrive/ICBHI_final_database"
LABEL_PATH = "/content/drive/MyDrive/wheeze_labels_filtered.csv"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load DataFrame with labels
df = pd.read_csv(LABEL_PATH)
df['full_path'] = df['filename'].apply(lambda x: os.path.join(AUDIO_DIR, x))
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['wheeze_label'], random_state=42)

# Audio to Log-Mel Conversion
def audio_to_logmel(filepath, sr=16000, n_fft=1024, hop_length=512, n_mels=128):
    y, _ = librosa.load(filepath, sr=sr)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    log_mel = librosa.power_to_db(mel, ref=np.max)
    return log_mel

# Dataset with SpecAugment
class RespiratoryDataset(Dataset):
    def __init__(self, df, sr=16000, augment=False):
        self.df = df
        self.sr = sr
        self.augment = augment
        self.freq_mask = T.FrequencyMasking(freq_mask_param=3)
        self.time_mask = T.TimeMasking(time_mask_param=6)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        filepath = row['full_path']
        label = row['wheeze_label']
        log_mel = audio_to_logmel(filepath, sr=self.sr)
        x = torch.tensor(log_mel).unsqueeze(0)

        if self.augment:
            x = self.freq_mask(x)
            x = self.time_mask(x)

        x = torch.nn.functional.interpolate(x.unsqueeze(0), size=(224, 224), mode='bilinear').squeeze(0)
        x = x.repeat(3, 1, 1)
        return x, torch.tensor(label, dtype=torch.float32)

# Loaders
train_dataset = RespiratoryDataset(train_df, augment=True)
val_dataset = RespiratoryDataset(val_df, augment=False)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# ViT Model + Fine-tuning strategy
model = timm.create_model('vit_base_patch16_224', pretrained=True)
model.head = nn.Linear(model.head.in_features, 1)

for param in model.parameters():
    param.requires_grad = False
for name, param in model.named_parameters():
    if "blocks.11" in name or "head" in name:
        param.requires_grad = True

model.to(device)

# Weighted Loss for imbalance
label_counts = df['wheeze_label'].value_counts()
neg, pos = label_counts[0], label_counts[1]
pos_weight = torch.tensor([neg / pos]).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5)

# Training Loop (50 epochs)
for epoch in range(50):
    model.train()
    train_loss = 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        preds = model(X).view(-1)
        loss = criterion(preds, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    preds_list, probs_list, targets_list = [], [], []
    with torch.no_grad():
        for X, y in val_loader:
            X, y = X.to(device), y.to(device)
            preds = model(X).view(-1)
            loss = criterion(preds, y)
            val_loss += loss.item()

            probs = torch.sigmoid(preds)
            preds_list.extend((probs > 0.5).int().cpu().numpy())
            probs_list.extend(probs.cpu().numpy())
            targets_list.extend(y.cpu().numpy())

    acc = accuracy_score(targets_list, preds_list)
    f1 = f1_score(targets_list, preds_list)
    auc = roc_auc_score(targets_list, probs_list)

    print(f"Epoch {epoch+1}/50 | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Acc: {acc:.4f} | F1: {f1:.4f} | AUC: {auc:.4f}")


In [None]:
# Vision Transformer with SpecAugment, Fine-Tuning, and 30 Epoch Training
# SpecAugmentation Hyperparameter
# freq_mask_param = 8
# time_mask_param = 12

# Validation Accuracy: 0.0.7174
# F1-score: 0.5938
# ROC-AUC: 0.7134

import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio.transforms as T
import pandas as pd
import librosa
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import timm

# Configuration
AUDIO_DIR = "/content/drive/MyDrive/ICBHI_final_database"
LABEL_PATH = "/content/drive/MyDrive/wheeze_labels_filtered.csv"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load DataFrame with labels
df = pd.read_csv(LABEL_PATH)
df['full_path'] = df['filename'].apply(lambda x: os.path.join(AUDIO_DIR, x))
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['wheeze_label'], random_state=42)

# Audio to Log-Mel Conversion
def audio_to_logmel(filepath, sr=16000, n_fft=1024, hop_length=512, n_mels=128):
    y, _ = librosa.load(filepath, sr=sr)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    log_mel = librosa.power_to_db(mel, ref=np.max)
    return log_mel

# Dataset with SpecAugment
class RespiratoryDataset(Dataset):
    def __init__(self, df, sr=16000, augment=False):
        self.df = df
        self.sr = sr
        self.augment = augment
        self.freq_mask = T.FrequencyMasking(freq_mask_param=3)
        self.time_mask = T.TimeMasking(time_mask_param=6)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        filepath = row['full_path']
        label = row['wheeze_label']
        log_mel = audio_to_logmel(filepath, sr=self.sr)
        x = torch.tensor(log_mel).unsqueeze(0)

        if self.augment:
            x = self.freq_mask(x)
            x = self.time_mask(x)

        x = torch.nn.functional.interpolate(x.unsqueeze(0), size=(224, 224), mode='bilinear').squeeze(0)
        x = x.repeat(3, 1, 1)
        return x, torch.tensor(label, dtype=torch.float32)

# Loaders
train_dataset = RespiratoryDataset(train_df, augment=True)
val_dataset = RespiratoryDataset(val_df, augment=False)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# ViT Model + Fine-tuning strategy
model = timm.create_model('vit_base_patch16_224', pretrained=True)
model.head = nn.Linear(model.head.in_features, 1)

for param in model.parameters():
    param.requires_grad = False
for name, param in model.named_parameters():
    if "blocks.11" in name or "head" in name:
        param.requires_grad = True

model.to(device)

# Weighted Loss for imbalance
label_counts = df['wheeze_label'].value_counts()
neg, pos = label_counts[0], label_counts[1]
pos_weight = torch.tensor([neg / pos]).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5)

# Training Loop (50 epochs)
for epoch in range(50):
    model.train()
    train_loss = 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        preds = model(X).view(-1)
        loss = criterion(preds, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    preds_list, probs_list, targets_list = [], [], []
    with torch.no_grad():
        for X, y in val_loader:
            X, y = X.to(device), y.to(device)
            preds = model(X).view(-1)
            loss = criterion(preds, y)
            val_loss += loss.item()

            probs = torch.sigmoid(preds)
            preds_list.extend((probs > 0.5).int().cpu().numpy())
            probs_list.extend(probs.cpu().numpy())
            targets_list.extend(y.cpu().numpy())

    acc = accuracy_score(targets_list, preds_list)
    f1 = f1_score(targets_list, preds_list)
    auc = roc_auc_score(targets_list, probs_list)

    print(f"Epoch {epoch+1}/50 | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Acc: {acc:.4f} | F1: {f1:.4f} | AUC: {auc:.4f}")
