First we will load all voices into 5 sec chunks and label them healthy or unhealthy

In [1]:
import os
import torch
import torchaudio
import torchaudio.transforms as T

# Paths
healthy_path = "data/healthy"
unhealthy_path = "data/unhealthy"


output_path = "processed_chunks"
os.makedirs(output_path, exist_ok=True)

chunk_duration = 5  # seconds
target_sr = 22050   # sampling rate

def process_folder(input_path, label):
    files = [f for f in os.listdir(input_path) if f.endswith(".wav")]
    print(f"📂 Found {len(files)} files in {input_path}")

    all_chunks = []
    samples_per_chunk = target_sr * chunk_duration

    for file in files:
        file_path = os.path.join(input_path, file)

        # Load audio
        waveform, sr = torchaudio.load(file_path)

        # Convert to mono if stereo
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)

        # Resample if needed
        if sr != target_sr:
            resampler = T.Resample(sr, target_sr)
            waveform = resampler(waveform)

        total_samples = waveform.shape[1]
        num_chunks = total_samples // samples_per_chunk
        remainder = total_samples % samples_per_chunk

        # Full 5s chunks
        for i in range(num_chunks):
            start = i * samples_per_chunk
            end = start + samples_per_chunk
            chunk = waveform[:, start:end]
            all_chunks.append((chunk, label))

            # Save chunk
            out_file = f"{os.path.splitext(file)[0]}_chunk{i}_{label}.wav"
            torchaudio.save(os.path.join(output_path, out_file), chunk, target_sr)

        # Handle remainder
        if remainder > 0:
            last_chunk = waveform[:, -remainder:]
            pad_size = samples_per_chunk - remainder
            padded = torch.nn.functional.pad(last_chunk, (0, pad_size))
            all_chunks.append((padded, label))

            out_file = f"{os.path.splitext(file)[0]}_lastchunk_{label}.wav"
            torchaudio.save(os.path.join(output_path, out_file), padded, target_sr)

    return all_chunks


# Process both sets
healthy_chunks = process_folder(healthy_path, label=0)     # 0 = healthy
unhealthy_chunks = process_folder(unhealthy_path, label=1) # 1 = unhealthy

# Combine dataset
dataset = healthy_chunks + unhealthy_chunks
print(f"✅ Total chunks created: {len(dataset)}")


📂 Found 43 files in data/healthy




📂 Found 32 files in data/unhealthy
✅ Total chunks created: 2149


In [8]:

from torch.utils.data import Dataset, DataLoader, random_split
import random


class VoiceDataset(Dataset):
    def __init__(self, chunk_dir, n_mels=64, augment=False):
        self.chunk_dir = chunk_dir
        self.files = [f for f in os.listdir(chunk_dir) if f.endswith(".wav")]
        self.augment = augment

        # Mel Spectrogram transform
        self.mel_transform = torchaudio.transforms.MelSpectrogram(
            sample_rate=22050,
            n_fft=1024,
            hop_length=512,
            n_mels=n_mels
        )

    def add_augmentation(self, waveform, sr):
        if random.random() < 0.5:  # Add noise
            noise = torch.randn_like(waveform) * 0.005
            waveform = waveform + noise

        if random.random() < 0.5:  # Random gain (volume up/down)
            gain = random.uniform(0.8, 1.2)
            waveform = waveform * gain

        if random.random() < 0.3:  # Pitch shift
            n_steps = random.uniform(-2, 2)
            waveform = torchaudio.functional.pitch_shift(waveform, sr, n_steps)

        return waveform

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        file = self.files[idx]
        file_path = os.path.join(self.chunk_dir, file)

        # Load audio
        waveform, sr = torchaudio.load(file_path)

        # Convert to mono if needed
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)

        # Ensure consistent sample rate
        if sr != 22050:
            resampler = torchaudio.transforms.Resample(sr, 22050)
            waveform = resampler(waveform)

        # 🔥 Apply augmentation only in training
        if self.augment:
            waveform = self.add_augmentation(waveform, 22050)

        # Convert to Mel spectrogram
        mel = self.mel_transform(waveform)
        mel_db = torchaudio.functional.amplitude_to_DB(
            mel, multiplier=10.0, amin=1e-10, db_multiplier=0
        )

        # Label from filename (_0 = healthy, _1 = unhealthy)
        label = 1 if "_1" in file else 0

        return mel_db, torch.tensor(label, dtype=torch.long)


# Create dataset
dataset = VoiceDataset("processed_chunks")

# Train-test split (80/20)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# ⚡ Enable augmentation only for train set
train_dataset.dataset.augment = True
test_dataset.dataset.augment = False

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

print(f"✅ Total samples: {len(dataset)}")
print(f"📊 Train: {len(train_dataset)}, Test: {len(test_dataset)}")



✅ Total samples: 2149
📊 Train: 1719, Test: 430


In [9]:
mel, label = dataset[0]
print("Mel shape:", mel.shape)
print("Label:", label)

Mel shape: torch.Size([1, 64, 216])
Label: tensor(0)


In [10]:
import torch.nn as nn
import torch.nn.functional as F


class CNNVoiceClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(CNNVoiceClassifier, self).__init__()

        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.pool1 = nn.MaxPool2d(2, 2)

        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(32)
        self.pool2 = nn.MaxPool2d(2, 2)

        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(64)
        self.pool3 = nn.AdaptiveAvgPool2d((4, 4))  # compress to fixed size

        self.fc1 = nn.Linear(64 * 4 * 4, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))
        x = x.view(x.size(0), -1)  # flatten
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [11]:
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNVoiceClassifier(num_classes=2).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    running_loss, correct, total = 0, 0, 0

    for mel, labels in train_loader:
        mel, labels = mel.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(mel)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    train_acc = 100. * correct / total
    print(f"Epoch [{epoch+1}/{num_epochs}] Loss: {running_loss/len(train_loader):.4f}, Train Acc: {train_acc:.2f}%")


Epoch [1/20] Loss: 0.6240, Train Acc: 67.25%
Epoch [2/20] Loss: 0.5147, Train Acc: 75.57%
Epoch [3/20] Loss: 0.4482, Train Acc: 76.61%
Epoch [4/20] Loss: 0.4123, Train Acc: 80.63%
Epoch [5/20] Loss: 0.3716, Train Acc: 83.19%
Epoch [6/20] Loss: 0.3274, Train Acc: 83.94%
Epoch [7/20] Loss: 0.3111, Train Acc: 84.82%
Epoch [8/20] Loss: 0.2893, Train Acc: 87.03%
Epoch [9/20] Loss: 0.2521, Train Acc: 88.89%
Epoch [10/20] Loss: 0.2546, Train Acc: 89.47%
Epoch [11/20] Loss: 0.1984, Train Acc: 91.80%
Epoch [12/20] Loss: 0.1914, Train Acc: 92.20%
Epoch [13/20] Loss: 0.1831, Train Acc: 92.03%
Epoch [14/20] Loss: 0.1976, Train Acc: 91.56%
Epoch [15/20] Loss: 0.1458, Train Acc: 93.83%
Epoch [16/20] Loss: 0.1299, Train Acc: 94.53%
Epoch [17/20] Loss: 0.1580, Train Acc: 92.84%
Epoch [18/20] Loss: 0.0985, Train Acc: 96.39%
Epoch [19/20] Loss: 0.1850, Train Acc: 93.25%
Epoch [20/20] Loss: 0.1396, Train Acc: 94.82%


In [12]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for mel, labels in test_loader:
        mel, labels = mel.to(device), labels.to(device)
        outputs = model(mel)
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

print(f"✅ Test Accuracy: {100. * correct / total:.2f}%")

✅ Test Accuracy: 91.86%


In [13]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

model.eval()
all_labels = []
all_preds = []

with torch.no_grad():
    for mel, labels in test_loader:
        mel, labels = mel.to(device), labels.to(device)
        outputs = model(mel)
        _, predicted = outputs.max(1)

        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(predicted.cpu().numpy())

# Classification report
print("📊 Classification Report:")
print(classification_report(all_labels, all_preds, target_names=["Healthy", "Unhealthy"]))

# Confusion matrix
cm = confusion_matrix(all_labels, all_preds)
print("🧮 Confusion Matrix:")
print(cm)




📊 Classification Report:
              precision    recall  f1-score   support

     Healthy       0.90      0.97      0.93       258
   Unhealthy       0.95      0.84      0.89       172

    accuracy                           0.92       430
   macro avg       0.93      0.91      0.91       430
weighted avg       0.92      0.92      0.92       430

🧮 Confusion Matrix:
[[251   7]
 [ 28 144]]
