In [11]:
import os
import torch
import torchaudio
from transformers import HubertModel, Wav2Vec2FeatureExtractor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from tqdm import tqdm
import pickle

In [2]:
# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [4]:
# Path to dataset
DATASET_PATH = r"C:\Users\srira\OneDrive\Desktop\pride\Final_dataset"

In [5]:
# Load HuBERT feature extractor
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")

# Load HuBERT model (we use the base model without classification head)
hubert_model = HubertModel.from_pretrained("facebook/hubert-base-ls960").to(device)
hubert_model.eval()  # Set to eval mode (we won't fine-tune it)

preprocessor_config.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

HubertModel(
  (feature_extractor): HubertFeatureEncoder(
    (conv_layers): ModuleList(
      (0): HubertGroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x HubertNoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x HubertNoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): HubertFeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): HubertEncoder(
    (pos_conv_embed): HubertPositionalConvEmbedding(
      (conv): Para

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

In [None]:
class AudioDataset(Dataset):
    def __init__(self, file_paths, labels):
        self.file_paths = file_paths
        self.labels = labels

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        audio, sr = torchaudio.load(self.file_paths[idx])  # shape: [channels, time]

        # Convert to mono if stereo
        if audio.shape[0] > 1:
            audio = torch.mean(audio, dim=0)
        else:
            audio = audio.squeeze(0)

        # Resample if not 16kHz
        if sr != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
            audio = resampler(audio)

        # Feature extraction using HuBERT
        input_values = feature_extractor(audio.numpy(), sampling_rate=16000, return_tensors="pt").input_values.to(device)
        with torch.no_grad():
            hidden_states = hubert_model(input_values).last_hidden_state  # [batch, time, features]

        # Take mean across time dimension
        features = torch.mean(hidden_states, dim=1).squeeze().cpu()  # shape: [768]

        label = self.labels[idx]
        return features, label


In [18]:
# Get all file paths and labels
file_paths = []
labels = []

for label in os.listdir(DATASET_PATH):
    label_folder = os.path.join(DATASET_PATH, label)
    if os.path.isdir(label_folder):
        for file in os.listdir(label_folder):
            if file.endswith(".wav"):
                file_paths.append(os.path.join(label_folder, file))
                labels.append(label)

# Encode labels to integers
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [19]:
# Save label encoder to file
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

# Split dataset
train_paths, val_paths, train_labels, val_labels = train_test_split(
    file_paths, encoded_labels, test_size=0.2, stratify=encoded_labels, random_state=42
)

# Create datasets and dataloaders
train_dataset = AudioDataset(train_paths, train_labels)
val_dataset = AudioDataset(val_paths, val_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)


In [20]:
# Define classifier
class AudioClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(AudioClassifier, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        return self.fc(x)

# Instantiate model
input_dim = 768  # HuBERT base model output size
num_classes = len(label_encoder.classes_)
model = AudioClassifier(input_dim, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [21]:
# Training loop with best model saving
EPOCHS = 10
best_acc = 0.0

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    for features, labels in tqdm(train_loader, desc="Training"):
        features, labels = features.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

    train_acc = correct / total
    print(f"Loss: {total_loss:.4f}, Train Acc: {train_acc:.4f}")

    # Validation
    model.eval()
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for features, labels in tqdm(val_loader, desc="Validation"):
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            _, predicted = torch.max(outputs, 1)
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)
    val_acc = val_correct / val_total
    print(f"Validation Accuracy: {val_acc:.4f}")

    # Save best model
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), "best_model.pth")
        print(f"✅ Saved best model with val acc: {best_acc:.4f}")


Epoch 1/10


Training: 100%|██████████| 70/70 [05:43<00:00,  4.91s/it]


Loss: 127.9232, Train Acc: 0.3750


Validation: 100%|██████████| 18/18 [01:20<00:00,  4.47s/it]


Validation Accuracy: 0.6000
✅ Saved best model with val acc: 0.6000

Epoch 2/10


Training: 100%|██████████| 70/70 [06:47<00:00,  5.82s/it]


Loss: 108.7412, Train Acc: 0.6143


Validation: 100%|██████████| 18/18 [01:48<00:00,  6.03s/it]


Validation Accuracy: 0.6929
✅ Saved best model with val acc: 0.6929

Epoch 3/10


Training: 100%|██████████| 70/70 [07:01<00:00,  6.02s/it]


Loss: 91.4651, Train Acc: 0.6804


Validation: 100%|██████████| 18/18 [01:44<00:00,  5.81s/it]


Validation Accuracy: 0.6500

Epoch 4/10


Training: 100%|██████████| 70/70 [07:38<00:00,  6.55s/it]


Loss: 77.3219, Train Acc: 0.7268


Validation: 100%|██████████| 18/18 [01:47<00:00,  5.96s/it]


Validation Accuracy: 0.7000
✅ Saved best model with val acc: 0.7000

Epoch 5/10


Training: 100%|██████████| 70/70 [07:15<00:00,  6.23s/it]


Loss: 68.8170, Train Acc: 0.7286


Validation: 100%|██████████| 18/18 [01:47<00:00,  5.95s/it]


Validation Accuracy: 0.7286
✅ Saved best model with val acc: 0.7286

Epoch 6/10


Training: 100%|██████████| 70/70 [06:46<00:00,  5.81s/it]


Loss: 60.7493, Train Acc: 0.7589


Validation: 100%|██████████| 18/18 [01:03<00:00,  3.53s/it]


Validation Accuracy: 0.7357
✅ Saved best model with val acc: 0.7357

Epoch 7/10


Training: 100%|██████████| 70/70 [04:15<00:00,  3.65s/it]


Loss: 55.1863, Train Acc: 0.7964


Validation: 100%|██████████| 18/18 [01:03<00:00,  3.54s/it]


Validation Accuracy: 0.7643
✅ Saved best model with val acc: 0.7643

Epoch 8/10


Training: 100%|██████████| 70/70 [04:20<00:00,  3.72s/it]


Loss: 50.9710, Train Acc: 0.8018


Validation: 100%|██████████| 18/18 [01:02<00:00,  3.47s/it]


Validation Accuracy: 0.7643

Epoch 9/10


Training: 100%|██████████| 70/70 [10:27<00:00,  8.97s/it]


Loss: 47.7888, Train Acc: 0.8036


Validation: 100%|██████████| 18/18 [02:37<00:00,  8.74s/it]


Validation Accuracy: 0.7786
✅ Saved best model with val acc: 0.7786

Epoch 10/10


Training: 100%|██████████| 70/70 [07:34<00:00,  6.50s/it]


Loss: 43.8976, Train Acc: 0.8214


Validation: 100%|██████████| 18/18 [03:12<00:00, 10.71s/it]

Validation Accuracy: 0.7786





In [23]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Put model in evaluation mode
model.eval()

# Store true and predicted labels
y_true = []
y_pred = []

with torch.no_grad():
    for features, labels in val_loader:  # <-- fixed from test_loader to val_loader
        features, labels = features.to(device), labels.to(device)
        outputs = model(features)
        _, predicted = torch.max(outputs, 1)

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

# Create confusion matrix
cm = confusion_matrix(y_true, y_pred)
class_names = label_encoder.classes_

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names,
            yticklabels=class_names)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.tight_layout()

# Save the confusion matrix image
plt.savefig('confusion_matrix.png')
plt.close()

print("✅ Confusion matrix saved as 'confusion_matrix.png'")


✅ Confusion matrix saved as 'confusion_matrix.png'
