## Install Required Packages

To install the necessary dependencies, run the following command:

In [2]:
pip install pandas numpy librosa scikit-learn torch




## Import Required Libraries

The following Python libraries are used in this project:

In [3]:
import os
import pandas as pd
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from torch.cuda.amp import autocast, GradScaler



## Setting Paths and Initial Variables

### Define Dataset Paths
Set up paths for the dataset and metadata files:

In [4]:
# Define paths
dataset_path = "C:/Users/avixa/Desktop/Madhav Lab"
train_metadata_path = os.path.join(dataset_path, "metadata_train.csv")
test_metadata_path = os.path.join(dataset_path, "metadata_test.csv")
train_folder = os.path.join(dataset_path, "train")
test_folder = os.path.join(dataset_path, "test")

## Load Metadata
Read the training and testing metadata from CSV files:

In [5]:
# Load metadata
train_metadata = pd.read_csv(train_metadata_path)
test_metadata = pd.read_csv(test_metadata_path)

## Define Audio Parameters
Set key parameters for audio processing:

In [6]:
# Audio parameters
SAMPLE_RATE = 8000
AUDIO_LENGTH = 4 * SAMPLE_RATE  # 4 seconds * 8000 Hz

## Encode Class Labels
Use LabelEncoder to convert class labels into numerical values:

In [7]:
# Use LabelEncoder to encode the classes
le = LabelEncoder()
train_metadata['Class ID Encoded'] = le.fit_transform(train_metadata['Class ID'])
test_metadata['Class_id Encoded'] = le.transform(test_metadata['Class_id'])  # Use transform, not fit_transform
NUM_CLASSES = train_metadata['Class ID'].nunique()

# Defining the Dataset Class

## AudioDataset Class

The `AudioDataset` class is a custom dataset for loading and preprocessing audio files.

In [9]:
# Define Dataset class
class AudioDataset(Dataset):
    def __init__(self, metadata, audio_dir, sample_rate, audio_length, is_train=True):
        self.metadata = metadata
        self.audio_dir = audio_dir
        self.sample_rate = sample_rate
        self.audio_length = audio_length
        self.is_train = is_train

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        if self.is_train:
            audio_path = os.path.join(self.audio_dir, self.metadata.iloc[idx, 0])
            label = self.metadata.iloc[idx, 3]  # Encoded label
        else:
            audio_path = os.path.join(self.audio_dir, self.metadata.iloc[idx, 0])
            label = self.metadata.iloc[idx, 3]  # Encoded label

        signal, sr = librosa.load(audio_path, sr=self.sample_rate)

        # Zero-pad if shorter than required length
        if len(signal) < self.audio_length:
            pad_length = self.audio_length - len(signal)
            signal = np.pad(signal, (0, pad_length))
        else:
            signal = signal[:self.audio_length]  # Truncate if longer

        signal = torch.tensor(signal, dtype=torch.float32).unsqueeze(0)  # Add channel dimension
        label = torch.tensor(label, dtype=torch.long)

        return signal, label

# Creating Datasets and Data Loaders

This section initializes datasets and data loaders for training, validation, and testing.

## Creating the Datasets

In [10]:
# Create datasets
full_train_dataset = AudioDataset(train_metadata, train_folder, SAMPLE_RATE, AUDIO_LENGTH, is_train=True)
test_dataset = AudioDataset(test_metadata, test_folder, SAMPLE_RATE, AUDIO_LENGTH, is_train=False)

## Splitting the Training Dataset

In [11]:
# Split the training dataset into training and validation sets
train_size = int(0.8 * len(full_train_dataset))
val_size = len(full_train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(full_train_dataset, [train_size, val_size])


## Defining Data Loaders

In [12]:
# Define data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Audio Classification Model

This section defines a deep convolutional neural network (CNN) designed for classifying audio signals. The model consists of multiple 1D convolutional layers, batch normalization, activation functions, pooling, dropout layers, and a final fully connected layer.

## Model Architecture

In [13]:
# Define Model
class AudioClassifier(nn.Module):
    def __init__(self, num_classes):
        super(AudioClassifier, self).__init__()
        self.conv1 = nn.Conv1d(1, 64, kernel_size=80, stride=4, padding=39)  # Adjusted padding
        self.bn1 = nn.BatchNorm1d(64)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool1d(kernel_size=4)
        self.dropout1 = nn.Dropout(0.1)

        self.conv2_1 = nn.Conv1d(64, 64, kernel_size=3, stride=1, padding=1)
        self.bn2_1 = nn.BatchNorm1d(64)
        self.relu2_1 = nn.ReLU()
        self.conv2_2 = nn.Conv1d(64, 64, kernel_size=3, stride=1, padding=1)
        self.bn2_2 = nn.BatchNorm1d(64)
        self.relu2_2 = nn.ReLU()
        self.pool2 = nn.MaxPool1d(kernel_size=4)
        self.dropout2 = nn.Dropout(0.1)

        self.conv3_1 = nn.Conv1d(64, 128, kernel_size=3, stride=1, padding=1)
        self.bn3_1 = nn.BatchNorm1d(128)
        self.relu3_1 = nn.ReLU()
        self.conv3_2 = nn.Conv1d(128, 128, kernel_size=3, stride=1, padding=1)
        self.bn3_2 = nn.BatchNorm1d(128)
        self.relu3_2 = nn.ReLU()
        self.pool3 = nn.MaxPool1d(kernel_size=4)
        self.dropout3 = nn.Dropout(0.1)

        self.conv4_1 = nn.Conv1d(128, 256, kernel_size=3, stride=1, padding=1)
        self.bn4_1 = nn.BatchNorm1d(256)
        self.relu4_1 = nn.ReLU()
        self.conv4_2 = nn.Conv1d(256, 256, kernel_size=3, stride=1, padding=1)
        self.bn4_2 = nn.BatchNorm1d(256)
        self.relu4_2 = nn.ReLU()
        self.conv4_3 = nn.Conv1d(256, 256, kernel_size=3, stride=1, padding=1)
        self.bn4_3 = nn.BatchNorm1d(256)
        self.relu4_3 = nn.ReLU()
        self.pool4 = nn.MaxPool1d(kernel_size=4)
        self.dropout4 = nn.Dropout(0.1)

        self.conv5_1 = nn.Conv1d(256, 512, kernel_size=3, stride=1, padding=1)
        self.bn5_1 = nn.BatchNorm1d(512)
        self.relu5_1 = nn.ReLU()
        self.conv5_2 = nn.Conv1d(512, 512, kernel_size=3, stride=1, padding=1)
        self.bn5_2 = nn.BatchNorm1d(512)
        self.relu5_2 = nn.ReLU()
        self.dropout5 = nn.Dropout(0.1)

        self.conv6_1 = nn.Conv1d(512, 512, kernel_size=3, stride=1, padding=1)
        self.bn6_1 = nn.BatchNorm1d(512)
        self.relu6_1 = nn.ReLU()

        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)
        self.dropout6 = nn.Dropout(0.2)
        self.fc = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.dropout1(self.pool1(self.relu1(self.bn1(self.conv1(x)))))

        x = self.relu2_1(self.bn2_1(self.conv2_1(x)))
        x = self.dropout2(self.pool2(self.relu2_2(self.bn2_2(self.conv2_2(x)))))

        x = self.relu3_1(self.bn3_1(self.conv3_1(x)))
        x = self.dropout3(self.pool3(self.relu3_2(self.bn3_2(self.conv3_2(x)))))

        x = self.relu4_1(self.bn4_1(self.conv4_1(x)))
        x = self.relu4_2(self.bn4_2(self.conv4_2(x)))
        x = self.dropout4(self.pool4(self.relu4_3(self.bn4_3(self.conv4_3(x)))))

        x = self.relu5_1(self.bn5_1(self.conv5_1(x)))
        x = self.dropout5(self.relu5_2(self.bn5_2(self.conv5_2(x))))

        x = self.relu6_1(self.bn6_1(self.conv6_1(x)))

        x = self.global_avg_pool(x)
        x = self.dropout6(x.squeeze(2))
        x = self.fc(x)
        return x

## Initialize Model, Optimizer, and Loss Function

The following code initializes the audio classification model, selects the appropriate computation device (GPU if available, otherwise CPU), and sets up the optimizer and loss function.

In [14]:
# Initialize model, optimizer, and loss function
model = AudioClassifier(NUM_CLASSES)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
criterion = nn.CrossEntropyLoss()

## Training and Validation Loop for Audio Classification

The following script trains the `AudioClassifier` model using a combination of techniques such as:

- **cuDNN Benchmarking**: Enables cuDNN optimizations for faster computation when input sizes are constant.
- **Gradient Scaling & Mixed Precision Training**: Uses `torch.cuda.amp` for mixed precision training to improve efficiency.
- **Asynchronous GPU Transfers**: Uses `non_blocking=True` when transferring data to the GPU.
- **Model Checkpointing**: Saves the model if it achieves the best validation loss.

## Training Configuration

In [15]:
# Enable cuDNN benchmarking if input sizes are constant.
torch.backends.cudnn.benchmark = True

# train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4, pin_memory=True)
# val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)

num_epochs = 500
best_val_loss = float('inf')

# Initialize the GradScaler for AMP
scaler = GradScaler()


  scaler = GradScaler()


### Training Loop and Validation Loop

In [56]:
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        # Use non_blocking=True for asynchronous GPU transfers
        inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True)
        
        optimizer.zero_grad()
        
        # Use autocast for mixed precision
        with autocast():
            outputs = model(inputs)
            loss = criterion(outputs, labels)
        
        # Scale the loss and backpropagate
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        running_loss += loss.item()
        if i % 10 == 9:  # Print every 10 batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 10:.3f}')
            running_loss = 0.0
    
    # Validation loop: compute both loss and accuracy
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for data in val_loader:
            inputs, labels = data
            inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True)
            with autocast():
                outputs = model(inputs)
                loss = criterion(outputs, labels)
            val_loss += loss.item()
            
            # Compute accuracy.
            # Assuming classification, get the index of the max log-probability
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    val_loss /= len(val_loader)
    accuracy = 100.0 * correct / total
    print(f'Epoch {epoch+1}, Validation Loss: {val_loss:.3f}, Validation Accuracy: {accuracy:.2f}%')

    # Save the model if validation loss is the best we've seen so far.
    if val_loss < best_val_loss:
        torch.save(model.state_dict(), "best_model.pth")
        best_val_loss = val_loss
        print("Saved best model")

print('Finished Training')

  with autocast():


[1,    10] loss: 1.866
[1,    20] loss: 1.623
[1,    30] loss: 1.477
[1,    40] loss: 1.410
[1,    50] loss: 1.434
[1,    60] loss: 1.348
[1,    70] loss: 1.244
[1,    80] loss: 1.228
[1,    90] loss: 1.143
[1,   100] loss: 1.096
[1,   110] loss: 1.225
[1,   120] loss: 1.095
[1,   130] loss: 1.147
[1,   140] loss: 1.024
[1,   150] loss: 1.117


  with autocast():


Epoch 1, Validation Loss: 1.026, Validation Accuracy: 60.89%
Saved best model
[2,    10] loss: 1.058
[2,    20] loss: 1.111
[2,    30] loss: 0.965
[2,    40] loss: 0.916
[2,    50] loss: 0.916
[2,    60] loss: 1.022
[2,    70] loss: 0.933
[2,    80] loss: 1.036
[2,    90] loss: 0.941
[2,   100] loss: 0.869
[2,   110] loss: 0.981
[2,   120] loss: 0.940
[2,   130] loss: 0.934
[2,   140] loss: 0.905
[2,   150] loss: 0.829
Epoch 2, Validation Loss: 0.841, Validation Accuracy: 69.95%
Saved best model
[3,    10] loss: 0.818
[3,    20] loss: 0.883
[3,    30] loss: 0.865
[3,    40] loss: 0.941
[3,    50] loss: 0.835
[3,    60] loss: 0.842
[3,    70] loss: 0.818
[3,    80] loss: 0.820
[3,    90] loss: 0.858
[3,   100] loss: 0.820
[3,   110] loss: 0.916
[3,   120] loss: 0.775
[3,   130] loss: 0.745
[3,   140] loss: 0.822
[3,   150] loss: 0.666
Epoch 3, Validation Loss: 0.875, Validation Accuracy: 68.12%
[4,    10] loss: 0.798
[4,    20] loss: 0.799
[4,    30] loss: 0.829
[4,    40] loss: 0.695
[

## Loading the Best Trained Model

After training, we can load the best-performing model (based on validation loss) for further evaluation or inference.

In [16]:
# Load best model
model = AudioClassifier(NUM_CLASSES)
model.load_state_dict(torch.load("best_model.pth"))
model.to(device)
model.eval()

AudioClassifier(
  (conv1): Conv1d(1, 64, kernel_size=(80,), stride=(4,), padding=(39,))
  (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu1): ReLU()
  (pool1): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (dropout1): Dropout(p=0.1, inplace=False)
  (conv2_1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn2_1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu2_1): ReLU()
  (conv2_2): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn2_2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu2_2): ReLU()
  (pool2): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (dropout2): Dropout(p=0.1, inplace=False)
  (conv3_1): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn3_1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu3_1

## Testing the Model

After training and selecting the best model, we evaluate its performance on the train, validation and test dataset.

In [17]:
def evaluate_model(model, data_loader, dataset_name="Dataset"):
    correct = 0
    total = 0
    all_predicted = []
    all_labels = []
    
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        for data in data_loader:
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            all_predicted.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = 100 * correct / total
    print(f'Accuracy of the network on the {dataset_name}: {accuracy:.2f}%')
    print(classification_report(all_labels, all_predicted))

# Evaluate on training set
evaluate_model(model, train_loader, dataset_name="training data")

# Evaluate on validation set
evaluate_model(model, val_loader, dataset_name="validation data")

# Evaluate on test set
evaluate_model(model, test_loader, dataset_name="test data")


Accuracy of the network on the training data: 97.02%
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1345
           1       0.99      0.94      0.96       515
           2       0.98      0.97      0.97      1435
           3       0.97      0.97      0.97       899
           4       0.96      0.96      0.96       468
           5       0.98      0.92      0.95       194
           6       0.92      0.92      0.92       175

    accuracy                           0.97      5031
   macro avg       0.97      0.95      0.96      5031
weighted avg       0.97      0.97      0.97      5031

Accuracy of the network on the validation data: 97.85%
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       345
           1       0.98      0.96      0.97       117
           2       0.97      0.99      0.98       356
           3       0.98      0.97      0.98       234
           4       0.99 