In [None]:
import librosa
import numpy as np
import scipy.signal
from pathlib import Path
import soundfile as sf
import random

class AudioPreprocessor:
    def __init__(self, sample_rate=22050, segment_duration=2.0, n_mels=64):
        self.sample_rate = sample_rate
        self.segment_duration = segment_duration
        self.n_mels = n_mels
        self.hop_length = 256
        self.n_fft = 1024
        self.max_segments = 5

    @staticmethod
    def load_audio(self, file_path):
        """Load audio file with librosa and resample if needed."""
        audio, sr = librosa.load(file_path, sr=self.sample_rate, mono=True)
        return audio
    
    @staticmethod
    def random_segment(self, audio):
        """Extract random segments of fixed duration from audio."""
        target_length = int(self.segment_duration * self.sample_rate)
        if len(audio) < target_length:
            # Pad with zeros if too short
            audio = np.pad(audio, (0, target_length - len(audio)), mode='constant')
        else:
            # Randomly select start point
            max_start = len(audio) - target_length
            start = random.randint(0, max_start)
            audio = audio[start:start + target_length]
        return audio
    
    @staticmethod
    def augment_audio(self, audio):
        """Apply custom audio augmentation techniques."""
        # Random pitch shift
        pitch_factor = random.uniform(-2, 2)
        audio = librosa.effects.pitch_shift(audio, sr=self.sample_rate, n_steps=pitch_factor)
        
        # Add random noise
        noise_level = random.uniform(0.001, 0.005)
        noise = np.random.normal(0, noise_level, len(audio))
        audio = audio + noise
        
        # Time stretch
        stretch_factor = random.uniform(0.8, 1.2)
        audio = librosa.effects.time_stretch(audio, rate=stretch_factor)
        
        return audio

    @staticmethod
    def extract_features(self, audio):
        """Extract custom feature set combining mel spectrogram and spectral features."""
        # Mel spectrogram
        mel_spec = librosa.feature.melspectrogram(
            y=audio,
            sr=self.sample_rate,
            n_mels=self.n_mels,
            n_fft=self.n_fft,
            hop_length=self.hop_length
        )
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

        # Spectral contrast
        contrast = librosa.feature.spectral_contrast(
            y=audio,
            sr=self.sample_rate,
            n_bands=6,
            hop_length=self.hop_length
        )

        # Spectral rolloff
        rolloff = librosa.feature.spectral_rolloff(
            y=audio,
            sr=self.sample_rate,
            hop_length=self.hop_length
        )

        # Combine features
        features = np.concatenate([
            mel_spec_db.T,
            contrast.T,
            rolloff.T
        ], axis=1)

        return features

    @staticmethod
    def normalize_features(self, features):
        """Normalize features using robust scaling."""
        median = np.median(features, axis=0)
        iqr = np.percentile(features, 75, axis=0) - np.percentile(features, 25, axis=0)
        iqr = np.where(iqr == 0, 1, iqr)  # Avoid division by zero
        normalized = (features - median) / iqr
        return normalized
    
    @staticmethod
    def process_file(self, file_path):
        """Process a single audio file."""
        audio = self.load_audio(file_path)
        segments = []
        
        for _ in range(self.max_segments):
            segment = self.random_segment(audio)
            segment = self.augment_audio(segment)
            features = self.extract_features(segment)
            normalized_features = self.normalize_features(features)
            segments.append(normalized_features)
        
        return np.array(segments)
    
    @staticmethod
    def process_dataset(self, input_dir, output_dir):
        """Process all audio files in a directory."""
        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok=True)
        
        for audio_file in Path(input_dir).glob("*.wav"):
            processed_data = self.process_file(audio_file)
            output_path = output_dir / f"processed_{audio_file.stem}.npy"
            np.save(output_path, processed_data)

if __name__ == "__main__":
    preprocessor = AudioPreprocessor()
    preprocessor.process_dataset(
        input_dir="path/to/audio/files",
        output_dir="path/to/output/processed"
    )

In [None]:
import os
import torch
import librosa
import random
from torch.utils.data import Dataset,  random_split, DataLoader
import numpy as np
from pathlib import Path

# Updated SoundDS class
class SoundDS(Dataset):
    def __init__(self, df, data_path):
        self.df = df
        self.data_path = str(data_path)
        self.preprocessor = AudioPreprocessor(sample_rate=44100, segment_duration=2.0, n_mels=64)
        self.duration = 4000  # Original duration in ms, but we'll use segment_duration from preprocessor
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        relative_path = self.df.loc[idx, 'relative_path'].lstrip('/')
        audio_file = os.path.join(self.data_path, 'audio', relative_path)
        
        if not os.path.exists(audio_file):
            raise FileNotFoundError(f"Audio file not found: {audio_file}")
        
        class_id = self.df.loc[idx, 'classID']
        
        # Process audio using AudioPreprocessor
        processed_segments = self.preprocessor.process_file(audio_file)
        
        # Convert to tensor (assuming PyTorch usage)
        processed_segments = torch.from_numpy(processed_segments).float()
        
        return processed_segments, class_id

In [None]:
# Function to split the dataset
def split_dataset(dataset, train_ratio=0.8, val_ratio=0.2):
    assert train_ratio + val_ratio == 1.0, "Ratios must sum to 1.0"
    total_size = len(dataset)
    train_size = int(train_ratio * total_size)
    val_size = total_size - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    return train_dataset, val_dataset

In [None]:
# Create DataLoaders (optional, for batching)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

 # Example: Iterate over a batch
for batch_segments, batch_labels in train_loader:
    print(f"Batch segments shape: {batch_segments.shape}, Batch labels shape: {batch_labels.shape}")
    break

In [None]:
import torch
import torch.nn as nn
from torch.nn import init
import torch.nn.functional as F

class EnhancedAudioCNN(nn.Module):
    def __init__(self, input_channels, num_classes, max_segments=5, feature_dim=128):
        super(EnhancedAudioCNN, self).__init__()
        
        # Initial convolutional layer
        self.conv1 = nn.Conv1d(in_channels=input_channels, out_channels=32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(32)
        self.dropout1 = nn.Dropout(0.2)
        init.kaiming_normal_(self.conv1.weight, a=0.1)
        self.conv1.bias.data.zero_()
        
        # First residual block
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm1d(32)
        self.dropout2 = nn.Dropout(0.3)
        init.kaiming_normal_(self.conv2.weight, a=0.1)
        self.conv2.bias.data.zero_()
        
        # Second residual block
        self.conv3 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm1d(64)
        self.dropout3 = nn.Dropout(0.3)
        init.kaiming_normal_(self.conv3.weight, a=0.1)
        self.conv3.bias.data.zero_()
        
        # Depthwise separable block
        self.depthwise_conv = nn.Conv1d(in_channels=64, out_channels=64, kernel_size=3, groups=64, padding=1)
        self.pointwise_conv = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=1)
        self.bn4 = nn.BatchNorm1d(128)
        self.dropout4 = nn.Dropout(0.4)
        init.kaiming_normal_(self.depthwise_conv.weight, a=0.1)
        self.depthwise_conv.bias.data.zero_()
        init.kaiming_normal_(self.pointwise_conv.weight, a=0.1)
        self.pointwise_conv.bias.data.zero_()
        
        # Global average pooling
        self.global_pool = nn.AdaptiveAvgPool1d(1)
        
        # Fully connected layer
        self.fc = nn.Linear(128 * max_segments, num_classes)
        
        # Initialize weights
        self._initialize_weights()

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        batch_size, segments, time_steps, channels = x.size()
        
        # Reshape for convolution
        x = x.view(batch_size * segments, channels, time_steps)
        
        # Convolutional layers with residuals
        x = F.relu(self.bn1(self.conv1(x)))
        residual = x
        x = F.relu(self.bn2(self.conv2(x)) + residual)
        x = self.dropout2(x)
        
        residual = x
        x = F.relu(self.bn3(self.conv3(x)) + residual)
        x = self.dropout3(x)
        
        x = F.relu(self.bn4(self.pointwise_conv(self.depthwise_conv(x))))
        x = self.dropout4(x)
        
        # Global average pooling
        x = self.global_pool(x)  # [batch_size * segments, 128, 1]
        x = x.view(batch_size, segments * 128)  # Flatten segments
        
        # Linear layer
        x = self.fc(x)
        
        return x

# Example usage
if __name__ == "__main__":
    input_channels = 128
    num_classes = 10
    max_segments = 5
    feature_dim = 128
    
    model = EnhancedAudioCNN(input_channels=input_channels, num_classes=num_classes, max_segments=max_segments, feature_dim=feature_dim)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    sample_input = torch.randn(32, max_segments, 100, input_channels)
    output = model(sample_input)
    print(f"Output shape: {output.shape}")  # Should be [32, num_classes]

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

def train_model(model, train_dl, val_dl, num_epochs, device):
    # Loss Function, Optimizer, and Scheduler
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)  # Add weight decay
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer, max_lr=0.001,
        steps_per_epoch=int(len(train_dl)),
        epochs=num_epochs,
        anneal_strategy='cosine'  # Switch to cosine for smoother learning
    )

    # Best validation accuracy for early stopping
    best_val_acc = 0.0

    for epoch in range(num_epochs):
        # Training Phase
        model.train()
        running_loss = 0.0
        correct_prediction = 0
        total_prediction = 0

        for i, data in enumerate(train_dl):
            inputs, labels = data[0].to(device), data[1].to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)  # Shape: [batch_size, num_classes]
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            scheduler.step()

            # Statistics
            running_loss += loss.item()
            _, prediction = torch.max(outputs, 1)
            correct_prediction += (prediction == labels).sum().item()
            total_prediction += prediction.shape[0]

        # Training epoch metrics
        train_loss = running_loss / len(train_dl)
        train_acc = correct_prediction / total_prediction

        # Validation Phase
        model.eval()
        val_loss = 0.0
        val_correct_prediction = 0
        val_total_prediction = 0

        with torch.no_grad():
            for data in val_dl:
                inputs, labels = data[0].to(device), data[1].to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                val_loss += loss.item()
                _, prediction = torch.max(outputs, 1)
                val_correct_prediction += (prediction == labels).sum().item()
                val_total_prediction += prediction.shape[0]

        # Validation epoch metrics
        val_loss = val_loss / len(val_dl)
        val_acc = val_correct_prediction / val_total_prediction

        # Print epoch stats
        print(f'Epoch [{epoch+1}/{num_epochs}], '
              f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, '
              f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pth')
            print(f"New best model saved with validation accuracy: {best_val_acc:.4f}")

    print('Finished Training')
    return model

# Example usage
if __name__ == "__main__":
    # Assuming model, train_dl, val_dl, and device are defined
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = EnhancedAudioCNN(input_channels=128, num_classes=10, max_segments=5, feature_dim=128).to(device)
    
    # Load your datasets (from previous split_dataset function)
    # train_dataset, val_dataset = split_dataset(full_dataset, train_ratio=0.8, val_ratio=0.2)
    # train_dl = DataLoader(train_dataset, batch_size=32, shuffle=True)
    # val_dl = DataLoader(val_dataset, batch_size=32, shuffle=False)
    
    num_epochs = 20
    trained_model = train_model(model, train_dl, val_dl, num_epochs, device)