In [None]:
import os
import numpy as np
import torch
import soundfile as sf
import joblib
import pandas as pd
from pyannote.audio import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

import torchaudio
torchaudio.set_audio_backend("soundfile")

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define the AudioCNN class
class AudioCNN(nn.Module):
    def __init__(self):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(64 * 16 * 16, 128)
        self.fc2 = nn.Linear(128, 10)  # Adjust the output layer to match your number of classes

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 64 * 16 * 16)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

def load_artifacts(model_path, scaler_path, label_mapping_path):
    # Load the PyTorch model
    model = torch.load(model_path, map_location=device)
    model.eval()
    
    # Load the scaler using joblib
    try:
        scaler = joblib.load(scaler_path)
    except Exception as e:
        print(f"Error loading scaler: {e}")
        scaler = StandardScaler()  # Placeholder

    # Load the label mapping using joblib
    try:
        label_mapping = joblib.load(label_mapping_path)
    except Exception as e:
        print(f"Error loading label mapping: {e}")
        label_mapping = {}  # Placeholder
    
    return model, scaler, label_mapping

def process_audio_file(audio_path, vad_pipeline):
    # Load the audio file
    y, sr = sf.read(audio_path, dtype='float32')

    # Ensure tensor is of dtype float32
    waveform = torch.tensor(y, dtype=torch.float32).unsqueeze(0).to(device)  # Add a channel dimension and move to GPU

    # Apply VAD to the audio file
    vad = vad_pipeline({'waveform': waveform, 'sample_rate': sr})

    # Get speech segments
    speech_segments = vad.get_timeline().support()

    # Add buffer around speech segments
    buffer_duration = 0.5  # buffer duration in seconds
    buffer_samples = int(buffer_duration * sr)

    segments_with_buffer = []
    for segment in speech_segments:
        start = max(0, int(segment.start * sr) - buffer_samples)
        end = min(len(y), int(segment.end * sr) + buffer_samples)
        segments_with_buffer.append((start, end))

    # Extract 1.1-second snippets
    snippet_duration = 1.1  # snippet duration in seconds
    snippet_samples = int(snippet_duration * sr)

    snippets = []
    timestamps = []
    for start, end in segments_with_buffer:
        segment_duration = end - start
        if segment_duration >= snippet_samples:
            for snippet_start in range(start, end - snippet_samples + 1, snippet_samples):
                snippet = y[snippet_start:snippet_start + snippet_samples]
                snippets.append(snippet)
                timestamps.append(snippet_start / sr)
        else:
            snippet = y[start:end]
            if len(snippet) < snippet_samples:
                snippet = np.pad(snippet, (0, snippet_samples - len(snippet)), mode='constant')
            snippets.append(snippet)
            timestamps.append(start / sr)

    return snippets, timestamps

class AudioDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feature = self.features[idx]
        label = self.labels[idx]
        return torch.tensor(feature, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

def prepare_dataset(scenes_folder, development_scenes_folder, vad_pipeline, scaler, annotations_df):
    features = []
    labels = []

    # Create a dictionary for easy lookup of labels
    label_dict = annotations_df.set_index('filename').to_dict()['command']
    
    # Iterate through each audio file in the scenes folder
    for filename in os.listdir(scenes_folder):
        if filename.endswith('.wav'):
            audio_path = os.path.join(scenes_folder, filename)
            snippets, timestamps = process_audio_file(audio_path, vad_pipeline)
            
            # Match snippets with precomputed features and labels
            for i, snippet in enumerate(snippets):
                snippet_id = f"{filename.split('.')[0]}_snippet_{i}"
                feature_path = os.path.join(development_scenes_folder, f"{snippet_id}.npy")
                if os.path.exists(feature_path):
                    feature = np.load(feature_path)
                    features.append(feature)
                    label = label_dict[filename]
                    labels.append(label)

    # Scale features
    features = scaler.transform(features)
    
    return features, labels

def train_model(model, train_loader, test_loader, num_epochs=20, learning_rate=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        # Create a tqdm progress bar
        train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
        
        for i, (inputs, labels) in enumerate(train_loader_tqdm):
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs.unsqueeze(1))
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            
            # Update the progress bar description
            train_loader_tqdm.set_postfix(loss=running_loss / (i+1))

        print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}")

        # Evaluate on the test set
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs.unsqueeze(1))
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        print(f"Test Accuracy: {100 * correct / total}%")
# Example usage
scenes_folder = "data/mlpc24_speech_commands/scenes"
development_scenes_folder = "data/mlpc24_speech_commands/development_scenes"
annotations_path = "data/development_scene_annotations.csv"
scenes_path = "data/development_scenes.csv"
token = "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"  # Replace with your Hugging Face access token
output_folder = "output_snippets"
vad_pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=token)

model_path = "MLPC-main/models/audio_cnn_model_complete.pth"
scaler_path = "MLPC-main/models/scaler.pkl"
label_mapping_path = "MLPC-main/models/label_mapping.pkl"

# Load the annotations and scenes dataframes
annotations_df = pd.read_csv(annotations_path)
scenes_df = pd.read_csv(scenes_path)

model, scaler, label_mapping = load_artifacts(model_path, scaler_path, label_mapping_path)
features, labels = prepare_dataset(scenes_folder, development_scenes_folder, vad_pipeline, scaler, annotations_df)

# Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Create DataLoader
train_dataset = AudioDataset(X_train, y_train)
test_dataset = AudioDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

model = AudioCNN().to(device)
train_model(model, train_loader, test_loader)


The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.
torchvision is not available - cannot save figures
The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.


In [4]:
# Example usage
scenes_folder = "data/mlpc24_speech_commands/scenes"
development_scenes_folder = "data/mlpc24_speech_commands/development_scenes"
annotations_path = "data/development_scene_annotations.csv"
scenes_path = "data/development_scenes.csv"
token = "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"  # Replace with your Hugging Face access token
output_folder = "output_snippets"
vad_pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=token)

model_path = "MLPC-main/models/audio_cnn_model_complete.pth"
scaler_path = "MLPC-main/models/scaler.pkl"
label_mapping_path = "MLPC-main/models/label_mapping.pkl"

annotations_df = pd.read_csv(annotations_path)
scenes_df = pd.read_csv(scenes_path)

# Print the columns to inspect them
print("Annotations DataFrame Columns:", annotations_df.columns)
print("Scenes DataFrame Columns:", scenes_df.columns)

Annotations DataFrame Columns: Index(['filename', 'command', 'start', 'end'], dtype='object')
Scenes DataFrame Columns: Index(['filename', 'speaker_id'], dtype='object')


In [1]:
print("HI")

HI
