In [16]:
import os
import numpy as np
import torch
import soundfile as sf
import joblib
import pandas as pd
from pyannote.audio import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

import torchaudio
torchaudio.set_audio_backend("soundfile")

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class AudioCNN(nn.Module):
    def __init__(self):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        
        # Calculate flatten size dynamically
        self.flatten_size = self._get_flatten_size((1, 1, 17640))
        
        self.fc1 = nn.Linear(128 * 4410, 128)  # Corrected input size for the fully connected layer
        self.fc2 = nn.Linear(128, 10)  # Adjust the output layer to match your number of classes

    def _get_flatten_size(self, input_shape):
        with torch.no_grad():
            x = torch.zeros(input_shape)
            x = self.pool(F.relu(self.conv1(x)))
            x = self.pool(F.relu(self.conv2(x)))
            print(f"Shape after conv1 and pool1: {x.shape}")
            print(f"Shape after conv2 and pool2: {x.shape}")
            return x.numel()

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        print(f"Shape after conv1 and pool1: {x.shape}")
        x = self.pool(F.relu(self.conv2(x)))
        print(f"Shape after conv2 and pool2: {x.shape}")
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


def load_artifacts(model_path, scaler_path, label_mapping_path):
    # Load the PyTorch model
    model = torch.load(model_path, map_location=device)
    model.eval()
    
    # Load the scaler using joblib
    try:
        scaler = joblib.load(scaler_path)
    except Exception as e:
        print(f"Error loading scaler: {e}")
        scaler = StandardScaler()  # Placeholder

    # Load the label mapping using joblib
    try:
        label_mapping = joblib.load(label_mapping_path)
    except Exception as e:
        print(f"Error loading label mapping: {e}")
        label_mapping = {}  # Placeholder
    
    return model, scaler, label_mapping

def process_audio_file(audio_path, vad_pipeline, output_folder):
    # Load the audio file
    y, sr = sf.read(audio_path, dtype='float32')

    # Ensure tensor is of dtype float32
    waveform = torch.tensor(y, dtype=torch.float32).unsqueeze(0).to(device)  # Add a channel dimension and move to GPU

    # Apply VAD to the audio file
    vad = vad_pipeline({'waveform': waveform, 'sample_rate': sr})

    # Get speech segments
    speech_segments = vad.get_timeline().support()

    # Add buffer around speech segments
    buffer_duration = 0.5  # buffer duration in seconds
    buffer_samples = int(buffer_duration * sr)

    segments_with_buffer = []
    for segment in speech_segments:
        start = max(0, int(segment.start * sr) - buffer_samples)
        end = min(len(y), int(segment.end * sr) + buffer_samples)
        segments_with_buffer.append((start, end))

    # Extract 1.1-second snippets
    snippet_duration = 1.1  # snippet duration in seconds
    snippet_samples = int(snippet_duration * sr)

    snippets = []
    timestamps = []
    for start, end in segments_with_buffer:
        segment_duration = end - start
        if segment_duration >= snippet_samples:
            for snippet_start in range(start, end - snippet_samples + 1, snippet_samples):
                snippet = y[snippet_start:snippet_start + snippet_samples]
                snippets.append(snippet)
                timestamps.append(snippet_start / sr)
        else:
            snippet = y[start:end]
            if len(snippet) < snippet_samples:
                snippet = np.pad(snippet, (0, snippet_samples - len(snippet)), mode='constant')
            snippets.append(snippet)
            timestamps.append(start / sr)

    # Save snippets
    base_filename = os.path.splitext(os.path.basename(audio_path))[0]
    snippet_folder = os.path.join(output_folder, f"{base_filename}_output_snippets")
    os.makedirs(snippet_folder, exist_ok=True)

    snippet_paths = []
    for i, snippet in enumerate(snippets):
        snippet_filename = os.path.join(snippet_folder, f'snippet_{i}.wav')
        sf.write(snippet_filename, snippet, sr)
        snippet_paths.append(snippet_filename)

    return snippet_paths, timestamps

def extract_features_and_predict(snippet_paths, model, scaler):
    predictions = []

    for snippet_path in snippet_paths:
        # Load the snippet
        y, sr = sf.read(snippet_path, dtype='float32')
        # Ensure the input shape
        y = y[:17640]  # Truncate or pad the snippet to 17640 samples
        if len(y) < 17640:
            y = np.pad(y, (0, 17640 - len(y)), 'constant')
        # Reshape for CNN input
        y = np.expand_dims(y, axis=0)
        y = np.expand_dims(y, axis=0)
        print(f"Input shape before feeding to model: {y.shape}")  # Debugging print
        # Scale the input
        y = torch.tensor(y, dtype=torch.float32).to(device)
        y = y / torch.max(torch.abs(y))  # Normalization step
        # Predict
        with torch.no_grad():
            output = model(y)
        _, predicted_label = torch.max(output, 1)
        predictions.append(predicted_label.item())

    return predictions

# Example usage
scenes_folder = "data/mlpc24_speech_commands/scenes"
development_scenes_folder = "data/mlpc24_speech_commands/development_scenes"
annotations_path = "data/development_scene_annotations.csv"
scenes_path = "data/development_scenes.csv"
token = "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"  # Replace with your Hugging Face access token
output_folder = "output_snippets"
vad_pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=token)

model_path = "MLPC-main/models/audio_cnn_model_complete.pth"
scaler_path = "MLPC-main/models/scaler.pkl"
label_mapping_path = "MLPC-main/models/label_mapping.pkl"

# Load the annotations and scenes dataframes
annotations_df = pd.read_csv(annotations_path)
scenes_df = pd.read_csv(scenes_path)

# Load artifacts
model, scaler, label_mapping = load_artifacts(model_path, scaler_path, label_mapping_path)

# Prepare dataset and predict
results = []
for filename in os.listdir(scenes_folder):
    if filename.endswith('.wav'):
        audio_path = os.path.join(scenes_folder, filename)
        snippet_paths, timestamps = process_audio_file(audio_path, vad_pipeline, output_folder)
        predictions = extract_features_and_predict(snippet_paths, model, scaler)
        
        # Get the command from annotations
        try:
            command = annotations_df[annotations_df['filename'] == filename]['command'].values[0]
        except IndexError:
            print(f"No command found for {filename}")
            command = None
        
        # Collect results
        for timestamp, prediction in zip(timestamps, predictions):
            results.append([filename, command, timestamp, prediction])

# Save results to a CSV file
results


Input shape before feeding to model: (1, 1, 17640)
Shape after conv1 and pool1: torch.Size([1, 64, 8820])
Shape after conv2 and pool2: torch.Size([1, 128, 4410])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x564480 and 1280x128)

In [4]:
# Example usage
scenes_folder = "data/mlpc24_speech_commands/scenes"
development_scenes_folder = "data/mlpc24_speech_commands/development_scenes"
annotations_path = "data/development_scene_annotations.csv"
scenes_path = "data/development_scenes.csv"
token = "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"  # Replace with your Hugging Face access token
output_folder = "output_snippets"
vad_pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=token)

model_path = "MLPC-main/models/audio_cnn_model_complete.pth"
scaler_path = "MLPC-main/models/scaler.pkl"
label_mapping_path = "MLPC-main/models/label_mapping.pkl"

annotations_df = pd.read_csv(annotations_path)
scenes_df = pd.read_csv(scenes_path)

# Print the columns to inspect them
print("Annotations DataFrame Columns:", annotations_df.columns)
print("Scenes DataFrame Columns:", scenes_df.columns)

Annotations DataFrame Columns: Index(['filename', 'command', 'start', 'end'], dtype='object')
Scenes DataFrame Columns: Index(['filename', 'speaker_id'], dtype='object')


In [1]:
print("HI")

HI
