In [9]:
import torchaudio.transforms as T
import os
import torch
from torch.utils.data import Dataset, DataLoader
import torchaudio


def preprocess_audio(waveform, sample_rate, num_frames=160):
    # Resample if not already at 16000 Hz
    if sample_rate != 16000:
        resample_transform = T.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resample_transform(waveform)
    
    # Extract MFCC features
    mfcc_transform = T.MFCC(sample_rate=16000, n_mfcc=13,
                            melkwargs={'n_fft': 400, 'hop_length': 160, 'n_mels': 23, 'center': False})
    mfcc = mfcc_transform(waveform)
    
    # Pad or truncate to fixed length
    mfcc_length = mfcc.shape[2]
    if mfcc_length > num_frames:
        mfcc = mfcc[:, :, :num_frames]  # Truncate
    elif mfcc_length < num_frames:
        pad_size = num_frames - mfcc_length
        mfcc = torch.nn.functional.pad(mfcc, (0, pad_size), "constant", 0)  # Pad with zeros
    
    return mfcc

class VoiceDataset(Dataset):
    def __init__(self, directory):
        self.files = [os.path.join(root, file) for root, _, files in os.walk(directory) for file in files if file.endswith('.wav')]
        
    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        waveform, sample_rate = torchaudio.load(self.files[idx])
        mfcc = preprocess_audio(waveform, sample_rate)
        return mfcc, 1  # 1 for 'trigger word', 0 for 'no trigger word' or background noise

# Now, your DataLoader should be able to batch the data without the size mismatch error.



In [14]:
import torch.nn as nn

class TriggerWordModel(nn.Module):
    def __init__(self):
        super(TriggerWordModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=(3, 3), padding=1)  # Feature channels: 16, Output: [batch_size, 16, 13, 160]
        self.pool = nn.MaxPool2d(2, 2)  # Output dimension: [batch_size, 16, 6, 80] (H and W halved)
        self.lstm = nn.LSTM(6 * 80 * 16, 128, batch_first=True)  # Adjusted for the flattened output from conv + pool
        self.fc = nn.Linear(128, 2)  # Expecting 128 features from LSTM

    def forward(self, x):
        x = x.squeeze(2)  # Removing the extra dimension
        x = torch.relu(self.conv1(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)  # Flatten the output from the pooling layer
        x = x.unsqueeze(1)  # Add sequence dimension for LSTM
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])  # Use the output of the last LSTM step
        return torch.log_softmax(x, dim=1)

model = TriggerWordModel()


In [15]:
from torch.optim import Adam
from torch.nn import NLLLoss

optimizer = Adam(model.parameters(), lr=0.001)
loss_function = NLLLoss()

def train_model(model, dataloader, epochs=10):
    model.train()
    for epoch in range(epochs):
        for mfcc, labels in dataloader:
            optimizer.zero_grad()
            outputs = model(mfcc)
            loss = loss_function(outputs, torch.tensor(labels, dtype=torch.long))
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch + 1}: Loss = {loss.item()}')

# Example usage
train_model(model, dataloader)


  loss = loss_function(outputs, torch.tensor(labels, dtype=torch.long))


Epoch 1: Loss = 0.030325978994369507
Epoch 2: Loss = 0.011222492903470993
Epoch 3: Loss = 0.005913855973631144
Epoch 4: Loss = 0.004091701935976744
Epoch 5: Loss = 0.0031207927968353033
Epoch 6: Loss = 0.0025056179147213697
Epoch 7: Loss = 0.0020761380437761545
Epoch 8: Loss = 0.0017584589077159762
Epoch 9: Loss = 0.0015145983779802918
Epoch 10: Loss = 0.0013222293928265572
