In [1]:
!pip install pydub
!apt-get install ffmpeg


Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [31]:
import os
from pydub import AudioSegment

# Define the input and output directories
input_dir = "/content/drive/My Drive/audio/marvin"  # Replace with the path to your .m4a files
output_dir = "/content/drive/My Drive/sound/marvin"  # Replace with the path to save .wav files

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Loop over all .m4a files in the input directory and convert them to .wav
for filename in os.listdir(input_dir):
    if filename.endswith(".m4a"):
        input_file = os.path.join(input_dir, filename)
        output_file = os.path.join(output_dir, os.path.splitext(filename)[0] + ".wav")

        # Convert to .wav using pydub
        print(f"Converting {filename} to WAV format...")
        audio = AudioSegment.from_file(input_file, format="m4a")
        audio.export(output_file, format="wav")
        print(f"Saved: {output_file}")

print("All files converted successfully!")


Converting Recording (28).m4a to WAV format...
Saved: /content/drive/My Drive/sound/marvin/Recording (28).wav
Converting Recording (27).m4a to WAV format...
Saved: /content/drive/My Drive/sound/marvin/Recording (27).wav
Converting Recording (24).m4a to WAV format...
Saved: /content/drive/My Drive/sound/marvin/Recording (24).wav
Converting Recording (25).m4a to WAV format...
Saved: /content/drive/My Drive/sound/marvin/Recording (25).wav
Converting Recording (26).m4a to WAV format...
Saved: /content/drive/My Drive/sound/marvin/Recording (26).wav
Converting Recording (23).m4a to WAV format...
Saved: /content/drive/My Drive/sound/marvin/Recording (23).wav
Converting Recording (22).m4a to WAV format...
Saved: /content/drive/My Drive/sound/marvin/Recording (22).wav
Converting Recording (2).m4a to WAV format...
Saved: /content/drive/My Drive/sound/marvin/Recording (2).wav
Converting Recording (20).m4a to WAV format...
Saved: /content/drive/My Drive/sound/marvin/Recording (20).wav
Converting R

In [52]:
import os
import torch
import torchaudio
import torchaudio.transforms as T
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import accuracy_score

# Preprocessing to convert waveforms to Mel Spectrograms
class ToMelSpectrogram:
    def __init__(self, n_mels=64, n_fft=1024, hop_length=512, fixed_length=128, sample_rate=16000):
        self.transform = T.MelSpectrogram(n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
        self.fixed_length = fixed_length
        self.sample_rate = sample_rate
        self.max_length = sample_rate  # 1 second of audio

    def __call__(self, waveform):
        # Ensure the waveform is mono (1 channel)
        if waveform.size(0) > 1:  # If the waveform has multiple channels, average them to mono
            waveform = torch.mean(waveform, dim=0, keepdim=True)

        # Ensure the waveform is at least 1 second (or desired length)
        if waveform.size(1) > self.max_length:
            waveform = waveform[:, :self.max_length]  # Truncate the waveform
        elif waveform.size(1) < self.max_length:
            padding = self.max_length - waveform.size(1)
            waveform = F.pad(waveform, (0, padding))  # Pad the waveform with zeros

        # Convert the waveform to a Mel Spectrogram
        mel_spec = self.transform(waveform)  # [n_mels, time]
        mel_spec = mel_spec.squeeze(0)  # Remove channel dimension if necessary

        # Ensure the spectrogram is exactly fixed_length in time dimension
        if mel_spec.size(1) < self.fixed_length:
            pad_size = self.fixed_length - mel_spec.size(1)
            mel_spec = F.pad(mel_spec, (0, pad_size), "constant", 0)
        else:
            mel_spec = mel_spec[:, :self.fixed_length]

        # Return the spectrogram with the channel dimension [1, n_mels, fixed_length]
        return mel_spec.unsqueeze(0)


# Custom dataset class
class SpeechCommandsDataset(Dataset):
    def __init__(self, data_dir, transform=None):
        self.data_dir = data_dir
        self.transform = transform
        self.commands = sorted(os.listdir(data_dir))
        self.file_paths = []

        for command in self.commands:
            command_dir = os.path.join(data_dir, command)
            if not os.path.isdir(command_dir):
                continue
            for filename in os.listdir(command_dir):
                if filename.endswith('.wav'):
                    self.file_paths.append((os.path.join(command_dir, filename), self.commands.index(command)))

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path, label = self.file_paths[idx]
        waveform, sample_rate = torchaudio.load(file_path)
        if self.transform:
            waveform = self.transform(waveform)
        return waveform, label


# Define the CNN model
class CNNClassifier(nn.Module):
    def __init__(self, num_classes):
        super(CNNClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)

        # Use adaptive pooling to handle variable input sizes
        self.adaptive_pool = nn.AdaptiveAvgPool2d((16, 8))

        # Calculate the flattened size automatically
        self.fc1 = nn.Linear(32 * 16 * 8, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))

        # Use adaptive pooling to ensure the output size is always the same
        x = self.adaptive_pool(x)
        x = x.view(x.size(0), -1)  # Flatten the output
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


# Training function
def train_model(model, train_loader, num_epochs=10):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(num_epochs):
        running_loss = 0.0
        model.train()
        for inputs, labels in train_loader:
            optimizer.zero_grad()

            outputs = model(inputs)  # Pass the input to the CNN
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f'Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}')

    # Save the trained model weights after training
    torch.save(model.state_dict(), 'model_weights.pth')
    print("Model weights saved to 'model_weights.pth'")


# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    print(f'Accuracy: {accuracy * 100:.2f}%')


# Load and preprocess the dataset
data_dir = '/content/drive/MyDrive/sound'
transform = ToMelSpectrogram()
dataset = SpeechCommandsDataset(data_dir, transform=transform)

# Split dataset into training and testing sets
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

if len(dataset) > 0:
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # Initialize and train the model
    model = CNNClassifier(num_classes=len(dataset.commands))
    train_model(model, train_loader)

    # Evaluate the model
    evaluate_model(model, test_loader)
else:
    print("Dataset is empty. Please check the directory and files.")


Epoch 1, Loss: 2.788423458735148
Epoch 2, Loss: 2.768233080705007
Epoch 3, Loss: 2.7543912331263223
Epoch 4, Loss: 2.702907125155131
Epoch 5, Loss: 2.6357186436653137
Epoch 6, Loss: 2.5578250686327615
Epoch 7, Loss: 2.4688090682029724
Epoch 8, Loss: 2.4048871199289956
Epoch 9, Loss: 2.3478829065958657
Epoch 10, Loss: 2.307185490926107
Model weights saved to 'model_weights.pth'
Accuracy: 20.83%


In [53]:
import hashlib

# Path to the saved model weights
model_weights_path = 'model_weights.pth'

# Calculate the MD5 checksum
def calculate_md5(file_path):
    with open(file_path, 'rb') as file:
        data = file.read()
        return hashlib.md5(data).hexdigest()

# Get the checksum of the model weights file
md5_checksum = calculate_md5(model_weights_path)
print(f'MD5 Checksum for model_weights.pth: {md5_checksum}')


MD5 Checksum for model_weights.pth: 672421f308a6cdef999632ebe4edcb3c


In [54]:
import hashlib

# Path to the saved model weights
model_weights_path = 'model_weights.pth'

# Calculate the MD5 checksum
def calculate_md5(file_path):
    with open(file_path, 'rb') as file:
        data = file.read()
        return hashlib.md5(data).hexdigest()

# Expected checksum value (this should be copied from your initial checksum calculation)
expected_md5_checksum = '672421f308a6cdef999632ebe4edcb3c'

# Get the current checksum of the model weights file
current_md5_checksum = calculate_md5(model_weights_path)

# Verify the checksum
if current_md5_checksum == expected_md5_checksum:
    print(f'Checksum verification passed. MD5 Checksum: {current_md5_checksum}')
else:
    print(f'Checksum verification failed. MD5 Checksum: {current_md5_checksum}')


Checksum verification passed. MD5 Checksum: 672421f308a6cdef999632ebe4edcb3c
