### Import models


In [54]:
import torch
import torch.nn as nn
import torchaudio
import sounddevice as sd
import torchvision.transforms as transforms
from torchvision.models import resnet18
from torch.autograd import Variable
from IPython.display import Audio


try:
    # MULTI GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = resnet18(pretrained=False)
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(512, 10)
    model = nn.DataParallel(model)  # Add this line
    model.load_state_dict(torch.load('ResNet18_Best.pth', map_location=device))
    model = model.to(device)
    model = model.eval()

    state_dict = torch.load('ResNet18_Best.pth', map_location=device)
    new_state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
    model.load_state_dict(new_state_dict)
except:
    #One GPU or CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = resnet18(pretrained=False)
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(512, 10)
    state_dict = torch.load('ResNet18_Best.pth', map_location=device)
    new_state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
    model.load_state_dict(new_state_dict)
    model = model.to(device)
    model = model.eval()



In [55]:
SAMPLE_RATE = 44100

class MonoToColor(nn.Module):
    def __init__(self, num_channels=3):
        super(MonoToColor, self).__init__()
        self.num_channels = num_channels

    def forward(self, tensor):
        return tensor.repeat(self.num_channels, 1, 1)

# Apply the same transformation as used during training
transformation = transforms.Compose([
    torchaudio.transforms.MelSpectrogram(sample_rate=SAMPLE_RATE, n_mels=128),
    torchaudio.transforms.AmplitudeToDB(stype='power', top_db=80),
    MonoToColor()
])

In [63]:
def predict_sound(model, device, transformation, sample_rate, target_sample_rate):
    # Record a 3 seconds mono audio at the specified sample rate
    duration = 0.5  # seconds
    recording = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1)
    sd.wait()
    
    print(recording.shape)#(22050, 1)


    # Convert to PyTorch tensor and apply transformations
    recording = torch.from_numpy(recording).float()
    
    print(recording.shape)#torch.Size([22050, 1])

    # Resample if necessary
    if sample_rate != target_sample_rate:
        resampler = torchaudio.transforms.Resample(sample_rate, target_sample_rate)
        recording = resampler(recording)
        
    print(recording.shape)#torch.Size([22050, 1])

    # Mix down if necessary
    if recording.shape[0] > 1:
        recording = torch.mean(recording, dim=0, keepdim=True)
    print(recording.shape)#torch.Size([1, 1])

    # Cut or pad if necessary
    if recording.shape[1] > target_sample_rate:
        recording = recording[:, :target_sample_rate]
    elif recording.shape[1] < target_sample_rate:
        num_missing_samples = target_sample_rate - recording.shape[1]
        last_dim_padding = (0, num_missing_samples)
        recording = nn.functional.pad(recording, last_dim_padding)

    # Apply transformation
    recording = transformation(recording)
    recording = recording.unsqueeze(0)

    # Make the prediction
    recording = recording.to(device)
    outputs = model(recording[None, ...])
    _, predicted = torch.max(outputs, 1)

    return predicted.item()

# Define class labels
class_labels = ['air_conditioner', 'car_horn', 'children_playing', 'dog_bark', 'drilling','engine_idling', 'gun_shot', 'jackhammer', 'siren', 'street_music']

# Make a prediction
predicted_index = predict_sound(model, device, transformation, SAMPLE_RATE, SAMPLE_RATE)
predicted_label = class_labels[predicted_index]
print(f"The predicted class is: {predicted_label}")


(22050, 1)
torch.Size([22050, 1])
torch.Size([22050, 1])
torch.Size([1, 1])


RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [1, 1, 3, 128, 221]

In [20]:
# Set up audio recording parameters
SAMPLE_RATE = 22050
DURATION = 1  # in seconds
CHANNELS = 1  # Mono audio
