In [None]:
import torch
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import torchaudio
from torchaudio.transforms import MelSpectrogram
from torchvision.models import resnet18
import torch.nn as nn
import torch.optim as optim

# Define paths for datasets
SIREN_SOUND_DATASET_PATH = "./data/siren_sounds/"
AMBULANCE_IMAGE_DATASET_PATH = "./data/ambulance_images/"

# Data preprocessing for audio
def audio_preprocessing(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)
    mel_spectrogram = MelSpectrogram()(waveform)
    return mel_spectrogram

# Data preprocessing for images
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# Load datasets
audio_dataset = datasets.DatasetFolder(
    root=SIREN_SOUND_DATASET_PATH,
    loader=audio_preprocessing,
    extensions=(".wav", ".mp3"),
)

image_dataset = datasets.ImageFolder(
    root=AMBULANCE_IMAGE_DATASET_PATH,
    transform=image_transform,
)

# Data loaders
audio_loader = DataLoader(audio_dataset, batch_size=32, shuffle=True)
image_loader = DataLoader(image_dataset, batch_size=32, shuffle=True)

# Define AI model for image classification
class AmbulanceClassifier(nn.Module):
    def __init__(self):
        super(AmbulanceClassifier, self).__init__()
        self.model = resnet18(pretrained=True)
        self.model.fc = nn.Linear(self.model.fc.in_features, 2)  # Binary classification
    
    def forward(self, x):
        return self.model(x)

# Define AI model for siren sound detection
class SirenSoundClassifier(nn.Module):
    def __init__(self):
        super(SirenSoundClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(16 * 128 * 128, 2)  # Assuming 128x128 spectrograms
    
    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = x.view(x.size(0), -1)
        return self.fc1(x)

# Training function
def train_model(model, data_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        for inputs, labels in data_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# Initialize models, loss function, and optimizers
image_model = AmbulanceClassifier()
sound_model = SirenSoundClassifier()
criterion = nn.CrossEntropyLoss()
image_optimizer = optim.Adam(image_model.parameters(), lr=0.001)
sound_optimizer = optim.Adam(sound_model.parameters(), lr=0.001)

# Train models
train_model(image_model, image_loader, criterion, image_optimizer)
train_model(sound_model, audio_loader, criterion, sound_optimizer)

# Save models
torch.save(image_model.state_dict(), "ambulance_model.pth")
torch.save(sound_model.state_dict(), "siren_sound_model.pth")

print("Training complete. Models saved.")
