In [1]:
import numpy as np

In [2]:
human1_deconv = np.load('data/LivingRoom_preprocessed_hack/Human1/deconvoled_trim.npy')
human1_deconv

array([[[ 9.07240337e-06,  1.71257434e-05,  1.22597794e-05, ...,
          8.86178623e-06,  1.40686043e-05,  1.06616626e-05],
        [-8.23853679e-06, -8.23373648e-06, -8.74563466e-06, ...,
          2.87394778e-06, -8.90485808e-06, -3.52464713e-06],
        [ 6.62780906e-07, -1.40201587e-06,  1.83228190e-06, ...,
         -1.06218545e-06,  8.53497477e-06,  2.28155182e-06],
        [ 8.11119719e-07, -2.47451499e-06,  5.82140547e-06, ...,
          7.73804004e-06,  2.29692650e-05,  1.10566034e-05]],

       [[ 1.64466037e-05,  1.53698293e-05,  1.91599720e-05, ...,
          1.76548729e-05,  1.70955263e-05,  1.41169767e-05],
        [ 1.05826217e-07,  9.13145777e-06,  1.76140716e-07, ...,
          1.21369967e-05,  1.55958387e-05,  1.23088676e-05],
        [ 7.06836090e-06,  3.26818258e-06,  7.15508941e-06, ...,
          4.64980394e-06,  8.96262554e-06,  6.55734584e-06],
        [-3.18089551e-05, -8.93297238e-06, -2.51026504e-05, ...,
         -7.02741545e-07,  1.34268867e-05,  3.24307

In [3]:
human1_deconv.shape

(1000, 4, 667200)

In [4]:
human1_deconv_2mic = human1_deconv[:, :2, :]
human1_deconv_2mic.shape

(1000, 2, 667200)

In [5]:
human1_centroid = np.load('data/LivingRoom_preprocessed_hack/Human1/centroid.npy')
human1_centroid

array([[-3231.3293467 , -1127.87771457],
       [-3198.54107875,  -744.5100656 ],
       [-3192.9776274 ,  -248.26678827],
       ...,
       [-1717.89923578, -3166.59648491],
       [-1808.60337549, -2779.13038427],
       [   44.43741322,   106.48353609]])

In [6]:
human1_centroid.shape

(1000, 2)

In [7]:
from sklearn.model_selection import train_test_split
import torch

X_train, X_test, y_train, y_test = train_test_split(human1_deconv_2mic, human1_centroid, test_size=0.2, random_state=42)

X_train = torch.Tensor(X_train).cuda()
X_test = torch.Tensor(X_test).cuda()

y_train = torch.Tensor(y_train).cuda()
y_test = torch.Tensor(y_test).cuda()

In [8]:
X_train.shape

torch.Size([800, 2, 667200])

In [9]:
import torchaudio
import torchaudio.transforms as T

def audio_to_melspectrogram(audio, sample_rate=44100, n_mels=64, n_fft=2048, hop_length=512):
    audio = torch.tensor(audio, dtype=torch.float32).cuda()
    if audio.ndim == 1:
        audio = audio.unsqueeze(0)
    transformer = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels).cuda()
    melspectrogram = transformer(audio)
    return melspectrogram.squeeze(0)



In [10]:
from torch.utils.data import Dataset, DataLoader

class AudioDataset(Dataset):
    def __init__(self, audio_data, labels, transform=None):
        self.audio_data = audio_data
        _max = torch.Tensor([500, 2000]).cuda()
        _min = torch.Tensor([-4000, -4000]).cuda()
        self.labels = (labels - _min) / (_max - _min)
        self.transform = transform

    def __len__(self):
        return len(self.audio_data)

    def __getitem__(self, idx):
        audio_signal = self.audio_data[idx]
        label = self.labels[idx]
        if self.transform:
            audio_signal = self.transform(audio_signal)
        return audio_signal, label

def transform_to_melspectrogram(audio_signal):
    # Convert the raw audio signal to a Mel spectrogram (or any other preprocessing)
    melspec = audio_to_melspectrogram(torch.tensor(audio_signal, dtype=torch.float32).cuda())
    return melspec


In [11]:
train_dataset = AudioDataset(X_train, y_train, transform=transform_to_melspectrogram)
test_dataset = AudioDataset(X_test, y_test, transform=transform_to_melspectrogram)

batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [12]:
import torch.nn as nn
import torch.nn.functional as F

class VGGishModel(nn.Module):
    def __init__(self):
        super(VGGishModel, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(2, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        ).cuda()
        # The output of features will depend on the input size, which should be calculated
        self.classifier = nn.Sequential(
            nn.Linear(165888, 4096),  # This dimension might need adjustment
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096, 2),  # Assume 1000 classes; adjust as needed
            nn.Sigmoid()
        ).cuda()
    
    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1).cuda()
        x = self.classifier(x)
        return x


In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = VGGishModel().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Training and validation loop
for epoch in range(100):
    model.train()
    running_loss = 0.0
    for spectrograms, coordinates in train_loader:
        spectrograms = spectrograms.to(device)
        coordinates = coordinates.to(device)

        optimizer.zero_grad()
        outputs = model(spectrograms)
        loss = criterion(outputs, coordinates)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    model.eval()
    validation_loss = 0.0
    with torch.no_grad():
        for spectrograms, coordinates in test_loader:
            spectrograms = spectrograms.to(device)
            coordinates = coordinates.to(device)

            outputs = model(spectrograms)
            loss = criterion(outputs, coordinates)
            validation_loss += loss.item()

    print(f"Epoch {epoch+1}, Training Loss: {running_loss / len(train_loader)}, Validation Loss: {validation_loss / len(test_loader)}")


  melspec = audio_to_melspectrogram(torch.tensor(audio_signal, dtype=torch.float32).cuda())
  audio = torch.tensor(audio, dtype=torch.float32).cuda()


Epoch 1, Training Loss: 0.05572466768324375, Validation Loss: 0.056741717629707776
Epoch 2, Training Loss: 0.05536507040262222, Validation Loss: 0.05619623454717489
Epoch 3, Training Loss: 0.055435036793351176, Validation Loss: 0.05651582261690727
Epoch 4, Training Loss: 0.05547938644886017, Validation Loss: 0.05638990150048183
Epoch 5, Training Loss: 0.055176672488451005, Validation Loss: 0.05686721864801187
Epoch 6, Training Loss: 0.055147481486201286, Validation Loss: 0.056249238264102205
Epoch 7, Training Loss: 0.05510280638933182, Validation Loss: 0.0568136263352174
Epoch 8, Training Loss: 0.05509075425565243, Validation Loss: 0.056613756773563534
Epoch 9, Training Loss: 0.053678588941693306, Validation Loss: 0.04379569667463119
Epoch 10, Training Loss: 0.041486500650644305, Validation Loss: 0.033153234622799434
Epoch 11, Training Loss: 0.034487190283834936, Validation Loss: 0.032092575843517594
Epoch 12, Training Loss: 0.03319721169769764, Validation Loss: 0.029935273700035535
Ep

In [15]:
model.eval()

_max = np.array([500, 2000])
_min = np.array([-4000, -4000])

def unnormalize(x):
    return (x + 1) / 2 * (_max - _min) + _min

distance_errors = []

for spectrograms, actual_coordinates in test_loader:
    spectrograms = spectrograms.to(device)
    actual_coordinates = actual_coordinates.to(device)
    
    with torch.no_grad():
        predicted_coordinates = model(spectrograms)
        predicted_coordinates = predicted_coordinates.cpu().numpy()
        actual_coordinates = actual_coordinates.cpu().numpy()
    
    for pred, actual in zip(predicted_coordinates, actual_coordinates):
        pred_unnorm = unnormalize(pred)
        actual_unnorm = unnormalize(actual)
        
        distance = np.linalg.norm(pred_unnorm - actual_unnorm)
        distance_errors.append(distance)

average_distance_error = np.mean(distance_errors)

print("Average Distance Error:", average_distance_error)


  melspec = audio_to_melspectrogram(torch.tensor(audio_signal, dtype=torch.float32).cuda())
  audio = torch.tensor(audio, dtype=torch.float32).cuda()


Average Distance Error: 202.3763176243583
