In [1]:
import numpy as np

## Loading the human1 data

In [2]:
human1_deconv = np.load('data/LivingRoom_preprocessed_hack/Human1/deconvoled_trim.npy')
human1_deconv

array([[[ 9.07240337e-06,  1.71257434e-05,  1.22597794e-05, ...,
          8.86178623e-06,  1.40686043e-05,  1.06616626e-05],
        [-8.23853679e-06, -8.23373648e-06, -8.74563466e-06, ...,
          2.87394778e-06, -8.90485808e-06, -3.52464713e-06],
        [ 6.62780906e-07, -1.40201587e-06,  1.83228190e-06, ...,
         -1.06218545e-06,  8.53497477e-06,  2.28155182e-06],
        [ 8.11119719e-07, -2.47451499e-06,  5.82140547e-06, ...,
          7.73804004e-06,  2.29692650e-05,  1.10566034e-05]],

       [[ 1.64466037e-05,  1.53698293e-05,  1.91599720e-05, ...,
          1.76548729e-05,  1.70955263e-05,  1.41169767e-05],
        [ 1.05826217e-07,  9.13145777e-06,  1.76140716e-07, ...,
          1.21369967e-05,  1.55958387e-05,  1.23088676e-05],
        [ 7.06836090e-06,  3.26818258e-06,  7.15508941e-06, ...,
          4.64980394e-06,  8.96262554e-06,  6.55734584e-06],
        [-3.18089551e-05, -8.93297238e-06, -2.51026504e-05, ...,
         -7.02741545e-07,  1.34268867e-05,  3.24307

In [3]:
human1_deconv.shape

(1000, 4, 667200)

In [4]:
human1_centroid = np.load('data/LivingRoom_preprocessed_hack/Human1/centroid.npy')
human1_centroid

array([[-3231.3293467 , -1127.87771457],
       [-3198.54107875,  -744.5100656 ],
       [-3192.9776274 ,  -248.26678827],
       ...,
       [-1717.89923578, -3166.59648491],
       [-1808.60337549, -2779.13038427],
       [   44.43741322,   106.48353609]])

In [5]:
human1_centroid.shape

(1000, 2)

## Splitting the data into train and test sets

In [6]:
from sklearn.model_selection import train_test_split
import torch

X_train, X_test, y_train, y_test = train_test_split(human1_deconv, human1_centroid, test_size=0.2, random_state=42)

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

X_train = torch.Tensor(X_train)
X_test = torch.Tensor(X_test)
X_valid = torch.Tensor(X_valid)

y_train = torch.Tensor(y_train)
y_test = torch.Tensor(y_test)
y_valid = torch.Tensor(y_valid)

## Preprocessing the data

### Featurization with the spectrogram

In [7]:
import torchaudio
import torchaudio.transforms as T

def audio_to_spectrogram(audio, sample_rate=44100, n_fft=2048, hop_length=512):
    audio = torch.tensor(audio, dtype=torch.float32)
    if audio.ndim == 1:
        audio = audio.unsqueeze(0)
    transformer = torchaudio.transforms.Spectrogram(n_fft=n_fft, hop_length=hop_length)
    spectrogram = transformer(audio)
    return spectrogram.squeeze(0)



### Class to load the data into pytorch with DataLoader

In [8]:
from torch.utils.data import Dataset, DataLoader

class AudioDataset(Dataset):
    def __init__(self, audio_data, labels, transform=None):
        self.audio_data = audio_data
        _max = torch.Tensor([500, 2000])
        _min = torch.Tensor([-4000, -4000])
        self.labels = (labels - _min) / (_max - _min)
        self.transform = transform

    def __len__(self):
        return len(self.audio_data)

    def __getitem__(self, idx):
        audio_signal = self.audio_data[idx]
        label = self.labels[idx]
        if self.transform:
            audio_signal = self.transform(audio_signal)
        return audio_signal, label

def transform_to_spectrogram(audio_signal):
    spectrogram = audio_to_spectrogram(torch.tensor(audio_signal, dtype=torch.float32))
    return spectrogram


In [9]:
train_dataset = AudioDataset(X_train, y_train, transform=transform_to_spectrogram)
test_dataset = AudioDataset(X_test, y_test, transform=transform_to_spectrogram)
valid_dataset = AudioDataset(X_valid, y_valid, transform=transform_to_spectrogram)

batch_size = 8

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

## VGGish Model

In [10]:
import torch.nn as nn
import torch.nn.functional as F

class VGGishModel(nn.Module):
    def __init__(self):
        super(VGGishModel, self).__init__()
        self.featurization = nn.Sequential(
            nn.Conv2d(4, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.xy = nn.Sequential(
            nn.Linear(2654208, 4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096, 2),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        x = self.featurization(x)
        x = torch.flatten(x, 1)
        x = self.xy(x)
        return x


### Training the model

In [11]:
_max = np.array([500, 2000])
_min = np.array([-4000, -4000])

def unnormalize(x):
    return (x + 1) / 2 * (_max - _min) + _min

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = VGGishModel().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

for epoch in range(100):
    model.train()
    running_loss = 0.0
    for spectrograms, coordinates in train_loader:
        spectrograms = spectrograms.to(device)
        coordinates = coordinates.to(device)

        optimizer.zero_grad()
        outputs = model(spectrograms)
        loss = criterion(outputs, coordinates)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    model.eval()
    validation_loss = 0.0
    with torch.no_grad():
        for spectrograms, coordinates in valid_loader:
            spectrograms = spectrograms.to(device)
            coordinates = coordinates.to(device)

            outputs = model(spectrograms)
            loss = criterion(outputs, coordinates)
            validation_loss += loss.item()

        for spectrograms, coordinates in test_loader:
            spectrograms = spectrograms.to(device)
            coordinates = coordinates.to(device)

            outputs = model(spectrograms)
            outputs = outputs.cpu().numpy()
            coordinates = coordinates.cpu().numpy()

            for pred, actual in zip(outputs, coordinates):
                pred_unnorm = unnormalize(pred)
                actual_unnorm = unnormalize(actual)
                distance = np.linalg.norm(pred_unnorm - actual_unnorm)
                test_distances.append(distance)

    average_distance_error = np.mean(test_distances)
    std_distance_error = np.std(test_distances)

    print(f"Epoch {epoch+1}, Training Loss: {running_loss / len(train_loader)}, "
          f"Validation Loss: {validation_loss / len(valid_loader)}, "
          f"Average Test Distance Error: {average_distance_error}, "
          f"STD of Test Distance Errors: {std_distance_error}")

  spectrogram = audio_to_spectrogram(torch.tensor(audio_signal, dtype=torch.float32))
  audio = torch.tensor(audio, dtype=torch.float32)


OutOfMemoryError: CUDA out of memory. Tried to allocate 40.50 GiB. GPU 0 has a total capacity of 79.11 GiB of which 27.76 GiB is free. Process 1930158 has 51.34 GiB memory in use. Of the allocated memory 50.26 GiB is allocated by PyTorch, and 379.04 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Evaluating the model

In [None]:
model.eval()

distance_errors = []

for spectrograms, actual_coordinates in test_loader:
    spectrograms = spectrograms.to(device)
    actual_coordinates = actual_coordinates.to(device)
    
    with torch.no_grad():
        predicted_coordinates = model(spectrograms)
        predicted_coordinates = predicted_coordinates.cpu().numpy()
        actual_coordinates = actual_coordinates.cpu().numpy()
    
    for pred, actual in zip(predicted_coordinates, actual_coordinates):
        pred_unnorm = unnormalize(pred)
        actual_unnorm = unnormalize(actual)
        
        distance = np.linalg.norm(pred_unnorm - actual_unnorm)
        distance_errors.append(distance)

average_distance_error = np.mean(distance_errors)

print("Average Distance Error:", average_distance_error)

standard_deviation = np.std(distance_errors)

print("Standard Deviation of Distance Errors:", standard_deviation)

Average Distance Error: 525.2892631156277
Standard Deviation of Distance Errors: 310.2712909710867


  melspec = audio_to_melspectrogram(torch.tensor(audio_signal, dtype=torch.float32))
  audio = torch.tensor(audio, dtype=torch.float32)


## Loading the human2 dataset

In [None]:
human2_deconv = np.load('data/LivingRoom_preprocessed_hack/Human2/deconvoled_trim.npy')
human2_deconv

array([[[ 1.32263867e-05,  4.04477396e-06,  1.72964337e-05, ...,
          1.42459985e-05,  7.06920127e-06,  1.52228458e-05],
        [ 1.66781626e-06,  4.13932912e-06,  4.92448726e-06, ...,
          1.69338909e-05,  1.40092038e-06,  1.19383767e-05],
        [ 1.15825042e-05,  4.49341405e-06,  1.40825741e-05, ...,
          1.12802718e-05,  1.38609530e-05,  6.46868511e-06],
        [-2.10553353e-05, -2.40086247e-05, -1.06697380e-05, ...,
         -2.20970142e-05, -2.00329578e-05, -2.09568789e-05]],

       [[-7.21401693e-06,  1.12978867e-07, -4.36823848e-06, ...,
         -1.81246889e-06, -8.53981510e-06, -1.84635439e-06],
        [ 1.36644203e-05,  2.30385322e-05,  1.24044136e-05, ...,
          9.66903553e-06,  1.00616371e-05,  1.03869361e-05],
        [ 2.78378029e-05,  1.38622945e-05,  2.71665067e-05, ...,
          1.33699305e-05,  1.06378029e-05,  1.05457875e-05],
        [ 1.15678056e-04,  1.64297759e-04,  5.92336910e-05, ...,
          5.16751425e-05,  3.32121781e-05,  1.38982

In [None]:
human2_deconv.shape

(104, 4, 667200)

In [None]:
human2_centroid = np.load('data/LivingRoom_preprocessed_hack/Human2/centroid.npy')
human2_centroid

array([[-2406.85413008, -2443.62906004],
       [-1923.16230038, -2504.30576997],
       [-1615.29307549, -2525.71107555],
       [-1234.84729435, -2581.21590946],
       [ -906.89320474, -2648.96828851],
       [ -579.82451164, -2563.38746474],
       [ -258.89043633, -2501.99822189],
       [ -237.00812692, -2957.79113999],
       [ -608.16773853, -2894.26845589],
       [ -920.67771947, -2892.68441764],
       [-1177.38289287, -3050.12473983],
       [-1690.95162773, -3023.00826184],
       [-1968.49135428, -3048.59689339],
       [-2401.44182176, -2925.40384273],
       [-2626.65068051, -2353.52639219],
       [-2945.10567004, -2270.29924636],
       [-3288.80884616, -2179.77519901],
       [-3543.43725461, -2122.72140672],
       [-3818.97939345, -2074.06640841],
       [-3787.00508742, -1745.78914635],
       [-3440.47071217, -1786.24662486],
       [-3126.89926719, -1778.7456592 ],
       [-2780.8277654 , -1806.35696377],
       [-2443.03492952, -1839.64542774],
       [-2359.37

In [None]:
human2_centroid.shape

(104, 2)

In [None]:
from sklearn.model_selection import train_test_split
import torch

X_train, X_test, y_train, y_test = train_test_split(human2_deconv, human2_centroid, test_size=0.2, random_state=42)

X_train = torch.Tensor(X_train)
X_test = torch.Tensor(X_test)

y_train = torch.Tensor(y_train)
y_test = torch.Tensor(y_test)

In [None]:
train_dataset = AudioDataset(X_train, y_train, transform=transform_to_spectrogram)
test_dataset = AudioDataset(X_test, y_test, transform=transform_to_spectrogram)

batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


## Training the model on the Human 2 dataset

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = VGGishModel().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

for epoch in range(100):
    model.train()
    running_loss = 0.0
    for spectrograms, coordinates in train_loader:
        spectrograms = spectrograms.to(device)
        coordinates = coordinates.to(device)

        optimizer.zero_grad()
        outputs = model(spectrograms)
        loss = criterion(outputs, coordinates)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    model.eval()
    validation_loss = 0.0
    with torch.no_grad():
        for spectrograms, coordinates in test_loader:
            spectrograms = spectrograms.to(device)
            coordinates = coordinates.to(device)

            outputs = model(spectrograms)
            loss = criterion(outputs, coordinates)
            validation_loss += loss.item()

    print(f"Epoch {epoch+1}, Training Loss: {running_loss / len(train_loader)}, Validation Loss: {validation_loss / len(test_loader)}")


  melspec = audio_to_melspectrogram(torch.tensor(audio_signal, dtype=torch.float32))
  audio = torch.tensor(audio, dtype=torch.float32)


Epoch 1, Training Loss: 0.06113321209947268, Validation Loss: 0.046385763213038445
Epoch 2, Training Loss: 0.05050892817477385, Validation Loss: 0.04715699516236782
Epoch 3, Training Loss: 0.05222000243763129, Validation Loss: 0.045040188357234
Epoch 4, Training Loss: 0.04746897146105766, Validation Loss: 0.0447651743888855
Epoch 5, Training Loss: 0.04852274184425672, Validation Loss: 0.044496551156044006
Epoch 6, Training Loss: 0.04723503440618515, Validation Loss: 0.044571490958333015
Epoch 7, Training Loss: 0.047738206262389817, Validation Loss: 0.0445479191839695
Epoch 8, Training Loss: 0.047862776244680084, Validation Loss: 0.0453879963606596
Epoch 9, Training Loss: 0.045510398844877877, Validation Loss: 0.044871823862195015
Epoch 10, Training Loss: 0.04944590417047342, Validation Loss: 0.04463179595768452
Epoch 11, Training Loss: 0.04504664366443952, Validation Loss: 0.045054176822304726
Epoch 12, Training Loss: 0.04469514482965072, Validation Loss: 0.044978370890021324
Epoch 13,

## Evaluating on the Human 2 dataset

In [None]:
model.eval()

_max = np.array([500, 2000])
_min = np.array([-4000, -4000])

def unnormalize(x):
    return (x + 1) / 2 * (_max - _min) + _min

distance_errors = []

for spectrograms, actual_coordinates in test_loader:
    spectrograms = spectrograms.to(device)
    actual_coordinates = actual_coordinates.to(device)
    
    with torch.no_grad():
        predicted_coordinates = model(spectrograms)
        predicted_coordinates = predicted_coordinates.cpu().numpy()
        actual_coordinates = actual_coordinates.cpu().numpy()
    
    for pred, actual in zip(predicted_coordinates, actual_coordinates):
        pred_unnorm = unnormalize(pred)
        actual_unnorm = unnormalize(actual)
        
        distance = np.linalg.norm(pred_unnorm - actual_unnorm)
        distance_errors.append(distance)

average_distance_error = np.mean(distance_errors)

print("Average Distance Error:", average_distance_error)


Average Distance Error: 479.9898824158726


  melspec = audio_to_melspectrogram(torch.tensor(audio_signal, dtype=torch.float32))
  audio = torch.tensor(audio, dtype=torch.float32)
