In [9]:
import numpy as np

In [10]:
human1_deconv = np.load('data/LivingRoom_preprocessed_hack/Human1/deconvoled_trim.npy')
human1_deconv

array([[[ 9.07240337e-06,  1.71257434e-05,  1.22597794e-05, ...,
          8.86178623e-06,  1.40686043e-05,  1.06616626e-05],
        [-8.23853679e-06, -8.23373648e-06, -8.74563466e-06, ...,
          2.87394778e-06, -8.90485808e-06, -3.52464713e-06],
        [ 6.62780906e-07, -1.40201587e-06,  1.83228190e-06, ...,
         -1.06218545e-06,  8.53497477e-06,  2.28155182e-06],
        [ 8.11119719e-07, -2.47451499e-06,  5.82140547e-06, ...,
          7.73804004e-06,  2.29692650e-05,  1.10566034e-05]],

       [[ 1.64466037e-05,  1.53698293e-05,  1.91599720e-05, ...,
          1.76548729e-05,  1.70955263e-05,  1.41169767e-05],
        [ 1.05826217e-07,  9.13145777e-06,  1.76140716e-07, ...,
          1.21369967e-05,  1.55958387e-05,  1.23088676e-05],
        [ 7.06836090e-06,  3.26818258e-06,  7.15508941e-06, ...,
          4.64980394e-06,  8.96262554e-06,  6.55734584e-06],
        [-3.18089551e-05, -8.93297238e-06, -2.51026504e-05, ...,
         -7.02741545e-07,  1.34268867e-05,  3.24307

In [11]:
human1_deconv.shape

(1000, 4, 667200)

In [12]:
import platform
platform.platform()

'Linux-6.1.0-16-amd64-x86_64-with-glibc2.35'

In [13]:
human1_centroid = np.load('data/LivingRoom_preprocessed_hack/Human1/centroid.npy')
human1_centroid

array([[-3231.3293467 , -1127.87771457],
       [-3198.54107875,  -744.5100656 ],
       [-3192.9776274 ,  -248.26678827],
       ...,
       [-1717.89923578, -3166.59648491],
       [-1808.60337549, -2779.13038427],
       [   44.43741322,   106.48353609]])

In [14]:
human1_centroid.shape

(1000, 2)

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(human1_deconv, human1_centroid, test_size=0.2, random_state=42)

X_train = torch.Tensor(X_train)
X_test = torch.Tensor(X_test)

y_train = torch.Tensor(y_train)
y_test = torch.Tensor(y_test)

In [17]:
import torch
import torchaudio
import torchaudio.transforms as T

def audio_to_melspectrogram(audio, sample_rate=44100, n_mels=64, n_fft=2048, hop_length=512):
    audio = torch.tensor(audio, dtype=torch.float32)
    if audio.ndim == 1:
        audio = audio.unsqueeze(0)
    transformer = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    melspectrogram = transformer(audio)
    return melspectrogram.squeeze(0)



In [18]:
from torch.utils.data import Dataset, DataLoader

class AudioDataset(Dataset):
    def __init__(self, audio_data, labels, transform=None):
        self.audio_data = audio_data
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.audio_data)

    def __getitem__(self, idx):
        audio_signal = self.audio_data[idx]
        label = self.labels[idx]
        if self.transform:
            audio_signal = self.transform(audio_signal)
        return audio_signal, label

def transform_to_melspectrogram(audio_signal):
    # Convert the raw audio signal to a Mel spectrogram (or any other preprocessing)
    melspec = audio_to_melspectrogram(torch.tensor(audio_signal, dtype=torch.float32))
    return melspec


In [19]:
train_dataset = AudioDataset(X_train, y_train, transform=transform_to_melspectrogram)
test_dataset = AudioDataset(X_test, y_test, transform=transform_to_melspectrogram)

train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=10, shuffle=False)


In [20]:
import torch.nn as nn
import torch.nn.functional as F

class VGGishModel(nn.Module):
    def __init__(self):
        super(VGGishModel, self).__init__()
        self.conv1 = nn.Conv2d(4, 64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(256, 512, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(165888, 4096)
        self.fc2 = nn.Linear(4096, 1000)
        self.fc3 = nn.Linear(1000, 2)  # Outputs the coordinates

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), 2)
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = F.max_pool2d(F.relu(self.conv3(x)), 2)
        x = F.max_pool2d(F.relu(self.conv4(x)), 2)
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class VGGishModel(nn.Module):
    def __init__(self):
        super(VGGishModel, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(4, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        # The output of features will depend on the input size, which should be calculated
        self.classifier = nn.Sequential(
            nn.Linear(165888, 4096),  # This dimension might need adjustment
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096, 2),  # Assume 1000 classes; adjust as needed
            nn.Sigmoid()
        )
    
    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x


In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = VGGishModel().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training and validation loop
for epoch in range(10):
    model.train()
    running_loss = 0.0
    for spectrograms, coordinates in train_loader:
        spectrograms = spectrograms.to(device)
        coordinates = coordinates.to(device)

        optimizer.zero_grad()
        outputs = model(spectrograms)
        loss = criterion(outputs, coordinates)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    model.eval()
    validation_loss = 0.0
    with torch.no_grad():
        for spectrograms, coordinates in test_loader:
            spectrograms = spectrograms.to(device)
            coordinates = coordinates.to(device)

            outputs = model(spectrograms)
            loss = criterion(outputs, coordinates)
            validation_loss += loss.item()

    print(f"Epoch {epoch+1}, Training Loss: {running_loss / len(train_loader)}, Validation Loss: {validation_loss / len(test_loader)}")


  melspec = audio_to_melspectrogram(torch.tensor(audio_signal, dtype=torch.float32))
  audio = torch.tensor(audio, dtype=torch.float32)


Epoch 1, Training Loss: 3863751.9625, Validation Loss: 4224984.9625


In [None]:
model.eval()  # Set the model to evaluation mode

# Assuming your train_loader is already created and ready to use
# Retrieve the first batch from the DataLoader
first_batch = next(iter(train_loader))
spectrograms, actual_coordinates = first_batch

# Extract only the first element from the batch
first_spectrogram = spectrograms[0].unsqueeze(0).to(device)  # Add a batch dimension
first_actual_coordinate = actual_coordinates[0].to(device)
with torch.no_grad():  # We do not need to track gradients here
    predicted_coordinate = model(first_spectrogram)
    predicted_coordinate = predicted_coordinate.cpu().numpy()  # Move data to cpu and convert to numpy array for easy handling

print("Predicted Coordinate:", predicted_coordinate)
print("Actual Coordinate:", first_actual_coordinate.cpu().numpy())  # Also convert actual coordinates to numpy array for consistency
print("Distance", (predicted_coordinate ** 2).sum() - (first_actual_coordinate.cpu().numpy() ** 2).sum())


Predicted Coordinate: [[-2153.3108 -1000.0165]]
Actual Coordinate: [-2082.7336   -561.10864]
Distance 984158.0


  melspec = audio_to_melspectrogram(torch.tensor(audio_signal, dtype=torch.float32))
  audio = torch.tensor(audio, dtype=torch.float32)
