# Utils

In [2]:
import os
import glob
import librosa
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split


# Loading Dataset

In [6]:
# Verzeichnispfad anpassen
audio_dir = "data"
# Liste aller Audiodateien
filepaths = glob.glob(os.path.join(audio_dir, '**/*.wav'), recursive=True)

# # Daten und Labels speichern
data = np.array([])
labels = []

for filepath in filepaths:
  # Audio laden und in NumPy-Array konvertieren
  y, sr = librosa.load(filepath, sr=None)
  print(y.shape, filepath)
  # Label aus Dateiname extrahieren (z.B. "knock_001.wav" -> 1)
  label =  1 if os.path.basename(filepath).split("_")[0] == "knock" else 0

  # Daten und Labels hinzufügen
  if data.size == 0:
    data = y
  else:
    data = np.vstack((data, y))
  labels.append(label)

np.save("data/npy/data.npy", data)
np.save("data/npy/labels.npy", labels)


(88200,) data\knocks\knock_1.wav
(88200,) data\knocks\knock_10.wav
(88200,) data\knocks\knock_10_var0.wav
(88200,) data\knocks\knock_10_var1.wav
(88200,) data\knocks\knock_10_var2.wav
(88200,) data\knocks\knock_10_var3.wav
(88200,) data\knocks\knock_10_var4.wav
(88200,) data\knocks\knock_10_var5.wav
(88200,) data\knocks\knock_10_var6.wav
(88200,) data\knocks\knock_10_var7.wav
(88200,) data\knocks\knock_10_var8.wav
(88200,) data\knocks\knock_10_var9.wav
(88200,) data\knocks\knock_11.wav
(88200,) data\knocks\knock_11_var0.wav
(88200,) data\knocks\knock_11_var1.wav
(88200,) data\knocks\knock_11_var2.wav
(88200,) data\knocks\knock_11_var3.wav
(88200,) data\knocks\knock_11_var4.wav
(88200,) data\knocks\knock_11_var5.wav
(88200,) data\knocks\knock_11_var6.wav
(88200,) data\knocks\knock_11_var7.wav
(88200,) data\knocks\knock_11_var8.wav
(88200,) data\knocks\knock_11_var9.wav
(88200,) data\knocks\knock_1_var0.wav
(88200,) data\knocks\knock_1_var1.wav
(88200,) data\knocks\knock_1_var2.wav
(8820

## testing loaded Data

In [7]:
print(data[0], labels[0])
print(type(data))

[ 1.2207031e-04  0.0000000e+00 -6.1035156e-05 ... -6.1035156e-05
 -9.1552734e-05  0.0000000e+00] 1
<class 'numpy.ndarray'>


In [18]:
# Daten in Tensoren konvertieren
data_tensor = torch.from_numpy(np.array(data)).float()

import torchvision.transforms as transforms

# Define the mean and standard deviation for normalization
mean = data_tensor.mean()
std = data_tensor.std()

np.save('data/npy/mean.npy', mean)
np.save('data/npy/std.npy', std)



# Create the transform
transform = transforms.Normalize(mean=mean, std=std)

# Reshape the data_tensor to have the shape (batch_size, channels, height, width)
data_tensor_for_normalisazion = data_tensor.view(-1, 1, 1, data_tensor.shape[1])
# Apply the transform to the data_tensor
normalized_data = transform(data_tensor_for_normalisazion)

labels_tensor = torch.from_numpy(np.array(labels)).long()

# Teilen der Daten in Trainings- und Testdatensätze
X_train, X_test, y_train, y_test = train_test_split(data_tensor, labels_tensor, test_size=0.2)


# Training Model

In [16]:

class AudioClassifier(nn.Module):
  def __init__(self, input_size, hidden_size, num_classes):
    super(AudioClassifier, self).__init__()
    self.lstm = nn.LSTM(input_size, hidden_size)
    self.fc = nn.Linear(hidden_size, num_classes)

  def forward(self, x):
    # x: (batch_size, sequence_length, input_size)
    x, _ = self.lstm(x)
    # x: (batch_size, sequence_length, hidden_size)
    out = self.fc(x[:, -1, :])  # Zugriff auf den letzten Output des LSTMs
    # out: (batch_size, num_classes)
    return F.log_softmax(out, dim=1)  # Log-Softmax für Klassifikation


In [21]:
# Hyperparameter
input_size = 88200  # Länge der Audiodaten
hidden_size = 64
num_classes = 2  # Klopfen (0) und Geräusche (1)

# Modell initialisieren
model = AudioClassifier(input_size, hidden_size, num_classes)

# Optimierer und Verlustfunktion definieren
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.NLLLoss()

# Trainingsdaten laden (muss angepasst werden)
# ...

# Set the model to training mode
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
X_train = X_train.to(device)
y_train = y_train.to(device)
model.train()

# Train the model
for epoch in range(10):
  # Clear the gradients
  optimizer.zero_grad()

  # Forward pass
  outputs = model(X_train)

  # Calculate the loss
  loss = criterion(outputs, y_train)

  # Backward pass
  loss.backward()

  # Update the weights
  optimizer.step()

  # Print the loss for each epoch
  print(f"Epoch {epoch+1}: Loss = {loss.item()}")

# Save the trained model
torch.save(model.state_dict(), "audio_classifier.pt")


IndexError: too many indices for tensor of dimension 2