In [108]:
import pandas as pd
import numpy as np
import librosa
import tensorflow as tf
import os
from sklearn.metrics import f1_score

In [90]:
# Load annotations file
annotations = pd.read_csv('output.csv')
data_dir = "./applications/data/TUT-sound-events-2017-development"

In [91]:
# Define audio settings
sr = 44100
duration = 1.0
hop_length = 512
n_mels = 128
all_keys=['people walking', 'car', 'large vehicle', 'brakes squeaking', 'people speaking', 'children']

In [117]:
# Extract features and labels from audio segments
features = []
labels = []
max_segments = 0
audio_files = annotations['filename'].unique()
for audio_file in audio_files:
    file_path = os.path.join(data_dir, audio_file)
    audio, _ = librosa.load(file_path, sr=sr)
    segments = annotations[annotations['filename'] == audio_file]
    for i, row in segments.iterrows():
        event_time = row['events'].split(',')
        start = int(float(event_time[0]) * sr/1000)
        end = int(float(event_time[1]) * sr/1000)
        segment = audio[start:end]

        #mel_spec = librosa.feature.melspectrogram(segment, sr=sr, n_mels=n_mels, hop_length=hop_length)
        #log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
        oenv = librosa.onset.onset_strength(y=segment, sr=sr, hop_length=hop_length)
        tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr,
                                      hop_length=hop_length)
        features.append(tempogram)

        label = np.zeros(6)
        label[all_keys.index(row['event_type'])] = 1
        labels.append(label)
    num_segments = segments.shape[0]
    if num_segments > max_segments:
        max_segments = num_segments

  return f(*args, **kwargs)


In [116]:
features[3].shape

(288,)

In [118]:
# Pad or truncate each sequence of features to match maximum number of segments
padded_features = []
for feat_seq in features:
    num_pad_segments = max_segments - feat_seq.shape[1]
    if num_pad_segments > 0:
        padded_feat_seq = np.pad(feat_seq, ((0, 0), (0, num_pad_segments)))
    else:
        padded_feat_seq = feat_seq[:, :max_segments]
    padded_features.append(padded_feat_seq)


In [119]:
# Convert list of features to a 3D array
X = np.stack(padded_features, axis=0)
X = tf.expand_dims(X, axis=-1)
Y = np.array(labels)


In [120]:
# Define event detection model
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(n_mels, max_segments, 1)),
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Reshape((-1, n_mels)),
    tf.keras.layers.LSTM(128, return_sequences=True),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation='softmax')
])


In [121]:
# Split dataset into training and validation sets
train_size = int(0.8 * X.shape[0])
train_features = X[:train_size]
train_labels = Y[:train_size]
val_features = X[train_size:]
val_labels = Y[train_size:]

In [122]:
# Compile model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
model.fit(train_features, train_labels, validation_data=(val_features, val_labels), batch_size=32, epochs=10)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x25436e05e10>

In [123]:
# Evaluate model on validation set
loss, accuracy = model.evaluate(val_features, val_labels)

# Make predictions on validation set
y_pred = model.predict(val_features)
y_pred = np.argmax(y_pred, axis=1)
y_true = np.argmax(val_labels, axis=1)

# Compute F1 score
f1 = f1_score(y_true, y_pred, average='macro')

print("Validation set loss: {:.4f}".format(loss))
print("Validation set accuracy: {:.4f}".format(accuracy))
print("Validation set F1 score: {:.4f}".format(f1))

Validation set loss: 1.3334
Validation set accuracy: 0.6439
Validation set F1 score: 0.1567
