# Read Data

In [1]:
import os
import sys
sys.path.append(os.pardir)
from helper.load_data import load_data

In [2]:
# determine path to `src/data`
cwd = os.getcwd()
src_dir = os.path.join(cwd, os.pardir)
data_dir = os.path.join(src_dir, 'data')

# load files
data_file_paths = load_data(data_dir)

# Feature Matrix & Label

In [3]:
import librosa
import numpy as np
np.set_printoptions(precision=4, suppress=True)
import pandas as pd
from scipy.fftpack import fft

# do not print warnings
if not sys.warnoptions:
    import warnings
    warnings.simplefilter('ignore')

In [4]:
def get_feature(waveform, sample_rate):
    return fft(waveform)
    # return librosa.feature.mfcc(y=waveform, sr=sample_rate)

def get_label(label_str):
    return -1 if (label_str == 'No_Whistle') else 1

In [5]:
feature_matrix = []
labels = []

max_file_count = 10

for file in data_file_paths[:max_file_count]:
    base = os.path.splitext(file)[0]
    extension = os.path.splitext(file)[1]
    
    if extension == '.flac' and os.path.isfile(base + '.csv'):
        print(os.path.basename(file))
        label_df = pd.read_csv(base + '.csv')

        for index, row in label_df.iterrows():
            start = max(0, row['start'])
            end = row['end']
            duration = end - start
            
            if duration <= 0:
                print(file)
                print(f'In row {index}')
                continue
            
            waveform, sample_rate = librosa.load(
                file,
                sr=None,    # do not resample file
                mono=True,
                offset=start,
                duration=duration)
            
            feature = get_feature(waveform, sample_rate)
            feature_matrix.append(feature)
            
            label = get_label(row['label'])
            labels.append(label)

CometInterceptor_3,4_1.flac
Genesis_7_2.flac
DeepImpact_2,3_2.flac
Helios_2_2.flac
Helios_2,3_1.flac
Phoenix_5_2.flac
Dawn_3,5_1.flac
/Users/nico/Documents/HTWK/INM/2024 SoSe/MUS Mustererkennung/Projekt/src/src/data/Dawn_3,5_1.flac
In row 6


In [6]:
def pad_array(array, length):
    padded_array = np.pad(array, (0, length - len(array)), mode='constant')
    return padded_array

def pad_matrix(matrix):
    max_length = max(len(sublist) for sublist in matrix)
    
    padded_matrix = []
    for sublist in matrix:
        padded_matrix.append(pad_array(sublist, max_length))
        
    return padded_matrix

In [9]:
feature_matrix = pad_matrix(feature_matrix)
feature_matrix = np.array(feature_matrix)

labels = np.array(labels)

In [10]:
print(feature_matrix.shape)
print(labels.shape)

print(len(feature_matrix[0]))
print(len(feature_matrix[1]))
print(len(feature_matrix[2]))

(94, 9369320)
(94,)
9369320
9369320
9369320


# Train Support Vector Classifier

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(np.abs(feature_matrix), labels, test_size=0.2, random_state=42)

In [12]:
print(X_train.shape)
print(X_test.shape)

(75, 9369320)
(19, 9369320)


In [13]:
from sklearn.svm import SVC

model = SVC()
model.fit(X_train, y_train)

## Save Model

see https://scikit-learn.org/stable/model_persistence.html#skops-io

In [15]:
import skops.io as sio

sio.dump(model, 'svc_pure_10.skops')

# Test Model

In [16]:
from sklearn import metrics

In [17]:
# accuracy = model.score(X_test, y_test)
# print(f'Model accuracy: {accuracy}')

In [18]:
y_test_predict = model.predict(X_test)

In [21]:
accuracy = metrics.accuracy_score(y_test, y_test_predict)
print(f'Model accuracy: {accuracy:.4f}')

precision = metrics.precision_score(y_test, y_test_predict)
print(f'Model precision: {precision:.4f}')

recall = metrics.recall_score(y_test, y_test_predict)
print(f'Model recall: {recall:.4f}')

f1 = metrics.f1_score(y_test, y_test_predict)
print(f'Model F1: {f1:.4f}')

Model accuracy: 0.8421
Model precision: 0.7000
Model recall: 1.0000
Model F1: 0.8235
