---
---

# Modelling

---
---

In [47]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
import sys
sys.path.append("../src")
from load_config import load_constants_from_yaml
from sklearn.preprocessing import LabelEncoder

Define constants

In [48]:
constants = load_constants_from_yaml('../constants.yml')

SAMPLING_RATING = constants["SAMPLING_RATING"]
FRAME_LENGTH_ENERGY = constants["FRAME_LENGTH_ENERGY"]
THRESHOLD_PERCENTAGE = constants["THRESHOLD_PERCENTAGE"]
MIN_SILENCE_DURATION = constants["MIN_SILENCE_DURATION"]
HOP_LENGTH = constants["HOP_LENGTH"]
TEST_SIZE = 0.2
FIRST_LAYER_NEURONS = 128
SECOND_LAYER_NEURONS = 64
RANDOM_STATE = 42
processed_data_path = "../data/processed/"

Load data

In [49]:
df = pd.read_csv(processed_data_path+"df_transformed.csv").drop("Unnamed: 0", axis = 1)
#segmented_mfccs = np.load(processed_data_path+"segmented_mfccs.npy")
#onehot_labels = np.load(processed_data_path+"segmented_onehot_labels.npy")

In [50]:
df.columns[0]

'labels'

In [51]:
df.head(5)

Unnamed: 0,labels,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13
0,english,-637.64417,28.056667,25.881245,22.61752,18.732693,14.74716,11.137844,8.256512,6.279685,5.19876,4.849034,4.967811,5.265425
1,english,-645.1791,17.923923,17.226955,16.14312,14.777039,13.253267,11.699085,10.227979,8.926468,7.846347,7.002743,6.378356,5.931872
2,english,-655.1502,4.062725,4.061807,4.060277,4.058136,4.055384,4.052021,4.048049,4.043466,4.038274,4.032475,4.026067,4.019054
3,english,-655.1509,4.061788,4.060869,4.059341,4.057201,4.054449,4.051086,4.047115,4.042531,4.037343,4.031544,4.025139,4.018128
4,english,-655.15125,4.061337,4.06042,4.05889,4.056751,4.053999,4.050637,4.046666,4.042084,4.036895,4.031097,4.024692,4.017683


Split data set

In [52]:
# Assuming y_train is your array of labels
label_encoder = LabelEncoder()

X = df.drop("labels", axis=1)
y = df["labels"]
y = label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

### Model architecture

Define the model

In [53]:
model = keras.Sequential([
    # Input shape based on MFCC features
    layers.Input(shape=X_train.shape[1:]),
    layers.Flatten(),  # Flatten the input
    # Dense layer with ReLU activation
    layers.Dense(FIRST_LAYER_NEURONS, activation='relu'),
    # Dense layer with ReLU activation
    layers.Dense(SECOND_LAYER_NEURONS, activation='relu'),
    # Output layer with sigmoid activation for binary classification
    layers.Dense(1, activation='sigmoid')
])

Compile the model

In [54]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

### Train the model

In [55]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10


  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m11238/11240[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 6ms/step - accuracy: 0.6070 - loss: 0.0000e+00

  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m11240/11240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 7ms/step - accuracy: 0.6070 - loss: 0.0000e+00 - val_accuracy: 0.6064 - val_loss: 0.0000e+00
Epoch 2/10
[1m11240/11240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 8ms/step - accuracy: 0.6078 - loss: 0.0000e+00 - val_accuracy: 0.6064 - val_loss: 0.0000e+00
Epoch 3/10
[1m11240/11240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 7ms/step - accuracy: 0.6063 - loss: 0.0000e+00 - val_accuracy: 0.6064 - val_loss: 0.0000e+00
Epoch 4/10
[1m11240/11240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 5ms/step - accuracy: 0.6075 - loss: 0.0000e+00 - val_accuracy: 0.6064 - val_loss: 0.0000e+00
Epoch 5/10
[1m11240/11240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 8ms/step - accuracy: 0.6085 - loss: 0.0000e+00 - val_accuracy: 0.6064 - val_loss: 0.0000e+00
Epoch 6/10
[1m11240/11240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 16ms/step - accuracy: 0.6086 - loss: 0.0000e+00 - v

<keras.src.callbacks.history.History at 0x7e467d5019c0>

### Evaluate the model on training set

In [56]:
# Evaluate the model on training set
train_loss, train_accuracy = model.evaluate(X_train, y_train)
print(f"Train loss: {train_loss}")
print(f"Train accuracy: {train_accuracy}")

[1m14050/14050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 4ms/step - accuracy: 0.6071 - loss: 0.0000e+00
Train loss: 0.0
Train accuracy: 0.6072508692741394


### Evaluate the model on the testing set

In [57]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


[1m3513/3513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.6098 - loss: 0.0000e+00
Test Loss: 0.0
Test Accuracy: 0.6092995405197144
