In [2]:
import os, pathlib, random, math
import numpy as np, pandas as pd, matplotlib.pyplot as plt, tensorflow as tf, tensorflow_hub as hub, soundfile as sf
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler          # handles class imbalance


In [3]:
SR             = 16_000           # YAMNet expects 16 kHz[55]
DURATION       = 3.0              # seconds (clips shorter than this are zero-padded)
BATCH_SIZE     = 32
VAL_SPLIT      = 0.15
TEST_SPLIT     = 0.15
EPOCHS         = 20


In [4]:
!pip install tqdm -q

In [5]:
from tqdm.auto import tqdm
import io
import librosa

# --- Function Definitions ---
yamnet_model    = hub.load('https://tfhub.dev/google/yamnet/1')   # returns scores, embeddings, mel-spec[55]
class_map_path = yamnet_model.class_map_path().numpy()
# Explicitly name the columns since the CSV doesn't have a header
yamnet_classes  = np.array(pd.read_csv(io.StringIO(class_map_path.decode('utf-8')), header=None, names=['index', 'name', 'display_name'])['display_name'])


def load_wav(file_path):
    wav, sr = sf.read(file_path)
    if wav.ndim > 1:  # Check if audio is stereo
        wav = np.mean(wav, axis=1) # Convert to mono
    if sr != SR:
        wav = librosa.resample(wav, orig_sr=sr, target_sr=SR)
    wav = wav[: int(SR * DURATION)]                     # truncate
    pad_len = max(0, int(SR*DURATION) - len(wav))
    if pad_len: wav = np.pad(wav, (0, pad_len))
    return wav.astype('float32')

@tf.function
def yamnet_embed(waveform):
    _, embedding, _ = yamnet_model(waveform)
    return tf.reduce_mean(embedding, axis=0)            # 1024-D vector[55]


# --- CONFIGURATION ---
# ⚠️ Please update this path to point to your main data directory.
# This script assumes that inside this directory, there are subdirectories
# for each of your audio classes (e.g., 'data/dog', 'data/cat').
data_dir = pathlib.Path('/content/drive/MyDrive/scream_dataset/Converted_Separately')
# ---------------------


if not data_dir.exists():
  print(f"Error: The directory '{data_dir}' was not found.")
  print("Please make sure you have uploaded your data and the path is correct.")
else:
  # Find all .wav files and extract their parent directory name as the label
  filepaths = list(data_dir.glob('*/*.wav'))

  if not filepaths:
    print(f"Error: No .wav files were found in the subdirectories of '{data_dir}'.")
    print("Please check your directory structure.")
  else:
    labels = [path.parent.name for path in filepaths]

    # Create a DataFrame
    files_df = pd.DataFrame({
        'filepath': filepaths,
        'label': labels
    })

    print(f"Found {len(files_df)} audio files from {len(files_df['label'].unique())} classes.")
    print("\nClass distribution:")
    print(files_df['label'].value_counts())

    # Generate embeddings for each file
    print("\nGenerating embeddings... (This may take a while)")
    all_embeddings = []
    for filepath in tqdm(files_df['filepath']):
        try:
            waveform = load_wav(filepath)
            embedding = yamnet_embed(waveform)
            all_embeddings.append(embedding.numpy())
        except Exception as e:
            print(f"Error processing {filepath}: {e}")
            all_embeddings.append(None) # Add a placeholder for the failed file

    files_df['embedding'] = all_embeddings

    # Remove rows where embedding extraction failed
    files_df.dropna(subset=['embedding'], inplace=True)

    print("\nDone. Here's a preview of the DataFrame with embeddings:")
    display(files_df.head())

Found 3128 audio files from 2 classes.

Class distribution:
label
scream        1583
non_scream    1545
Name: count, dtype: int64

Generating embeddings... (This may take a while)


  0%|          | 0/3128 [00:00<?, ?it/s]


Done. Here's a preview of the DataFrame with embeddings:


Unnamed: 0,filepath,label,embedding
0,/content/drive/MyDrive/scream_dataset/Converte...,non_scream,"[0.6920425, 0.14002092, 0.33974504, 0.12877707..."
1,/content/drive/MyDrive/scream_dataset/Converte...,non_scream,"[0.6920425, 0.13090713, 0.37201968, 0.02745604..."
2,/content/drive/MyDrive/scream_dataset/Converte...,non_scream,"[0.6920425, 0.13090713, 0.43269512, 0.43751755..."
3,/content/drive/MyDrive/scream_dataset/Converte...,non_scream,"[0.6920425, 0.1420183, 0.44168654, 0.17926262,..."
4,/content/drive/MyDrive/scream_dataset/Converte...,non_scream,"[0.6920425, 0.13711627, 0.36012766, 0.15117149..."


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# --- Prepare Data for Modeling ---

# 1. Extract features (X) and labels (y) from the DataFrame
X = np.array(files_df['embedding'].tolist())
y = files_df['label']

# 2. Encode string labels into integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Print the mapping from string to integer
print("Label mapping:")
for i, class_name in enumerate(label_encoder.classes_):
    print(f"- {class_name}: {i}")


# 3. Create the initial train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=TEST_SPLIT, random_state=42, stratify=y_encoded
)

# 4. Create the validation split from the training data
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=VAL_SPLIT/(1-TEST_SPLIT), random_state=42, stratify=y_train
)

print(f"\nDataset splits:")
print(f"- Training examples:   {len(X_train)}")
print(f"- Validation examples: {len(X_val)}")
print(f"- Test examples:       {len(X_test)}")


# --- Create TensorFlow Datasets ---

# 5. Oversample the training data to handle class imbalance
ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train, y_train)

print(f"\nAfter oversampling, training examples: {len(X_train_res)}")


# 6. Create tf.data.Dataset objects
train_ds = tf.data.Dataset.from_tensor_slices((X_train_res, y_train_res)).shuffle(buffer_size=len(X_train_res)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

print("\nTensorFlow Datasets created successfully.")

Label mapping:
- non_scream: 0
- scream: 1

Dataset splits:
- Training examples:   2188
- Validation examples: 470
- Test examples:       470

After oversampling, training examples: 2214

TensorFlow Datasets created successfully.


In [8]:
inputs  = tf.keras.Input(shape=(1024,), name='yamnet_embedding')
x       = tf.keras.layers.Dense(512, activation='relu')(inputs)
x       = tf.keras.layers.Dropout(0.4)(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs, outputs)
model.compile(optimizer=tf.keras.optimizers.Adam(1e-4),
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()


In [9]:
callbacks=[
    tf.keras.callbacks.EarlyStopping(patience=4, restore_best_weights=True),
    tf.keras.callbacks.ModelCheckpoint('scream_detector.h5', save_best_only=True)
]

history = model.fit(train_ds,
                    epochs=EPOCHS,
                    validation_data=val_ds,
                    callbacks=callbacks)


Epoch 1/20
[1m69/70[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.6819 - loss: 0.5981



[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.6844 - loss: 0.5956 - val_accuracy: 0.9213 - val_loss: 0.3387
Epoch 2/20
[1m69/70[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - accuracy: 0.8892 - loss: 0.3415



[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8895 - loss: 0.3410 - val_accuracy: 0.9468 - val_loss: 0.2465
Epoch 3/20
[1m66/70[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 8ms/step - accuracy: 0.9116 - loss: 0.2658



[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9118 - loss: 0.2649 - val_accuracy: 0.9511 - val_loss: 0.2050
Epoch 4/20
[1m67/70[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - accuracy: 0.9119 - loss: 0.2329



[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9124 - loss: 0.2326 - val_accuracy: 0.9553 - val_loss: 0.1838
Epoch 5/20
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9243 - loss: 0.2041 - val_accuracy: 0.9106 - val_loss: 0.1942
Epoch 6/20
[1m68/70[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - accuracy: 0.9341 - loss: 0.1901



[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9343 - loss: 0.1899 - val_accuracy: 0.9340 - val_loss: 0.1723
Epoch 7/20
[1m68/70[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - accuracy: 0.9431 - loss: 0.1691



[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9428 - loss: 0.1693 - val_accuracy: 0.9489 - val_loss: 0.1532
Epoch 8/20
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9434 - loss: 0.1591 - val_accuracy: 0.8979 - val_loss: 0.2038
Epoch 9/20
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9518 - loss: 0.1542



[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9517 - loss: 0.1541 - val_accuracy: 0.9447 - val_loss: 0.1522
Epoch 10/20
[1m68/70[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - accuracy: 0.9548 - loss: 0.1513



[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9546 - loss: 0.1513 - val_accuracy: 0.9553 - val_loss: 0.1304
Epoch 11/20
[1m68/70[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - accuracy: 0.9548 - loss: 0.1500



[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9547 - loss: 0.1498 - val_accuracy: 0.9574 - val_loss: 0.1283
Epoch 12/20
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9492 - loss: 0.1503 - val_accuracy: 0.9404 - val_loss: 0.1389
Epoch 13/20
[1m66/70[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 11ms/step - accuracy: 0.9595 - loss: 0.1236



[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9592 - loss: 0.1237 - val_accuracy: 0.9596 - val_loss: 0.1187
Epoch 14/20
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.9516 - loss: 0.1317 - val_accuracy: 0.9511 - val_loss: 0.1275
Epoch 15/20
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.9523 - loss: 0.1198 - val_accuracy: 0.9553 - val_loss: 0.1234
Epoch 16/20
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9493 - loss: 0.1489



[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9494 - loss: 0.1486 - val_accuracy: 0.9574 - val_loss: 0.1095
Epoch 17/20
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9627 - loss: 0.1097 - val_accuracy: 0.9489 - val_loss: 0.1212
Epoch 18/20
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9527 - loss: 0.1243



[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9527 - loss: 0.1243 - val_accuracy: 0.9596 - val_loss: 0.1041
Epoch 19/20
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9633 - loss: 0.1111 - val_accuracy: 0.9426 - val_loss: 0.1223
Epoch 20/20
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9644 - loss: 0.1101 - val_accuracy: 0.9447 - val_loss: 0.1255


In [10]:
y_true, y_pred = [], []
for X,y in test_ds:
    y_true.extend(y.numpy())
    y_pred.extend((model.predict(X) > 0.5).astype(int).ravel())

print(classification_report(y_true, y_pred, target_names=['non-scream','scream']))
print(confusion_matrix(y_true, y_pred))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37

In [12]:
# Convert the Keras model to TensorFlow Lite format
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

# Save the TFLite model to a file
with open('scream_detector.tflite', 'wb') as f:
    f.write(tflite_model)

print("Model converted and saved as 'scream_detector.tflite'")

# You can verify the file size
import os
model_size = os.path.getsize('scream_detector.tflite')
print(f"TFLite model size: {model_size / 1024:.2f} KB")

Saved artifact at '/tmp/tmpetkmhd9j'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 1024), dtype=tf.float32, name='yamnet_embedding')
Output Type:
  TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)
Captures:
  139509727612560: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139509727612944: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139509727615440: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139509727612368: TensorSpec(shape=(), dtype=tf.resource, name=None)
Model converted and saved as 'scream_detector.tflite'
TFLite model size: 2053.66 KB


In [13]:
# A generator function that provides a sample of the training data
def representative_data_gen():
  # Use a sample of 100 from the training dataset
  for input_value, _ in train_ds.take(100):
    yield [input_value]

# Create a new converter from the original Keras model
converter = tf.lite.TFLiteConverter.from_keras_model(model)

# Enable optimizations, which will trigger quantization
converter.optimizations = [tf.lite.Optimize.DEFAULT]

# Provide the representative dataset to the converter
converter.representative_dataset = representative_data_gen

# To ensure full integer quantization for maximum compatibility
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8
converter.inference_output_type = tf.int8

# Convert the model
tflite_model_quant = converter.convert()

# Save the quantized model to a new file
quantized_model_path = 'scream_detector_quantized.tflite'
with open(quantized_model_path, 'wb') as f:
  f.write(tflite_model_quant)

print(f"Quantized model saved to: {quantized_model_path}")

# Compare the file sizes
original_size = os.path.getsize('scream_detector.tflite')
quantized_size = os.path.getsize(quantized_model_path)

print(f"\nOriginal TFLite model size:    {original_size / 1024:.2f} KB")
print(f"Quantized TFLite model size:   {quantized_size / 1024:.2f} KB")
print(f"Size reduction of approximately: {original_size / quantized_size:.2f}x")

Saved artifact at '/tmp/tmp2kii00lt'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 1024), dtype=tf.float32, name='yamnet_embedding')
Output Type:
  TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)
Captures:
  139509727612560: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139509727612944: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139509727615440: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139509727612368: TensorSpec(shape=(), dtype=tf.resource, name=None)




Quantized model saved to: scream_detector_quantized.tflite

Original TFLite model size:    2053.66 KB
Quantized TFLite model size:   528.44 KB
Size reduction of approximately: 3.89x
