In [None]:
!pip install noisereduce

# Table Of Content
1. Preprocessing the dataset
2. Training the model

# Preprocessing the dataset

**Reducing noise from the audio files**

In [None]:
import os
import librosa
import noisereduce as nr
import soundfile as sf
from tqdm import tqdm  #for progress bar

# Defining input and output directories
input_dir = "/kaggle/input/ravdess-emotional-speech-audio" 
output_dir = "/kaggle/working/denoised_ravdess_audio"

os.makedirs(output_dir, exist_ok=True)

# Processing each subfolder and file in the dataset
for root, dirs, files in os.walk(input_dir):
    
    relative_path = os.path.relpath(root, input_dir)
    output_folder = os.path.join(output_dir, relative_path)
    os.makedirs(output_folder, exist_ok=True) 

    for file in tqdm(files, desc=f"Processing {relative_path}"):
        if file.endswith(".wav"):
            input_file_path = os.path.join(root, file)
            output_file_path = os.path.join(output_folder, file)

            try:
                
                y, sr = librosa.load(input_file_path, sr=None)

                
                noise_start = 0
                noise_end = int(sr * 0.5) 
                noise_profile = y[noise_start:noise_end]

                
                y_denoised = nr.reduce_noise(y=y, sr=sr, y_noise=noise_profile)

                
                sf.write(output_file_path, y_denoised, sr)
            except Exception as e:
                print(f"Error processing {input_file_path}: {e}")

print(f"✅ Noise reduction complete! Denoised files are saved in: {output_dir}")


**Creating mel spectograms of the corresponding audio files in greyscale and applying noise masking and then resizing the spectograms into size of 224*224 inorder to be sent as input to the resNet50 model**

In [None]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image


input_dir = "/kaggle/working/denoised_ravdess_audio"  
output_dir = "/kaggle/working/spectrograms"


os.makedirs(output_dir, exist_ok=True)

# Function to apply noise masking
def apply_mask(spectrogram_db):
    threshold = np.percentile(spectrogram_db, 5)  
    spectrogram_db[spectrogram_db < threshold] = np.min(spectrogram_db)
    return spectrogram_db

print("Started processing audio files...")

# Iterating through each actor's folder
for actor_folder in os.listdir(input_dir):
    actor_path = os.path.join(input_dir, actor_folder)
    
    if os.path.isdir(actor_path):  
        # Creating corresponding output directory
        output_actor_dir = os.path.join(output_dir, actor_folder)
        os.makedirs(output_actor_dir, exist_ok=True)
        
        # Processing each audio file in the actor's folder
        for file in os.listdir(actor_path):
            if file.endswith(".wav"):  
                file_path = os.path.join(actor_path, file)
                try:
                    
                    y, sr = librosa.load(file_path, sr=None)

                    # Creating a Mel spectrogram
                    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=512)
                    spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)

                    # Applying noise masking
                    spectrogram_db = apply_mask(spectrogram_db)

                    # Plotting the spectrogram
                    fig, ax = plt.subplots(figsize=(10, 10))  
                    ax.set_axis_off()  
                    librosa.display.specshow(spectrogram_db, sr=sr, x_axis=None, y_axis=None, cmap='gray_r', fmax=8000)
                    plt.savefig("temp_spectrogram.png", bbox_inches='tight', pad_inches=0, dpi=100)
                    plt.close(fig)

                    # Resizing for ResNet50 (224x224)
                    img = Image.open("temp_spectrogram.png").convert("L")  # Converting to greyscale
                    img = img.resize((224, 224), Image.Resampling.LANCZOS)
                    
                    # Saving the spectrogram image in the corresponding output folder
                    output_path = os.path.join(output_actor_dir, f"{os.path.splitext(file)[0]}_spectrogram.png")
                    img.save(output_path)

                    print(f"Spectrogram saved: {output_path}")
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

print("✅ All spectrograms have been processed and saved.")


In [None]:
import shutil

folder_to_zip = "/kaggle/working/spectrograms"
output_zip = "/kaggle/working/spectrograms.zip"

shutil.make_archive(output_zip.replace(".zip", ""), 'zip', folder_to_zip)

print("ZIP file created:", output_zip)

**Making positive and negative pairs where positive pairs correspond to audio files of the same actor and vice-versa. This has been done in order to train the model to different ways of the actor saying the same dialogue**

In [None]:
import os
import random
import itertools


input_dir = "/kaggle/input/spectogramsss"
output_file = "/kaggle/working/pairs.txt"

actors = [actor for actor in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, actor))]

# Function to create pairs
def create_pairs(actors, num_positive, num_negative):
    positive_pairs = []
    negative_pairs = []
    
    # Creating positive pairs
    for actor in actors:
        actor_path = os.path.join(input_dir, actor)
        spectrograms = [os.path.join(actor_path, file) for file in os.listdir(actor_path) if file.endswith(".png")]
        if len(spectrograms) > 1:  
            # Generating all possible pairs of spectrograms for the same actor
            actor_pairs = list(itertools.combinations(spectrograms, 2))
            random.shuffle(actor_pairs)
            positive_pairs.extend(actor_pairs[:num_positive // len(actors)])  # Limit pairs per actor
    
    # Creating negative pairs
    for actor1, actor2 in itertools.combinations(actors, 2):
        actor1_path = os.path.join(input_dir, actor1)
        actor2_path = os.path.join(input_dir, actor2)
        spectrograms1 = [os.path.join(actor1_path, file) for file in os.listdir(actor1_path) if file.endswith(".png")]
        spectrograms2 = [os.path.join(actor2_path, file) for file in os.listdir(actor2_path) if file.endswith(".png")]
        if spectrograms1 and spectrograms2:
            # Creating pairs between spectrograms of two different actors
            negative_pairs.extend([(s1, s2) for s1 in spectrograms1 for s2 in spectrograms2])
    

    random.shuffle(negative_pairs)
    return positive_pairs[:num_positive], negative_pairs[:num_negative]

# Generating 20,000 positive and 20,000 negative pairs
print("Generating pairs...")
positive_pairs, negative_pairs = create_pairs(actors, 20000, 20000)

# Saving pairs to a file
print("Saving pairs to file...")
with open(output_file, "w") as f:
    for pair in positive_pairs:
        f.write(f"{pair[0]},{pair[1]},1\n")  #  1 for positive
    for pair in negative_pairs:
        f.write(f"{pair[0]},{pair[1]},0\n")  #  0 for negative

print(f"✅ Pairs saved to {output_file}.")


**Randomly shuffling the contents of the pairs.txt file**

In [None]:
import random

file_path = "/kaggle/working/pairs.txt"

with open(file_path, "r") as f:
    lines = f.readlines()

# Shuffling the lines
random.shuffle(lines)

# Writing the shuffled lines back to the file
with open(file_path, "w") as f:
    f.writelines(lines)

print("File shuffled successfully!")


# Training the model

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import CSVLogger, Callback
import numpy as np

# Forcing GPU usage and prevent memory overflow
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("✅ GPU is available and will be used.")
    except RuntimeError as e:
        print(e)


batch_size = 32
total_samples = 40000
steps_per_epoch = total_samples // batch_size
validation_samples = total_samples // 5  
validation_steps = validation_samples // batch_size  


def preprocess_image(image_path):
    """Preprocess an image given its file path."""
    image = tf.io.read_file(image_path)
    image = tf.image.decode_png(image, channels=3)  
    image = tf.image.resize(image, [224, 224])  
    image = image / 255.0  
    return image

def pair_generator(txt_file, batch_size=32, repeat=False):
    """
    Generator to yield batches of image pairs and labels from a text file.
    """
    while True:  
        with open(txt_file, "r") as f:
            lines = f.readlines()
            np.random.shuffle(lines)  # Shuffling data before each epoch
            for i in range(0, len(lines), batch_size):
                batch_lines = lines[i:i + batch_size]
                images1, images2, labels = [], [], []
                for line in batch_lines:
                    line = line.strip()
                    if not line:
                        continue  # Skipping empty lines
                    try:
                        file1, file2, label = line.split(",")
                        images1.append(preprocess_image(file1))
                        images2.append(preprocess_image(file2))
                        labels.append(float(label))
                    except ValueError:
                        print(f"Skipping invalid line: {line}")
                
                if images1 and images2 and labels:
                   yield (tf.convert_to_tensor(images1), tf.convert_to_tensor(images2)), tf.convert_to_tensor(labels, dtype=tf.float32)

        if not repeat:
            break


txt_file_path = "/kaggle/working/pairs.txt"


train_dataset = tf.data.Dataset.from_generator(
    lambda: pair_generator(txt_file_path, batch_size=batch_size, repeat=True),
    output_signature=(
        (tf.TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32),
         tf.TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32)),
        tf.TensorSpec(shape=(None,), dtype=tf.float32)
    )
).prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_generator(
    lambda: pair_generator(txt_file_path, batch_size=batch_size, repeat=False),
    output_signature=(
        (tf.TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32),
         tf.TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32)),
        tf.TensorSpec(shape=(None,), dtype=tf.float32)
    )
).prefetch(tf.data.AUTOTUNE)

# Defining the Siamese ResNet50 model
base_model = ResNet50(
    weights="/kaggle/input/resnet50/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5",
    include_top=False,
    input_shape=(224, 224, 3)
)

# Feature extraction for each image in the pair
input1 = tf.keras.Input(shape=(224, 224, 3))
input2 = tf.keras.Input(shape=(224, 224, 3))

x1 = base_model(input1, training=False)
x2 = base_model(input2, training=False)

# Global average pooling
x1 = GlobalAveragePooling2D()(x1)
x2 = GlobalAveragePooling2D()(x2)

# Concatenating the feature vectors
merged = tf.keras.layers.Concatenate()([x1, x2])

# Fully connected layers
dense1 = Dense(128, activation="relu")(merged)
output = Dense(1, activation="sigmoid")(dense1)

siamese_model = Model(inputs=[input1, input2], outputs=output)

# Compilinng the model
siamese_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Custom Callback to Print Metrics After Every Epoch
class EpochLogger(Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(f"\nEpoch {epoch + 1} Summary:")
        print(f"  Loss: {logs['loss']:.4f}, Accuracy: {logs['accuracy']:.4f}")
        print(f"  Validation Loss: {logs['val_loss']:.4f}, Validation Accuracy: {logs['val_accuracy']:.4f}")
        
        for batch in val_dataset.take(1):
            (img1, img2), labels = batch
            predictions = siamese_model.predict([img1, img2])
            print(f"  Sample True Label: {labels.numpy()[:5]}")
            print(f"  Sample Predicted: {predictions[:5].flatten()}")

# Setting up CSVLogger and Custom Logger callback
csv_logger = CSVLogger('/kaggle/working/training_log.csv', append=True)
epoch_logger = EpochLogger()

# Training the model on GPU with fixed steps per epoch
with tf.device('/GPU:0'):
    history = siamese_model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=5,
        steps_per_epoch=steps_per_epoch,  
        validation_steps=validation_steps,  
        callbacks=[csv_logger, epoch_logger]
    )


siamese_model.save("/kaggle/working/siamese_model_v1.h5")
print("✅ Model training complete and saved!")
