In [1]:
#-------------------------------------------------------------------------------------JUPYTER NOTEBOOK SETTINGS-------------------------------------------------------------------------------------
from IPython.core.display import display, HTML                                    
display(HTML("<style>.container { width:100% !important; }</style>"))  
import IPython.display as display

  from IPython.core.display import display, HTML


In [None]:
import os
import gc
import re
import librosa
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from joblib import dump, load

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Layer, Input, Conv1D, MaxPooling1D, Dropout, Flatten, Dense, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping 
from tensorflow.keras import mixed_precision

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
tf.config.list_physical_devices('GPU')

In [None]:
# Set up mixed precision policy
mixed_precision.set_global_policy('mixed_float16')

### Data Loading and Processing

In [None]:
def calculate_frames_directory(directory):
    max_length = 0 
    labels = [label for label in os.listdir(directory) if os.path.isdir(os.path.join(directory, label))]
    
    # Finding the maximum number of frames among all samples with a progress bar
    for label in tqdm(labels, desc="Finding Max Length"):
        label_path = os.path.join(directory, label)
        wav_files = [os.path.join(label_path, file) for file in os.listdir(label_path) if file.endswith('.wav')]

        Process each file in the label directory
        for wav_file in tqdm(wav_files, desc=f"Processing {label}", leave=False):
            signal, sr = librosa.load(wav_file, sr=16000)
            mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=13, n_fft=256, hop_length=160, n_mels=32, fmin=0, fmax=8000)
            if mfccs.shape[1] > max_length:
                max_length = mfccs.shape[1]
        
    return max_lenght

directory = "/Users/ciprian/Desktop/Projects/Smart Plant Pot/Audio/Voice Recognition/Prototype 3"
max_window_length = calculate_frames_directory()

In [None]:
# Augmentation functions
# HIGH INTENSITY MASKING
def time_masking(mfccs, width=10, num_masks=2):
    """ Apply time masking to a series of MFCCs with a given width and number of times, more intensively. """
    masked_mfccs = np.copy(mfccs)
    for _ in range(num_masks):
        t = np.random.randint(0, max(1, masked_mfccs.shape[1] - width))
        masked_mfccs[:, t:t+width] = 0
    return masked_mfccs

def frequency_masking(mfccs, width=5, num_masks=2):
    """ Apply frequency masking to a series of MFCCs with a larger width and more times. """
    masked_mfccs = np.copy(mfccs)
    for _ in range(num_masks):
        f = np.random.randint(0, max(1, masked_mfccs.shape[0] - width))
        masked_mfccs[f:f+width, :] = 0
    return masked_mfccs

def time_warping(signal, sr, warping_factor=0.8):
    """ Warp time axis of a signal with a given warping factor, adjusted to be more severe. """
    n_steps = int(signal.size * (1 - warping_factor))
    if n_steps >= signal.size or n_steps < 1:
        return signal  # Return original signal if warping is not feasible
    return np.interp(np.arange(signal.size), np.linspace(0, signal.size, num=signal.size - n_steps), signal[:signal.size - n_steps])

def apply_augmentation(signal, sr, mfccs, intensity='medium'):
    """Apply specified augmentation level to signal and MFCCs."""
    if intensity == 'high':
        signal = time_warping(signal, sr, warping_factor=np.random.uniform(0.7, 1.3))
        mfccs = time_masking(mfccs, width=15, num_masks=2)
        mfccs = frequency_masking(mfccs, width=10, num_masks=2)
    elif intensity == 'medium':
        if np.random.rand() < 0.5:
            signal = time_warping(signal, sr, warping_factor=np.random.uniform(0.8, 1.2))
        mfccs = time_masking(mfccs, width=10, num_masks=2)
        mfccs = frequency_masking(mfccs, width=5, num_masks=2)
    elif intensity == 'low':
        choice = np.random.choice(['time', 'freq'])
        if choice == 'time':
            mfccs = time_masking(mfccs, width=8, num_masks=1)
        else:
            mfccs = frequency_masking(mfccs, width=4, num_masks=1)
    return signal, mfccs

def load_and_augment_data(directory, augment=True):
    max_length = 332  
    labels = [label for label in os.listdir(directory) if os.path.isdir(os.path.join(directory, label))]
    x, y, genders = [], [], []

    for label in tqdm(labels, desc="Loading and Padding Data"):
        label_path = os.path.join(directory, label)
        wav_files = [os.path.join(label_path, file) for file in os.listdir(label_path) if file.endswith('.wav')]
        
        for wav_file in tqdm(wav_files, desc=f"Padding {label}", leave=False):
            signal, sr = librosa.load(wav_file, sr=16000)
            mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=13, n_fft=256, hop_length=160, n_mels=32, fmin=0, fmax=8000)
            
            if augment:
                intensity = np.random.choice(['none', 'low', 'medium', 'high'], p=[0.25, 0.25, 0.25, 0.25])
                if intensity != 'none':
                    signal, mfccs = apply_augmentation(signal, sr, mfccs, intensity)
            
            pad_width = max_length - mfccs.shape[1]
            mfccs_padded = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
            x.append(mfccs_padded)
            y.append(label)
            
            if label in ['noise', 'silence']:
                genders.append(-1)  # No gender
            elif 'pitched' in wav_file.lower():
                genders.append(1)  # Female
            else:
                genders.append(0)  # Male

    return np.array(x, dtype=np.float32), np.array(y), np.array(genders)

# Load data and split
directory = "/Users/ciprian/Desktop/Projects/Smart Plant Pot/Audio/Voice Recognition/Prototype 3"
x, y, genders = load_and_augment_data(directory)
x_train, x_temp, y_train, y_temp, genders_train, genders_temp = train_test_split(x, y, genders, test_size=0.3, random_state=42, stratify=y)
x_val, x_test, y_val, y_test, genders_val, genders_test = train_test_split(x_temp, y_temp, genders_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [None]:
# Assuming x_train is properly shaped and contains MFCC features
if len(x_train) > 1:
    print(f"The number of features extracted from one of the samples is: {len(x_train[1])}")
    print(f"Number of frames (windows) in the first sample: {x_train[0].shape[1]}\n")
else:
    print("Not enough samples in x_train to display features.\n")

print(f"The number of samples for training is {len(x_train)}, with the number of labels {len(y_train)}")
print(f"The number of samples for validation is {len(x_val)}, with the number of labels {len(y_val)}")
print(f"The number of samples for testing is {len(x_test)}, with the number of labels {len(y_test)}")
print(f"The data for gender validation is {len(genders_val)}, with the testing data {len(genders_test)}, with training data {len(genders_train)}")

In [None]:
# Flatten the MFCC features for the first 10 samples for display purposes
x_flattened_for_display = [x[i].flatten()[:25] for i in range(25)]  # Display only the first 10 MFCC coefficients of each sample

# Create a DataFrame with the flattened MFCC features and labels
df = pd.DataFrame(x_flattened_for_display)
df['Label'] = y[:25]  # Add the labels as the last column

# Setting column names for clarity in display
feature_columns = [f'Feature_{i+1}' for i in range(df.shape[1] - 1)]  # Feature column names
df.columns = feature_columns + ['Label']  # Rename the columns for better understanding

# Display the DataFrame in Jupyter Notebook
df

In [None]:
dump((x_train, y_train), 'saved_data/train_data.joblib')
dump((x_val, y_val), 'saved_data/val_data.joblib')
dump((x_test, y_test), 'saved_data/test_data.joblib')
dump((genders_train, genders_val, genders_test), 'saved_data/genders_data.joblib')
print("All extracted features from the samples have been saved properly!")

### CNN Setup and Training

In [None]:
x_train, y_train = load('saved_data/train_data.joblib')
x_val, y_val = load('saved_data/val_data.joblib')
x_test, y_test = load('saved_data/test_data.joblib')
genders_train, genders_val, genders_test = load('saved_data/genders_data.joblib') 

In [None]:
# x = np.concatenate((x_train, x_val, x_test))
# y = np.concatenate((y_train, y_val, y_test))

In [None]:
print("Train data:", x_train.shape, x_train.dtype)
print("Validation data:", x_val.shape, x_val.dtype)
print("Test data:", x_test.shape, x_test.dtype)
print("Gender data:", genders_train, genders_val, genders_test)

# Check for any NaN or inf values in your dataset
print("NaNs in train:", np.isnan(x_train).any())
print("NaNs in validation:", np.isnan(x_val).any())
print("NaNs in test:", np.isnan(x_test).any())
print("NaNs in gender train:", np.isnan(genders_train).any())
print("NaNs in gender validation:", np.isnan(genders_val).any())
print("NaNs in gender test:", np.isnan(genders_test).any())

print("Infs in train:", np.isinf(x_train).any())
print("Infs in validation:", np.isinf(x_val).any())
print("Infs in test:", np.isinf(x_test).any())
print("Infs in gender train:", np.isinf(genders_train).any())
print("Infs in gender validation:", np.isinf(genders_val).any())
print("Infs in gender test:", np.isinf(genders_test).any())

In [None]:
# ONEHOT ENCODING THE LABELS
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)

# Convert labels to one-hot encoding
y_train_onehot = to_categorical(y_train_encoded)
y_val_onehot = to_categorical(y_val_encoded)
y_test_onehot = to_categorical(y_test_encoded)

In [None]:
# CONVOLUTIONAL NEURAL NETWORK SETUP AND TRAINING
def load_latest_weights(weights_dir, file_pattern):
    """Load the latest weights based on the file modification time."""
    # List all files in the directory that match the pattern
    all_weights = [os.path.join(weights_dir, f) for f in os.listdir(weights_dir) if file_pattern in f]
    # Find the most recent file by sorting based on modification time
    latest_weights = max(all_weights, key=os.path.getmtime, default=None)
    if latest_weights:
        print(f"Loading weights from {latest_weights}")
        return latest_weights
    else:
        print("No weights file found.")
        return None

# Gradient Reversal Layer
class GradientReversalLayer(Layer):
    def __init__(self, lambda_):
        super(GradientReversalLayer, self).__init__()
        self.lambda_ = lambda_

    @tf.custom_gradient
    def call(self, x):
        def grad(dy):
            return -self.lambda_ * dy
        return x, grad

# Domain Classifier
def create_domain_classifier(feature_extractor_output):
    x = GradientReversalLayer(lambda_=1.0)(feature_extractor_output)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    domain_output = Dense(1, activation='sigmoid', name='domain_output')(x)
    return domain_output

# Primary model
input_shape = (13, 332)
input_layer = Input(shape=input_shape)
x = Conv1D(32, kernel_size=3, activation='relu', padding='same')(input_layer)
x = BatchNormalization()(x)
x = Conv1D(32, kernel_size=3, activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = MaxPooling1D(pool_size=2)(x)
x = Dropout(0.25)(x)
x = Conv1D(64, kernel_size=2, activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = Conv1D(64, kernel_size=2, activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = MaxPooling1D(pool_size=2)(x)
x = Dropout(0.25)(x)
feature_extractor_output = Flatten()(x)

# Task-specific classifier
task_output = Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(feature_extractor_output)
task_output = Dropout(0.5)(task_output)
task_output = Dense(y_train_onehot.shape[1], activation='softmax', name='task_output')(task_output)

# Domain classifier
domain_output = create_domain_classifier(feature_extractor_output)

# Create the combined model
model = Model(inputs=input_layer, outputs=[task_output, domain_output])

# Compile the model with two losses
model.compile(optimizer=Adam(learning_rate=0.0001),
              loss={'task_output': 'categorical_crossentropy', 'domain_output': 'binary_crossentropy'},
              metrics={'task_output': 'accuracy', 'domain_output': 'accuracy'})

# Callbacks
early_stopping_monitor = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True, verbose=1)
reduce_lr_on_plateau = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=20, min_lr=0.0000001, verbose=1)

class SaveWeightsCallback(Callback):
    def __init__(self, save_freq, filepath):
        super(SaveWeightsCallback, self).__init__()
        self.save_freq = save_freq
        self.filepath = filepath
    
    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % self.save_freq == 0:
            self.model.save_weights(self.filepath.format(epoch=epoch + 1))

weights_saver = SaveWeightsCallback(save_freq=50, filepath='saved_data/adversarial-training_custom-cnn_weights_epoch_{epoch}.weights.h5')

# Train the model
num_epochs_per_stage = 50
total_epochs = 500
current_epoch = 0
all_history = []

while current_epoch < total_epochs:
    try:
        latest_weights_file = load_latest_weights('saved_data', '.weights.h5')
        if latest_weights_file:
            model.load_weights(latest_weights_file)
        
        # Select only samples with valid gender labels for domain training
        valid_idx = genders_train != -1
        x_train_valid = x_train[valid_idx]
        y_train_onehot_valid = y_train_onehot[valid_idx]
        genders_train_valid = genders_train[valid_idx]

        history = model.fit(
            x_train,
            {'task_output': y_train_onehot, 'domain_output': genders_train},
            epochs=current_epoch + num_epochs_per_stage,
            batch_size=2048,
            validation_data=(x_val, {'task_output': y_val_onehot, 'domain_output': genders_val}),
            callbacks=[weights_saver, early_stopping_monitor, reduce_lr_on_plateau],
            initial_epoch=current_epoch,
            verbose=1  
        )
        
        all_history.append(history.history)
        current_epoch += len(history.history['loss'])
        gc.collect()

        if early_stopping_monitor.stopped_epoch > 0:
            print(f"Early stopping triggered at epoch {current_epoch}")
            break

    except Exception as e:
        print("An error occurred during training:", e)
        break

In [None]:
model.save('saved_data/adversarial-training_custom-cnn_final_model.keras')

if all_history:
    final_history = {key: np.concatenate([seg[key] for seg in all_history]) for key in all_history[0]}
    dump(final_history, 'saved_data/adversarial-training_custom-cnn_training_history.joblib')
else:
    print("No training history was recorded.")