In [None]:
#-------------------------------------------------------------------------------------JUPYTER NOTEBOOK SETTINGS-------------------------------------------------------------------------------------
from IPython.core.display import display, HTML                                    
display(HTML("<style>.container { width:100% !important; }</style>"))  
import IPython.display as display

In [None]:
import os
import numpy as np
import librosa
import json
import concurrent.futures
import joblib
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from joblib import Parallel, delayed
from scipy.stats import randint, uniform
from tqdm import tqdm

In [None]:
class MFCCFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, sr=16000, n_mfcc=13, n_fft=512, hop_length=320, n_mels=32, fmin=80, fmax=8000, window_size=None):
        self.sr = sr
        self.n_mfcc = n_mfcc
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax
        self.window_size = window_size

    def fit(self, X, y=None):
        if self.window_size is None:
            self.window_size = self._calculate_max_window_size(X)
        return self

    def transform(self, X):
        features = Parallel(n_jobs=-1, backend='threading')(delayed(self._extract_mfcc)(wav_file) for wav_file in tqdm(X, desc="Extracting MFCC features"))
        return np.array(features)

    def _extract_mfcc(self, wav_file):
        signal, sr = librosa.load(wav_file, sr=self.sr)
        mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=self.n_mfcc, n_fft=self.n_fft,
                                     hop_length=self.hop_length, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax)
        if mfccs.shape[1] < self.window_size:
            pad_width = self.window_size - mfccs.shape[1]
            pad_value = mfccs.mean()
            mfccs = np.pad(mfccs, ((0, 0), (0, pad_width)), mode='constant', constant_values=pad_value)
        else:
            mfccs = mfccs[:, :self.window_size]
        return mfccs.flatten()

    def _calculate_max_window_size(self, files):
        max_window_size = 0
        for wav_file in tqdm(files, desc="Calculating max window size"):
            signal, _ = librosa.load(wav_file, sr=self.sr)
            duration = len(signal) / self.sr
            window_size = int(np.floor((duration * self.sr - self.n_fft) / self.hop_length) + 1)
            if window_size > max_window_size:
                max_window_size = window_size
        return max_window_size

def load_files_and_labels(root_dir):
    files = []
    labels = []
    
    for subdir, _, subfiles in os.walk(root_dir):
        for file in subfiles:
            if file.endswith('.wav'):
                files.append(os.path.join(subdir, file))
                labels.append(os.path.basename(subdir))  # Use the final subdirectory as the label
    
    return files, labels

if __name__ == "__main__":
    root_dir = '/Users/ciprian/Desktop/Projects/Smart Plant Pot/Audio/Voice Recognition/Prototype 4'  

    # Load all files and labels from all subdirectories
    files, labels = load_files_and_labels(root_dir)
    label_encoder = LabelEncoder()
    labels_encoded = label_encoder.fit_transform(labels)
    
    # Subsample data for parameter tuning
    subsample_size = 10000  # Adjust based on available computational resources
    subsample_indices = np.random.choice(len(files), size=subsample_size, replace=False)
    files_subsample = [files[i] for i in subsample_indices]
    labels_subsample = [labels_encoded[i] for i in subsample_indices]
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(files_subsample, labels_subsample, test_size=0.2, random_state=42)

    # Define the pipeline
    pipeline = Pipeline([
        ('mfcc', MFCCFeatureExtractor()),
        ('classifier', KNeighborsClassifier())
    ])
    
    # Define the parameter distributions for RandomizedSearchCV
    param_dist = {
        'mfcc__sr': [16000],
        'mfcc__n_mfcc': randint(10, 30),
        'mfcc__n_fft': randint(128, 512),
        'mfcc__hop_length': randint(80, 320),
        'mfcc__n_mels': randint(20, 50),
        'mfcc__fmin': uniform(0, 100),  # Search fmin in the range 0 to 100 Hz
        'mfcc__fmax': uniform(4000, 8000)
    }
    
    # Perform random search
    random_search = RandomizedSearchCV(pipeline, param_dist, n_iter=50, cv=3, n_jobs=-1, verbose=2, random_state=42)
    random_search.fit(X_train, y_train)
    
    # Get the best parameters from the random search
    best_params_random = random_search.best_params_
    print(f"Best Parameters from Random Search: {best_params_random}")

    # Define a narrower grid based on the random search results for GridSearchCV
    param_grid = {
        'mfcc__sr': [16000],
        'mfcc__n_mfcc': [best_params_random['mfcc__n_mfcc']-2, best_params_random['mfcc__n_mfcc'], best_params_random['mfcc__n_mfcc']+2],
        'mfcc__n_fft': [best_params_random['mfcc__n_fft']-64, best_params_random['mfcc__n_fft'], best_params_random['mfcc__n_fft']+64],
        'mfcc__hop_length': [best_params_random['mfcc__hop_length']-40, best_params_random['mfcc__hop_length'], best_params_random['mfcc__hop_length']+40],
        'mfcc__n_mels': [best_params_random['mfcc__n_mels']-5, best_params_random['mfcc__n_mels'], best_params_random['mfcc__n_mels']+5],
        'mfcc__fmin': [max(0, best_params_random['mfcc__fmin']-10), best_params_random['mfcc__fmin'], best_params_random['mfcc__fmin']+10],
        'mfcc__fmax': [best_params_random['mfcc__fmax']-1000, best_params_random['mfcc__fmax'], best_params_random['mfcc__fmax']+1000]
    }
    
    # Perform grid search
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    
    # Best parameters and score
    best_params_grid = grid_search.best_params_
    best_score_grid = grid_search.best_score_
    
    print(f"Best Parameters from Grid Search: {best_params_grid}")
    print(f"Best Cross-Validation Score from Grid Search: {best_score_grid}")

    # Evaluate on test set
    test_score = grid_search.score(X_test, y_test)
    print(f"Test Set Score: {test_score}")

    # Calculate the best window size based on the best parameters
    mfcc_extractor = MFCCFeatureExtractor(
        sr=best_params_grid['mfcc__sr'],
        n_mfcc=best_params_grid['mfcc__n_mfcc'],
        n_fft=best_params_grid['mfcc__n_fft'],
        hop_length=best_params_grid['mfcc__hop_length'],
        n_mels=best_params_grid['mfcc__n_mels'],
        fmin=best_params_grid['mfcc__fmin'],
        fmax=best_params_grid['mfcc__fmax']
    )
    mfcc_extractor.fit(X_train)
    best_window_size = mfcc_extractor.window_size
    print(f"Best Window Size: {best_window_size}")

### Save the best parameters and score

In [None]:
# Save the values to a JSON file
results = {
    'best_params_grid': best_params_grid,
    'best_score_grid': best_score_grid,
    'test_score': test_score,
    'best_window_size': best_window_size
}

with open('saved_data/best_results.json', 'w') as f:
    json.dump(results, f, indent=4)

### Test specific upper frequency levels

In [None]:
# Define the two fmax and n_mfcc values to compare
fmax_values = [4567.87, 4568, 8000]
n_mfcc_values = [13, 22]

# Prepare data
root_dir = '/Users/ciprian/Desktop/Projects/Smart Plant Pot/Audio/Voice Recognition/Prototype 4'  
files, labels = load_files_and_labels(root_dir)
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Subsample data for the experiment
subsample_size = 10000
subsample_indices = np.random.choice(len(files), size=subsample_size, replace=False)
files_subsample = [files[i] for i in subsample_indices]
labels_subsample = [labels_encoded[i] for i in subsample_indices]

# Split data
X_train, X_test, y_train, y_test = train_test_split(files_subsample, labels_subsample, test_size=0.2, random_state=42)

# Function to create pipeline with given fmax and n_mfcc
def create_pipeline(fmax, n_mfcc):
    return Pipeline([
        ('mfcc', MFCCFeatureExtractor(fmax=fmax, sr=16000, n_mfcc=n_mfcc, n_fft=404, hop_length=119, n_mels=35, fmin=0)),
        ('classifier', KNeighborsClassifier())
    ])

# Compare performance for different fmax and n_mfcc values
for fmax in fmax_values:
    for n_mfcc in n_mfcc_values:
        pipeline = create_pipeline(fmax, n_mfcc)
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Accuracy with fmax={fmax}, n_mfcc={n_mfcc}: {accuracy}")

# Output the window size for each fmax and n_mfcc value
for fmax in fmax_values:
    for n_mfcc in n_mfcc_values:
        mfcc_extractor = MFCCFeatureExtractor(fmax=fmax, sr=16000, n_mfcc=n_mfcc, n_fft=404, hop_length=119, n_mels=35, fmin=0)
        mfcc_extractor.fit(X_train)
        best_window_size = mfcc_extractor.window_size
        print(f"Best Window Size with fmax={fmax}, n_mfcc={n_mfcc}: {best_window_size}")

### Extract features from wav samples

In [None]:
class MFCCFeatureExtractor:
    def __init__(self, sr=16000, n_mfcc=13, n_fft=256, hop_length=160, n_mels=32, fmin=0, fmax=8000):
        self.sr = sr
        self.n_mfcc = n_mfcc
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax

    def extract_features_from_file(self, wav_file, window_size):
        try:
            # Load the audio file
            signal, sr = librosa.load(wav_file, sr=self.sr)
            # Extract MFCC features
            mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=self.n_mfcc, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax)
            # Padding
            if mfccs.shape[1] < window_size:
                pad_width = window_size - mfccs.shape[1]
                mfccs = np.pad(mfccs, ((0, 0), (0, pad_width)), mode='mean')
            else:
                mfccs = mfccs[:, :window_size]
            return mfccs
        except Exception as e:
            print(f"Error processing file {wav_file}: {e}")
            return None

def process_directory(root_dir, window_size, mfcc_extractor, pbar):
    mfcc_features = []
    labels = []
    gender_labels = []

    for subdir, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.wav'):
                wav_file = os.path.join(subdir, file)
                gender_label = os.path.basename(os.path.dirname(os.path.dirname(wav_file)))
                audio_label = os.path.basename(os.path.dirname(wav_file))

                mfcc = mfcc_extractor.extract_features_from_file(wav_file, window_size)
                if mfcc is not None:
                    mfcc_features.append(mfcc)
                    labels.append(audio_label)
                    gender_labels.append(gender_label)
            pbar.update(1)  # Update the progress bar

    return np.array(mfcc_features), np.array(labels), np.array(gender_labels)

def main(root_dir, window_size, mfcc_extractor):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        subdirs = [os.path.join(root_dir, d) for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]

        total_files = sum(len(files) for _, _, files in os.walk(root_dir) if any(file.endswith('.wav') for file in files))
        with tqdm(total=total_files, desc="Processing files") as pbar:
            futures = []
            for subdir in subdirs:
                futures.append(executor.submit(process_directory, subdir, window_size, mfcc_extractor, pbar))

            mfcc_features = []
            labels = []
            gender_labels = []

            for future in concurrent.futures.as_completed(futures):
                result = future.result()
                if result:
                    mfccs, lbls, g_lbls = result
                    mfcc_features.extend(mfccs)
                    labels.extend(lbls)
                    gender_labels.extend(g_lbls)

    return np.array(mfcc_features), np.array(labels), np.array(gender_labels)

if __name__ == "__main__":
    root_dir = '/Users/ciprian/Desktop/Projects/Smart Plant Pot/Audio/Voice Recognition/Prototype 4' 

    with open('saved_data/best_results.json', 'r') as f:
        loaded_results = json.load(f)

    best_params_grid = loaded_results['best_params_grid']
    best_score_grid = loaded_results['best_score_grid']
    test_score = loaded_results['test_score']
    best_window_size = loaded_results['best_window_size']

    print(f"Loaded Best Parameters from Grid Search: {best_params_grid}")
    print(f"Loaded Best Cross-Validation Score from Grid Search: {best_score_grid}")
    print(f"Loaded Test Set Score: {test_score}")
    print(f"Loaded Best Window Size: {best_window_size}")

    mfcc_extractor = MFCCFeatureExtractor(
        sr=best_params_grid['mfcc__sr'],
        n_mfcc=best_params_grid['mfcc__n_mfcc'],
        n_fft=best_params_grid['mfcc__n_fft'],
        hop_length=best_params_grid['mfcc__hop_length'],
        n_mels=best_params_grid['mfcc__n_mels'],
        fmin=best_params_grid['mfcc__fmin'],
        fmax=best_params_grid['mfcc__fmax']
    )

    mfcc_features, labels, gender_labels = main(root_dir, best_window_size, mfcc_extractor)

    joblib.dump(mfcc_features, 'mfcc_features.joblib')
    joblib.dump(labels, 'labels.joblib')
    joblib.dump(gender_labels, 'gender_labels.joblib')