In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score
import librosa
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler

In [2]:
def compute_audio_features(audio_file, coeff_count=20):
    signal, rate = librosa.load(audio_file, sr=None)
    coefficients = librosa.feature.mfcc(y=signal, sr=rate, n_mfcc=coeff_count)
    delta = librosa.feature.delta(coefficients)
    delta2 = librosa.feature.delta(coefficients, order=2)
    merged_features = np.vstack((coefficients, delta, delta2))
    return np.hstack([np.mean(merged_features, axis=1), np.std(merged_features, axis=1)])


In [3]:
def collect_voice_data(data_root):
    voice_features, speaker_ids = [], []
    for speaker_dir in os.listdir(data_root):
        dir_path = os.path.join(data_root, speaker_dir)
        if os.path.isdir(dir_path):
            for audio_file in os.listdir(dir_path):
                if audio_file.endswith(".wav"):
                    file_path = os.path.join(dir_path, audio_file)
                    mfcc_data = compute_audio_features(file_path)
                    voice_features.append(mfcc_data)
                    speaker_ids.append(speaker_dir)
    return np.array(voice_features), np.array(speaker_ids)



In [4]:
def display_voice_data(data_root):
    for speaker_dir in os.listdir(data_root):
        dir_path = os.path.join(data_root, speaker_dir)
        if os.path.isdir(dir_path):
            for audio_file in os.listdir(dir_path):
                if audio_file.endswith(".wav"):
                    file_path = os.path.join(dir_path, audio_file)
                    signal, rate = librosa.load(file_path, sr=None)
                    coefficients = librosa.feature.mfcc(y=signal, sr=rate, n_mfcc=20)
                    plt.figure(figsize=(10, 4))
                    librosa.display.specshow(coefficients, x_axis='time')
                    plt.colorbar()
                    plt.title(f'MFCC Spectrogram: {speaker_dir} - {audio_file}')
                    plt.tight_layout()
                    plt.show()


In [5]:
voice_data_path = "Data\\Input\\Training"
features, labels = collect_voice_data(voice_data_path)
normalizer = StandardScaler()
normalized_Dimension = normalizer.fit_transform(features)

cross_validator = KFold(n_splits=5, shuffle=True, random_state=42)
performance_scores = []
ignore_threshold = True

In [6]:
for train_idx, test_idx in cross_validator.split(normalized_Dimension):
    X_train, X_test = normalized_Dimension[train_idx], normalized_Dimension[test_idx]
    y_train, y_test = labels[train_idx], labels[test_idx]
    gmm_models_of_entry = {}
    for speaker in np.unique(y_train):
        speaker_data = X_train[y_train == speaker]
        data_points = speaker_data.shape[0]
        mixture_count = min(32, max(2, data_points // 2))
        gmm = GaussianMixture(n_components=mixture_count, covariance_type='full', max_iter=200, random_state=42)
        gmm.fit(speaker_data)
        gmm_models_of_entry[speaker] = gmm
    correct_count = 0
    for i in range(len(X_test)):
        test_sample = X_test[i].reshape(1, -1)
        true_speaker = y_test[i]
        best_score = float('-inf')
        identified_speaker = None
        for speaker, model in gmm_models_of_entry.items():
            likelihood = model.score(test_sample)
            #print(f"Likelihood for {speaker}: {likelihood}")
            if likelihood > best_score:
                best_score = likelihood
                identified_speaker = speaker
        if ignore_threshold or best_score > -50:
            if identified_speaker == true_speaker:
                correct_count += 1
    fold_accuracy = correct_count / len(X_test)
    performance_scores.append(fold_accuracy)

In [7]:
def identify_speaker(audio_sample, model_dict, feature_scaler, score_threshold=-50):
    sample_features = compute_audio_features(audio_sample)
    normalized_features = feature_scaler.transform([sample_features])
    highest = float('-inf')
    probable_voice = None
    for speaker, model in model_dict.items():
        likelihood = model.score(normalized_features)
        print(f"Likelihood for {speaker}: {likelihood}")
        if likelihood > highest:
            probable_voice = speaker
            highest = likelihood
    return probable_voice


In [8]:
average_accuracy = np.mean(performance_scores)
print(f"Average Cross-Validation Accuracy: {average_accuracy:.2f}")

Average Cross-Validation Accuracy: 0.90


You can change the path for different test sample and it working Quite good on that as well

In [9]:
test_path = "Data\\Input\\Test\\ShivamY_audio2.wav"
recognized_speaker = identify_speaker(test_path, gmm_models_of_entry, normalizer)
print(f"Recognized Speaker: {recognized_speaker}")

Likelihood for Abhay-001: -112674144.86172944
Likelihood for Eknath-002: -107693945.97117543
Likelihood for Rg-003: -109217035.22017416
Likelihood for Rishika-004: -115834562.05809252
Likelihood for ShivamY-006: -58678419.400855795
Likelihood for Vaibhav-005: -93513428.93072116
Recognized Speaker: ShivamY-006
