In [None]:
import librosa
import numpy as np
from scipy.spatial.distance import cosine
from IPython.display import Audio, display

def preprocess_audio(audio_path, sr=16000):
    y, sr = librosa.load(audio_path, sr=sr)
    y, _ = librosa.effects.trim(y, top_db=25)
    return y, sr

def extract_mfcc_vector(file_path, n_mfcc=20):
    y, sr = preprocess_audio(file_path)
    mfcc = librosa.feature.mfcc(
        y=y,
        sr=sr,
        n_mfcc=n_mfcc,
        n_fft=512,
        hop_length=160
    )
    mfcc = mfcc - np.mean(mfcc, axis=1, keepdims=True)
    mfcc_mean = np.mean(mfcc, axis=1)
    return mfcc_mean

def verify_voice(vec1, vec2, threshold=0.3):
    vec1 = vec1 / (np.linalg.norm(vec1) + 1e-10)
    vec2 = vec2 / (np.linalg.norm(vec2) + 1e-10)
    distance = cosine(vec1, vec2)

    if distance < threshold:
        return "SAME", distance
    else:
        return "DIFFERENT", distance

In [None]:
# Example 1: Different speakers
speakerA = "data/native_speaker/01_native.wav"
speakerB = "data/learner/01_learner.wav"

print("Speaker A:")
display(Audio(speakerA))
print("Speaker B:")
display(Audio(speakerB))

result, distance = verify_voice(
    extract_mfcc_vector(speakerA),
    extract_mfcc_vector(speakerB),
    threshold=0.3
)

print(result, distance)

# Example 2: Same speaker
speakerC = "data/native_speaker/02_native.wav"
speakerD = "data/native_speaker/03_native.wav"

print("Speaker C:")
display(Audio(speakerC))
print("Speaker D:")
display(Audio(speakerD))

result, distance = verify_voice(
    extract_mfcc_vector(speakerC),
    extract_mfcc_vector(speakerD),
    threshold=0.3
)

print(result, distance)

Speaker A:


Speaker B:


DIFFERENT 1.9898791890134935
Speaker C:


Speaker D:


SAME 0.05729727324523781
