In [1]:
import glob
import os
import librosa
import torch
import numpy as np
from pyannote.audio import Inference
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [None]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get Hugging Face token from environment
HF_TOKEN = os.getenv("HF_TOKEN")


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

🔍 Using device: cpu


In [None]:
inference = Inference("pyannote/embedding",
                      use_auth_token=HF_TOKEN,
                      window="whole",
                      device=device)
MAX_DURATION = 5.0

C:\Users\WAGHMARE\AppData\Roaming\Python\Python311\site-packages\pytorch_lightning\utilities\migration\migration.py:208: You have multiple `ModelCheckpoint` callback states in this checkpoint, but we found state keys that would end up colliding with each other after an upgrade, which means we can't differentiate which of your checkpoint callbacks needs which states. At least one of your `ModelCheckpoint` callbacks will not be able to reload the state.
Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.5.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\WAGHMARE\.cache\torch\pyannote\models--pyannote--embedding\snapshots\4db4899737a38b2d618bbd74350915aa10293cb2\pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.6.0+cpu. Bad things might happen unless you revert torch to 1.x.


C:\Users\WAGHMARE\AppData\Roaming\Python\Python311\site-packages\pytorch_lightning\core\saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['loss_func.W']


In [4]:
def extract_embeddings_from_dir(directory):
    wav_files = glob.glob(os.path.join(directory, "**", "*.wav"), recursive=True)
    print(f"\n Found {len(wav_files)} audio files in {directory}")

    embeddings = []
    file_labels = []

    for path in tqdm(wav_files, desc=f"Processing {os.path.basename(directory)}"):
        try:
            waveform, sr = librosa.load(path, sr=16000, mono=True)
            if len(waveform) > MAX_DURATION * sr:
                waveform = waveform[:int(MAX_DURATION * sr)]
            else:
                pad_len = int(MAX_DURATION * sr) - len(waveform)
                waveform = np.pad(waveform, (0, pad_len), 'constant')

            inputs = {
                "waveform": torch.tensor(waveform, dtype=torch.float32).unsqueeze(0).to(device),
                "sample_rate": sr
            }

            with torch.no_grad():
                embedding = inference(inputs)

            embeddings.append(embedding.flatten())
            file_labels.append(path)
        except Exception as e:
            print(f"Error processing {path}: {e}")

    return embeddings, file_labels

In [5]:
ENGLISH_DIR = r"C:\Users\WAGHMARE\Desktop\Research Project\processed_en"
HINDI_DIR = r"C:\Users\WAGHMARE\Desktop\Research Project\processed_hi"

In [6]:
embeddings_en, labels_en = extract_embeddings_from_dir(ENGLISH_DIR)


📁 Found 125 audio files in C:\Users\WAGHMARE\Desktop\Research Project\processed_en


🔄 Processing processed_en: 100%|████████████████████████████████████████████████████| 125/125 [00:22<00:00,  5.49it/s]


In [7]:
embeddings_hi, labels_hi = extract_embeddings_from_dir(HINDI_DIR)


📁 Found 1891 audio files in C:\Users\WAGHMARE\Desktop\Research Project\processed_hi


🔄 Processing processed_hi: 100%|██████████████████████████████████████████████████| 1891/1891 [07:21<00:00,  4.28it/s]


In [8]:
def normalize_embedding(embedding):
    return (embedding - np.mean(embedding)) / np.std(embedding)

def compare_audio_files(file1, file2, inference_model, device, duration=5.0):
    def process_audio(path):
        
        waveform, sr = librosa.load(path, sr=16000, mono=True)

        
        if len(waveform) > duration * sr:
            waveform = waveform[:int(duration * sr)]
        else:
            pad_length = int(duration * sr) - len(waveform)
            waveform = np.pad(waveform, (0, pad_length), 'constant')

        
        inputs = {
            "waveform": torch.tensor(waveform, dtype=torch.float32).unsqueeze(0).to(device),
            "sample_rate": sr
        }

        
        with torch.no_grad():
            embedding = inference_model(inputs)

            
            if isinstance(embedding, dict):
                embedding = embedding["embedding"]

            
            if isinstance(embedding, torch.Tensor):
                embedding = embedding.detach().cpu().numpy()

            
            embedding = embedding.flatten()

        return embedding

    
    emb1 = process_audio(file1)
    emb2 = process_audio(file2)

    
    emb1_normalized = normalize_embedding(emb1)
    emb2_normalized = normalize_embedding(emb2)

    
    print(f"Embedding 1 (First 10 values): {emb1_normalized[:10]}")
    print(f"Embedding 2 (First 10 values): {emb2_normalized[:10]}")

    
    similarity = cosine_similarity([emb1_normalized], [emb2_normalized])[0][0]
    
    # Print the similarity score
    print(f"\n🎙️ Similarity between:\n'{file1}'\nand\n'{file2}' ➜ {similarity:.4f}")
    return similarity

In [9]:
file_path_1 = r"C:\Users\WAGHMARE\Desktop\Research Project\processed_hi\0e97e303991e6646c845049579a4bea7a25ea775f6b8b44a09e0efd0ae9661ec984c393c92d2522e3158c8b0ca206ad60fd9e136713e18c21dffd72811a88b88\common_voice_hi_38108255.wav"
file_path_2 = r"C:\Users\WAGHMARE\Desktop\Research Project\processed_hi\0e97e303991e6646c845049579a4bea7a25ea775f6b8b44a09e0efd0ae9661ec984c393c92d2522e3158c8b0ca206ad60fd9e136713e18c21dffd72811a88b88\common_voice_hi_38112196.wav"
similarity_score = compare_audio_files(file_path_1, file_path_2, inference, device)

Embedding 1 (First 10 values): [ 2.5484045   0.09778892  1.5351144   0.78593045  0.98404956  1.7880502
 -0.27377176  0.2508088  -0.2613017  -0.01528185]
Embedding 2 (First 10 values): [ 1.544618   -1.5865766  -0.23792142 -0.81557417  0.55871034  0.6480172
 -0.7324481   1.0108807  -1.2180551  -0.31522748]

🎙️ Similarity between:
'C:\Users\WAGHMARE\Desktop\Research Project\processed_hi\0e97e303991e6646c845049579a4bea7a25ea775f6b8b44a09e0efd0ae9661ec984c393c92d2522e3158c8b0ca206ad60fd9e136713e18c21dffd72811a88b88\common_voice_hi_38108255.wav'
and
'C:\Users\WAGHMARE\Desktop\Research Project\processed_hi\0e97e303991e6646c845049579a4bea7a25ea775f6b8b44a09e0efd0ae9661ec984c393c92d2522e3158c8b0ca206ad60fd9e136713e18c21dffd72811a88b88\common_voice_hi_38112196.wav' ➜ 0.5653


In [10]:
file_path_1 = r"C:\Users\WAGHMARE\Desktop\RP\data\English\A07_F\en_A07_F_01.wav"
file_path_2 = r"C:\Users\WAGHMARE\Desktop\RP\data\English\A07_F\en_A07_F_04.wav"
similarity_score = compare_audio_files(file_path_1, file_path_2, inference, device)

Embedding 1 (First 10 values): [ 3.177615   -2.0175655  -0.9942628  -0.35604155 -0.09130063 -0.10804775
 -0.31795043  0.46639934 -0.37669405  0.7474587 ]
Embedding 2 (First 10 values): [ 0.21342622 -0.19962388  1.0655699  -0.18776372 -1.0166669   0.7234538
 -1.234155   -0.8617688  -2.2636068   0.10148074]

🎙️ Similarity between:
'C:\Users\WAGHMARE\Desktop\RP\data\English\A07_F\en_A07_F_01.wav'
and
'C:\Users\WAGHMARE\Desktop\RP\data\English\A07_F\en_A07_F_04.wav' ➜ 0.2848


In [11]:
file_path_1 = r"C:\Users\WAGHMARE\Desktop\Research Project\50_speakers_audio_data\Speaker_0001\Speaker_0001_00002.wav"
file_path_2 = r"C:\Users\WAGHMARE\Desktop\Research Project\50_speakers_audio_data\Speaker_0001\Speaker_0001_00004.wav"
similarity_score = compare_audio_files(file_path_1, file_path_2, inference, device)

Embedding 1 (First 10 values): [ 0.06538545 -1.8003957  -0.41515714  1.1346269   1.5001671  -0.87405515
 -0.7823379   0.04286908 -1.2332816  -0.5366544 ]
Embedding 2 (First 10 values): [ 0.7052868  -2.4616885  -0.05391465  0.8576792   1.5719087  -1.9978913
  0.23450336  0.3896977  -0.926255   -0.9599938 ]

🎙️ Similarity between:
'C:\Users\WAGHMARE\Desktop\Research Project\50_speakers_audio_data\Speaker_0001\Speaker_0001_00002.wav'
and
'C:\Users\WAGHMARE\Desktop\Research Project\50_speakers_audio_data\Speaker_0001\Speaker_0001_00004.wav' ➜ 0.8249


In [12]:
file_path_1 = r"C:\Users\WAGHMARE\Desktop\Research Project\50_speakers_audio_data\Speaker0050\Speaker0050_002.wav"
file_path_2 = r"C:\Users\WAGHMARE\Desktop\Research Project\50_speakers_audio_data\Speaker0039\Speaker0039_001.wav"
similarity_score = compare_audio_files(file_path_1, file_path_2, inference, device)


Embedding 1 (First 10 values): [ 1.8196362e-01 -2.2389858e+00  1.5574834e+00  1.1474992e-01
 -6.8704617e-01  8.9194947e-01  2.2660670e-01  1.0181997e-03
  9.2620069e-01  4.6025634e-01]
Embedding 2 (First 10 values): [ 0.47011063 -0.4088314  -0.17669976  1.3997208   0.48099512 -0.3109015
 -0.45904306 -1.2318515  -2.5297658   0.3463721 ]

🎙️ Similarity between:
'C:\Users\WAGHMARE\Desktop\Research Project\50_speakers_audio_data\Speaker0050\Speaker0050_002.wav'
and
'C:\Users\WAGHMARE\Desktop\Research Project\50_speakers_audio_data\Speaker0039\Speaker0039_001.wav' ➜ -0.0407


In [14]:
file_path_1 = r"C:\Users\WAGHMARE\Desktop\Research Project\processed_hi\1ab26689a68e6aeb9ee56daf140f5d65c19295d6b65eaef9f51f22cbd867b635758194eb9640790ea7d37a0de2bd5d0f235fb7662cc80dda56261f92c3cb4475\common_voice_hi_25037973.wav"
file_path_2 = r"C:\Users\WAGHMARE\Desktop\Research Project\processed_hi\0a073b2441b86a13d80728f1a14413d473c4091d1053a3a53982f2c2698abb2a31f35efaefaad9b678d4cf8b27aecc058d4f664d005474a90b8af287ff58c303\common_voice_hi_23849286.wav"
similarity_score = compare_audio_files(file_path_1, file_path_2, inference, device)

Embedding 1 (First 10 values): [ 2.1595752  -0.30694318  1.4633367   1.8988129   0.76514125 -1.6562616
 -1.2709295   0.38310316 -0.6176218   0.40029523]
Embedding 2 (First 10 values): [ 1.7465259   0.2329593  -1.184087    0.40510708  0.76387626  0.44100273
 -1.0300514   0.61136824  0.07249195 -0.7768629 ]

🎙️ Similarity between:
'C:\Users\WAGHMARE\Desktop\Research Project\processed_hi\1ab26689a68e6aeb9ee56daf140f5d65c19295d6b65eaef9f51f22cbd867b635758194eb9640790ea7d37a0de2bd5d0f235fb7662cc80dda56261f92c3cb4475\common_voice_hi_25037973.wav'
and
'C:\Users\WAGHMARE\Desktop\Research Project\processed_hi\0a073b2441b86a13d80728f1a14413d473c4091d1053a3a53982f2c2698abb2a31f35efaefaad9b678d4cf8b27aecc058d4f664d005474a90b8af287ff58c303\common_voice_hi_23849286.wav' ➜ 0.2714


In [18]:
file_path_1 = r"C:\Users\WAGHMARE\Desktop\RP\data\English\A01_M\en_A01_M_02.wav"
file_path_2 = r"C:\Users\WAGHMARE\Desktop\RP\data\English\A08_F\en_A08_F_05.wav"
similarity_score = compare_audio_files(file_path_1, file_path_2, inference, device)

Embedding 1 (First 10 values): [ 1.7303016  -0.92037207  0.07880109 -0.01700118  0.23779546  1.046638
 -2.1089869   1.0774442   0.7817927   1.2765217 ]
Embedding 2 (First 10 values): [ 0.61642873 -1.1677047   1.7057757   0.4080145  -0.8218774   0.6297684
  0.16025929 -0.540241   -0.6675733  -0.78814995]

🎙️ Similarity between:
'C:\Users\WAGHMARE\Desktop\RP\data\English\A01_M\en_A01_M_02.wav'
and
'C:\Users\WAGHMARE\Desktop\RP\data\English\A08_F\en_A08_F_05.wav' ➜ 0.0462
