<a href="https://colab.research.google.com/github/ElyasBelkhir/VocalVault/blob/main/VoiceAuthentication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install datasets transformers speechbrain

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/521.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/521.2 kB[0m [31m5.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting speechbrain
  Downloading speechbrain-0.5.15-py3-none-any.whl (553 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m553.8/553.8 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.5-py3-none-any.whl (7.8 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiproces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Create function to extract speaker embeddings

In [None]:
import torch
import numpy as np
from tqdm import tqdm
from transformers import Wav2Vec2FeatureExtractor
from transformers import WavLMForXVector

# Extract embedding vector for each audio sample using pre-trained
# model for speaker verification (WavLM)
device = "cuda" if torch.cuda.is_available() else "cpu"
feature_extractor_wav2vec = Wav2Vec2FeatureExtractor.from_pretrained(
    "microsoft/wavlm-base-plus-sv")
model_wav_lm = WavLMForXVector.from_pretrained(
    "microsoft/wavlm-base-plus-sv").to(device)

N_SECONDS_SEGMENT = 4
SAMPLING_RATE = 16000

def extract_embeddings(model, feature_extractor, data, device):
    """Use WavLM model to extract embeddings for audio segments"""
    emb_train = list()
    for i in tqdm(range(len(data))):
        # Add padding to ensure compatibility
        padded_data = np.pad(data[i], (0, max(0, SAMPLING_RATE * N_SECONDS_SEGMENT - data[i].shape[0])), 'constant')
        inputs = feature_extractor(
            padded_data,
            sampling_rate=SAMPLING_RATE,
            return_tensors="pt",
            padding="longest"
        ).to(device)
        with torch.no_grad():
            embeddings = model(**inputs).embeddings

        # Normalize embeddings along the last dimension (dimension -1)
        embeddings_normalized = torch.nn.functional.normalize(embeddings, dim=-1)

        emb_train += torch.nn.functional.normalize(
            embeddings.cpu(), dim=-1).cpu()

    return torch.stack(emb_train)

preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/58.6k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/wavlm-base-plus-sv were not used when initializing WavLMForXVector: ['wavlm.encoder.pos_conv_embed.conv.weight_g', 'wavlm.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing WavLMForXVector from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMForXVector from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WavLMForXVector were not initialized from the model checkpoint at microsoft/wavlm-base-plus-sv and are newly initialized: ['wavlm.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wavlm.encoder.pos_conv_embed.conv.parametrizations.weight.original0']
You should probably TRAIN this model on a d

Create function to load wav file

In [None]:
import librosa

def load_wav_file(file_path, sampling_rate):
    """ Load a WAV file. """
    audio, _ = librosa.load(file_path, sr=sampling_rate)
    return audio

Return whether 2 wav files are same or not

In [None]:
def compare_audio(wav_1, wav_2):
  # Load your WAV files
  wav_file_1 = load_wav_file(wav_1, SAMPLING_RATE)
  wav_file_2 = load_wav_file(wav_2, SAMPLING_RATE)

  # Extract embeddings
  x_wav_files = [wav_file_1, wav_file_2]
  embeddings = extract_embeddings(model_wav_lm, feature_extractor_wav2vec, x_wav_files, device)

  embedding_1 = embeddings[0]
  embedding_2 = embeddings[1]

  cosine_sim = torch.nn.CosineSimilarity(dim=-1)
  similarity = cosine_sim(embedding_1, embedding_2)
  threshold = 0.86  # the optimal threshold is dataset-dependent
  if similarity < threshold:
    print("\nSpeakers are not the same")
  else:
    print("\nSpeakers are the same")

In [None]:
compare_audio("/content/drive/MyDrive/Audio/ibby2.wav", "/content/drive/MyDrive/Audio/ibby.wav")

100%|██████████| 2/2 [00:08<00:00,  4.48s/it]


Speakers are the same



