# **INSTALLING MODULES**

In [1]:
!pip install speechbrain==0.5.16
!pip install faster_whisper
!pip install pyannote.audio
!pip install whisper

Collecting speechbrain==0.5.16
  Using cached speechbrain-0.5.16-py3-none-any.whl.metadata (23 kB)
Using cached speechbrain-0.5.16-py3-none-any.whl (630 kB)
Installing collected packages: speechbrain
  Attempting uninstall: speechbrain
    Found existing installation: speechbrain 1.0.3
    Uninstalling speechbrain-1.0.3:
      Successfully uninstalled speechbrain-1.0.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pyannote-audio 3.3.2 requires speechbrain>=1.0.0, but you have speechbrain 0.5.16 which is incompatible.[0m[31m
[0mSuccessfully installed speechbrain-0.5.16
Collecting speechbrain>=1.0.0 (from pyannote.audio)
  Using cached speechbrain-1.0.3-py3-none-any.whl.metadata (24 kB)
Using cached speechbrain-1.0.3-py3-none-any.whl (864 kB)
Installing collected packages: speechbrain
  Attempting uninstall: speechbrain
    Found existing installation: 

In [2]:
from huggingface_hub import login
from dotenv import load_dotenv
import os
load_dotenv()  # Automatically loads .env file from current directory
login(token=os.getenv("HUGGINGFACE_HUB_TOKEN"))

# **IMPORT NECESSARY LIBRARIES**

In [3]:
import librosa
import traceback
from faster_whisper import WhisperModel
import torch
import whisper
import datetime
from pathlib import Path
import pandas as pd
import re
import time
import os
import numpy as np
from sklearn.cluster import AgglomerativeClustering, KMeans
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
from pyannote.audio import Audio
from pyannote.core import Segment
import speechbrain
from scipy.spatial.distance import cdist
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# !tar -xvf wavs.tar.gz

# **Defining Helper Functions**

In [5]:
# Purpose: This function is for converting time in seconds into a human-readable format,
# specifically  'hh:mm:ss'
# Input: It takes a floating-point number representing time in seconds
# Output: It should return a string representing the time in 'hh:mm:ss'
def convert_time(secs):
  return datetime.timedelta(seconds=round(secs))

In [6]:
# Purpose: This function handles the audio transcription process using the
# Output: Return a list of segment dictionaries.
def transcribe_audio(audio_file, model_name="base", language="en", beam_size=5, best_of=5):
  # Load the model
  model = WhisperModel(model_name, compute_type='int8')

  try:
    # Transcribe audio
    options = dict(language='en', beam_size=5, best_of=5)
    transcribe_options = dict(task='transcribe', **options)
    segments_raw, info = model.transcribe(audio_file, **transcribe_options)

    # Convert back to original openai format
    segments = []
    for segment_chunk in segments_raw:
      chunk = {}
      chunk['start'] = segment_chunk.start
      chunk['end'] = segment_chunk.end
      chunk['text'] = segment_chunk.text
      segments.append(chunk)

  except Exception as e:
    raise RuntimeError(f"Error converting audio to transcribe.{e}")

  return segments


In [7]:
# Purpose: This function extracts a speaker embedding for a specific segment of
# an audio file using a pre-trained speaker embedding model.
def extract_segment_embedding(audio_file, segment, total_duration,
                              embedding_model):
  try:
    audio = Audio()
    start = segment['start']
    end = min(total_duration, segment['end']) # end time cannot more than t_d

    clip = Segment(start, end)
    # method to extract the waveform for the given segment from the
    waveform, sample_rate = audio.crop(audio_file, clip)

    embeddings = embedding_model(waveform[None])

    return embeddings.squeeze()
  except Exception as e:
    traceback.print_exc()
    raise RuntimeError(f"Error during segment embedding {e}")

In [8]:
def compute_segment_embeddings(audio_file, segments, embedding_model):
  audio_data, sample_rate = librosa.load(audio_file, mono=True, sr=16000)
  total_duration = len(audio_data)/sample_rate

  # get embeddings for each segments
  embeddings = []
  for segment in segments:
    embeddings.append(extract_segment_embedding(audio_file, segment, total_duration, embedding_model))
  
  embedding_vec = np.stack(embeddings)
  embedding_vec = np.nan_to_num(embedding_vec)

  return embedding_vec  

In [9]:
def cluster_embeddings(embeddings, n_clusters):
  # Initialize KMeans cluster
  kmeans = KMeans(
      n_clusters=n_clusters,
      random_state=42
  )

  kmeans.fit(embeddings)
  labels = kmeans.labels_

  centroids = kmeans.cluster_centers_

  return labels, centroids

In [10]:
# Purpose: Calculate the average embedding for each cluster. This average embedding represents a 
# cluster's speaker profile.
def compute_cluster_averages(embeddings, labels, n_clusters):
  avg_cluster_labels = {}
  for i in range(n_clusters):
    cluster_masked = (labels==i)
    masked_embeddings = embeddings[cluster_masked]
    avg_embedding = np.mean(masked_embeddings, axis=0)
    avg_cluster_labels[i]=avg_embedding
  return avg_cluster_labels

In [11]:
# Purpose: To load audio files of known speakers, compute their speaker embeddings, and store them for speaker identification.
def load_known_speaker_embeddings(known_speaker_files, embedding_model):
  known_speaker = {}
  for f_name in known_speaker_files:
    audio_data, sample_rate = librosa.load(f_name, mono=True, sr=16000)
    audio = Audio()
    total_duration = len(audio_data)/sample_rate
    clip = Segment(0, total_duration)

    waveform, sample_rate = audio.crop(f_name, clip)

    embeddings = embedding_model(waveform[None])
    embeddings = embeddings.squeeze()
    known_speaker[f_name.split('.')[0]]=embeddings
  
  return known_speaker

In [12]:
def assign_speaker_labels(cluster_avg_embeddings, known_speaker_embeddings, similarity_threshold=0.7):
  speaker_labels = {}
  for spk_label, spk_emb in known_speaker_embeddings.items():
    max_siml = -1
    corr_label = 'Unknown'
    for cluster_id, avg_emb in cluster_avg_embeddings.items():
      # Calculate cosine similarity
      # Reshape the 1D arrays to 2D arrays
      cos_siml = cosine_similarity(avg_emb.reshape(1, -1), spk_emb.reshape(1, -1))[0][0]
      if cos_siml>max_siml:
        max_siml = cos_siml
        corr_label = spk_label

    # Check the threshold
    if max_siml> similarity_threshold:
      speaker_labels[cluster_id]=corr_label
    else:
      speaker_labels[cluster_id]=corr_label
  return speaker_labels

In [13]:
# Purpose: This function orchestrates the entire speaker diarization and identification pipeline, calling all the previously defined functions in the correct sequence.
def run_pipeline(audio_file, embedding_model, known_speaker_embeddings, n_clusters=3, whisper_model_name="base", similarity_threshold=0.7):
  print('#'*40)
  print("Started Diarization and Identification Pipeline")
  # Get the segments of the audio
  segments = transcribe_audio(audio_file)
  # Get the segments vector
  embedding_vec = compute_segment_embeddings(audio_file, segments, embedding_model)
  # Get the labels and centroids for each cluster
  labels, centroids = cluster_embeddings(embedding_vec, n_clusters)
  # Computer cluster avetages
  cluster_avg_embeddings = compute_cluster_averages(embedding_vec, labels, n_clusters)
  # Get the labels for the speakers in the cluster
  speaker_labels = assign_speaker_labels(cluster_avg_embeddings, known_speaker_embeddings, similarity_threshold)

  # Annotate each segment with its cluster ID and assigned speaker ID.
  results = []
  for segment, cluster_id in zip(segments, labels):
      start = segment['start']
      end = segment['end']
      speaker_id = speaker_labels.get(cluster_id)
      results.append({
          'Start': convert_time(start),
          'End': convert_time(end),
          'Test': segment['text'],
          'Cluster': cluster_id,
          'Speaker_ID': speaker_id
      })
  
  df = pd.DataFrame(results)
  print(df)
  return df, len(segments), centroids

In [14]:
embedding_model = PretrainedSpeakerEmbedding(
    "speechbrain/spkrec-ecapa-voxceleb",
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
)

  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)


In [15]:
known_speaker_files = ['sample.wav', 'sample_noisy.wav', 'speaker_A.wav', 'speaker_B.wav', 'speaker_C.wav', 'speaker_D.wav', 'speaker_E.wav']

In [None]:
known_speakers = load_known_speaker_embeddings(known_speaker_files, embedding_model)

: 

In [None]:
clean_audio = 'sample.wav'
df_clean, n_segments_clean, centroids_clean = run_pipeline(clean_audio, embedding_model, known_speakers, 3, 'base', 0.7)

########################################
Started Diarization and Identification Pipeline


In [None]:
df_clean.head(10)

In [None]:
noisy_audio = 'sample_noisy.wav'
df_noisy, n_segments_noisy, centroids_noisy = run_pipeline(noisy_audio, embedding_model, known_speakers, 3, 'base', 0.7)

In [None]:
df_noisy.head(10)