**Import & SetUp**

In [None]:
# Install modules:
!pip install resemblyzer

In [2]:
# Connect to Google Drive:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Import modules:
import zipfile
import os
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm
from resemblyzer import VoiceEncoder, preprocess_wav
from pathlib import Path

**Load Dataset**

In [4]:
# Paths to the zip files:
base_path = '/content/drive/MyDrive/FVML/'

zip_files = [
    'mavceleb_v1_train.zip',
    'mavceleb_v1_test.zip',
    'mavceleb_v2_train.zip',
    'mavceleb_v2_test.zip'
]

zip_files = [
    'mavceleb_v1_train.zip',
    'mavceleb_v2_train.zip',
]

# Base directory for extraction:
base_extract_to = '/content/src'

# Create base directory if it doesn't exist:
os.makedirs(base_extract_to, exist_ok=True)

# Process each zip file:
for zip_filename in zip_files:
    # Construct the full paths:
    zip_path = os.path.join(base_path, zip_filename)
    extract_to = os.path.join(base_extract_to, zip_filename[:-4])

    # Create a subdirectory for each zip file:
    os.makedirs(extract_to, exist_ok=True)

    # Unzip the file with progress bar:
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Filter for files in the 'voices' directory only:
        voice_files = [f for f in zip_ref.namelist() if 'voices/' in f]

        # Start extracting with progress bar:
        with tqdm(total=len(voice_files), desc=f"Extracting {zip_filename}") as pbar:
            for file in voice_files:
                # Extract only the files from the 'voices' directory:
                zip_ref.extract(member=file, path=extract_to)
                pbar.update(1)


Extracting mavceleb_v2_train.zip:   0%|          | 0/21215 [00:00<?, ?it/s]

**Embeddings**

GitHub: https://github.com/resemble-ai/Resemblyzer?tab=readme-ov-file

In [5]:
# Create a voice encoder instance:
encoder = VoiceEncoder()

def get_embedding(audio_path, encoder):
    # Preprocess the .wav file:
    wav = preprocess_wav(audio_path)

    # Get the embedding:
    embed = encoder.embed_utterance(wav)

    return embed

# Base directory where the extracted voices are located
base_directory = '/content/src'
results_directory = '/content/res'

# Ensure the results directory exists:
os.makedirs(results_directory, exist_ok=True)

# List all dataset directories:
dataset_directories = [os.path.join(base_directory, d) for d in os.listdir(base_directory) if os.path.isdir(os.path.join(base_directory, d))]

for dataset_dir in dataset_directories:
    # Determine if it is the train or test set and which version:
    is_train = 'train' in dataset_dir
    languages = ['English', 'Urdu'] if 'v1' in dataset_dir else ['English', 'Hindi']

    for language in languages:
        # Case 1: Train set
        if is_train and 'v2' in dataset_dir:
            # Handling training data with speaker and language info
            if 'v1' in dataset_dir:
                voice_dir_base = os.path.join(dataset_dir, 'voices')
            else:
                voice_dir_base = os.path.join(dataset_dir, 'v2/voices')

            print(voice_dir_base)
            # Process each id:
            embeddings = []
            for id_dir in tqdm(os.listdir(voice_dir_base)):
                id_path = os.path.join(voice_dir_base, id_dir, language)

                # Process each speaker:
                for speaker_id in os.listdir(id_path):
                    speaker_path = os.path.join(id_path, speaker_id)

                    # Process each file:
                    for filename in os.listdir(speaker_path):
                        if filename.endswith('.wav'):
                            file_path = os.path.join(speaker_path, filename)

                            # Get the embeddings from the model:
                            embedding = get_embedding(file_path, encoder)

                            # Get the audio_id:
                            audio_id = filename.split('.')[0]

                            # Add the embedding to the embeddings:
                            embeddings.append([id_dir, language, speaker_id, audio_id] + list(embedding))

            df = pd.DataFrame(embeddings, columns=['id', 'language', 'speaker_id', 'audio_id'] + [f'dim_{i}' for i in range(len(embeddings[0])-4)])
            csv_path = os.path.join(results_directory, os.path.basename(dataset_dir), f'{language}.csv')
            os.makedirs(os.path.dirname(csv_path), exist_ok=True)
            df.to_csv(csv_path, index=False)
            print(f'Saved embeddings to {csv_path}')

        # Case 2: Test set:
        else:
            voice_dir = os.path.join(dataset_dir, 'voices', language)

            if os.path.exists(voice_dir):
                embeddings = []
                for filename in tqdm(os.listdir(voice_dir), desc=f"Processing {voice_dir}"):
                    if filename.endswith('.wav'):
                        file_path = os.path.join(voice_dir, filename)
                        embedding = get_embedding(file_path, encoder)
                        audio_id = filename.split('.')[0]
                        embeddings.append([audio_id] + list(embedding))

                df = pd.DataFrame(embeddings, columns=['audio_id'] + [f'dim_{i}' for i in range(len(embeddings[0])-1)])

                result_dir = os.path.join(results_directory, os.path.basename(dataset_dir), 'voices', language)
                os.makedirs(result_dir, exist_ok=True)
                csv_path = os.path.join(result_dir, f'{language}.csv')
                df.to_csv(csv_path, index=False)
                print(f'Saved embeddings to {csv_path}')

Loaded the voice encoder model on cuda in 0.50 seconds.
/content/src/mavceleb_v2_train/v2/voices


  0%|          | 0/78 [00:00<?, ?it/s]

Saved embeddings to /content/res/mavceleb_v2_train/English.csv
/content/src/mavceleb_v2_train/v2/voices


  0%|          | 0/78 [00:00<?, ?it/s]

Saved embeddings to /content/res/mavceleb_v2_train/Hindi.csv


In [6]:
# Copy the files to Google Drive:
!cp -r /content/res/mavceleb_v2_train /content/drive/MyDrive/MyDriveData