In [1]:
import jiwer
import torch, torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

In [34]:
# Check if CUDA is available
print("CUDA Available:", torch.cuda.is_available())

# If CUDA is available, check which GPU is being used
if torch.cuda.is_available():
    print("CUDA Device Name:", torch.cuda.get_device_name(0))
    print("CUDA Device Count:", torch.cuda.device_count())
    print("Current CUDA Device:", torch.cuda.current_device())

CUDA Available: True
CUDA Device Name: NVIDIA GeForce RTX 3070 Laptop GPU
CUDA Device Count: 1
Current CUDA Device: 0


In [2]:
ASR_PRETRAINED_MODEL = "facebook/wav2vec2-large-960h-lv60-self"

def load_asr():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = Wav2Vec2ForCTC.from_pretrained(ASR_PRETRAINED_MODEL).to(device)
    tokenizer = Wav2Vec2Tokenizer.from_pretrained(ASR_PRETRAINED_MODEL)
    models = {"model": model, "tokenizer": tokenizer}
    return models

In [4]:
def wav_to_text(model, wav):
    # Tokenize the input
    inputs = model["tokenizer"](wav, sampling_rate=16000, return_tensors="pt", padding="longest")

    # Fix input shape if necessary
    input_values = inputs.input_values.squeeze(1)  # Squeeze out the extra dimension

    # Move tensors to the GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_values = input_values.to(device)

    # Get the model predictions (logits)
    logits = model["model"](input_values).logits
    
    # Get the predicted IDs
    predicted_ids = torch.argmax(logits, dim=-1)
    
    # Decode the predicted IDs into the text (batch_decode returns a list, so we take [0])
    result = model["tokenizer"].batch_decode(predicted_ids)[0]

    return result

In [5]:
models = load_asr()

Some weights of the model checkpoint at facebook/wav2vec2-large-960h-lv60-self were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.maske

In [6]:
# Load your wav file using torchaudio
def load_wav_file(file_path):
    waveform, sample_rate = torchaudio.load(file_path)

    # Resample if necessary (Wav2Vec2 expects 16kHz)
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    return waveform

In [7]:
wav = load_wav_file(r"C:\Users\tuanp\Desktop\DDDM-VC\DDDM-VC\converted\src_to_tar.wav")

In [8]:
wav

tensor([[0.0096, 0.0097, 0.0093,  ..., 0.0042, 0.0042, 0.0039]])

In [9]:
wav2 = load_wav_file(r"C:\Users\tuanp\Desktop\DDDM-VC\DDDM-VC\preprocess\workspace\ha0\data\src.wav")

In [10]:
convert = wav_to_text(models, wav)
convert

  attn_output = torch.nn.functional.scaled_dot_product_attention(


"THOUGHT KILLS ME THAT I AM NOT THOUGHT TO LEAP LARGE LENGTHS OF MILES WHEN THOU ART GONE BUT THAT SO MUCH OF EARTH AND WATER WROUGHT I MUST ATTEND TIME'S LEISURE WITH MY MOAN RECEIVING NOT BY ELEMENTS SO SLOW BUT HEAVY TEARS BADGES OF EITHER'S WOE"

In [11]:
original = wav_to_text(models, wav2)
convert

"THOUGHT KILLS ME THAT I AM NOT THOUGHT TO LEAP LARGE LENGTHS OF MILES WHEN THOU ART GONE BUT THAT SO MUCH OF EARTH AND WATER WROUGHT I MUST ATTEND TIME'S LEISURE WITH MY MOAN RECEIVING NOT BY ELEMENTS SO SLOW BUT HEAVY TEARS BADGES OF EITHER'S WOE"

In [12]:
jiwer.cer(convert, original)

0.0

### Measure speaker similarity

In [37]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, WavLMForCTC
from scipy.spatial.distance import cosine

# Load pre-trained WavLM model and Wav2Vec2Processor (acts as feature extractor)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor = Wav2Vec2Processor.from_pretrained('patrickvonplaten/wavlm-libri-clean-100h-base-plus')
model = WavLMForCTC.from_pretrained('patrickvonplaten/wavlm-libri-clean-100h-base-plus').to(device)

# Load the wav file using torchaudio
def load_wav_file1(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    return waveform, sample_rate

# Extract WavLM embeddings
def get_wavlm_embedding(file_path):
    waveform, sample_rate = load_wav_file1(file_path)

    # Resample to 16kHz if necessary
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
        
    waveform = waveform.squeeze()

    # Process the waveform using the feature extractor
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
    input_values = inputs.input_values.to(device)

    # Extract WavLM embeddings
    with torch.no_grad():
        outputs = model(**input_values)  # Outputs are CausalLMOutput with logits
        logits = outputs.logits  # Access the logits

        # Average over time steps to create a speaker embedding
        embedding = logits.mean(dim=1).squeeze()  # Average over time steps

    return embedding.cpu()

# Compute cosine similarity between two embeddings
def compute_similarity(embedding1, embedding2):
    return 1 - cosine(embedding1.numpy(), embedding2.numpy())




Some weights of the model checkpoint at patrickvonplaten/wavlm-libri-clean-100h-base-plus were not used when initializing WavLMForCTC: ['wavlm.encoder.pos_conv_embed.conv.weight_g', 'wavlm.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing WavLMForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WavLMForCTC were not initialized from the model checkpoint at patrickvonplaten/wavlm-libri-clean-100h-base-plus and are newly initialized: ['wavlm.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wavlm.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probab

In [43]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, WavLMModel
from scipy.spatial.distance import cosine

# Load pre-trained WavLM model and processor
processor = Wav2Vec2Processor.from_pretrained('patrickvonplaten/wavlm-libri-clean-100h-base-plus')
model = WavLMForCTC.from_pretrained('patrickvonplaten/wavlm-libri-clean-100h-base-plus')

# Check if CUDA is available and move the model to GPU if it is
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)  # Move the model to GPU

# Load the wav file using torchaudio
def load_wav_file(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    return waveform, sample_rate

# Extract WavLM logits
def get_wavlm_logits(file_path):
    waveform, sample_rate = load_wav_file(file_path)

    # Resample to 16kHz if necessary
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    # Squeeze the waveform to remove any extra dimensions
    waveform = waveform.squeeze()

    # Process the waveform using the feature extractor
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)

    # Move inputs to GPU if available
    input_values = inputs.input_values.to(device)

    # Extract WavLM logits
    with torch.no_grad():
        outputs = model(input_values)  # Get logits instead of hidden states
        logits = outputs.logits  # Access the logits

        # Average over time steps to create a speaker embedding from the logits
        embedding = logits.mean(dim=1).squeeze()  # Average over time steps

    return embedding.cpu()  # Move embedding back to CPU for further processing

# Compute cosine similarity between two embeddings
def compute_similarity(embedding1, embedding2):
    return 1 - cosine(embedding1.numpy(), embedding2.numpy())


Some weights of the model checkpoint at patrickvonplaten/wavlm-libri-clean-100h-base-plus were not used when initializing WavLMForCTC: ['wavlm.encoder.pos_conv_embed.conv.weight_g', 'wavlm.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing WavLMForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WavLMForCTC were not initialized from the model checkpoint at patrickvonplaten/wavlm-libri-clean-100h-base-plus and are newly initialized: ['wavlm.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wavlm.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probab

In [44]:
# Example usage
embedding1 = get_wavlm_embedding(r"C:\Users\tuanp\Desktop\DDDM-VC\DDDM-VC\converted\src_to_tar.wav")
embedding2 = get_wavlm_embedding(r"C:\Users\tuanp\Desktop\DDDM-VC\DDDM-VC\preprocess\workspace\ha0\data\src.wav")

# Compute the similarity
similarity = compute_similarity(embedding1, embedding2)
print(f"Similarity: {similarity}")

Similarity: 0.9993569850921631
