In [1]:
import jiwer
import torch, torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

In [32]:
ASR_PRETRAINED_MODEL = "facebook/wav2vec2-large-960h-lv60-self"

def load_asr():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = Wav2Vec2ForCTC.from_pretrained(ASR_PRETRAINED_MODEL).to(device)
    tokenizer = Wav2Vec2Tokenizer.from_pretrained(ASR_PRETRAINED_MODEL)
    models = {"model": model, "tokenizer": tokenizer}
    return models

In [50]:
def wav_to_text(model, wav):
    # Tokenize the input
    inputs = model["tokenizer"](wav, sampling_rate=16000, return_tensors="pt", padding="longest")

    # Fix input shape if necessary
    input_values = inputs.input_values.squeeze(1)  # Squeeze out the extra dimension

    # Move tensors to the GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_values = input_values.to(device)

    # Get the model predictions (logits)
    logits = model["model"](input_values).logits
    
    # Get the predicted IDs
    predicted_ids = torch.argmax(logits, dim=-1)
    
    # Decode the predicted IDs into the text (batch_decode returns a list, so we take [0])
    result = model["tokenizer"].batch_decode(predicted_ids)[0]

    return result

In [33]:
models = load_asr()

Some weights of the model checkpoint at facebook/wav2vec2-large-960h-lv60-self were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.maske

In [47]:
# Load your wav file using torchaudio
def load_wav_file(file_path):
    waveform, sample_rate = torchaudio.load(file_path)

    # Resample if necessary (Wav2Vec2 expects 16kHz)
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    return waveform

In [48]:
wav = load_wav_file(r"C:\Users\tuanp\Desktop\DDDM-VC\DDDM-VC\converted\src_to_tar.wav")

In [49]:
wav

tensor([[0.0096, 0.0097, 0.0093,  ..., 0.0042, 0.0042, 0.0039]])

In [52]:
wav2 = load_wav_file(r"C:\Users\tuanp\Desktop\DDDM-VC\DDDM-VC\preprocess\workspace\ha0\data\src.wav")

In [58]:
convert = wav_to_text(models, wav)
convert

"THOUGHT KILLS ME THAT I AM NOT THOUGHT TO LEAP LARGE LENGTHS OF MILES WHEN THOU ART GONE BUT THAT SO MUCH OF EARTH AND WATER WROUGHT I MUST ATTEND TIME'S LEISURE WITH MY MOAN RECEIVING NOT BY ELEMENTS SO SLOW BUT HEAVY TEARS BADGES OF EITHER'S WOE"

In [59]:
original = wav_to_text(models, wav2)
convert

"THOUGHT KILLS ME THAT I AM NOT THOUGHT TO LEAP LARGE LENGTHS OF MILES WHEN THOU ART GONE BUT THAT SO MUCH OF EARTH AND WATER WROUGHT I MUST ATTEND TIME'S LEISURE WITH MY MOAN RECEIVING NOT BY ELEMENTS SO SLOW BUT HEAVY TEARS BADGES OF EITHER'S WOE"

In [57]:
jiwer.cer(convert, original)

0.0