In [None]:
# !pip install transformers datasets jiwer PrettyTable openai-whisper ffmpeg pydub

# Check CUDA

In [1]:
import torch
import warnings

print(f"Is CUDA supported by this system? {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

cuda_id = torch.cuda.current_device()
print(f"ID of current CUDA device: {cuda_id}")

print(f"Name of current CUDA device: {torch.cuda.get_device_name(cuda_id)}")

warnings.filterwarnings("ignore")

Is CUDA supported by this system? True
CUDA version: 11.8
ID of current CUDA device: 0
Name of current CUDA device: NVIDIA GeForce RTX 3050 Laptop GPU


# Load model

In [2]:
from transformers import pipeline

#### Whisper model

In [3]:
whisper = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device='cuda')

#### PhoWhisper model

In [4]:
pho_whisper = pipeline("automatic-speech-recognition", model="vinai/PhoWhisper-large", device='cuda')

#### Wav2Vec model

In [5]:
wav2vec = pipeline("automatic-speech-recognition", model="nguyenvulebinh/wav2vec2-base-vietnamese-250h", device='cuda')

Some weights of the model checkpoint at nguyenvulebinh/wav2vec2-base-vietnamese-250h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at nguyenvulebinh/wav2vec2-base-vietnamese-250h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You

# Load Data

In [6]:
import os
import numpy as np
import pandas as pd

In [7]:
df = pd.read_csv('valTest.csv')
valTest = []
for index, row in df.iterrows():
  obj = dict()
  obj['text'] = row['text']
  obj['file'] = row['file']
  obj['path'] = row['path']
  valTest.append(obj)

In [8]:
len(valTest)

6210

# Evaluate

In [9]:
from prettytable import PrettyTable
from tqdm import tqdm
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [10]:
result = []
for test in tqdm(valTest):
  row = []
  row.append(test['text'])
  row.append(whisper(test['path'])['text'])
  row.append(pho_whisper(test['path'])['text'])
  row.append(wav2vec(test['path'])['text'])
  result.append(row)

  0%|          | 0/6210 [00:00<?, ?it/s]Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  0%|          | 10/6210 [41:03<468:35:14, 272.08s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize e

In [None]:
df = pd.DataFrame(result, columns=['Actual', "Whisper", "PhoWhisper", "Wav2Vec"])
df.to_csv('results.csv', index=False)