In [None]:
# !pip install transformers datasets jiwer PrettyTable openai-whisper ffmpeg pydub

# Check CUDA

In [1]:
import torch
import warnings

print(f"Is CUDA supported by this system? {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

cuda_id = torch.cuda.current_device()
print(f"ID of current CUDA device: {cuda_id}")

print(f"Name of current CUDA device: {torch.cuda.get_device_name(cuda_id)}")

warnings.filterwarnings("ignore")

Is CUDA supported by this system? True
CUDA version: 11.8
ID of current CUDA device: 0
Name of current CUDA device: NVIDIA GeForce RTX 3050 Laptop GPU


# Logging Hub

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Load model

In [3]:
from transformers import pipeline

#### Whisper model

In [4]:
whisper = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device='cuda')

#### PhoWhisper model

In [5]:
pho_whisper = pipeline("automatic-speech-recognition", model="vinai/PhoWhisper-large", device='cuda')

#### Wav2Vec model

In [6]:
wav2vec = pipeline("automatic-speech-recognition", model="nguyenvulebinh/wav2vec2-base-vietnamese-250h", device='cuda')

Some weights of the model checkpoint at nguyenvulebinh/wav2vec2-base-vietnamese-250h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at nguyenvulebinh/wav2vec2-base-vietnamese-250h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You

# Load Data

In [7]:
import os
import numpy as np
import pandas as pd

In [8]:
df = pd.read_csv('valTest.csv')
valTest = []
for index, row in df.iterrows():
  obj = dict()
  obj['text'] = row['text']
  obj['file'] = row['file']
  obj['path'] = row['path']
  valTest.append(obj)

In [10]:
len(valTest)

6210

# Evaluate

In [11]:
from prettytable import PrettyTable
from tqdm import tqdm
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [None]:
result = []
for test in tqdm(valTest):
  row = []
  row.append(test['text'])
  row.append(whisper(test['path'])['text'])
  row.append(pho_whisper(test['path'])['text'])
  row.append(wav2vec(test['path'])['text'])
  result.append(row)

In [None]:
df = pd.DataFrame(result, columns=['Actual', 'PhoWhisper', 'Whisper'])
df.to_csv('results.csv', index=False)

In [None]:
table = PrettyTable()
table.field_names = ["Truth", "PhoWhisper", "Wav2vec"]
table.align = "l"
table.max_table_width = 90
table.vrules = True
for row in result:
  table.add_row(row)
  table.add_row(["-"*29, "-"*27, "-"*27])
print(table[:50])

In [None]:
table