# Data-Processing

In [1]:
import pandas as pd

df = pd.read_csv("/kaggle/input/fosd-vietnamese/transcript.csv", encoding="utf-8")
df["file_name"] = df["file_name"].str.replace(".mp3", ".wav", regex=False)
df.head()

Unnamed: 0,file_name,caption,tag
0,FPTOpenSpeechData_Set001_V0.1_000001.wav,cách để đi,0.00000-1.27298
1,FPTOpenSpeechData_Set001_V0.1_000002.wav,họ đã xét nghiệm máu cho cheng nhưng mọi thứ v...,0.00000-3.79298
2,FPTOpenSpeechData_Set001_V0.1_000003.wav,anh có thể gọi tôi không,0.00000-2.52098
3,FPTOpenSpeechData_Set001_V0.1_000004.wav,có rất nhiều yếu tố may rủi ở đây,0.00000-3.43298
4,FPTOpenSpeechData_Set001_V0.1_000005.wav,ai là chúa nói dối,0.00000-3.93698


In [2]:
import re
import unicodedata

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"[\,\?\.\!\-\;\:\"\“\%\‘\”\�]"
                  r"[^a-zàáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễ"
                  r"ìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữ"
                  r"ỳýỵỷỹđ ]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text
    
def remove_punctuation(text):
    if not isinstance(text, str):
        return ""
    # Bỏ hết ký tự không phải chữ cái, số hoặc khoảng trắng
    text = re.sub(r'[^a-zA-ZÀ-ỹ0-9\s]', '', text)
    # Chuẩn hóa Unicode (tách dấu ra riêng)
    text = unicodedata.normalize('NFD', text)
    # Bỏ các ký tự dấu (combining marks)
    text = ''.join(ch for ch in text if unicodedata.category(ch) != 'Mn')
    # Bỏ khoảng trắng thừa và đưa về lowercase
    text = text.lower().strip()
    return text

df["caption"] = df["caption"].apply(remove_punctuation)
df.head()

Unnamed: 0,file_name,caption,tag
0,FPTOpenSpeechData_Set001_V0.1_000001.wav,cach đe đi,0.00000-1.27298
1,FPTOpenSpeechData_Set001_V0.1_000002.wav,ho đa xet nghiem mau cho cheng nhung moi thu v...,0.00000-3.79298
2,FPTOpenSpeechData_Set001_V0.1_000003.wav,anh co the goi toi khong,0.00000-2.52098
3,FPTOpenSpeechData_Set001_V0.1_000004.wav,co rat nhieu yeu to may rui o đay,0.00000-3.43298
4,FPTOpenSpeechData_Set001_V0.1_000005.wav,ai la chua noi doi,0.00000-3.93698


In [3]:
import pandas as pd
import numpy as np
import os

audio_folder = r'/kaggle/input/fosd-vietnamese/wav'

def find_audio_path(file_name):
    path = os.path.join(audio_folder, file_name)
    if os.path.exists(path):
        return path
    print("not find",path)

df['audio_path'] = df['file_name'].apply(find_audio_path)
df.head()

Unnamed: 0,file_name,caption,tag,audio_path
0,FPTOpenSpeechData_Set001_V0.1_000001.wav,cach đe đi,0.00000-1.27298,/kaggle/input/fosd-vietnamese/wav/FPTOpenSpeec...
1,FPTOpenSpeechData_Set001_V0.1_000002.wav,ho đa xet nghiem mau cho cheng nhung moi thu v...,0.00000-3.79298,/kaggle/input/fosd-vietnamese/wav/FPTOpenSpeec...
2,FPTOpenSpeechData_Set001_V0.1_000003.wav,anh co the goi toi khong,0.00000-2.52098,/kaggle/input/fosd-vietnamese/wav/FPTOpenSpeec...
3,FPTOpenSpeechData_Set001_V0.1_000004.wav,co rat nhieu yeu to may rui o đay,0.00000-3.43298,/kaggle/input/fosd-vietnamese/wav/FPTOpenSpeec...
4,FPTOpenSpeechData_Set001_V0.1_000005.wav,ai la chua noi doi,0.00000-3.93698,/kaggle/input/fosd-vietnamese/wav/FPTOpenSpeec...


# Pre-Procesing Data

### Use model

In [4]:
from datasets import load_dataset, Audio, Dataset, DatasetDict
from transformers import Wav2Vec2Processor
from transformers import Wav2Vec2FeatureExtractor, AutoTokenizer
import torchaudio

#load dataset
# dataset = load_dataset(
#     "csv",
#     data_files={
#         "train": "train.csv",
#         "validation": "val.csv"
#     }
# )
#dataset = load_dataset("csv", data_files="all_data.csv")

data = Dataset.from_pandas(df)
dataset_split = data.train_test_split(test_size=0.1, seed=42)
train_val_split = dataset_split["test"].train_test_split(test_size=0.5, seed=42)

final_dataset = DatasetDict({
    "train": dataset_split["train"],
    "validation": train_val_split["train"],  # đây là val
    "test": train_val_split["test"]
})

print(final_dataset)

2025-08-27 08:50:36.112464: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756284636.345822      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756284636.414693      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


DatasetDict({
    train: Dataset({
        features: ['file_name', 'caption', 'tag', 'audio_path'],
        num_rows: 23328
    })
    validation: Dataset({
        features: ['file_name', 'caption', 'tag', 'audio_path'],
        num_rows: 1296
    })
    test: Dataset({
        features: ['file_name', 'caption', 'tag', 'audio_path'],
        num_rows: 1297
    })
})


In [5]:
# from torch.utils.data import Dataset
# import torchaudio

# class ASRDataset(Dataset):
#     def __init__(self, audio_paths, transcripts, processor, sampling_rate=16000):
#         self.audio_paths = audio_paths
#         self.transcripts = transcripts
#         self.processor = processor
#         self.sampling_rate = sampling_rate

#     def __getitem__(self, idx):
#         speech_array, sr = torchaudio.load(self.audio_paths[idx])
#         if sr != self.sampling_rate:
#             speech_array = torchaudio.transforms.Resample(sr, self.sampling_rate)(speech_array)
#         speech_array = speech_array.squeeze()

#         # Xử lý audio & transcript cùng lúc
#         inputs = self.processor(
#             speech_array,
#             sampling_rate=self.sampling_rate,
#             text=self.transcripts[idx]
#         )
#         return {k: v.squeeze() for k, v in inputs.items()}

#     def __len__(self):
#         return len(self.audio_paths)


# class TokenizerDataset(Dataset):
#     def __init__(self, audio_paths, transcripts, processor, max_len=128):
#         self.audio_paths = audio_paths
#         self.transcripts = transcripts
#         self.processor = processor
#         self.max_len = max_len

#     def __getitem__(self, idx):
#         # ---- Load audio ----
#         speech_array, sampling_rate = torchaudio.load(self.audio_paths[idx])
#         speech_array = speech_array.squeeze().numpy()

#         # Extract features (input cho encoder Wav2Vec2)
#         inputs = self.processor.feature_extractor(
#             speech_array,
#             sampling_rate=sampling_rate,
#             return_tensors="pt"
#         )

#         # ---- Tokenize transcript ----
#         labels = self.processor.tokenizer(
#             self.transcripts[idx],
#             padding="max_length",
#             truncation=True,
#             max_length=self.max_len,
#             return_tensors="pt"
#         ).input_ids.squeeze(0)

#         return {
#             "input_values": inputs.input_values.squeeze(0),  # (T, feature_dim)
#             "labels": labels                                # (max_len,)
#         }

#     def __len__(self):
#         return len(self.audio_paths)


In [6]:
#load audio
dataset = final_dataset.cast_column("audio_path", Audio(sampling_rate=16000))

#load tokenizer
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
text_tokenizer = AutoTokenizer.from_pretrained("t5-small")  # hoặc "facebook/bart-base"
vocab_size = len(text_tokenizer)


# Pre-processing (extract feature + tokenize text)
def prepare_dataset(batch):
    # Audio -> input_values
    audio = batch["audio_path"]
    batch["input_values"] = feature_extractor(audio["array"], 
                                              sampling_rate=audio["sampling_rate"]).input_values[0]
    
    # Text -> labels
    batch["labels"] = text_tokenizer(batch["caption"], 
                                     truncation=True, 
                                     padding="max_length", 
                                     max_length=128).input_ids
    return batch

dataset = dataset.map(prepare_dataset, remove_columns=dataset["train"].column_names)

preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/23328 [00:00<?, ? examples/s]

Map:   0%|          | 0/1296 [00:00<?, ? examples/s]

Map:   0%|          | 0/1297 [00:00<?, ? examples/s]

In [7]:
#Khi làm ASR, đặc biệt với Wav2Vec2, Whisper, hay bất kỳ model CTC nào

from dataclasses import dataclass
from typing import Any, Dict, List, Union
import torch

@dataclass
class DataCollatorSeq2SeqWithPadding:
    feature_extractor: Any
    tokenizer: Any
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Pad audio
        input_features = [{"input_values": f["input_values"]} for f in features]
        batch = self.feature_extractor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt"
        )

        # Pad text labels
        label_features = [{"input_ids": f["labels"]} for f in features]
        labels_batch = self.tokenizer.pad(
            label_features,
            padding=self.padding,
            return_tensors="pt"
        )

        # Trong seq2seq vẫn thường mask pad thành -100 (giống CTC) để loss bỏ qua
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels

        return batch
        
data_collator = DataCollatorSeq2SeqWithPadding(
    feature_extractor=feature_extractor,
    tokenizer=text_tokenizer,
    padding=True )

In [8]:
from torch.utils.data import DataLoader

train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=data_collator)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=data_collator)

# Early Stopping

In [9]:
class EarlyStopping:
    def __init__(self, patience=5, delta=0.0, path='best_model.pt'):
        self.patience = patience
        self.delta = delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        self.path = path

    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.save_checkpoint(model)
        elif val_loss > self.best_loss - self.delta:
            self.counter += 1
            print(f"EarlyStopping counter: {self.counter}/{self.patience}")
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.save_checkpoint(model)
            self.counter = 0

    def save_checkpoint(self, model):
        torch.save(model.state_dict(), self.path)
        print(f"Saved best model with val_loss: {self.best_loss:.4f}")

# Encoder

In [10]:
import torch
import torch.nn as nn
import torchvision.models as models
from transformers import Wav2Vec2ForCTC, Wav2Vec2Model
from transformers import get_scheduler
from tqdm import tqdm

In [11]:
class Wav2Vec2Encoder(nn.Module):
    def __init__(self, model_name="facebook/wav2vec2-large-xlsr-53"):
        super().__init__()
        self.encoder = Wav2Vec2Model.from_pretrained(model_name) #Wav2Vec2ForCTC

    def forward(self, input_values, attention_mask=None):
        outputs = self.encoder(input_values, attention_mask=attention_mask)
        #return outputs.logits  
        return outputs.last_hidden_state  # [B, T, hidden]

# Position Encoding

RuntimeError: CUDA error: device-side assert triggered CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1 Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.

In [12]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000, device='cpu'):
        super().__init__()
        pe = torch.zeros(max_len, d_model, device=device)  # tạo trực tiếp trên device tránh lỗi trên
        position = torch.arange(0, max_len, device=device).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2, device=device) *
                             (-torch.log(torch.tensor(10000.0, device=device)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x


# Decoder

In [13]:
class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model=768, encoder_hidden_size=1024, num_layers=6, nhead=8, dim_feedforward=2048):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model)
        
        self.enc_to_dec = nn.Linear(encoder_hidden_size, d_model)

        decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, tgt_tokens, memory, tgt_mask=None, memory_mask=None):
        # tgt_emb = self.embedding(tgt_tokens)  # [B, T, d_model]
        # tgt_emb = self.pos_encoding(tgt_emb)
        tgt_tokens_for_emb = tgt_tokens.clone()
        tgt_tokens_for_emb[tgt_tokens_for_emb == -100] = 0  # giả sử pad_token_id = 0
        #assert tgt_tokens_for_emb.max() < vocab_size

        tgt_emb = self.embedding(tgt_tokens_for_emb)  # [B, T, d_model]
        tgt_emb = self.pos_encoding(tgt_emb)

        memory = self.enc_to_dec(memory) 
        
        output = self.transformer_decoder(tgt_emb.transpose(0, 1), memory.transpose(0, 1),
                                          tgt_mask=tgt_mask, memory_mask=memory_mask)
        return self.fc_out(output.transpose(0, 1))

# ASR Model (Encoder + Decoder)

In [14]:
class ASRModel(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_values, tgt_tokens, tgt_mask=None):
        memory = self.encoder(input_values)
        output = self.decoder(tgt_tokens, memory, tgt_mask=tgt_mask)
        return output

# Training

In [15]:
def train_one_epoch(model, train_loader, optimizer, criterion, device, lr_scheduler):
    model.train()
    total_loss, total_samples = 0, 0
    loop = tqdm(train_loader, leave=True)
    #loop = tqdm(val_loader, total=len(val_loader), leave=True, ncols=100, desc="Trainning")
    for batch in loop:
        input_values = batch["input_values"].to(device)
        #labels = batch["labels"].to(device)
        #decoder_input_ids = batch["decoder_input_ids"].to(device)
        labels = batch["labels"][:, 1:].to(device)  
        decoder_input_ids = batch["labels"][:, :-1].to(device)

        optimizer.zero_grad()
        logits = model(input_values, decoder_input_ids)
        loss = criterion(logits.reshape(-1, logits.size(-1)),labels.reshape(-1))
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item() * input_values.size(0)
        total_samples += input_values.size(0)
        #loop.set_postfix(loss=loss.item())

    avg_loss = total_loss / total_samples
    return avg_loss

def validate(model, val_loader, criterion, device):
    model.eval()
    total_loss, total_samples = 0, 0
    with torch.no_grad():
        loop = tqdm(val_loader, leave=True)
        #loop = tqdm(val_loader, total=len(val_loader), leave=True, ncols=100, desc="Validating")
        for batch in loop:
            input_values = batch["input_values"].to(device)
            decoder_input_ids = batch["labels"][:, :-1].to(device) 
            labels = batch["labels"].to(device)
    
            outputs = model(input_values, decoder_input_ids)
            loss = outputs.loss

            total_loss += loss.item() * input_values.size(0)
            total_samples += input_values.size(0)
            #loop.set_postfix(loss=loss.item())
            
    avg_loss = total_loss / total_samples
    return avg_loss 

In [None]:
encoder = Wav2Vec2Encoder()
decoder = TransformerDecoder(vocab_size)
model = ASRModel(encoder, decoder)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = nn.DataParallel(model).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss(ignore_index=-100)
# KHi traning asr end to end thì cái criterion được tích hợp sẵn trong label của hugging face nên ta không cần 
# phải gọi thủ công nữa


num_epochs = 30
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

early_stopper = EarlyStopping(patience=3)

for epoch in range(num_epochs):
    train_loss = train_one_epoch(model, train_loader,  optimizer, criterion, device, lr_scheduler)
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}")

    val_loss = validate(model, val_loader, criterion, device)
    print(f"Epoch [{epoch+1}/{num_epochs}], Val Loss: {val_loss:.4f}")
    
    early_stopper(val_loss, model)
    if early_stopper.early_stop:
        print("Early stopping")
        break