In [None]:
from kaggle_secrets import UserSecretsClient
secret_label = "hugging_face_key"
secret_value = UserSecretsClient().get_secret(secret_label)
from huggingface_hub import login

login(secret_value)

In [None]:
!pip install -U datasets huggingface-hub
!pip install colorednoise > /dev/null

In [None]:
%%capture
!apt install git-lfs

In [None]:
# !pip install augly[audio]
!pip install audiomentations
!pip install datasets==2.8.0

In [None]:
from IPython.lib.display import Audio
import torchaudio
import torch
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
import json
import pandas as pd
import os

# Make datasets

In [None]:
%cd /kaggle/input/soict2023-slu/SLU 

## Prepare data


In [None]:
!ls train_data/Train | wc -l

In [None]:
data = []
with open("/kaggle/input/soict2023-slu/SLU/train_20230909.jsonl") as f:
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)
data = Dataset.from_pandas(df)

In [None]:
# data = data.train_test_split(test_size=0.2, shuffle=True)
data

In [None]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(data, num_examples=2)

- Có thể thấy rằng trong các đoạn transcriptions chứa các ký tự đặc biệt, ví dụ `,.?!;:`. Nếu không sử dụng các language model, rất khó để phân loại được các kí tự này do chúng không được phát âm rõ ràng, vì vậy ta sẽ loại bỏ các ký tự này.
- Sau đó, ta normalize các câu về dạng lowercase.

In [None]:
df[df["sentence"].str.contains('̣')]["sentence"].values

In [None]:
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\\/"]'

def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
#     batch["text"] = batch["text"].replace('́', "").replace('̣', "").replace('̀', "")
    return batch

In [None]:
data = data.map(remove_special_characters)

In [None]:
show_random_elements(data, num_examples=5)

## Make vocab

Trong `CTC`, thông thường ta sẽ phân loại speech chunks thành các letters. Ta sẽ viết hàm mapping để concat tất cả transcriptions thành 1, sau đó transforms string thành set of chars.

In [None]:
def extract_all_chars(batch):
    all_text = " ".join(batch["text"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

In [None]:
vocabs = data.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=data.column_names)

In [None]:
from transformers import Wav2Vec2Processor
processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h")

pretrain_vocab = processor.tokenizer.get_vocab()

In [None]:
vocab_list = list(set(vocabs["vocab"][0]) | 
                  set(pretrain_vocab.keys()) - {"<s>", "</s>", "<pad>", "<unk>", "|", "j"})

In [None]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict.keys()

In [None]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [None]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

=> Linear layer cuối của pretrained Wav2Vec2 checkpoint sẽ có chiều là 110

In [None]:
import json
with open('/kaggle/working/vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [None]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("/kaggle/working/vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

## Create Wav2Vec2 Feature Extractor

In [None]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)

In [None]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="/kaggle/working/vocab.json",
    path_in_repo="vocab.json",
    repo_id="foxxy-hm/wav2vec2-base-finetune-vi-v6",
    repo_type="model",
)

In [None]:
processor.push_to_hub("wav2vec2-base-finetune-vi-v6")

## Preprocess Audio Data

Rút trích đặc trưng từ các files âm thanh và chuyển thành các array 1 chiều.

In [None]:
%cd train_data/Train

In [None]:
import soundfile as sf

def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = sf.read(batch["file"])
    batch["speech"] = speech_array
    batch["sampling_rate"] = sampling_rate
    batch["target_text"] = batch["text"]
    return batch

In [None]:
data = data.map(speech_file_to_array_fn, remove_columns=data.column_names)
data

In [None]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(data))

ipd.Audio(data=np.asarray(data[rand_int]["speech"]), autoplay=True, rate=16000)

In ra shape của speech input, transcription và sampling rate tương ứng.

In [None]:
rand_int = random.randint(0, len(data))

print("Target text:", data[rand_int]["target_text"])
print("Input array shape:", np.asarray(data[rand_int]["speech"]).shape)
print("Sampling rate:", data[rand_int]["sampling_rate"])

=> 1-dimension array, sampling rate tương ứng với 16kHz, và target text.

# Augmentation data

In [None]:
from audiomentations import Lambda, Compose, AddGaussianNoise, AddGaussianSNR, TimeStretch, PitchShift, Shift, AddBackgroundNoise, OneOf
import numpy as np
import colorednoise as cn

def PinkNoiseSNR(samples, sample_rate, min_snr=5.0, max_snr=15.0):
    snr = np.random.uniform(min_snr, max_snr)
    a_signal = np.sqrt(samples ** 2).max()
    a_noise = a_signal / (10 ** (snr / 20))

    pink_noise = cn.powerlaw_psd_gaussian(1, len(samples))
    a_pink = np.sqrt(pink_noise ** 2).max()
    samples = (samples + pink_noise * 1 / a_pink * a_noise)
    return samples.astype(np.float32)

def VolumeControl(samples, sample_rate, mode="sine", db_limit=10.0):
    db = np.random.uniform(-db_limit, db_limit)
    if mode == "uniform":
        db_translated = 10 ** (db / 20)
    elif mode == "fade":
        lin = np.arange(len(samples))[::-1] / (len(samples) - 1)
        db_translated = 10 ** (db * lin / 20)
    elif mode == "cosine":
        cosine = np.cos(np.arange(len(samples)) / len(samples) * np.pi * 2)
        db_translated = 10 ** (db * cosine / 20)
    else:
        sine = np.sin(np.arange(len(samples)) / len(samples) * np.pi * 2)
        db_translated = 10 ** (db * sine / 20)
    db_translated
    augmented = (samples * db_translated)
    return augmented.astype(np.float32)

def augment(batch):
    aug = Compose([
        OneOf(
            transforms=[Lambda(transform=PinkNoiseSNR, p=0.5), 
                        AddGaussianSNR(min_snr_db=5.0, max_snr_db=15.0, p=0.5)]
        ),
        TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
        PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
        Lambda(transform=VolumeControl, p=0.5),
    ])
    return {"speech": aug(np.asarray(batch["speech"], dtype=(np.float32)), sample_rate=16000)}
#     sr = 16000
#     transform = Compose([
#       OneOf([
#         GaussianNoiseSNR(min_snr=10),
#         PinkNoiseSNR(min_snr=10)
#       ]),
#       PitchShift(max_steps=2, sr=sr),
#       TimeStretch(),
# #       TimeShift(sr=sr),
#       VolumeControl(mode="sine")
#     ])
#     return {"speech": transform(np.asarray(batch["speech"]).astype(np.float32))}

In [None]:
orig_size = len(data)
augment_size = int(orig_size * 0.2)
random_idxs = np.random.choice(orig_size, augment_size, replace=False)
data_subset = data.select(random_idxs)
augmented = data_subset.map(augment)

In [None]:
# combined = concatenate_datasets([data, augmented])
combined = data

In [None]:
combined = combined.train_test_split(test_size=0.1, shuffle=True, seed=42)

In [None]:
combined

In [None]:
# combined.push_to_hub("slu-augmented-data", num_shards={"train": 1, "test": 1})

In [None]:
# new_data = load_dataset("foxxy-hm/slu-augmented-data")
# new_data

In [None]:
combined["train"].save_to_disk("/kaggle/working/data/train.dataset", num_shards=1)
combined["test"].save_to_disk("/kaggle/working/data/valid.dataset", num_shards=1)

In [None]:
%cd /kaggle/working
!mv ./data/train.dataset/data-00000-of-00001.arrow ./data/train.dataset/dataset.arrow 
!mv ./data/valid.dataset/data-00000-of-00001.arrow ./data/valid.dataset/dataset.arrow 
import json

def overwrite_state(filepath):
    with open(filepath) as f:
        data = json.load(f)

    # Update filename
    data["_data_files"][0]["filename"] = "dataset.arrow"

    with open(filepath, 'w') as f:
        json.dump(data, f, indent=2)

overwrite_state("./data/valid.dataset/state.json")
overwrite_state("./data/train.dataset/state.json")

In [None]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(augmented))

print(rand_int, augmented[rand_int]["target_text"])
ipd.Audio(data=np.asarray(augmented[rand_int]["speech"]), autoplay=True, rate=16000)

In [None]:
# aug = Compose([
# #     AddBackgroundNoise(min_snr_db=5.0, max_snr_db=20.0, p=0.5),
#     AddGaussianSNR(min_snr_db=5.0, max_snr_db=20.0, p=0.5),
# #     TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
#     PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
# #     Shift(p=0.5),
# ])
# sample = aug(samples=np.asarray(combined["train"][2974]["speech"], dtype=(np.float32)), sample_rate=16000)
# ipd.Audio(data=sample, autoplay=True, rate=16000)

In [None]:
# sample = augment(combined["train"][2974])["speech"]
# ipd.Audio(data=sample, autoplay=True, rate=16000)