In [None]:
# !pip install datasets[audio] -q
!pip uninstall transformers -y
!pip install transformers==4.20.0 
!pip install pyctcdecode==v0.1.0
!pip install https://github.com/kpu/kenlm/archive/master.zip
!pip install jiwer -q

In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import load_dataset, load_metric, Dataset#, Audio
import soundfile as sf
import pandas as pd
from transformers.file_utils import cached_path, hf_bucket_url
import os, zipfile
import kenlm
from pyctcdecode import Alphabet, BeamSearchDecoderCTC, LanguageModel
import IPython
import torchaudio
import torch
import json

wer_metric = load_metric("wer")

In [None]:
cache_dir = './cache/'
processor = Wav2Vec2Processor.from_pretrained("foxxy-hm/wav2vec2-base-finetune-vi-v6", cache_dir=cache_dir)
model = Wav2Vec2ForCTC.from_pretrained("foxxy-hm/wav2vec2-base-finetune-vi-v6", cache_dir=cache_dir)
# model.to("cpu")
# lm_file = hf_bucket_url("nguyenvulebinh/wav2vec2-base-vietnamese-250h", filename='vi_lm_4grams.bin.zip')
# lm_file = cached_path(lm_file,cache_dir=cache_dir)
# with zipfile.ZipFile(lm_file, 'r') as zip_ref:
#     zip_ref.extractall(cache_dir)
# lm_file = cache_dir + 'vi_lm_4grams.bin'

# Load data and preprocess

In [None]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [None]:
path = "/kaggle/input/soict2023-slu/SLU/public_test/public_test/"

In [None]:
import os
test_ds = Dataset.from_dict({
    "file": [path + i for i in os.listdir(path)]
})

In [None]:
import soundfile as sf

def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = sf.read(batch["file"])
    batch["speech"] = speech_array
    batch["sampling_rate"] = sampling_rate
    return batch

In [None]:
test_ds = test_ds.map(speech_file_to_array_fn)

In [None]:
show_random_elements(test_ds, num_examples=2)

# Create 2-gram model

In [None]:
%cd /kaggle/working/

In [None]:
chars_to_ignore_regex = '[\?\.\!\-\;\:\"\“\‘\”\�\।\’]'
import json
import re
train = []
with open('/kaggle/input/soict2023-slu/SLU/train_20230909.jsonl') as f:
    for line in f.readlines():
        sample = json.loads(line)["sentence"]
        sample = re.sub(chars_to_ignore_regex, '', sample.replace(",", " "))
        train.append(sample)

with open('/kaggle/working/text.txt', 'w') as f:
    f.write(" ".join(train))

In [None]:
! sudo apt -y install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev
! wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz
! mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2
! ls kenlm/build/bin
! kenlm/build/bin/lmplz -o 2 < "text.txt" > "2gram.arpa"

## Create 3-gram model

In [None]:
! kenlm/build/bin/lmplz -o 3 < "text.txt" > "3gram.arpa"

#### Xem 30 dòng đầu tiên 

In [None]:
!head -30 2gram.arpa

- 2-gram đã bao gồm chính xác `<unk>` và `<s>` token, nhưng lại không có token `<\s>` 
- Do đó, ta sẽ thêm `end-of-sentence` token vào bằng cách thêm dòng `0 </s> -0.17968792` vào dưới`begin-of-sentence` token và tăng ngram 1 count lên 1. 

In [None]:
def fix_ngrams(n_grams):
    with open(f"{n_grams}.arpa", "r") as read_file, open(f"{n_grams}_correct.arpa", "w") as write_file:
        has_added_eos = False
        for line in read_file:
            if not has_added_eos and "ngram 1=" in line:
                count=line.strip().split("=")[-1]
                write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
            elif not has_added_eos and "<s>" in line:
                write_file.write(line)
                write_file.write(line.replace("<s>", "</s>"))
                has_added_eos = True
            else:
                write_file.write(line)

fix_ngrams("2gram")
fix_ngrams("3gram")

In [None]:
import kenlm
ngram_model_2 = kenlm.LanguageModel('./2gram_correct.arpa')
ngram_model_3 = kenlm.LanguageModel('./3gram_correct.arpa')

In [None]:
!head -20 2gram_correct.arpa

## Load N-Gram LM

In [None]:
def get_decoder_ngram_model(tokenizer, ngram_lm_path):
    vocab_dict = tokenizer.get_vocab()
    sort_vocab = sorted((value, key) for (key, value) in vocab_dict.items())
    vocab = [x[1] for x in sort_vocab][:-2]
    vocab_list = vocab
    # convert ctc blank character representation
    vocab_list[tokenizer.pad_token_id] = ""
    # replace special characters
    vocab_list[tokenizer.unk_token_id] = ""
#     vocab_list[tokenizer.bos_token_id] = ""
#     vocab_list[tokenizer.eos_token_id] = ""
    # convert space character representation
    vocab_list[tokenizer.word_delimiter_token_id] = " "
    # specify ctc blank char index, since conventially it is the last entry of the logit matrix
    alphabet = Alphabet.build_alphabet(vocab_list, ctc_token_idx=tokenizer.pad_token_id)
    lm_model = kenlm.Model(ngram_lm_path)
    decoder = BeamSearchDecoderCTC(alphabet,
                                   language_model=LanguageModel(lm_model))
    return decoder

In [None]:
ngram_lm_model_2 = get_decoder_ngram_model(processor.tokenizer, "./2gram_correct.arpa")
ngram_lm_model_3 = get_decoder_ngram_model(processor.tokenizer, "./3gram_correct.arpa")

# Inference

In [None]:
def map_to_result(batch):
    model.to("cuda")
    input_values = processor(
      batch["speech"],
      sampling_rate=batch["sampling_rate"],
      return_tensors="pt"
    ).input_values.to("cuda")

    with torch.no_grad():
        logits = model(input_values).logits

    pred_ids = torch.argmax(logits, dim=-1)
    batch["pred_str"] = processor.batch_decode(pred_ids)[0]
    batch["pred_str_with_beam_search_2"] = ngram_lm_model_2.decode(logits[0].cpu().detach().numpy(), beam_width=500)
    batch["pred_str_with_beam_search_3"] = ngram_lm_model_3.decode(logits[0].cpu().detach().numpy(), beam_width=500)
    return batch


In [None]:
results = test_ds.map(map_to_result)

In [None]:
show_random_elements(results.remove_columns(["speech", "sampling_rate"]))

In [None]:
results.to_pandas().to_csv("inference_stage1.csv", index=False)