In [None]:
!pip install keras_preprocessing
!pip install pandarallel
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

# Load Test Data


In [None]:
import pandas as pd
from datasets import load_from_disk, load_dataset
# test_infer_stage1 = load_dataset("foxxy-hm/slu-inference-stage1", split="train")
# test_infer_stage1 = load_from_disk("/kaggle/input/soict-2023-wav2vec2-n-gram-inference-stage-1")
# test_df = test_infer_stage1.to_pandas()

# test_df = pd.read_csv("/kaggle/input/bi-model/inference_stage1 v3.csv")
test_df = pd.read_csv("/kaggle/input/soict-2023-wav2vec2-n-gram-inference-stage-1/inference_stage1.csv")
test_df

# Load Models

In [None]:
!pwd

In [None]:
!mkdir data

In [None]:
!cp /kaggle/input/output-training-stage-2-v2-08/*.py /kaggle/working/
!cp /kaggle/input/output-training-stage-2-v2-08/data/train_stage2.jsonl /kaggle/working/data/
!cp /kaggle/input/output-training-stage-2-v2-08/data/valid_stage2.jsonl /kaggle/working/data/

In [None]:
import torch 
from model import *
slot_model = torch.load("/kaggle/input/output-training-stage-2-v2-08/model_slot_best.ckpt", map_location=torch.device('cpu'))
intent_model = torch.load("/kaggle/input/output-training-stage-2-v2-08/model_intent_best.ckpt", map_location=torch.device('cpu'))
# slot_model = torch.load("/kaggle/input/soict2023-slu-training-stage-2-v2/model_slot_best.ckpt", map_location=torch.device('cpu'))
# intent_model = torch.load("/kaggle/input/soict2023-slu-training-stage-2-v2/model_intent_best.ckpt", map_location=torch.device('cpu'))

In [None]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [None]:
# show_random_elements(test_infer_stage1, num_examples=1)

In [None]:
def labels2token(labels, tokens):
    result = []
    current_label = None
    current_tokens = []

    for label, token in zip(labels, tokens):
        if label != 'O':
            if current_label is None:
                current_label = label
            current_tokens.append(token)
        else:
            if current_label is not None:
                result.append({"type": current_label, "filler": ' '.join(current_tokens)})
                current_label = None
                current_tokens = []

    # Append the last label if it exists
    if current_label is not None:
        result.append({"type": current_label, "filler": ' '.join(current_tokens)})
    return result

In [None]:
from make_dict import idx2intent, idx2slot, idx2word, word2idx
from keras_preprocessing.sequence import pad_sequences
import torch.nn.functional as F 
import torch
import utils
# torch.multiprocessing.set_start_method('spawn')

device="cpu"
def map_to_result(batch):
#     

    sentences = batch["pred_str"]
    real_len_test = [batch["sentence_len"]]
    # Chuyển các câu về dạng vector of index
    sentence_idx = [word2idx.get(w, word2idx['UNK']) for w in sentences]

    # Padding các câu về max_len
    while len(sentence_idx) < 50:
        sentence_idx.append(word2idx["PAD"])
        
    x_test = torch.tensor(sentence_idx).unsqueeze(0).to(device)
#     print(x_test)
    mask_test = utils.make_mask(real_len_test, batch=1).to(device)
    # Slot model generate hs_test and intent model generate hi_test
    hs_test = slot_model.enc(x_test)
    hi_test = intent_model.enc(x_test)

    # Slot
    slot_logits_test = slot_model.dec(hs_test, hi_test)
    log_slot_logits_test = utils.masked_log_softmax(slot_logits_test, mask_test, dim=-1)
    slot_pred_test = torch.argmax(log_slot_logits_test, dim=-1)
    # Intent
    intent_logits_test = intent_model.dec(hi_test, hs_test, real_len_test)
    log_intent_logits_test = F.log_softmax(intent_logits_test, dim=-1)
    res_test = torch.argmax(log_intent_logits_test, dim=-1)
    
#     print("Itent: ")
#     print("Predict: ", idx2intent[res_test.item()])
    
#     print("Slot: ")
    slot_pred_test = slot_pred_test[0][:real_len_test[0]]

    slot_pred_test = [int(item) for item in slot_pred_test]
    slot_pred_test = [idx2slot[item] for item in slot_pred_test]
#     print("Predict: " , slot_pred_test)
#     print("Sentence: ", sentences)
#     print("=================")
    return {"intent": idx2intent[res_test.item()], 
                 "entities": labels2token(slot_pred_test, sentences),
                 "file": batch["file"].split("/")[-1]}
    
import re
chars_to_ignore_regex = '[\`\'\?\.\!\-\;\/"]'
def remove_special_characters(x):
    x = x.lower().strip()
    x = re.sub(",", " ", x)
    x = re.sub(chars_to_ignore_regex, '', x)
    return x

def processing(x):
    x = remove_special_characters(x)
    return x.split()

#### Processing for base model

In [None]:
test_df["pred_str"] = test_df["pred_str"].parallel_apply(processing)
test_df["pred_str"]

In [None]:
test_df["sentence_len"] = test_df["pred_str"].apply(len)

#### Result for base model

In [None]:
results = test_df.apply(map_to_result, axis=1)

In [None]:
results.sample(5).values

In [None]:
!mkdir infer_without_lm

In [None]:
import json
with open('infer_without_lm/predictions.jsonl', 'w', encoding='utf-8') as json_file:
    for i in results:
        json.dump(i, json_file, ensure_ascii=False)
        json_file.write('\n')

#### Processing for base model + 2-gram

In [None]:
test_df["pred_str_with_beam_search_2"] = test_df["pred_str_with_beam_search_2"].parallel_apply(processing)
test_df["pred_str_with_beam_search_2"]

#### Results for base model with LM

In [None]:
# results[0][6:652]

In [None]:
def map_to_result(batch):
#     

    sentences = batch["pred_str_with_beam_search_2"]
    real_len_test = [len(batch["pred_str_with_beam_search_2"])]
    # Chuyển các câu về dạng vector of index
    sentence_idx = [word2idx.get(w, word2idx['UNK']) for w in sentences]

    # Padding các câu về max_len
    while len(sentence_idx) < 50:
        sentence_idx.append(word2idx["PAD"])
        
    x_test = torch.tensor(sentence_idx).unsqueeze(0).to(device)
#     print(x_test)
    mask_test = utils.make_mask(real_len_test, batch=1).to(device)
    # Slot model generate hs_test and intent model generate hi_test
    hs_test = slot_model.enc(x_test)
    hi_test = intent_model.enc(x_test)

    # Slot
    slot_logits_test = slot_model.dec(hs_test, hi_test)
    log_slot_logits_test = utils.masked_log_softmax(slot_logits_test, mask_test, dim=-1)
    slot_pred_test = torch.argmax(log_slot_logits_test, dim=-1)
    # Intent
    intent_logits_test = intent_model.dec(hi_test, hs_test, real_len_test)
    log_intent_logits_test = F.log_softmax(intent_logits_test, dim=-1)
    res_test = torch.argmax(log_intent_logits_test, dim=-1)
    
#     print("Itent: ")
#     print("Predict: ", idx2intent[res_test.item()])
    
#     print("Slot: ")
    slot_pred_test = slot_pred_test[0][:real_len_test[0]]

    slot_pred_test = [int(item) for item in slot_pred_test]
    slot_pred_test = [idx2slot[item] for item in slot_pred_test]
#     print("Predict: " , slot_pred_test)
#     print("Sentence: ", sentences)
#     print("=================")
    return {"intent": idx2intent[res_test.item()], 
                 "entities": labels2token(slot_pred_test, sentences),
                 "file": batch["file"].split("/")[-1]}

results = test_df.apply(map_to_result, axis=1)
import json

In [None]:
pd.set_option('display.max_colwidth', None)
test_df[["pred_str", "pred_str_with_beam_search_2"]].sample(5)

In [None]:
test_df.loc[[361, 981, 978, 836]]

In [None]:
# import IPython.display as ipd
# import numpy as np
# import random
# import soundfile as sf

# speech_array, sampling_rate = sf.read("/kaggle/input/soict2023-slu/SLU/public_test/public_test/BA1LQuAP7SlUPuOr8kIQ6Y4.wav")
# speech_array, sampling_rate = sf.read("/kaggle/input/soict2023-slu/SLU/public_test/public_test/4dNNfE4gAAKTkkxiFbqI3M0.wav")
# speech_array, sampling_rate = sf.read("/kaggle/input/soict2023-slu/SLU/public_test/public_test/hdZ72FGMiC86B5FYKYjGmJC.wav")
# st = "/kaggle/input/soict2023-slu/SLU/public_test/public_test/6bTozd2qnF5j7wsn2HMYqLz.wav"
# speech_array, sampling_rate = sf.read(st)
# rand_int = random.randint(0, len(results))

# ipd.Audio(data=np.asarray(speech_array), autoplay=True, rate=16000)

In [None]:
!mkdir infer_with_lm

In [None]:
with open('infer_with_lm/predictions2.jsonl', 'w', encoding='utf-8') as json_file:
    for i in results:
        json.dump(i, json_file, ensure_ascii=False)
        json_file.write('\n')

## Base model with 3-gram

In [None]:
test_df["pred_str_with_beam_search_3"] = test_df["pred_str_with_beam_search_3"].parallel_apply(processing)
test_df["pred_str_with_beam_search_3"]

In [None]:
def map_to_result(batch):
#     

    sentences = batch["pred_str_with_beam_search_3"]
    real_len_test = [len(batch["pred_str_with_beam_search_3"])]
    # Chuyển các câu về dạng vector of index
    sentence_idx = [word2idx.get(w, word2idx['UNK']) for w in sentences]

    # Padding các câu về max_len
    while len(sentence_idx) < 50:
        sentence_idx.append(word2idx["PAD"])
        
    x_test = torch.tensor(sentence_idx).unsqueeze(0).to(device)
#     print(x_test)
    mask_test = utils.make_mask(real_len_test, batch=1).to(device)
    # Slot model generate hs_test and intent model generate hi_test
    hs_test = slot_model.enc(x_test)
    hi_test = intent_model.enc(x_test)

    # Slot
    slot_logits_test = slot_model.dec(hs_test, hi_test)
    log_slot_logits_test = utils.masked_log_softmax(slot_logits_test, mask_test, dim=-1)
    slot_pred_test = torch.argmax(log_slot_logits_test, dim=-1)
    # Intent
    intent_logits_test = intent_model.dec(hi_test, hs_test, real_len_test)
    log_intent_logits_test = F.log_softmax(intent_logits_test, dim=-1)
    res_test = torch.argmax(log_intent_logits_test, dim=-1)
    
#     print("Itent: ")
#     print("Predict: ", idx2intent[res_test.item()])
    
#     print("Slot: ")
    slot_pred_test = slot_pred_test[0][:real_len_test[0]]

    slot_pred_test = [int(item) for item in slot_pred_test]
    slot_pred_test = [idx2slot[item] for item in slot_pred_test]
#     print("Predict: " , slot_pred_test)
#     print("Sentence: ", sentences)
#     print("=================")
    return {"intent": idx2intent[res_test.item()], 
                 "entities": labels2token(slot_pred_test, sentences),
                 "file": batch["file"].split("/")[-1]}

results = test_df.apply(map_to_result, axis=1)
with open('infer_with_lm/predictions3.jsonl', 'w', encoding='utf-8') as json_file:
    for i in results:
        json.dump(i, json_file, ensure_ascii=False)
        json_file.write('\n')