# Installation

In [1]:
import os
import json
import re
import numpy as np
import pandas as pd
import random

import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from rank_bm25 import BM25Okapi

In [2]:
import logging
logging.disable(logging.WARNING)

# Data

In [3]:
train_2223 = "D:/Lab/ALQAC_2024_VIMONOT5/resource/data_alqac_2024/train.json"
law_path = "D:/Lab/ALQAC_2024_VIMONOT5/resource/data_alqac_2024/law.json"

In [4]:
def random_num(start_ran, end_ran):
    return random.randint(start_ran, end_ran)

def load_json(path):
    with open(path, "r", encoding = "utf-8") as f:
        data = json.load(f)
    return data

In [5]:
def query_articles(query, corpus):
    query_law_id = query["law_id"]
    query_article_id = query["article_id"]

    for local_law in corpus:
        local_id = local_law["id"]
        if local_id == query_law_id:
            for local_article in local_law["articles"]:
                local_article_id = local_article["id"]
                if local_article_id == query_article_id:
                    return "[Điều " + query_article_id + " "+ query_law_id + "] " + local_article["text"]
            
    return None

def rand_articles(query, corpus):
    query_law_id = query["law_id"]
    query_article_id = query["article_id"]

    for local_law in corpus:
        local_id = local_law["id"]
        if local_id == query_law_id:
            limit_len = len(local_law["articles"])
            query_article_id = str(random_num(limit_len))
            for local_article in local_law["articles"]:
                local_article_id = local_article["id"]
                if local_article_id == query_article_id:
                    return "[Điều " + query_article_id + " "+ query_law_id + "] " + local_article["text"]
    
    return None

def rand_database(list_rand, avoid_rule, num):
    cnt = 0
    output_list = []
    while cnt < num:
        ran_idx = random_num(50)
        try:
            local_aricle = list_rand[ran_idx]
            local_detect = detect_law(local_aricle)
            if local_detect == avoid_rule: continue
        except:
            continue
        output_list.append(local_aricle["text"])
        cnt += 1
    return output_list

In [6]:
def concat_data(dataset_path, corpus_path):
    local_dataset = load_json(dataset_path)
    local_corpus = load_json(corpus_path)

    query_list = []
    answer_list = []
    label_list = []
    for local_data in local_dataset:
        local_id = local_data["question_id"]
        local_text = local_data["text"]
        try:
            local_choices = local_data["choices"]
            local_choices = local_choices["A"] + ". " + local_choices["B"] + ". " + local_choices["C"] + ". " + local_choices["D"]
            local_text += " " + local_choices
        except:
            local_text = local_text
        
        local_articles = query_articles(local_data["relevant_articles"][0], local_corpus)
        query_list.append(pre_processing(local_text))
        answer_list.append(pre_processing(local_articles))
        label_list.append(1)
        
    
#         list_rand = rand_database(local_data["top_n"], local_data["relevant_articles"][0], 2)
#         for local_rand in list_rand:
#             query_list.append(local_text)
#             answer_list.append(local_rand)
#             label_list.append(0)
        
    return {
        "query": query_list, 
        "article": answer_list, 
        "label": label_list
    }

# concat_data("/kaggle/input/alqac-2023/top_50_new.json", "/kaggle/input/alqac-2023/law.json")

# BM25

In [7]:
import math

def sigmoid(x):
  return 1 / (1 + math.exp(-x))

In [8]:
def pre_processing(text):
    text = text.replace("\n", " ")
    return " ".join(text.split())

def write_jsonfile(filename, data):
    result = json.dumps(data, indent=4, ensure_ascii=False)
    myjsonfile = open(filename, "w", encoding="utf8")
    myjsonfile.write(result)
    myjsonfile.close()
    
def create_bm_corpus(corpus_path):
    corpus = load_json(corpus_path)
    output_corpus_list = []
    for local_law in corpus:
        local_id = local_law["id"]
        for lo in local_law["articles"]:
            content = "[Điều " + lo["id"] + " "+ local_id + "] " + pre_processing(lo["text"])
            output_corpus_list.append(content)
            
    tokenized_corpus = [doc.split(" ") for doc in output_corpus_list]
    
    return BM25Okapi(tokenized_corpus), output_corpus_list


def minmax_scale(score_list, eps = 0.0001):
    score_list = np.array(score_list)
    min_score = np.min(score_list)
    max_score = np.max(score_list)
    if max_score - min_score == 0:
        return [1.0] * len(score_list)
    return (score_list - min_score) / (max_score - min_score + eps)

def query_bm25(query, corpus, bm25_model, n):
    query = pre_processing(query)
    tokenized_query = query.split(" ")
    score_list = bm25_model.get_scores(tokenized_query)
    score_list = minmax_scale(score_list)
    top_list = np.argsort(score_list)[::-1][:n]
    output_list = []
    for i in top_list:
        local_data = {
            "score": score_list[i],
            "text": corpus[i]
        }
        output_list.append(local_data)
    return output_list
            
def get_top_n_list(dataset_path, corpus_path, top_k=50):
    local_dataset = load_json(dataset_path)
    local_corpus = load_json(corpus_path)
    bm25, corpus = create_bm_corpus(corpus_path)
    
    output_list = []
    
    for local_data in local_dataset:
        local_id = local_data["question_id"]
        local_text = local_data["text"]
        try:
            local_choices = local_data["choices"]
            local_choices = local_choices["A"] + ". " + local_choices["B"] + ". " + local_choices["C"] + ". " + local_choices["D"]
            local_text += " " + local_choices
        except:
            local_text = local_text
        local_data = {
            "question_id": local_id,
            "relevant_articles": local_data["relevant_articles"],
            "text": local_text,
            "top_n": query_bm25(local_text, corpus, bm25, top_k)
        }
        output_list.append(local_data)
        
    return output_list

In [9]:
def concat_file(dataset_path, corpus_path, is_train):
    local_dataset = load_json(dataset_path)
    local_corpus = load_json(corpus_path)
        
    query_list = []
    answer_list = []
    label_list = []
    for local_data in local_dataset:
        local_id = local_data["question_id"]
        local_text = local_data["text"]
        if is_train:
            local_articles = query_articles(local_data["relevant_articles"][0], local_corpus)
            query_list.append(pre_processing(local_text))
            answer_list.append(pre_processing(local_articles))
            label_list.append(1)
        for candidate in local_data["top_n"]:
            query_list.append(local_text)
            answer_list.append(candidate["text"])
            label_list.append(0)
        
    return {
        "query": query_list, 
        "article": answer_list, 
        "label": label_list
    }

# concat_test("/kaggle/working/top_50_test.json", corpus_path)

In [10]:
def detect_law(data):
    score = data["score"]
    data = data["text"]
    content = "".join(data.split("]")[1:])
    data = (data.split("]")[0])[1:]
    data = data.split()
    return {
        "law_id": " ".join(data[2:]),
        "article_id": " ".join(data[1:2]),
        "score": score,
        "content": content
    }

In [11]:
def create_output(output_path,top_k=1):
    data = load_json(output_path)
    result_list = []
    for local_data in data:
        top_n_items = local_data["top_n"][:top_k]
        relevant_articles = [detect_law(item) for item in top_n_items]
        data_output = {
            "question_id": local_data["question_id"],
            "question_content": local_data["text"],
            "relevant_articles": local_data["relevant_articles"],
            "bm25_relevant_articles": relevant_articles
        }
        result_list.append(data_output)
    return result_list
    
# create_output("/kaggle/working/top_50_test.json")
# write_jsonfile("results_v2.json", create_output("/kaggle/working/top_10_public_test.json",3))

# Create top k BM25

In [12]:
# top_k_list = [1,2,5,10,20,50,100]

# for k in top_k_list:
#     out_file = "top_" + str(k) + "_public_test.json"
#     output = create_output("/kaggle/working/top_100_private_test.json",k)
#     write_jsonfile(out_file, output)

In [None]:
output_train = create_output("D:/Lab/ALQAC_2024_VIMONOT5/resource/top_100_train.json",100)
write_jsonfile("top_100_private_eval.json", output_train)

# output_private = create_output("/kaggle/working/top_100_private_test.json",100)
# write_jsonfile("top_100_train_eval.json", output_private)

In [None]:
output_priv_22 = create_output("/kaggle/working/top_100_priv_2022.json",5)
write_jsonfile("top_5_eval_priv_22.json", output_priv_22)

--- 33.19103026390076 seconds ---


In [13]:
def check_in_label(article, relevant_articles_list):
    law_id = article["law_id"]
    article_id = article["article_id"]
    for relevant_articles in relevant_articles_list:
        if law_id == relevant_articles["law_id"] and article_id == relevant_articles["article_id"]:
            return 1
    return 0

In [14]:
def make_df(data, rate = 1):
    question_list = []
    article_list = []
    bm25_score_list = []
    label_list = []
    for i in range(len(data)):
        cnt_neg = 0
        cnt_pos = 0
        local_question = data[i]["question_content"]
        for j in range(len(data[i]["bm25_relevant_articles"])):
            local_article = data[i]["bm25_relevant_articles"][j]["content"]
            local_label = int(check_in_label(data[i]["bm25_relevant_articles"][j], data[i]["relevant_articles"]))
            local_bm25_score = data[i]["bm25_relevant_articles"][j]["score"]
            if local_label == 1:
                question_list.append(local_question)
                article_list.append(local_article)
                bm25_score_list.append(local_bm25_score)
                label_list.append(local_label)
                cnt_pos += 1
        
        for j in range(len(data[i]["bm25_relevant_articles"])):
            local_article = data[i]["bm25_relevant_articles"][j]["content"]
            local_label = int(check_in_label(data[i]["bm25_relevant_articles"][j], data[i]["relevant_articles"]))
            local_bm25_score = data[i]["bm25_relevant_articles"][j]["score"]
            if local_label == 0:
                question_list.append(local_question)
                article_list.append(local_article)
                bm25_score_list.append(local_bm25_score)
                label_list.append(local_label)
                cnt_neg += 1
                if cnt_neg == rate * cnt_pos:
                    break

    # to df
    case = {
        "question_list": question_list,
        "article_list": article_list,
        "bm25_score_list": bm25_score_list,
        "label_list": label_list,
    }
    return pd.DataFrame(case)

In [26]:
train_2223 = "D:/Lab/ALQAC_2024_VIMONOT5/resource/data_alqac_2024/public_test_GOLD_TASK_1.json"
law_path = "D:/Lab/ALQAC_2024_VIMONOT5/resource/data_alqac_2024/law.json"
p_1_path = "D:/Lab/ALQAC_2024_VIMONOT5/resource/top_100_train.json"
p_2_path = "D:/Lab/ALQAC_2024_VIMONOT5/resource/top_100_train_add.json"

output_path = "D:/Lab/ALQAC_2024_VIMONOT5/public_test_5.csv"

import time
start_time = time.time()
train_one = get_top_n_list(train_2223, law_path, 5)
write_jsonfile(p_1_path, train_one)

output_private = create_output(p_1_path, 5)
print("--- %s seconds ---" % (time.time() - start_time))
write_jsonfile(p_2_path, output_private)

x = make_df(output_private, 5)
x.to_csv(output_path, index=False)

--- 6.930664777755737 seconds ---


In [None]:
start_time = time.time()
private = get_top_n_list(private_path, law_path,100)
write_jsonfile("top_100_private_test.json", private)

output_private = create_output("/kaggle/working/top_100_private_test.json",100)
write_jsonfile("top_100_private_eval.json", output_private)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
output_private[0]

In [9]:
import pandas as pd

x1 = pd.read_csv("D:/Lab/ALQAC_2024_VIMONOT5/resource/public_test_5.csv")
x2 = pd.read_csv("D:/Lab/ALQAC_2024_VIMONOT5/resource/train_5.csv")
x3 = pd.read_csv("D:/Lab/ALQAC_2024_VIMONOT5/resource/unverified_train_5.csv")

In [23]:
x = pd.concat([x1,x2,x3], axis=0).reset_index(drop=True)

In [1]:
x.to_csv("3_train.csv", index=False)

NameError: name 'x' is not defined

In [16]:
import pandas as pd

x = pd.read_csv("D:/Lab/ALQAC_2024_VIMONOT5/resource/public_2024_cp5.csv")

#min max scale ViMonoT5_score if same question_id

x["ViMonoT5_score"] = x.groupby("question_id")["ViMonoT5_score"].transform(lambda x: (x - x.min()) / (x.max() - x.min()))

In [17]:
def make_output_file(question_id, law_id, article_id, predict):
    output_list = []
    for i in range(len(predict)):
        if predict[i] == 1:
            output_list.append({
                "question_id": question_id[i],
                "relevant_articles": [
                    {
                        "law_id": law_id[i],
                        "article_id": str(article_id[i])
                    }
                ],
            })
    predict_list = []
    for i in range(len(output_list)):
        local_question_id = output_list[i]["question_id"]
        flag_t = 0
        for j in predict_list:
            if j["question_id"] == local_question_id:
                flag_t = 1
                j["relevant_articles"] += output_list[i]["relevant_articles"]
        if flag_t == 0:
            predict_list.append(output_list[i])
    return predict_list

In [18]:
label_case = load_json("D:/Lab/ALQAC_2024_VIMONOT5/resource/data_alqac_2024/public_test_GOLD_TASK_1.json")
predict_case = load_json("D:/Lab/ALQAC_2024_VIMONOT5/resource/data_alqac_2024/XOSANOWJ-submision-col-bert.json")

In [19]:
def eval_case(predict, label):
    avg_f2 = 0
    avg_recall = 0
    avg_precision = 0
    n_case = 0
    for i in predict:
        p_qid = i["question_id"]
        true_negative = 0
        true_positive = 0
        false_negative = 0
        false_positive = 0
        for j in label:
            if j["question_id"] == p_qid:
                for k in i["relevant_articles"]:
                    if k in j["relevant_articles"]:
                        true_positive += 1
                    else:
                        false_positive += 1
                break

        recall = true_positive / len(j["relevant_articles"])
        try:
            precision = true_positive / len(i["relevant_articles"])
        except:
            precision = 1
        if precision + recall == 0:
            local_f2 = 0
        else:
            local_f2 = 5 * precision * recall / (4 * precision + recall)
        avg_f2 += local_f2
        avg_recall += recall
        avg_precision += precision
        n_case += 1
    mirco_f2 = 5 * avg_precision * avg_recall / (4 * avg_precision + avg_recall)
    return avg_f2 / n_case, avg_recall / n_case, avg_precision / n_case

In [20]:
eval_case(predict_case, label_case)

(0.0, 0.0, 0.04)

In [21]:
w_bm25_list = []
w_vimono_list = []
threshold_list = []

f2_list = []
recall_list = []
precision_list = []

In [23]:
for w_bm25 in range(0, 10):
    w_bm25 = w_bm25 / 10
    w_vimono = 1 - w_bm25
    x["Ensemble"] = w_bm25 * x["bm25_score_scaled"] + w_vimono * x["ViMonoT5_score"]
    x["Ensemble"] = x.groupby("question_id")["Ensemble"].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
    for i_th in range(0, 10):
        threshold = i_th / 10
        x["Ensemble_label"] = x["Ensemble"].apply(lambda x: 1 if x > threshold else 0)
        out = make_output_file(x["question_id"], x["law_id"], x["article_id"], x["Ensemble_label"])
        f2, recall, precision = eval_case(out, label_case)
        f2_list.append(f2)
        recall_list.append(recall)
        precision_list.append(precision)
        w_bm25_list.append(w_bm25)
        w_vimono_list.append(w_vimono)
        threshold_list.append(threshold)

result = {
    "w_bm25": w_bm25_list,
    "w_vimono": w_vimono_list,
    "threshold": threshold_list,
    "f2": f2_list,
    "recall": recall_list,
    "precision": precision_list
}

result_df = pd.DataFrame(result)
result_df.to_csv("public_re.csv", index=False)
        