# Installation

In [1]:
!nvidia-smi

Mon Jul  1 05:51:33 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0              26W / 250W |      0MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
!pip install datasets transformers
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [3]:
import os
import json
import re
import numpy as np
import pandas as pd
import random

import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from rank_bm25 import BM25Okapi

In [4]:
import logging
logging.disable(logging.WARNING)
torch.cuda.set_device(0)
print(torch.cuda.current_device())

0


# Data

In [5]:
# train_2223 = "/kaggle/input/thesis-data/retrieval/train_2223.json"
# private_path = "/kaggle/input/thesis-data/private_test_2023.json"
# law_path = "/kaggle/input/thesis-data/law.json"
# law2223_path = "/kaggle/input/thesis-data/law_2223.json"
# private_2022 = "/kaggle/input/thesis-data/ALQAC22_test.json"
# train_one_sample = "/kaggle/input/thesis-data/train_one_sample.json"

law_path = "/kaggle/input/alqac-2024-data/law.json"
pub_test_2024 = "/kaggle/input/alqac-2024-data/data_2024/public_test.json"
train_2024 = "/kaggle/input/alqac-2024-data/data_2024/train.json" # k dung
unverified_train_2024 = "/kaggle/input/alqac-2024-data/data_2024/unverified_train.json"
merged_2224_train = "/kaggle/input/alqac-2024-data/merged_train.json"
private_2023 = "/kaggle/input/alqac-2024-data/private_test_GOLD_TASK_1 (2).json"

data_team_qa = "/kaggle/input/data-team-qa/retrieval_additional_data_formatted_2.json"

private_2024 = "/kaggle/input/all-private-test-related-data-alqac-2024/private_test_TASK_1.json"

In [6]:
# def add_question_ids(json_data):
#     for i, question in enumerate(json_data):
#         question['question_id'] = i + 1
#     return json_data

# input_file = '/kaggle/input/data-team-qa/retrieval_additional_data_formatted.json'
# output_file = 'retrieval_additional_data_formatted_2.json'

# # Read the JSON data from the input file
# with open(input_file, 'r', encoding='utf-8') as f:
#     data = json.load(f)

# # Add question IDs to the data
# data_with_ids = add_question_ids(data)

# # Write the updated JSON data to the output file
# with open(output_file, 'w', encoding='utf-8') as f:
#     json.dump(data_with_ids, f, ensure_ascii=False, indent=4)

In [7]:
def random_num(start_ran, end_ran):
    return random.randint(start_ran, end_ran)

def load_json(path):
    with open(path, "r", encoding = "utf-8") as f:
        data = json.load(f)
    return data

In [8]:
def query_articles(query, corpus):
    query_law_id = query["law_id"]
    query_article_id = query["article_id"]

    for local_law in corpus:
        local_id = local_law["id"]
        if local_id == query_law_id:
            for local_article in local_law["articles"]:
                local_article_id = local_article["id"]
                if local_article_id == query_article_id:
                    return "[Điều " + query_article_id + " "+ query_law_id + "] " + local_article["text"]
            
    return None

def rand_articles(query, corpus):
    query_law_id = query["law_id"]
    query_article_id = query["article_id"]

    for local_law in corpus:
        local_id = local_law["id"]
        if local_id == query_law_id:
            limit_len = len(local_law["articles"])
            query_article_id = str(random_num(limit_len))
            for local_article in local_law["articles"]:
                local_article_id = local_article["id"]
                if local_article_id == query_article_id:
                    return "[Điều " + query_article_id + " "+ query_law_id + "] " + local_article["text"]
    
    return None

def rand_database(list_rand, avoid_rule, num):
    cnt = 0
    output_list = []
    while cnt < num:
        ran_idx = random_num(50)
        try:
            local_aricle = list_rand[ran_idx]
            local_detect = detect_law(local_aricle)
            if local_detect == avoid_rule: continue
        except:
            continue
        output_list.append(local_aricle["text"])
        cnt += 1
    return output_list

In [9]:
def concat_data(dataset_path, corpus_path):
    local_dataset = load_json(dataset_path)
    local_corpus = load_json(corpus_path)

    query_list = []
    answer_list = []
    label_list = []
    for local_data in local_dataset:
        local_id = local_data["question_id"]
        local_text = local_data["text"]
        try:
            local_choices = local_data["choices"]
            local_choices = local_choices["A"] + ". " + local_choices["B"] + ". " + local_choices["C"] + ". " + local_choices["D"]
            local_text += " " + local_choices
        except:
            local_text = local_text
        
        local_articles = query_articles(local_data["relevant_articles"][0], local_corpus)
        query_list.append(pre_processing(local_text))
        answer_list.append(pre_processing(local_articles))
        label_list.append(1)
        
    
#         list_rand = rand_database(local_data["top_n"], local_data["relevant_articles"][0], 2)
#         for local_rand in list_rand:
#             query_list.append(local_text)
#             answer_list.append(local_rand)
#             label_list.append(0)
        
    return {
        "query": query_list, 
        "article": answer_list, 
        "label": label_list
    }

# concat_data("/kaggle/input/alqac-2023/top_50_new.json", "/kaggle/input/alqac-2023/law.json")

# BM25

In [10]:
import math

def sigmoid(x):
  return 1 / (1 + math.exp(-x))

In [11]:
def pre_processing(text):
    text = text.replace("\n", " ")
    return " ".join(text.split())

def write_jsonfile(filename, data):
    result = json.dumps(data, indent=2, ensure_ascii=False)
    myjsonfile = open(filename, "w", encoding="utf8")
    myjsonfile.write(result)
    myjsonfile.close()
    
def create_bm_corpus(corpus_path):
    corpus = load_json(corpus_path)
    output_corpus_list = []
    for local_law in corpus:
        local_id = local_law["id"]
        for lo in local_law["articles"]:
            content = "[Điều " + lo["id"] + " "+ local_id + "] " + pre_processing(lo["text"])
            output_corpus_list.append(content)
            
    tokenized_corpus = [doc.split(" ") for doc in output_corpus_list]
    
    return BM25Okapi(tokenized_corpus), output_corpus_list

def query_bm25(query, corpus, bm25_model, n):
    query = pre_processing(query)
    tokenized_query = query.split(" ")
    score_list = bm25_model.get_scores(tokenized_query)
    top_list = np.argsort(score_list)[::-1][:n]
    output_list = []
    for i in top_list:
        local_data = {
            "score": score_list[i],
            "text": corpus[i]
        }
        output_list.append(local_data)
    return output_list
            
def get_top_n_list(dataset_path, corpus_path, top_k=50):
    local_dataset = load_json(dataset_path)
    local_corpus = load_json(corpus_path)
    bm25, corpus = create_bm_corpus(corpus_path)
    
    output_list = []
    
    for local_data in local_dataset:
        local_id = local_data["question_id"]
        local_text = local_data["text"]
        try:
            local_choices = local_data["choices"]
            local_choices = local_choices["A"] + ". " + local_choices["B"] + ". " + local_choices["C"] + ". " + local_choices["D"]
            local_text += " " + local_choices
        except:
            local_text = local_text
        local_data = {
            "question_id": local_id,
#             "relevant_articles": local_data["relevant_articles"],
            "text": local_text,
            "top_n": query_bm25(local_text, corpus, bm25, top_k)
        }
        output_list.append(local_data)
        
    return output_list

In [18]:
# # # Load the main JSON data
# # with open('data.json', 'r', encoding='utf-8') as file:
# #     data = json.load(file)


# # Create a dictionary to map queries to their relevant articles -> Lay label thoi, test k can
# # relevant_articles_map = {}
# # for item in relevant_data:
# #     query = item['text']
# #     relevant_articles = item['relevant_articles']
# #     relevant_articles_map[query] = {(ra['law_id'], ra['article_id']) for ra in relevant_articles}

# # Initialize an empty list to store the processed data
# rows = []

# # Function to extract law_id and article_id
# def extract_law_article_info(text):
#     data = text.split("]")[0][1:]
#     data = data.split()
#     return {
#         "law_id": " ".join(data[2:]),
#         "article_id": data[1]
#     }

# # Process each question
# for question in private_24_data:
#     question_id = question['question_id']
#     query = question['text']
#     query_length = len(query.split())

#     # Extract top_n data
#     top_n = question['top_n']
#     scores = [item['score'] for item in top_n]
#     min_score, max_score = min(scores), max(scores)

#     for item in top_n:
#         content = item['text']
#         bm25_score = item['score']
#         bm25_score_scaled = (bm25_score - min_score) / (max_score - min_score) if max_score > min_score else 0
        
#         # Extract law_id and article_id from the content
#         law_article_info = extract_law_article_info(content)
#         law_id = law_article_info['law_id']
#         article_id = law_article_info['article_id']
        
#         # Clean the content to remove the [law_id article_id] part
#         content_cleaned = ' '.join(content.split('] ', 1)[1:])
        
#         # Calculate the length of the article content
#         article_length = len(content_cleaned.split())
        
#         # Determine the label
#         label = 1 if (law_id, article_id) in relevant_articles_map.get(query, set()) else 0

#         # Append the row data
#         rows.append({
#             'question_id': f'qa_data_{question_id}',
#             'query': query,
#             'law_id': law_id,
#             'article_id': article_id,
#             'content': content_cleaned,
#             'bm25_score': bm25_score,
#             'bm25_score_scaled': bm25_score_scaled,
#             'label': label,
#             'query_length': query_length,
#             'article_length': article_length
#         })

# # Create a DataFrame from the rows list
# df = pd.DataFrame(rows)

In [19]:
df.head(10)

Unnamed: 0,question_id,query,law_id,article_id,content,bm25_score,bm25_score_scaled,label,query_length,article_length
0,qa_data_04-DS-1,"Trong nguyên tắc về quản lý viên chức, việc tu...",Luật Viên chức,6,Các nguyên tắc quản lý viên chức 1. Bảo đảm sự...,91.59893,1.0,0,47,166
1,qa_data_04-DS-1,"Trong nguyên tắc về quản lý viên chức, việc tu...",Luật Viên chức,7,Vị trí việc làm 1. Vị trí việc làm là công việ...,70.130809,0.650504,0,47,98
2,qa_data_04-DS-1,"Trong nguyên tắc về quản lý viên chức, việc tu...",Luật Viên chức,31,"Bổ nhiệm, thay đổi chức danh nghề nghiệp 1. Vi...",66.734567,0.595214,0,47,241
3,qa_data_04-DS-1,"Trong nguyên tắc về quản lý viên chức, việc tu...",Luật Viên chức,37,Bổ nhiệm viên chức quản lý 1. Việc bổ nhiệm vi...,64.113004,0.552536,0,47,265
4,qa_data_04-DS-1,"Trong nguyên tắc về quản lý viên chức, việc tu...",Luật Viên chức,32,Thay đổi vị trí việc làm 1. Khi đơn vị sự nghi...,63.77265,0.546995,0,47,143
5,qa_data_04-DS-1,"Trong nguyên tắc về quản lý viên chức, việc tu...",Luật Viên chức,20,Căn cứ tuyển dụng Việc tuyển dụng viên chức ph...,58.408925,0.459675,0,47,38
6,qa_data_04-DS-1,"Trong nguyên tắc về quản lý viên chức, việc tu...",Luật Viên chức,3,"Giải thích từ ngữ Trong Luật này, các từ ngữ d...",57.645271,0.447243,0,47,251
7,qa_data_04-DS-1,"Trong nguyên tắc về quản lý viên chức, việc tu...",Luật Viên chức,41,Nội dung đánh giá viên chức 1. Việc đánh giá v...,57.635767,0.447088,0,47,194
8,qa_data_04-DS-1,"Trong nguyên tắc về quản lý viên chức, việc tu...",Luật Viên chức,33,"Chế độ đào tạo, bồi dưỡng viên chức 1. Việc đà...",56.927364,0.435555,0,47,197
9,qa_data_04-DS-1,"Trong nguyên tắc về quản lý viên chức, việc tu...",Luật Viên chức,40,Căn cứ đánh giá viên chức Việc đánh giá viên c...,56.527548,0.429047,0,47,47


In [20]:
df.to_csv("bm25_len_output_data_private_24.csv", index = False)

In [14]:
# train_2223 = get_top_n_list(train_2223, law2223_path,100)
# write_jsonfile("top_100_train.json", train_2223)

train_2224_data = get_top_n_list(merged_2224_train, law_path,100)
write_jsonfile("top_100_train_2224.json", train_2224_data)

In [15]:
# private = get_top_n_list(private_path, law_path,100)
# write_jsonfile("top_100_private_test.json", private)

pub_test_2024_data = get_top_n_list(pub_test_2024, law_path,100)
write_jsonfile("top_100_pub_test_2024.json", pub_test_2024_data)

In [11]:
# priv_2022 = get_top_n_list(private_2022, law2223_path,100)
# write_jsonfile("top_100_priv_2022.json", priv_2022)

priv_test_2023_data = get_top_n_list(private_2023, law_path,100)
write_jsonfile("top_100_priv_test_2023.json", priv_test_2023_data)

In [17]:
unverified_train_2024_data = get_top_n_list(unverified_train_2024, law_path,100)
write_jsonfile("top_100_unverified_train_2024.json", unverified_train_2024_data)

In [14]:
def concat_file(dataset_path, corpus_path, is_train):
    local_dataset = load_json(dataset_path)
    local_corpus = load_json(corpus_path)
        
    query_list = []
    answer_list = []
    label_list = []
    for local_data in local_dataset:
        local_id = local_data["question_id"]
        local_text = local_data["text"]
        if is_train:
            local_articles = query_articles(local_data["relevant_articles"][0], local_corpus)
            query_list.append(pre_processing(local_text))
            answer_list.append(pre_processing(local_articles))
            label_list.append(1)
        for candidate in local_data["top_n"]:
            query_list.append(local_text)
            answer_list.append(candidate["text"])
            label_list.append(0)
        
    return {
        "query": query_list, 
        "article": answer_list, 
        "label": label_list
    }

# concat_test("/kaggle/working/top_50_test.json", corpus_path)

In [15]:
def detect_law(data):
    data = data["text"]
    data = (data.split("]")[0])[1:]
    data = data.split()
    return {
        "law_id": " ".join(data[2:]),
        "article_id": " ".join(data[1:2])
    }

In [16]:
def create_output(output_path,top_k=1):
    data = load_json(output_path)
    result_list = []
    for local_data in data:
        top_n_items = local_data["top_n"][:top_k]
        relevant_articles = [detect_law(item) for item in top_n_items]
        data_output = {
            "question_id": local_data["question_id"],
            "relevant_articles": relevant_articles
        }
        result_list.append(data_output)
    return result_list
    
# create_output("/kaggle/working/top_50_test.json")
# write_jsonfile("results_v2.json", create_output("/kaggle/working/top_10_public_test.json",3))

# Create top k BM25

In [21]:
# top_k_list = [1,2,5,10,20,50,100]

# for k in top_k_list:
#     out_file = "top_" + str(k) + "_public_test.json"
#     output = create_output("/kaggle/working/top_100_private_test.json",k)
#     write_jsonfile(out_file, output)

In [None]:
# output_train = create_output("/kaggle/working/top_100_train.json",100)
# write_jsonfile("top_100_private_eval.json", output_train)

# output_private = create_output("/kaggle/working/top_100_private_test.json",100)
# write_jsonfile("top_100_train_eval.json", output_private)

In [18]:
# output_priv_22 = create_output("/kaggle/working/top_100_priv_2022.json",5)
# write_jsonfile("top_5_eval_priv_22.json", output_priv_22)

In [28]:
output_data_qa = create_output("/kaggle/working/top_100_data_team_qa.json",100)
write_jsonfile("top_100_train_data_qa_eval.json", output_data_qa)

In [24]:
# New --------------------------

output_train_2224 = create_output("/kaggle/working/top_100_train_2224.json",100)
write_jsonfile("top_100_train_2224_eval.json", output_train_2224)

output_unverified_train_24 = create_output("/kaggle/working/top_100_unverified_train_2024.json",100)
write_jsonfile("top_100_unverfied_train_24_eval.json", output_unverified_train_24)

output_pub_test_24 = create_output("/kaggle/working/top_100_pub_test_2024.json",100)
write_jsonfile("top_100_pub_test_24_eval.json", output_pub_test_24)

output_priv_test_23 = create_output("/kaggle/working/top_100_priv_test_2023.json",5)
write_jsonfile("top_5_priv_test_22_eval.json", output_priv_test_23)

In [16]:
output_priv_test_23_100 = create_output("/kaggle/working/top_100_priv_test_2023.json",100)
write_jsonfile("top_100_priv_test_23_eval.json", output_priv_test_23_100)

In [17]:
# import time
# start_time = time.time()
# train_one = get_top_n_list(train_2223, law2223_path,100)
# write_jsonfile("top_100_train_one.json", train_one)

# output_private = create_output("/kaggle/working/top_100_train_one.json",100)
# print("--- %s seconds ---" % (time.time() - start_time))
# write_jsonfile("top_100_train_one_eval.json", output_private)

--- 20.325040102005005 seconds ---


In [19]:
# start_time = time.time()
# private = get_top_n_list(private_path, law_path,100)
# write_jsonfile("top_100_private_test.json", private)

# output_private = create_output("/kaggle/working/top_100_private_test.json",100)
# write_jsonfile("top_100_private_eval.json", output_private)
# print("--- %s seconds ---" % (time.time() - start_time))

--- 3.6871795654296875 seconds ---


### Json to csv

In [None]:
# Initialize an empty list to store the processed data
rows = []

# Function to extract law_id and article_id
def extract_law_article_info(text):
    data = text.split("]")[0][1:]
    data = data.split()
    return {
        "law_id": " ".join(data[2:]),
        "article_id": data[1]
    }

# Process each question
for question in private_24_data:
    question_id = question['question_id']
    query = question['text']
    query_length = len(query.split())

    # Extract top_n data
    top_n = question['top_n']
    scores = [item['score'] for item in top_n]
    min_score, max_score = min(scores), max(scores)

    for item in top_n:
        content = item['text']
        bm25_score = item['score']
        bm25_score_scaled = (bm25_score - min_score) / (max_score - min_score) if max_score > min_score else 0
        
        # Extract law_id and article_id from the content
        law_article_info = extract_law_article_info(content)
        law_id = law_article_info['law_id']
        article_id = law_article_info['article_id']
        
        # Clean the content to remove the [law_id article_id] part
        content_cleaned = ' '.join(content.split('] ', 1)[1:])
        
        # Calculate the length of the article content
        article_length = len(content_cleaned.split())
        
        # Determine the label
        label = 1 if (law_id, article_id) in relevant_articles_map.get(query, set()) else 0

        # Append the row data
        rows.append({
            'question_id': f'qa_data_{question_id}',
            'query': query,
            'law_id': law_id,
            'article_id': article_id,
            'content': content_cleaned,
            'bm25_score': bm25_score,
            'bm25_score_scaled': bm25_score_scaled,
            'label': label,
            'query_length': query_length,
            'article_length': article_length
        })

# Create a DataFrame from the rows list
df = pd.DataFrame(rows)