In [1]:
import json
import random

# Load the question and law data
with open('/kaggle/input/data-qa-991/question_9.91_train_gui_duy.json', 'r') as qftr:
    question_data_train = json.load(qftr)
        
with open('/kaggle/input/data-qa-991/question_9.91_test_gui_duy.json', 'r') as qfte:
    question_data_test = json.load(qfte)

with open('/kaggle/input/law-non-dup-12/law_nondup copy 12.json', 'r') as lf:
    law_data = json.load(lf)

In [2]:
# Create a dictionary for quick lookup of laws by their IDs
law_dict = {}
for law in law_data:
    law_dict[law['id']] = law

In [3]:
number_of_negative_samples = 2

# Function to find a random article from the same law but different from the correct article
def find_hard_negative(law, relevant_id):
    # Flatten all sections and articles in the law except the correct one
    articles = []
    for chapter in law['content']:
        for section in chapter['content_Chapter']:
            for article in section['content_Section']:
                if (chapter['id_Chapter'], section['id_Section'], article['id_Article']) not in relevant_id:
                    articles.append(article['content_Article'])
    return random.sample(articles, min(number_of_negative_samples, len(articles))) if articles else None

# Function to find a random article from a different law
def find_soft_negative(law_dict, correct_law_id, num_laws=2, num_chapters=2, num_sections=2, num_articles=2):
    random_articles = []

    while len(random_articles) < number_of_negative_samples:
        random_law_ids = random.sample([key for key in law_dict.keys() if key != correct_law_id], num_laws)
        
        for random_law_id in random_law_ids:
            random_law = law_dict[random_law_id]
            random_chapters = random.sample(random_law['content'], min(num_chapters, len(random_law['content'])))
#             print(len(random_chapters))
            for random_chapter in random_chapters:
                random_sections = random.sample(random_chapter['content_Chapter'], min(num_sections, len(random_chapter['content_Chapter'])))
#                 print(len(random_sections))
                for random_section in random_sections:
                    random_articles_in_section = random.sample(random_section['content_Section'], min(num_articles, len(random_section['content_Section'])))
                    
                    for random_article in random_articles_in_section:
                        random_articles.append(random_article['content_Article'])
    
    return random.sample(random_articles, min(number_of_negative_samples, len(random_articles))) if random_articles else None

# Function to process the questions and match the relevant laws
def process_questions(question_data, law_dict):
    for question in question_data:
        relevant_id = []
        for relevant_law in question['relevant_laws']:
            chapter_id = relevant_law['id_Chapter']
            section_id = relevant_law['id_Section']
            article_id = relevant_law['id_Article']
            relevant_id.append((chapter_id, section_id, article_id))
            
        for relevant_law in question['relevant_laws']:
            law_id = relevant_law['id_Law']
            chapter_id = relevant_law['id_Chapter']
            section_id = relevant_law['id_Section']
            article_id = relevant_law['id_Article']

            # Find the corresponding law
            if law_id in law_dict:
                law = law_dict[law_id]
                for chapter in law['content']:
                    if chapter['id_Chapter'] == chapter_id:
                        for section in chapter['content_Chapter']:
                            if section['id_Section'] == section_id:
                                for article in section['content_Section']:
                                    if article['id_Article'] == article_id:
                                        # Add the content of the article to the relevant law in the question
                                        relevant_law['content'] = article['content_Article']

                                        # Add a soft negative
                                        relevant_law['soft_negative'] = find_soft_negative(law_dict, law_id)

                                        # Add a hard negative
                                        relevant_law['hard_negative'] = find_hard_negative(law, relevant_id)


In [4]:
process_questions(question_data_train, law_dict)
process_questions(question_data_test, law_dict)

In [5]:
# Save the updated question data
with open(f'updated_question_file_train_{number_of_negative_samples}negative.json', 'w') as f:
    json.dump(question_data_train, f, ensure_ascii=False, indent=4)

with open(f'updated_question_file_test_{number_of_negative_samples}negative.json', 'w') as f:
    json.dump(question_data_test, f, ensure_ascii=False, indent=4)