In [1]:
import re
import os
import sys
import shutil
import json
from glob import glob
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv('.env')
from openai import OpenAI
client = OpenAI()
import numpy as np
import pandas as pd
import time
from IPython.display import display, clear_output
import matplotlib.pyplot as plt
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
def read_dir(directory):
    contents = []
    paths = []
    for file_path in tqdm(glob(os.path.join(directory, '**/*.txt'), recursive=True)):
        with open(file_path, 'r', encoding="utf-8") as f:
            content = f.read()
            contents.append(content)
            paths.append(file_path)
    return contents, paths

## **Analyze our legal document length distribution**

In [None]:
contents = read_dir("_ตรวจแล้ว")

In [None]:
lengths = [len(content) for content in contents]
paragraphs = [len(content.split("\n\n")) for content in contents]

In [None]:
plt.figure(figsize=(10, 5))
plt.hist(lengths, bins=30, alpha=0.7, label='Length of strings')
plt.xlabel('Length')
plt.ylabel('Frequency')
plt.title('Distribution of String Lengths')
plt.legend()
plt.show()

# Plotting the distribution of split counts
plt.figure(figsize=(10, 5))
plt.hist(paragraphs, bins=30, alpha=0.7, color='orange', label='Count of "\\n\\n" splits')
plt.xlabel('Count of "\\n\\n" splits')
plt.ylabel('Frequency')
plt.title('Distribution of "\\n\\n" Splits in Strings')
plt.legend()
plt.show()

In [None]:
# Creating a DataFrame
df = pd.DataFrame({'content': contents})
df['length'] = df['content'].apply(len)
df['split_count'] = df['content'].apply(lambda x: x.count("\n\n"))

# Compute quantiles for split counts
split_count_quantiles = df['split_count'].quantile([0.25, 0.5, 0.75]).tolist()

# Define bins and labels based on quantiles
split_bins = [-1] + split_count_quantiles + [df['split_count'].max()]
split_labels = [f"{int(split_bins[i]) + 1} to {int(split_bins[i+1])}" for i in range(len(split_bins)-1)]

# Categorizing data into bins
df['split_count_bin'] = pd.cut(df['split_count'], bins=split_bins, labels=split_labels, include_lowest=True)

# Aggregating bin counts for visualization
split_count_bin_counts = df['split_count_bin'].value_counts().reindex(split_labels)

# Visualizing the distribution of "\n\n" split counts by detailed bins
plt.figure(figsize=(10, 6))
split_count_bin_counts.plot(kind='bar', color='orange')
plt.title('Distribution of "\\n\\n" Split Counts by Detailed Bins')
plt.xlabel('Split Count Range')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
def get_n(n):
    if n<5: return 3
    elif n<8: return 5
    elif n<13: return 8
    elif n<17: return 10
    else: return 

In [None]:
ns = 0
for p in paragraphs:
    n = get_n(p)
    ns += n
print("total estimated questions:", ns)

## **Generating dataset**

In [3]:
def get_n(n):
    if n<5: return 3
    elif n<8: return 5
    elif n<13: return 8
    elif n>18: return 14
    else: return 10

In [4]:
def ask_gpt(
    messages,
    model="gpt-3.5-turbo-1106",
):
    """
    messages: list of dict

    [
        {"role": "system", "content": "system instruction here"},
        {"role": "user", "content": "user instruction here with inputs"}
    ]

    model: str

    openai model name
    """
    try:
        messages = messages
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            # default settings
            temperature=0,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            # stop=stop_sequence,
        )
        return True, response.choices[0].message.content
    except Exception as e:
        print(f"{e}")
        return False, e

In [5]:
INSTRUCTION_TEMPLATE = """Act as a dataset generator.
Generate {n} legal question and answer pairs dataset based on the legal facts in a #given knowledge# section by following this requirements:
- The generated contents must be in Thai language.
- Don't use the same phrase/wording directly from #given knowledge# in a generated question.
- Ensure that all text are clear and unambiguous. Specifically, avoid using phrases like "เอกสารนี้ (this document)", pronouns or similar constructs where the object of reference is not explicitly stated.
- Use full reference to the law, for examples, "มาตรา 1 ของพรบ.คอมพิวเตอร์" instead of "มาตรา 1"
- The question must be one of the following types:
    (1) Asking for definition or declaration, for examples, "มาตราที่ 1 ของพรบ.คอมพิวเตอร์เกี่ยวกับอะไร"
    (2) Situation based, for examples, "นายสมชายต่อยเพื่อน ผิดกฎหมายข้อไหน"
    (3) Seeking advice, for examples, "ถ้าเราโดนโกงเงินจากบัญชีธนาคารเราจะทำอย่างไรดี"
- The answer must also include references, for examples, "อ้างอิงจากมาตรา 1,2 และ 4 ของพรบ.คอมพิวเตอร์"
- You must think in a following way: Issue, Rule, Application, Conclusion (IRAC).
"""

INPUT_TEMPLATE = """
#given knowledges#
{knowledge}
"""

RESPONSE_TEMPLATE = """
The response must include question, its references from #given knowledges# and the answer in a JSON format as following:
{{
    1: {{
        question: "...",
        references: "...",
        answer: "..."
    }},
    ...
}}

Response:"""

In [6]:
def process_content(content_path_tuple):
    content, path = content_path_tuple
    n = get_n(len(content.split("\n\n")))
    instruction = (INSTRUCTION_TEMPLATE.format(n=n)) + (INPUT_TEMPLATE.format(knowledge=content)) + RESPONSE_TEMPLATE
    messages = [
        {"role": "system", "content": instruction},
    ]
    isSuccess, response = ask_gpt(messages, model="gpt-4-0125-preview")
    if isSuccess:
        return (response, path)
    else:
        return (None, path)
    
def run_in_parallel(contents, paths, num_threads=2):
    # Combine contents and paths into a list of tuples for easier management
    content_path_tuples = list(zip(contents, paths))
    
    results = [None] * len(contents)  # Pre-allocate list for results to maintain order
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Use a dictionary to keep track of futures and their corresponding index
        future_to_index = {executor.submit(process_content, cpt): i for i, cpt in enumerate(content_path_tuples)}
        
        for future in tqdm(as_completed(future_to_index)):
            index = future_to_index[future]
            response, path = future.result()
            # Use the index to place the result in the correct position
            results[index] = (response, path)
    
    return results

In [7]:
legal_name = "พระราชบัญญัติว่าด้วยราคาสินค้าและบริการ"
contents, paths = read_dir(f"_ตรวจแล้ว/{legal_name}")
results = run_in_parallel(contents, paths, num_threads=4)

df = pd.DataFrame(columns=['question', 'answer', 'references', 'source'])

metadata = {}
for i in range(len(contents)):
    metadata[paths[i]] = {"expected": get_n(len(contents[i].split("\n\n"))), "actual":0}

for response_str, path in results:
    if response_str:
        start_index = response_str.find('{')
        end_index = response_str.rfind('}') + 1
        if start_index != -1 and end_index != -1: json_str = response_str[start_index:end_index]
        else:
            raise Exception("JSON not found")
        try:
            response = json.loads(json_str)
            # print(response_str[8:-4])
            counts = 0
            for key, value in response.items():
                # Extract each question, answer, and references, and add them to the DataFrame
                df.loc[len(df)] = [value['question'], value['answer'], value['references'], path]
                counts += 1
            metadata[path]["actual"] = counts
        except Exception as e:
            df.loc[len(df)] = [None, None, response_str, path]
            metadata[path]["actual"] = 0
    else:
        df.loc[len(df)] = [None, None, None, path]
        metadata[path]["actual"] = 0

display(df.head(2))
if os.path.exists(f"dataset/{legal_name}.csv"):
    shutil.copy(f"dataset/{legal_name}.csv", f"dataset/{legal_name}_backup.csv")
df.to_csv(f"dataset/{legal_name}.csv", index=False, encoding="utf-8")
with open(f"dataset/metadata_{legal_name}.json", 'w', encoding="utf-8") as f:
    json.dump(metadata, f, indent=4, ensure_ascii=False)
print(sum([get_n(len(content.split("\n\n"))) for content in contents]), len(df))

100%|██████████| 6/6 [00:00<00:00, 4978.40it/s]
6it [05:32, 55.46s/it] 


Unnamed: 0,question,answer,references,source
0,พระราชบัญญัติว่าด้วยราคาสินค้าและบริการ พ.ศ. 2...,มีวัตถุประสงค์เพื่อปรับปรุงกฎหมายว่าด้วยการกำห...,หมายเหตุจากพระราชบัญญัติว่าด้วยราคาสินค้าและบร...,_ตรวจแล้ว/พระราชบัญญัติว่าด้วยราคาสินค้าและบริ...
1,การจำหน่ายตามพระราชบัญญัติว่าด้วยราคาสินค้าและ...,"หมายถึงการขาย, แลกเปลี่ยน, ให้, จ่ายแจก, โอนสิ...",มาตรา 4 ของพระราชบัญญัติว่าด้วยราคาสินค้าและบร...,_ตรวจแล้ว/พระราชบัญญัติว่าด้วยราคาสินค้าและบริ...


45 45


In [8]:
legal_name = "พระราชบัญญัติการแข่งขันทางการค้า"
contents, paths = read_dir(f"_ตรวจแล้ว/{legal_name}")
results = run_in_parallel(contents, paths, num_threads=4)

df = pd.DataFrame(columns=['question', 'answer', 'references', 'source'])

metadata = {}
for i in range(len(contents)):
    metadata[paths[i]] = {"expected": get_n(len(contents[i].split("\n\n"))), "actual":0}

for response_str, path in results:
    if response_str:
        start_index = response_str.find('{')
        end_index = response_str.rfind('}') + 1
        if start_index != -1 and end_index != -1: json_str = response_str[start_index:end_index]
        else:
            raise Exception("JSON not found")
        try:
            response = json.loads(json_str)
            # print(response_str[8:-4])
            counts = 0
            for key, value in response.items():
                # Extract each question, answer, and references, and add them to the DataFrame
                df.loc[len(df)] = [value['question'], value['answer'], value['references'], path]
                counts += 1
            metadata[path]["actual"] = counts
        except Exception as e:
            df.loc[len(df)] = [None, None, response_str, path]
            metadata[path]["actual"] = 0
    else:
        df.loc[len(df)] = [None, None, None, path]
        metadata[path]["actual"] = 0

display(df.head(2))
if os.path.exists(f"dataset/{legal_name}.csv"):
    shutil.copy(f"dataset/{legal_name}.csv", f"dataset/{legal_name}_backup.csv")
df.to_csv(f"dataset/{legal_name}.csv", index=False, encoding="utf-8")
with open(f"dataset/metadata_{legal_name}.json", 'w', encoding="utf-8") as f:
    json.dump(metadata, f, indent=4, ensure_ascii=False)
print(sum([get_n(len(content.split("\n\n"))) for content in contents]), len(df))

100%|██████████| 9/9 [00:00<00:00, 10951.19it/s]
9it [08:12, 54.73s/it] 


Unnamed: 0,question,answer,references,source
0,,,"{\n 1: {\n question: ""หากบุคคลใดไม่ป...",_ตรวจแล้ว/พระราชบัญญัติการแข่งขันทางการค้า/พระ...
1,หากบริษัทฝ่าฝืนมาตรา 51 วรรคหนึ่ง จะต้องชำระค่...,ต้องชำระค่าปรับทางปกครองในอัตราไม่เกินสองแสนบา...,อ้างอิงจากมาตรา 80 ของพระราชบัญญัติการแข่งขันท...,_ตรวจแล้ว/พระราชบัญญัติการแข่งขันทางการค้า/พระ...


77 70


## **Fix errors**

In [9]:
def fix_json_like_string(json_like_str):
    # Regular expression to match keys in the format: key: (without quotes)
    # This regex assumes keys are alphanumeric with underscores
    regex = r'(?<!")(\b\w+\b)(?=\s*:)'  # Look for word characters ending with ':' not preceded by a quote
    
    # Function to add double quotes around the found keys
    def add_quotes(match):
        return f'"{match.group(1)}"'
    
    # Replace all occurrences of keys without quotes with quoted keys
    fixed_str = re.sub(regex, add_quotes, json_like_str)
    
    return fixed_str

In [None]:
for file_path in tqdm(glob(os.path.join('dataset', '**/*.json'), recursive=True)):
    metadata = json.load(open(file_path, 'r', encoding="utf-8"))
    expected = sum([val["expected"] for val in metadata.values()])
    actual = len(pd.read_csv(file_path.replace(".json", ".csv").replace("metadata/metadata_",""), encoding="utf-8"))
    if expected!=actual:
        print(file_path, expected, actual)

In [12]:
legal_name = "พระราชบัญญัติการแข่งขันทางการค้า"
df = pd.read_csv(f"dataset/{legal_name}.csv", encoding="utf-8")
metadata = json.load(open(f"dataset/metadata/metadata_{legal_name}.json", 'r', encoding="utf-8"))

condition = df['question'].isna() # | df['answer'].isna() | df['references'].isna()
df_filtered = df[~condition].reset_index(drop=True)
df_error = df[condition].reset_index(drop=True)
unhandle_error = pd.DataFrame(columns=['references', 'source'])

display(df_error.head(len(df_error)))

for index, row in df_error.iterrows():
    path = row["source"]
    response_str = row['references']
    if isinstance(response_str, float):
        with open(path, 'r', encoding="utf-8") as f:
            content = f.read()
        response_str, _ = process_content((content, path))
        # print(response_str)
    tmp = response_str
    start_index = response_str.find('{')
    end_index = response_str.rfind('}') + 1
    if start_index != -1 and end_index != -1:
        try:
            response_str = tmp
            start_index = response_str.find('{')
            end_index = response_str.rfind('}') + 1
            json_str = response_str[start_index:end_index]
            response = json.loads(fix_json_like_string(json_str))
            for key, value in response.items():
                df_filtered.loc[len(df_filtered)] = [value['question'], value['answer'], value['references'], row["source"]]
        except Exception as e:
            response_str = tmp
            start_index = response_str.find('{')
            end_index = response_str.rfind('}') + 1
            json_str = response_str[start_index:end_index] + '\n}'
            try:
                response = json.loads(fix_json_like_string(json_str))
                for key, value in response.items():
                    df_filtered.loc[len(df_filtered)] = [value['question'], value['answer'], value['references'], row["source"]]
            except Exception as e:
                unhandle_error.loc[len(unhandle_error)] = [response_str, path]
    else:
        unhandle_error.loc[len(unhandle_error)] = [response_str, path]
print(sum([val["expected"] for val in metadata.values()]), len(df_filtered), len(unhandle_error))
if len(unhandle_error)>0:
    unhandle_error.to_csv(f"dataset/unhandle_error.csv", index=False, encoding="utf-8")

Unnamed: 0,question,answer,references,source
0,,,"{\n 1: {\n question: ""หากบุคคลใดไม่ป...",_ตรวจแล้ว/พระราชบัญญัติการแข่งขันทางการค้า/พระ...


77 77 0


In [13]:
df_filtered.reset_index(drop=True, inplace=True)
tmp = df_filtered.copy()
tmp = tmp.drop_duplicates(subset=['question'],keep='first').reset_index(drop=True)
len(tmp), len(df_filtered)

(77, 77)

In [14]:
df_filtered.reset_index(drop=True, inplace=True)
if os.path.exists(f"dataset/{legal_name}.csv"):
    shutil.copy(f"dataset/{legal_name}.csv", f"dataset/{legal_name}_backup.csv")
df_filtered.to_csv(f"dataset/{legal_name}.csv", index=False, encoding="utf-8")

In [None]:
for file_path in tqdm(glob(os.path.join('dataset', '**/*.csv'), recursive=True)):
    if "backup" not in file_path:
        df = pd.read_csv(file_path, encoding="utf-8")
        tmp = df.copy()
        tmp = tmp.drop_duplicates(subset=['question'],keep='first').reset_index(drop=True)
        if len(tmp)!=len(df):
            shutil.copy(f"dataset/{legal_name}.csv", f"dataset/{legal_name}_backup_w_duplicate.csv")
            tmp.to_csv(f"dataset/{legal_name}.csv", index=False, encoding="utf-8")
            print(file_path)
    

## **Finalize data**
Generate dataset.csv & swap rows 

In [17]:
df = pd.DataFrame(columns=['question', 'answer', 'references', 'source'])
count = 0

for file_path in tqdm(glob(os.path.join('dataset', '**/*.csv'), recursive=True)):
    if "backup" not in file_path:
        tmp = pd.read_csv(file_path, encoding="utf-8")
        count += len(tmp)
        df = pd.concat([df, tmp], ignore_index=True)

df.reset_index(drop=True, inplace=True)
df = df.sample(frac=1).reset_index(drop=True)
df.to_csv(f"dataset.csv", index=False, encoding="utf-8")
len(df), count

100%|██████████| 41/41 [00:00<00:00, 800.97it/s]


(4121, 4121)