## ***Read and Save method***

In [None]:
#Reading data from the docx file
def read_docx(file_path):
    doc = Document(file_path)
    content = []
    for paragraph in doc.paragraphs:
        if paragraph.text.strip():
            paragraph = paragraph.text.lower()
            content.append(paragraph.strip())
    return content

def save_to_json(data, output_file):
    with open(output_file, mode='w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

# ***METHOD***

### ***Saving data to data folder***

In [None]:
# Load documents
document = read_docx('../data/*.docx')

# Calling chunker
chunker = chunkMethod(raw_data=document)
data = chunker.semantic_chunk(
        model_name="bkai-foundation-models/vietnamese-bi-encoder",
        similarity_threshold=0.8,
        group_max_size=300
)

# Main script
input_file = "../data/law.docx" 
output_file = "../data/formatted_data.json"

# Đọc dữ liệu
raw_data = read_docx(input_file)
chunker = chunkMethod(raw_data=raw_data)
chunked_data = chunker.recursive_chunk()

# Xuất ra JSON
save_to_json(chunked_data, output_file)
print(f"Data is saved to: {output_file}")

Data is saved to: ../data/formatted_data.json


### ***Load in json data***

In [None]:
json_load = pd.read_json("../data/formatted_data.json")

### ***Preprocessing data***

In [None]:
dict = {'chapter':[], 'chapter_title':[], 'article':[], 'title':[], 'lines':[]}
for i in range(len(json_load)):
    chapter = json_load[json_load.index == i]['chapter'].values[0]
    chapter_title = json_load[json_load.index == i]['chapter_title'].values[0]
    chapter_ariticles = json_load[json_load.index == i]['chapter_articles'].values[0]
    for article in chapter_ariticles:
        dict['chapter'].append(chapter)
        dict['chapter_title'].append(chapter_title)
        dict['article'].append(article['article'])
        dict['title'].append(article['title'])
        dict['lines'].append(article['lines'])
        
dict = pd.DataFrame(dict)

In [None]:
context_dict = {'chapter':[], 'chapter_title':[], 'article':[], 'title':[], 'context':[]}
for i in range(len(dict)):
    chapter = dict[dict.index == i]['chapter'].values[0]
    chapter_title = dict[dict.index == i]['chapter_title'].values[0]
    article = dict[dict.index == i]['article'].values[0]
    title = dict[dict.index == i]['title'].values[0]
    for line in dict[dict.index == i]['lines'].values[0]:
        context_dict['chapter'].append(chapter)
        context_dict['chapter_title'].append(chapter_title)
        context_dict['article'].append(article)
        context_dict['title'].append(title)
        context_dict['context'].append(line)
        
context_dict = pd.DataFrame(context_dict)
get_context = context_dict['context'].tolist()

## ***Chunking method***

In [None]:
class chunkMethod:
    def __init__(self,raw_data: list,
                chunk_size: int = 300,
                chunk_overlap: int = 20):
        self.raw_data = raw_data

    def document_based_chunk(self):
        data = []
        current_title = None
        current_chapter = None
        current_title = None
        current_articles = []
        
        for idx, line in enumerate(self.raw_data):
            chapter_match = re.match(r"(chương\s+\w+)\s*(.*)", line)
            if chapter_match:
                if current_chapter and current_articles:
                    data.append({
                        "chapter": current_chapter,
                        "chapter_title": current_title,
                        "chapter_articles": current_articles
                    })

                current_chapter = chapter_match.group(1)
                current_title = self.raw_data[idx + 1] if idx + 1 < len(self.raw_data) else ""
                current_articles = []
                
            elif re.match(r"điều\s+\d+\.", line):
                article_match = re.match(r"(điều\s+\d+\.)(.*)", line)
                if article_match:
                    current_articles.append({
                        "article": article_match.group(1).strip(),
                        "title": article_match.group(2).strip(),
                        "lines": []
                    })
                    
            else:
                if current_articles:
                    current_articles[-1]["lines"].append(line.strip())
        
        if current_chapter and current_articles:
            data.append({
                "chapter": current_chapter,
                "chapter_title": current_title,
                "chapter_articles": current_articles
            })
        
        return data