In [1]:
%load_ext autoreload
%autoreload 2
import re
import os
import sys
from tqdm.autonotebook import tqdm
from pathlib import Path
from glob import glob

import pymupdf

root_dir = Path(os.getcwd()).parent.parent
sys.path.insert(0, str(root_dir))


  from tqdm.autonotebook import tqdm


In [None]:
from src.d00_utils.parsing import (extract_index_from_doc,
                                   get_hierarchy,
                                   extract_previous_hierarchy,
                                   extract_final_hierarchy,
                                   extract_paragraphs)

from src.d01_data.data import json_dump

raw_path = root_dir / 'data' / '01_raw'
intermediate_path = root_dir / 'data' / '02_intermediate'

files_path = [Path(p) for p in glob(str(raw_path / '*.pdf'))]

In [3]:
for file in tqdm(files_path):
    print(f'Parsing file {file}')
    
    doc = pymupdf.open(file)
    
    # Separates the index pages from the rest
    doc, indexes_pages = extract_index_from_doc(doc=doc)
    
    # Joins all the text of the pages and extracts the hierarchy
    index_text = '\n'.join([p.get_text() for p in indexes_pages])
    jerarquia = get_hierarchy(index_text)
    all_pages_text = '\n'.join([p.get_text() for p in doc])
    
    
    # Regex to find each Articulo in the text
    pattern = r'^Artículo\s+\d+(?:\s+bis)?\.'

    # Use re.findall with the MULTILINE flag to extract all occurrences.
    matches = re.findall(pattern, all_pages_text, flags=re.MULTILINE)

    # Regex to find special case of Articulo
    combine_pattern = r'^\s*(Artículos\s+\d+\s+(?:a|y)\s+\d+\.)\s*'
    combine_matches = re.findall(combine_pattern, all_pages_text, flags=re.MULTILINE)

    if not combine_matches:
        combine_matches = []

    matches = matches + combine_matches
    
    
    # Extract the previous hierarchical structure from the full text.
    previous_hierarchy = extract_previous_hierarchy(texto=all_pages_text,
                                                    articles=matches,
                                                    jerarquia=jerarquia)

    # Based on the previously extracted hierarchy, determine the final parent-child relationships.
    final_parents = extract_final_hierarchy(hierarchy=previous_hierarchy,
                                            origen=file.stem.lower())

    # Extract paragraphs from the text based on the provided search words (article markers).
    article_texts = extract_paragraphs(text=all_pages_text, 
                                        search_words=matches)
    
    # Making sure there is not article missing information
    assert len(final_parents) == len(article_texts), 'Mismatch in articles information'
    
    # Combination of information of each article
    combined_article_info = {}
    for article in article_texts.keys():
        combined_article_info[article] = f'{final_parents[article]}\n{article_texts[article]}'
        

    # Savinf the result of parsing
    json_dump(combined_article_info, intermediate_path / file.stem)
    print()


  0%|          | 0/3 [00:00<?, ?it/s]

Parsing file c:\Users\edgarmp\Desktop\ejemplos\chatbot\data\01_raw\Constitucion Española.pdf
Última página del índice 2
Archivo guardado exitosamente como c:\Users\edgarmp\Desktop\ejemplos\chatbot\data\02_intermediate\Constitucion Española.json

Parsing file c:\Users\edgarmp\Desktop\ejemplos\chatbot\data\01_raw\Código Civil.pdf
Última página del índice 12
Archivo guardado exitosamente como c:\Users\edgarmp\Desktop\ejemplos\chatbot\data\02_intermediate\Código Civil.json

Parsing file c:\Users\edgarmp\Desktop\ejemplos\chatbot\data\01_raw\Código Penal.pdf
Última página del índice 8
Archivo guardado exitosamente como c:\Users\edgarmp\Desktop\ejemplos\chatbot\data\02_intermediate\Código Penal.json

