In [None]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTTextLine, LTChar
from concurrent.futures import ProcessPoolExecutor
from transformers import AutoTokenizer, AutoModel
from nltk.tokenize import sent_tokenize
from collections import defaultdict
import math
import json
import gc
import torch
import re
import os
gc.collect()
torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-base')
model = AutoModel.from_pretrained('intfloat/multilingual-e5-base')

In [None]:
def extract_element_texts_from_pdf(pdf_path):
    header_footer_candidates = defaultdict(int)
    page_element_texts = []
    num_pages = 0

    for page_layout in extract_pages(pdf_path):
        num_pages += 1
        page_texts = get_page_texts(page_layout, header_footer_candidates)
        page_element_texts.append(page_texts)

    header_footers = [k for k, v in header_footer_candidates.items() if v >= num_pages * 0.5]
    cleaned_elements_text = clean_header_footers(page_element_texts, header_footers)

    preprocessed_text = format_final_text(cleaned_elements_text)

    return preprocessed_text

def get_page_texts(page_layout, header_footer_candidates):
    page_texts = []
    text_containers = [e for e in page_layout if isinstance(e, LTTextContainer)]

    for index, element in enumerate(text_containers):
        if is_skip_element(element):
            continue
        
        element_text = format_element_text(element.get_text())
        page_texts.append(element_text)
        
        if is_header_footer(index, len(text_containers)):
            header_footer_candidates[element_text] += 1

    return page_texts

def is_skip_element(element):
    first_line = next((line for i, line in enumerate(element) if i == 0 and isinstance(line, LTTextLine)), None)
    if first_line:
        first_char = next((char for j, char in enumerate(first_line) if j == 0 and isinstance(char, LTChar)), None)
        if first_char and "Italic" in first_char.fontname and round(first_char.size) == 8:
            return True
    return False

def is_header_footer(index, total_elements):
    return index <= 4 or index >= total_elements - 4

def format_element_text(element_text):
    return " ".join(element_text.replace("\n", " ").split())

def clean_header_footers(page_element_texts, header_footers):
    cleaned_texts = []
    pattern = r'(\d+ de \d+)|Página \d+ de \d+'

    for page in page_element_texts:
        cleaned_page = [element_text for element_text in page if element_text not in header_footers and not re.search(pattern, element_text)]
        cleaned_texts.extend(cleaned_page)
    
    return cleaned_texts

def format_final_text(cleaned_elements_text):
    punctuation = ['.', '!', '?', ":", ";", "-", ""]
    updated_list = []
    
    for i, string in enumerate(cleaned_elements_text):
        if i == len(cleaned_elements_text) - 1 or (cleaned_elements_text[i + 1] and not cleaned_elements_text[i + 1][0].islower()):
            if string and string[-1] not in punctuation:
                string += '.'
        updated_list.append(string)

    return updated_list

In [None]:
def word_count(string):
    return (len(string.split()))

def token_counter(string):
    tokens = tokenizer(string, return_tensors='pt')
    return tokens['input_ids'].shape[1]

def split_string_at_word(input_string, word_limit):
    words = input_string.split()
    return ' '.join(words[:word_limit]), ' '.join(words[word_limit:])

def normalize_strings(strings, lower=260, upper=320, prefer=280, max_tokens=512):
    normalized = []
    i = 0
    while i < len(strings):
        current_string = strings[i]
        current_string_words = word_count(current_string)
        current_string_tokens = token_counter(current_string)
        string_bigger_than_max = current_string_tokens >= max_tokens

        next_string = strings[i+1] if i+1 < len(strings) else None

        article_condition = next_string and (next_string.lower().startswith("artículo") or next_string.lower().startswith("articulo") or next_string.startswith("Decreto") or next_string.startswith("DECRETO"))

        if (lower < current_string_words < upper or article_condition) and not string_bigger_than_max:
            normalized.append(current_string)
            i += 1
        elif next_string:
            next_string_sent = sent_tokenize(next_string)
            if (current_string_words + word_count(next_string_sent[0]) < upper) and not string_bigger_than_max:
                strings[i] += " " + next_string_sent[0]
                if len(next_string_sent) > 1:
                    strings[i+1] = " ".join(next_string_sent[1:])
                else:
                    del strings[i+1]
            else:
                if current_string_words > upper and not string_bigger_than_max:
                    curr, strings[i] = split_string_at_word(current_string, prefer)
                    normalized.append(curr)
                elif string_bigger_than_max:
                    coefficient = 0.9
                    while string_bigger_than_max:
                        temp1, temp2 = split_string_at_word(current_string, math.floor(coefficient * current_string_words * max_tokens / current_string_tokens))
                        string_bigger_than_max = token_counter(temp1) >= max_tokens
                        coefficient -= 0.1
                    strings[i] = temp2
                    normalized.append(temp1)
                else:
                    normalized.append(current_string)
                    i += 1
        else:
            if string_bigger_than_max:
                if current_string_words > upper and not string_bigger_than_max:
                    curr, strings[i] = split_string_at_word(current_string, prefer)
                    normalized.append(curr)
                elif string_bigger_than_max:
                    coefficient = 0.9
                    while string_bigger_than_max:
                        temp1, temp2 = split_string_at_word(current_string, math.floor(coefficient * current_string_words * max_tokens / current_string_tokens))
                        string_bigger_than_max = token_counter(temp1) >= max_tokens
                        coefficient -= 0.1
                    strings[i] = temp2
                    normalized.append(temp1)
                else:
                    normalized.append(current_string)
                    break
            else:
                normalized.append(current_string)
                break
    return normalized


In [None]:
def preprocess_documents(pdf_path):
    strings_from_doc = extract_element_texts_from_pdf(pdf_path)
    chunks = normalize_strings(strings_from_doc)
    for i in chunks:
        print
        if token_counter(i) >= 512:
            raise Exception("Token Sequence Error")
    return chunks

def process_pdfs_in_folder(root_folder):
    for dir_path, _, filenames in os.walk(root_folder):
        for file_name in filenames:
            if file_name.endswith('.pdf'):
                pdf_path = os.path.join(dir_path, file_name)
                json_path = os.path.join(dir_path, file_name.rsplit('.', 1)[0] + '.json')
                if (os.path.exists(json_path)):
                    print("File already exists")
                    continue
                print(pdf_path)
                chunks = preprocess_documents(pdf_path)
                with open(json_path, 'w', encoding="utf-8") as json_file:
                    json.dump(chunks, json_file, indent=4)

In [None]:
process_pdfs_in_folder("federal/leyes_federales")
process_pdfs_in_folder("federal/reglamentos_federales")

process_pdfs_in_folder("state/leyes_estatales")
process_pdfs_in_folder("state/reglamentos_estatales")
process_pdfs_in_folder("state/reglamentos_municipales/monterrey")

Remove all JSON files:

In [None]:
def delete_json_files(folder):
    for root, dirs, files in os.walk(folder):
        for file in files:
            if (file.endswith('.json') or file.endswith('.index')) and not ("schema" in file):
                file_path = os.path.join(root, file)
                os.remove(file_path)
                print(f'Removed file: {file_path}')

delete_json_files("federal/leyes_federales")
delete_json_files("federal/reglamentos_federales")

delete_json_files("state/leyes_estatales")
delete_json_files("state/reglamentos_estatales")
delete_json_files("state/reglamentos_municipales/monterrey")