# Semantic Chunking
- Instead of splitting text at arbitrary lengths, use Natural Language Processing (NLP) to split at meaningful boundaries:
- Sentence-based: Use NLP tools like Spacy or NLTK to split by sentences.
- Paragraph-based: Treat each paragraph as a chunk.
- Heading-based: Use document structure (e.g., titles, headings, subheadings) to define chunks.
- Slide-based: If the PDF is a slide deck, extract each slide as a chunk.

In [26]:
import nltk

# Download tokenizer model for Portuguese if not already downloaded
nltk.download("punkt")

from nltk.tokenize import sent_tokenize

def semantic_chunking(text, max_length=500):
    sentences = sent_tokenize(text, language="portuguese")  # Tokenize sentences in Portuguese
    chunks, current_chunk = [], ""

    for sent in sentences:
        if len(current_chunk) + len(sent) < max_length:
            current_chunk += " " + sent
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sent

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks


[nltk_data] Downloading package punkt to /home/ebezerra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [27]:
import fitz  # PyMuPDF
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    return text

In [28]:
PDF_FILE = "../data/propostas/tokio_outubro_2024.pdf"  # Change this to your PDF file
print("📄 Extracting text from PDF...")
raw_text = extract_text_from_pdf(PDF_FILE)


📄 Extracting text from PDF...


In [29]:
# raw_text

In [30]:
# Define the output file path
OUTPUT_FILE = "../data/tokio_outubro_2024.txt"  

# Save extracted text to a text file
with open(OUTPUT_FILE, "w", encoding="utf-8") as file:
    file.write(raw_text)

In [31]:
import re

def remove_footer(text):
    """
    Removes footers matching the given pattern.
    """
    # Define a regex pattern based on your footer structure
    footer_pattern = r"\d+\s*\nTokio Marine Seguradora S\.A – Cia \d+\s*\nCondomínio Processo SUSEP nº .* - Versão- .*"

    # Remove matching footer patterns
    cleaned_text = re.sub(footer_pattern, "", text, flags=re.MULTILINE)

    return cleaned_text.strip()

In [32]:
# Load extracted text from file
input_file = "../data/tokio_outubro_2024.txt"
output_file = "../data/tokio_outubro_2024_cleaned.txt"

with open(input_file, "r", encoding="utf-8") as file:
    raw_text = file.read()

# Remove footers
cleaned_text = remove_footer(raw_text)

# Save cleaned text
with open(output_file, "w", encoding="utf-8") as file:
    file.write(cleaned_text)

print("✅ Footer removed successfully!")


✅ Footer removed successfully!


In [33]:
# nltk.download('punkt_tab')

In [34]:
chunks = semantic_chunking(raw_text)

In [35]:
chunks[0]

''

# Extracting summary information

In [36]:
input_string = """
1. ÂMBITO GEOGRÁFICO............................................................................................................................................................. 8 
2. OBJETIVO DO SEGURO ........................................................................................................................................................... 8 
3. DOCUMENTOS DO SEGURO ................................................................................................................................................... 8 
4. LOCAL DE RISCO ..................................................................................................................................................................... 8 
5. EDIFÍCIOS ABRANGIDOS PELO SEGURO.............................................................................................................................. 8 
6. EDIFÍCIOS NÃO ABRANGIDOS PELO SEGURO..................................................................................................................... 9 
7. COBERTURAS DO SEGURO .................................................................................................................................................. 10 
7.1 COBERTURA BÁSICA AMPLA ........................................................................................................................................... 10 
7.2 COBERTURA BÁSICA SIMPLES ........................................................................................................................................ 11 
8 COBERTURAS ACESSÓRIAS ........................................................................................................................... 12 
8.1 ALAGAMENTO E INUNDAÇÃO ......................................................................................................................... 12 
8.2 DANOS ELÉTRICOS ........................................................................................................................................... 13 
8.3 DERRAME OU VAZAMENTO DE CHUVEIROS AUTOMÁTICOS (SPRINKLERS) ........................................... 14 
8.4 DESMORONAMENTO ......................................................................................................................................... 15 
8.5 RESPONSABILIDADE CIVIL DANOS MORAIS ................................................................................................. 16 
8.6 RESPONSABILIDADE CIVIL CONDOMÍNIO ...................................................................................................... 17 
8.7 RESPONSABILIDADE CIVIL GARAGISTA ........................................................................................................ 19 
8.8 RESPONSABILIDADE CIVIL PORTÕES AUTOMÁTICOS ................................................................................ 23 
8.9 RESPONSABILIDADE CIVIL SÍNDICO .............................................................................................................. 24 
8.10 FIDELIDADE ........................................................................................................................................................ 26 
8.11 IMPACTO DE VEÍCULOS.................................................................................................................................... 27 
"""

In [37]:
import re

def extract_matching_patterns(pattern1, pattern2, input_string):
    # Find all matches for both patterns using finditer to get match objects
    matches1 = pattern1.finditer(input_string)
    matches2 = pattern2.finditer(input_string)
    
    # Combine both sets of matches into a list of tuples (with offset)
    combined_matches = []
    
    for match in matches1:
        # Append a tuple with the components and the offset (match.start())
        combined_matches.append((match.group(1), match.group(3).strip(), match.group(4), match.start()))
    
    for match in matches2:
        # Append a tuple with the components and the offset (match.start())
        combined_matches.append((match.group(1), match.group(3).strip(), match.group(4), match.start()))
    
    # Sort the combined list of tuples by the offset (last item in the tuple)
    sorted_matches = sorted(combined_matches, key=lambda match: match[3])  # match[3] is the offset
    
    # Return the sorted list of matched components
    return sorted_matches

# Adjusted regex patterns to handle multi-line section titles
pattern1 = re.compile(r"\s*(\d+(\.\d+)?)\s*\n([\s\S]+?)\s*\.+\s*(\d+)\s*")  # Pattern 1
pattern2 = re.compile(r"\s*(\d+(\.\d+)?)\.\s*\n([\s\S]+?)\s*\.+\s*(\d+)\s*")  # Pattern 2

# Example input with a multi-line title
input_string = """
blah blah
11 
CLÁUSULA PARTICULAR DE EXCLUSÃO PARA SITUAÇÕES NACIONAIS OU INTERNACIONAIS DE 
SANÇÃO, EMBARGO, PROIBIÇÃO OU RESTRIÇÃO................................................................................................... 55 
some other random text
"""

# Call the function and print the results
result = extract_matching_patterns(pattern1, pattern2, input_string)

for match in result:
    print(match)  # Output will include tuple with matched components and sorted by offset


('11', 'CLÁUSULA PARTICULAR DE EXCLUSÃO PARA SITUAÇÕES NACIONAIS OU INTERNACIONAIS DE \nSANÇÃO, EMBARGO, PROIBIÇÃO OU RESTRIÇÃO', '55', 10)


In [38]:
import re

def extract_section_numbers(document_content):
    """
    Extracts section numbers from a given document content while handling multi-line titles correctly.
    
    Args:
        document_content (str): The input string containing the document content.
    
    Returns:
        list: A sorted list of unique section numbers found in the document.
    """
    # Updated regex to properly match section numbers followed by a title
    pattern = re.compile(r'^\s*(\d+(\.\d+)*)\s*\n([^\n]+?)\s*\.+\s*(\d+)\s*$', re.MULTILINE)

    # Find all matches
    matches = pattern.findall(document_content)

    # Extract section numbers, ensuring correct sorting
    extracted_sections = [(match[0], match[1], match[2], match[3]) for match in matches]

    return extracted_sections
    # Sort based on hierarchical order
    # return sorted(set(extracted_sections), key=lambda x: [int(n) for n in x.split('.')])

# Example usage
document_text = """
7. 
COBERTURAS DO SEGURO .................................................................................................................................................. 10 
7.1
COBERTURA BÁSICA AMPLA ........................................................................................................................................... 10 
7.2 
COBERTURA BÁSICA SIMPLES ........................................................................................................................................ 11 
"""

# Extract section numbers
section_numbers = extract_section_numbers(document_text)

# Print results
print(section_numbers)


[('7.1', '.1', 'COBERTURA BÁSICA AMPLA', '10'), ('7.2', '.2', 'COBERTURA BÁSICA SIMPLES', '11')]


In [39]:
import re

def extract_summary(input_string):
    # Combine both sets of matches into a list of tuples (with offset)
    combined_matches = set()
    
    pattern1 = re.compile(r"\s*(\d+(\.\d+)?)\s*\n([^\n]+?)\s*\.+\s*(\d+)\s*")  # Pattern 1
    pattern2 = re.compile(r"\s*(\d+(\.\d+)?)\.\s*\n([^\n]+?)\s*\.+\s*(\d+)\s*")  # Pattern 2

    # Find all matches for both patterns using finditer to get match objects
    matches1 = pattern1.finditer(input_string)
    matches2 = pattern2.finditer(input_string)
    
    for match in matches1:
        # Append a tuple with the components and the offset (match.start())
        combined_matches.add((match.group(1), match.group(3), match.group(4), match.start(), match.end()))
    
    for match in matches2:
        # Append a tuple with the components and the offset (match.start())
        combined_matches.add((match.group(1), match.group(3), match.group(4), match.start(), match.end()))
    
    # Adjusted regex patterns to handle multi-line section titles
    pattern1 = re.compile(r"\s*(\d+(\.\d+)?)\s*\n([\s\S]+?)\s*\.+\s*(\d+)\s*")  # Pattern 1
    pattern2 = re.compile(r"\s*(\d+(\.\d+)?)\.\s*\n([\s\S]+?)\s*\.+\s*(\d+)\s*")  # Pattern 2

    # Find all matches for both patterns using finditer to get match objects
    matches1 = pattern1.finditer(input_string)
    matches2 = pattern2.finditer(input_string)
    
    for match in matches1:
        # Append a tuple with the components and the offset (match.start())
        combined_matches.add((match.group(1), match.group(3), match.group(4), match.start(), match.end()))
    
    for match in matches2:
        # Append a tuple with the components and the offset (match.start())
        combined_matches.add((match.group(1), match.group(3), match.group(4), match.start(), match.end()))

    # Sort the combined list of tuples by the offset (last item in the tuple)
    combined_matches = list(combined_matches)
    sorted_matches = sorted(combined_matches, key=lambda match: match[3])  # match[3] is the offset
    
    # Return the sorted list of matched components
    return sorted_matches

In [40]:
import re

def extract_summary(input_string):
    """
    Extracts section numbers, titles, and page numbers from a document while ensuring 
    that section titles do not contain newline characters.
    
    Args:
        input_string (str): The document content.
    
    Returns:
        list: A sorted list of tuples (section_number, title, page_number, start_offset, end_offset),
              excluding tuples where the title contains newlines.
    """
    # Set to store unique matches
    combined_matches = set()
    
    # Patterns to capture section headers with and without an extra dot
    pattern1 = re.compile(r"\s*(\d+(\.\d+)?)\s*\n([^\n]+?)\s*\.+\s*(\d+)\s*")  # Without dot
    pattern2 = re.compile(r"\s*(\d+(\.\d+)?)\.\s*\n([^\n]+?)\s*\.+\s*(\d+)\s*")  # With dot

    # Find matches and add to set
    for pattern in [pattern1, pattern2]:
        for match in pattern.finditer(input_string):
            section_number, title, page_number = match.group(1), match.group(3), match.group(4)
            start_offset, end_offset = match.start(), match.end()

            # Exclude entries where the title contains newline characters
            if "\n" not in title:
                combined_matches.add((section_number, title, page_number, start_offset, end_offset))

    # Sort the matches by start offset
    sorted_matches = sorted(combined_matches, key=lambda match: match[3])  # Sort by start_offset
    
    return sorted_matches


In [41]:
input_file = "../data/tokio_outubro_2024_cleaned.txt"
# input_file = "../data/snippet.txt"

with open(input_file, "r", encoding="utf-8") as file:
    document_text = file.read()

# Call the function and print the results
result = extract_summary(document_text)

for match in result:
    print(match)

('2024', 'Processo SUSEP nº 15414', '100909', 153, 189)
('1', 'ÂMBITO GEOGRÁFICO', '8', 5054, 5240)
('2', 'OBJETIVO DO SEGURO', '8', 5240, 5422)
('3', 'DOCUMENTOS DO SEGURO', '8', 5422, 5598)
('4', 'LOCAL DE RISCO', '8', 5598, 5786)
('5', 'EDIFÍCIOS ABRANGIDOS PELO SEGURO', '8', 5786, 5952)
('6', 'EDIFÍCIOS NÃO ABRANGIDOS PELO SEGURO', '9', 5952, 6113)
('7', 'COBERTURAS DO SEGURO', '10', 6113, 6289)
('10', '7', '1', 6284, 6294)
('10', '7', '2', 6456, 6466)
('8', 'COBERTURAS ACESSÓRIAS', '12', 6630, 6785)
('8.1', 'ALAGAMENTO E INUNDAÇÃO', '12', 6785, 6939)
('8.2', 'DANOS ELÉTRICOS', '13', 6939, 7104)
('8.3', 'DERRAME OU VAZAMENTO DE CHUVEIROS AUTOMÁTICOS (SPRINKLERS)', '14', 7104, 7216)
('8.4', 'DESMORONAMENTO', '15', 7216, 7378)
('8.5', 'RESPONSABILIDADE CIVIL DANOS MORAIS', '16', 7378, 7521)
('8.6', 'RESPONSABILIDADE CIVIL CONDOMÍNIO', '17', 7521, 7667)
('8.7', 'RESPONSABILIDADE CIVIL GARAGISTA', '19', 7667, 7814)
('8.8', 'RESPONSABILIDADE CIVIL PORTÕES AUTOMÁTICOS', '23', 7814, 7947)

In [42]:
import re

def find_offsets(content, s1, s2):
    """
    Finds the offsets in `content` where `s1` and `s2` appear separated by exactly one space or tab.
    
    Args:
        content (str): The document content.
        s1 (str): The first substring.
        s2 (str): The second substring.
    
    Returns:
        list: A list of starting offsets where `s1` and `s2` appear with a single space or tab in between.
    """
    # Define the regex pattern to find occurrences of s1 and s2 separated by a space or tab
    pattern = re.compile(rf"({re.escape(s1)})[ \t]({re.escape(s2)})")

    # Find all matches and store their start positions
    offsets = [match.start() for match in pattern.finditer(content)]
    
    return offsets


In [43]:
current_section = "8.5"
next_section = "8.6"
start_offsets = find_offsets(document_text, current_section, "RESPONSABILIDADE CIVIL DANOS MORAIS")
end_offsets = find_offsets(document_text, next_section, "RESPONSABILIDADE CIVIL CONDOMÍNIO")

print(f'Limits of section {current_section}: ({start_offsets[0]}, {end_offsets[0]})')

Limits of section 8.5: (46330, 49591)


In [44]:
def extract_substring(content, start_offset, end_offset):
    """
    Extracts a substring from `content` between `offset_start` and `offset_end`.

    Args:
        content (str): The document content.
        offset_start (int): The starting offset (inclusive).
        offset_end (int): The ending offset (exclusive).

    Returns:
        str: The extracted substring.
    """
    assert len(content) > end_offset
    return content[start_offset:end_offset]

In [45]:
# Extract substring
print(len(document_text))
substring = extract_substring(document_text, start_offsets[0], end_offsets[0])
print(substring)

403326
8.5 RESPONSABILIDADE CIVIL DANOS MORAIS 
 
8.5.1 
Riscos Cobertos 
 
Garante, até o Limite Máximo de Indenização contratado o reembolso da indenização pelo qual o condomínio seja responsável 
civilmente a pagar, em sentença judicial transitada em julgado ou em acordo expressamente autorizado pela seguradora, em 
virtude de danos morais, decorrente diretamente de danos materiais e ou danos corporais involuntariamente causados a terceiros 
efetivamente indenizados nas coberturas de Responsabilidade Civil Condomínio ou Síndico, previstas no presente contrato.  
 
Salvo disposição em contrário, esta cobertura também abrange as despesas emergenciais efetuadas pelo Segurado ao tentar 
evitar e/ou minorar os danos causados a terceiros, desde que qualquer acordo com terceiros, judicial ou extrajudicial, 
somente será considerado pela Seguradora, quando submetido previamente a sua aprovação expressa.  
 
Para efeito deste Contrato de Seguro, caracteriza- se Dano Moral, aquele que traz co

# Using Tika

In [46]:
from tika import parser

input_file = "../data/tokio_outubro_2024.txt"
output_file = "../data/tokio_outubro_2024_tika.txt"

parsed_pdf = parser.from_file("../data/propostas/tokio_outubro_2024.pdf",)

raw_text = parsed_pdf['content']

# Remove footers
cleaned_text = remove_footer(raw_text)

# Save cleaned text
with open(output_file, "w", encoding="utf-8") as file:
    file.write(cleaned_text)

parsed_pdf.keys()

dict_keys(['metadata', 'content', 'status'])

In [47]:
parsed_pdf['metadata']

{'pdf:PDFVersion': '1.7',
 'xmp:CreatorTool': 'Microsoft® Word para Microsoft 365',
 'pdf:hasXFA': 'false',
 'access_permission:can_print_degraded': 'true',
 'X-TIKA:Parsed-By-Full-Set': ['org.apache.tika.parser.DefaultParser',
  'org.apache.tika.parser.pdf.PDFParser'],
 'X-TIKA:content_handler': 'ToTextContentHandler',
 'pdf:num3DAnnotations': '0',
 'MSIP_Label_0988faac-7551-4c58-92c7-6ea9d9398ce2_SiteId': '8b7cb950-a7e7-4897-9382-00f1b86d3871',
 'dc:format': 'application/pdf; version=1.7',
 'pdf:docinfo:custom:MSIP_Label_0988faac-7551-4c58-92c7-6ea9d9398ce2_ContentBits': '2',
 'pdf:docinfo:creator_tool': 'Microsoft® Word para Microsoft 365',
 'MSIP_Label_0988faac-7551-4c58-92c7-6ea9d9398ce2_ContentBits': '2',
 'access_permission:fill_in_form': 'true',
 'pdf:hasCollection': 'false',
 'pdf:encrypted': 'false',
 'pdf:containsNonEmbeddedFont': 'true',
 'xmp:CreateDate': '2024-10-06T15:05:16Z',
 'pdf:hasMarkedContent': 'true',
 'xmp:ModifyDate': '2024-10-06T15:05:16Z',
 'pdf:docinfo:creat

# Using llmsherpa

In [48]:
# Install package
# !pip install --upgrade --quiet llmsherpa

In [49]:
# get all documents under the folder
import os
import glob
from datetime import datetime
import time

file_location = "../data/apolices"

from llmsherpa.readers import LayoutPDFReader

llmsherpa_api_url = "http://localhost:5001/parseDocument?renderFormat=all"

pdf_files = glob.glob(file_location + '/*.pdf')

print(f'#PDF files found: {len(pdf_files)}!')
pdf_reader = LayoutPDFReader(llmsherpa_api_url)

# parse documents and create graph
startTime = datetime.now()

for pdf_file in pdf_files:
    print(pdf_file)
    doc = pdf_reader.read_pdf(pdf_file)

    # find the first / in pdf_file from right
    idx = pdf_file.rfind('/')
    pdf_file_name = pdf_file[idx+1:]

    # open a local file to write the JSON
    with open(pdf_file_name + '.json', 'w') as f:
        # convert doc.json from a list to string
        f.write(str(doc.json))

print(f'Total time: {datetime.now() - startTime}')

#PDF files found: 0!
Total time: 0:00:00.000547


In [50]:
from langchain_community.document_loaders.llmsherpa import LLMSherpaFileLoader

loader = LLMSherpaFileLoader(
    file_path="../data/apolices/tokio_outubro_2024.pdf",
    new_indent_parser=False,
    apply_ocr=False,
    strategy="sections",
    llmsherpa_api_url="http://localhost:5001/api/parseDocument?renderFormat=all",
)
docs = loader.load()

FileNotFoundError: [Errno 2] No such file or directory: '../data/apolices/tokio_outubro_2024.pdf'

In [None]:
docs

[]