# Get text from PDF Cali

# Texto de Ordenanzas

In [None]:
import os
import re
import pandas as pd
from pdf2image import convert_from_path
import pytesseract

# Function to extract text from scanned PDFs
def extract_text_from_scanned_pdf(pdf_path):
    # Convert PDF pages to images
    images = convert_from_path(pdf_path, dpi=300)  # Higher DPI improves OCR accuracy

    extracted_text = []
    for i, image in enumerate(images):
        print(f"Processing page {i + 1} of {os.path.basename(pdf_path)}...")
        # OCR the image
        text = pytesseract.image_to_string(image, lang='eng')
        extracted_text.append(text)

    # Combine text from all pages
    full_text = "\n".join(extracted_text)
    return full_text

# Function to extract articles from text
def extract_articles_from_text(content):
    # Define the regex pattern for matching articles and their text
    pattern = r'((?:ART[ÍI]CULO|ARTlCULO)\s(?:\d+[º°\.]?|PRIMERO|SEGUNDO|TERCERO|CUARTO|QUINTO|SEXTO|SÉPTIMO|OCTAVO|NOVENO|DÉCIMO|UNDÉCIMO|DUODÉCIMO|DECIMOTERCERO|DECIMOCUARTO|DECIMOQUINTO|DECIMOSEXTO|DECIMOSÉPTIMO|DECIMOOCTAVO|DECIMONOVENO|VIGÉSIMO|VIGÉSIMO\sPRIMERO|VIGÉSIMO\sSEGUNDO)[.\-:]?)\s*(.*?)(?=(?:\n(?:ART[ÍI]CULO|ARTlCULO)\s(?:\d+[º°\.]?|PRIMERO|SEGUNDO|TERCERO|CUARTO|QUINTO|SEXTO|SÉPTIMO|OCTAVO|NOVENO|DÉCIMO|UNDÉCIMO|DUODÉCIMO|DECIMOTERCERO|DECIMOCUARTO|DECIMOQUINTO|DECIMOSEXTO|DECIMOSÉPTIMO|DECIMOOCTAVO|DECIMONOVENO|VIGÉSIMO|VIGÉSIMO\sPRIMERO|VIGÉSIMO\sSEGUNDO)[.\-:]?|$))'

    # Find matches using the regex
    matches = re.findall(pattern, content, re.DOTALL)

    # Structure the data into a list of dictionaries
    data = [{'Titulo': match[0].strip(),  # Article title
             'Texto': match[1].strip()}   # Article text
            for match in matches]

    return data

# Function to clean text and remove illegal characters for Excel
def clean_text(text, max_length=32767):
    # Remove illegal characters
    text = ''.join(c for c in text if ord(c) >= 32 and ord(c) not in {0xFFFF, 0xFFFE})
    return text[:max_length]  # Trim to Excel's maximum character limit

# Function to save structured data to an Excel file
def save_to_excel(data, output_file="structured_articles.xlsx"):
    # Clean data to remove illegal characters
    cleaned_data = [{'Titulo': clean_text(item['Titulo']), 'Texto': clean_text(item['Texto'])} for item in data]

    # Convert data to a DataFrame
    df = pd.DataFrame(cleaned_data)

    # Save DataFrame to an Excel file
    df.to_excel(output_file, index=False)
    print(f"Data saved to {output_file}")

# Main process for processing all PDFs in a folder
def process_all_pdfs_in_directory(input_directory, output_excel_file="structured_articles.xlsx"):
    all_data = []

    # Loop through all PDF files in the directory
    for filename in os.listdir(input_directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(input_directory, filename)
            try:
                print(f"Processing file: {filename}")

                # Extract text from the PDF
                content = extract_text_from_scanned_pdf(pdf_path)

                # Extract articles from the text
                articles_data = extract_articles_from_text(content)

                # Add file name to the data
                for article in articles_data:
                    article['Archivo'] = filename

                all_data.extend(articles_data)
            except Exception as e:
                print(f"Error processing {filename}: {e}. Skipping this file.")

    # Save all structured data to an Excel file
    if all_data:
        save_to_excel(all_data, output_excel_file)
    else:
        print("No data extracted from the PDFs. Excel file not created.")

# Run the main process
if __name__ == "__main__":
    input_folder = "/content/"  # Replace with the folder containing your PDFs
    output_file = "structured_articles.xlsx"
    process_all_pdfs_in_directory(input_folder, output_file)


Processing file: Ord._627-2023_Dic.18_-_Politica_Publica_de_Accion_Comunal.pdf
Processing page 1 of Ord._627-2023_Dic.18_-_Politica_Publica_de_Accion_Comunal.pdf...
Processing page 2 of Ord._627-2023_Dic.18_-_Politica_Publica_de_Accion_Comunal.pdf...
Processing page 3 of Ord._627-2023_Dic.18_-_Politica_Publica_de_Accion_Comunal.pdf...
Processing page 4 of Ord._627-2023_Dic.18_-_Politica_Publica_de_Accion_Comunal.pdf...
Processing page 5 of Ord._627-2023_Dic.18_-_Politica_Publica_de_Accion_Comunal.pdf...
Processing page 6 of Ord._627-2023_Dic.18_-_Politica_Publica_de_Accion_Comunal.pdf...
Processing page 7 of Ord._627-2023_Dic.18_-_Politica_Publica_de_Accion_Comunal.pdf...
Processing page 8 of Ord._627-2023_Dic.18_-_Politica_Publica_de_Accion_Comunal.pdf...
Processing page 9 of Ord._627-2023_Dic.18_-_Politica_Publica_de_Accion_Comunal.pdf...
Processing file: Ord._666-2024_Ago.01_-_Creacion_Mesa_de_Madres_y_Padres_Comunitarios.pdf
Processing page 1 of Ord._666-2024_Ago.01_-_Creacion_Mesa