# Entity and Article Extraction using LangExtract

This notebook uses LangExtract library to extract government entities and legal article mentions from Mexico City legal documents.

LangExtract provides structured data extraction with built-in validation and type safety.


In [None]:
# Install required packages
%pip install langextract python-dotenv pandas openpyxl


In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
import langextract as lx
import time
import re

# Load environment variables
load_dotenv()

# Set OpenAI API key
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

print("✅ LangExtract configured with OpenAI API key")


✅ LangExtract configured with OpenAI API key


In [2]:
# Test LangExtract connection and understand the output format
test_examples = [
    lx.data.ExampleData(
        text="I love apples and bananas from the market.",
        extractions=[
            lx.data.Extraction(extraction_class="fruit", extraction_text="apples"),
            lx.data.Extraction(extraction_class="fruit", extraction_text="bananas")
        ]
    )
]

test_result = lx.extract(
    text_or_documents="Give me 5 random fruits: oranges, grapes, strawberries, mangoes, pineapples",
    prompt_description="Extract fruit names from text",
    examples=test_examples,
    model_id="gpt-4o-mini",
    api_key=os.environ.get('OPENAI_API_KEY'),
    fence_output=True,
    use_schema_constraints=False
)

print("✅ Test extraction completed")
print("Result type:", type(test_result))
print("Result length:", len(test_result) if hasattr(test_result, '__len__') else "No length")

# Inspect the first result to understand its structure
if test_result:
    first_result = test_result[0] if hasattr(test_result, '__getitem__') else test_result
    print("First result type:", type(first_result))
    print("First result attributes:", dir(first_result) if hasattr(first_result, '__dict__') else "No attributes")
    
    # Try to access the result properly
    if hasattr(first_result, 'extractions'):
        print("Extractions found:", len(first_result.extractions))
        for i, extraction in enumerate(first_result.extractions[:3]):
            print(f"  {i+1}. Class: {extraction.extraction_class}, Text: {extraction.extraction_text}")
    elif hasattr(first_result, 'text'):
        print("Text content:", first_result.text[:100] + "..." if len(first_result.text) > 100 else first_result.text)


✅ Test extraction completed
Result type: <class 'langextract.core.data.AnnotatedDocument'>
Result length: No length
First result type: <class 'langextract.core.data.AnnotatedDocument'>
First result attributes: ['__annotations__', '__class__', '__dataclass_fields__', '__dataclass_params__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__match_args__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_document_id', '_tokenized_text', 'document_id', 'extractions', 'text', 'tokenized_text']
Extractions found: 5
  1. Class: fruit, Text: oranges
  2. Class: fruit, Text: grapes
  3. Class: fruit, Text: strawberries


In [2]:
# Load the Excel file
df = pd.read_excel("/Users/alexa/Projects/cdmx_kg/Mexico_City/LEY_AMBIENTAL_DE_LA_CIUDAD_DE_MEXICO.xlsx")
df = df[['row_id','doc_hash','document_name', 'document_section_title', 'text']]
print(f"Loaded {len(df)} rows")
print(df.head())


Loaded 332 rows
              row_id      doc_hash                         document_name  \
0  12406E12_ARTCULO1  1.240600e+16  LEY AMBIENTAL DE LA CIUDAD DE MÉXICO   
1  12406E12_ARTCULO2  1.240600e+16  LEY AMBIENTAL DE LA CIUDAD DE MÉXICO   
2  12406E12_ARTCULO3  1.240600e+16  LEY AMBIENTAL DE LA CIUDAD DE MÉXICO   
3  12406E12_ARTCULO4  1.240600e+16  LEY AMBIENTAL DE LA CIUDAD DE MÉXICO   
4  12406E12_ARTCULO5  1.240600e+16  LEY AMBIENTAL DE LA CIUDAD DE MÉXICO   

  document_section_title                                               text  
0             Artículo 1  º.- La presente Ley es reglamentaria de las di...  
1             Artículo 2  º.- Se consideran de utilidad pública:\n\nI. E...  
2             Artículo 3  º.- En todo lo no previsto en la presente Ley,...  
3             Artículo 4  º.- Para los efectos de esta Ley, se utilizará...  
4             Artículo 5  º.- Son autoridades en materia ambiental en la...  


In [3]:
# Examples for government entity extraction - based on real legal document patterns
gov_entity_examples = [
    lx.data.ExampleData(
        text="Son autoridades en materia ambiental en la Ciudad de México: I. La Jefatura de Gobierno; II. La Secretaría del Medio Ambiente;",
        extractions=[
            lx.data.Extraction(extraction_class="entity", extraction_text="Jefatura de Gobierno"),
            lx.data.Extraction(extraction_class="entity", extraction_text="Secretaría del Medio Ambiente")
        ]
    ),
    lx.data.ExampleData(
        text="La aplicación de esta Ley corresponde al Gobierno de la Ciudad de México, a través de la Secretaría y las Alcaldías.",
        extractions=[
            lx.data.Extraction(extraction_class="entity", extraction_text="Gobierno de la Ciudad de México"),
            lx.data.Extraction(extraction_class="entity", extraction_text="Secretaría"),
            lx.data.Extraction(extraction_class="entity", extraction_text="Alcaldías")
        ]
    ),
    lx.data.ExampleData(
        text="El Tribunal Superior de Justicia de la Ciudad de México y la Procuraduría General de Justicia tendrán competencia.",
        extractions=[
            lx.data.Extraction(extraction_class="entity", extraction_text="Tribunal Superior de Justicia de la Ciudad de México"),
            lx.data.Extraction(extraction_class="entity", extraction_text="Procuraduría General de Justicia")
        ]
    ),
    lx.data.ExampleData(
        text="La Comisión Ambiental Metropolitana y el Instituto de Verificación Administrativa de la Ciudad de México colaborarán.",
        extractions=[
            lx.data.Extraction(extraction_class="entity", extraction_text="Comisión Ambiental Metropolitana"),
            lx.data.Extraction(extraction_class="entity", extraction_text="Instituto de Verificación Administrativa de la Ciudad de México")
        ]
    ),
    lx.data.ExampleData(
        text="SEDEMA coordinará con las alcaldías y la Procuraduría Ambiental y del Ordenamiento Territorial (PAOT).",
        extractions=[
            lx.data.Extraction(extraction_class="entity", extraction_text="SEDEMA"),
            lx.data.Extraction(extraction_class="entity", extraction_text="alcaldías"),
            lx.data.Extraction(extraction_class="entity", extraction_text="Procuraduría Ambiental y del Ordenamiento Territorial"),
            lx.data.Extraction(extraction_class="entity", extraction_text="PAOT")
        ]
    ),
    lx.data.ExampleData(
        text="La Universidad Autónoma de la Ciudad de México (UACM) y la Secretaría de Educación participarán en programas ambientales.",
        extractions=[
            lx.data.Extraction(extraction_class="entity", extraction_text="Universidad Autónoma de la Ciudad de México"),
            lx.data.Extraction(extraction_class="entity", extraction_text="UACM"),
            lx.data.Extraction(extraction_class="entity", extraction_text="Secretaría de Educación")
        ]
    )
]

# Examples for article mention extraction - based on real legal document patterns
article_mention_examples = [
    lx.data.ExampleData(
        text="La presente Ley es reglamentaria de las disposiciones contenidas en el Apartado A del artículo 13 de la Constitución Política de la Ciudad de México",
        extractions=[
            lx.data.Extraction(extraction_class="mention", extraction_text="artículo 13 de la Constitución Política de la Ciudad de México"),
            lx.data.Extraction(extraction_class="mention", extraction_text="la presente Ley")
        ]
    ),
    lx.data.ExampleData(
        text="En todo lo no previsto en la presente Ley, se aplicará supletoriamente la Ley de Procedimiento Administrativo de la Ciudad de México",
        extractions=[
            lx.data.Extraction(extraction_class="mention", extraction_text="la presente Ley"),
            lx.data.Extraction(extraction_class="mention", extraction_text="la Ley de Procedimiento Administrativo de la Ciudad de México")
        ]
    ),
    lx.data.ExampleData(
        text="De conformidad con lo establecido en el artículo 4º de la Constitución Política de los Estados Unidos Mexicanos",
        extractions=[
            lx.data.Extraction(extraction_class="mention", extraction_text="artículo 4º de la Constitución Política de los Estados Unidos Mexicanos")
        ]
    ),
    lx.data.ExampleData(
        text="Para los efectos de esta Ley se aplicarán las disposiciones del Código Civil para el Distrito Federal",
        extractions=[
            lx.data.Extraction(extraction_class="mention", extraction_text="esta Ley"),
            lx.data.Extraction(extraction_class="mention", extraction_text="del Código Civil para el Distrito Federal")
        ]
    ),
    lx.data.ExampleData(
        text="Las sanciones previstas en los artículos 237 y 238 del Código Penal para el Distrito Federal serán aplicables",
        extractions=[
            lx.data.Extraction(extraction_class="mention", extraction_text="artículos 237 y 238 del Código Penal para el Distrito Federal")
        ]
    ),
    lx.data.ExampleData(
        text="El reglamento de esta Ley establecerá los procedimientos específicos conforme al artículo 89 fracción I",
        extractions=[
            lx.data.Extraction(extraction_class="mention", extraction_text="reglamento de esta Ley"),
            lx.data.Extraction(extraction_class="mention", extraction_text="artículo 89 fracción I")
        ]
    ),
    lx.data.ExampleData(
        text="Los lineamientos que establezca el reglamento de la presente Ley deberán considerar los criterios técnicos",
        extractions=[
            lx.data.Extraction(extraction_class="mention", extraction_text="el reglamento de la presente Ley"),
            lx.data.Extraction(extraction_class="mention", extraction_text="la presente Ley")
        ]
    ),
    lx.data.ExampleData(
        text="Los criterios técnicos y metodológicos se establecerán conforme a las normas oficiales mexicanas que expida la Secretaría",
        extractions=[
            lx.data.Extraction(extraction_class="mention", extraction_text="normas oficiales mexicanas")
        ]
    )
]

print("✅ Improved LangExtract examples defined with better text alignment")




✅ Improved LangExtract examples defined with better text alignment


In [4]:
# Define extraction functions using LangExtract

def extract_government_entities(batch_df):
    """Extract government entities from a batch of documents using LangExtract"""
    
    entities = []
    
    # Process each row individually for better tracking
    for row in batch_df.itertuples():
        try:
            # Use LangExtract to extract entities
            result = lx.extract(
                text_or_documents=row.text,
                prompt_description="Extract ALL government entities, institutions, and legal bodies mentioned in this text. Include Secretarías, Alcaldías, Institutos, Tribunales, Consejos, Comisiones, Universities, Procuradurías, and any other government organizations.",
                examples=gov_entity_examples,
                model_id="gpt-4o-mini",
                api_key=os.environ.get('OPENAI_API_KEY'),
                fence_output=True,
                use_schema_constraints=False
            )
            
            # Handle single AnnotatedDocument result
            if result:
                if hasattr(result, 'extractions'):
                    for extraction in result.extractions:
                        if hasattr(extraction, 'extraction_text'):
                            entities.append({
                                'row_id': row.row_id,
                                'mention': extraction.extraction_text,
                                'entity_type': None  # We'll classify later if needed
                            })
                elif hasattr(result, 'extraction_text'):
                    # Direct extraction object
                    entities.append({
                        'row_id': row.row_id,
                        'mention': result.extraction_text,
                        'entity_type': None
                    })
        
        except Exception as e:
            print(f"Error extracting entities for row {row.row_id}: {e}")
            continue
    
    return entities

def extract_article_mentions(batch_df):
    """Extract article mentions from a batch of documents using LangExtract"""
    
    mentions = []
    
    # Get the current law name for self-references
    current_law = batch_df.iloc[0]['document_name'] if len(batch_df) > 0 else "LEY AMBIENTAL DE LA CIUDAD DE MÉXICO"
    
    # Process each row individually for better tracking
    for row in batch_df.itertuples():
        try:
            # Use LangExtract to extract article mentions
            result = lx.extract(
                text_or_documents=row.text,
                prompt_description=f"Extract ALL legal references from this Mexican legal text. Include article numbers, law names, constitutional references, codes, regulations, decrees, and treaties. For self-references like 'esta Ley' or 'la presente Ley', consider it as referring to: {current_law}",
                examples=article_mention_examples,
                model_id="gpt-4o-mini",
                api_key=os.environ.get('OPENAI_API_KEY'),
                fence_output=True,
                use_schema_constraints=False
            )
            
            # Handle single AnnotatedDocument result
            if result:
                if hasattr(result, 'extractions'):
                    for extraction in result.extractions:
                        if hasattr(extraction, 'extraction_text'):
                            # Parse the extracted text to identify components
                            mention_text = extraction.extraction_text
                            
                            # Try to extract article number and law name from the mention
                            art_num = None
                            law_name = None
                            
                            # Handle regulation self-references
                            if "reglamento de esta ley" in mention_text.lower():
                                law_name = f"REGLAMENTO DE LA {current_law}"
                                processed_mention = mention_text
                            elif "reglamento de la presente ley" in mention_text.lower():
                                law_name = f"REGLAMENTO DE LA {current_law}"
                                processed_mention = mention_text
                            elif "esta ley" in mention_text.lower() or "presente ley" in mention_text.lower():
                                law_name = current_law
                                processed_mention = mention_text
                            else:
                            # Extract article number if present
                            
                                art_match = re.search(r'artículo[s]?\s+(\d+)', mention_text.lower())
                            if art_match:
                                art_num = art_match.group(1)


                            # Simple parsing logic for common patterns
                            if "artículo" in mention_text.lower():
                                # Try to extract article number
                            
                                art_match = re.search(r'artículo\s+(\d+)', mention_text.lower())
                                if art_match:
                                    art_num = art_match.group(1)
                            
                            # Extract law name (everything after "de la", "del", etc.)
                            if "constitución" in mention_text.lower():
                                law_name = "CONSTITUCIÓN POLÍTICA DE LA CIUDAD DE MÉXICO"
                            elif "código civil" in mention_text.lower():
                                law_name = "CÓDIGO CIVIL"
                            elif "esta ley" in mention_text.lower() or "presente ley" in mention_text.lower():
                                law_name = current_law
                            else:
                                law_name = mention_text  # Use the full mention as law name
                            
                            mentions.append({
                                'row_id': row.row_id,
                                'art_num': art_num,
                                'law_name': law_name or mention_text,
                                'mention_extraction': mention_text
                            })
                elif hasattr(result, 'extraction_text'):
                    # Direct extraction object
                    mentions.append({
                        'row_id': row.row_id,
                        'art_num': None,
                        'law_name': result.extraction_text,
                        'mention_extraction': result.extraction_text
                    })
        
        except Exception as e:
            print(f"Error extracting mentions for row {row.row_id}: {e}")
            continue
    
    return mentions

print("✅ Extraction functions defined with individual row processing")


✅ Extraction functions defined with individual row processing


In [5]:
# Process documents in batches

# Collectors for results
all_entities = []
all_mentions = []

# Batch configuration
BATCH_SIZE = 3  # Small batches for better quality with LangExtract
total_batches = (len(df) + BATCH_SIZE - 1) // BATCH_SIZE

print(f"Processing {len(df)} articles in {total_batches} batches...")
print(f"Batch size: {BATCH_SIZE}")

for i, start in enumerate(range(0, len(df), BATCH_SIZE)):
    batch = df.iloc[start:start+BATCH_SIZE]
    batch_num = i + 1
    
    print(f"\nProcessing batch {batch_num}/{total_batches} (articles {start+1}-{min(start+BATCH_SIZE, len(df))})")
    
    # Extract government entities
    print("  → Extracting government entities...")
    try:
        entities = extract_government_entities(batch)
        all_entities.extend(entities)
        print(f"    Found {len(entities)} entities")
    except Exception as e:
        print(f"    Error: {e}")
    
    # Extract article mentions  
    print("  → Extracting article mentions...")
    try:
        mentions = extract_article_mentions(batch)
        all_mentions.extend(mentions)
        print(f"    Found {len(mentions)} mentions")
    except Exception as e:
        print(f"    Error: {e}")
    
    # Small delay to avoid rate limits
    time.sleep(0.5)

print(f"\n=== EXTRACTION COMPLETE ===")
print(f"Total government entities found: {len(all_entities)}")
print(f"Total article mentions found: {len(all_mentions)}")


Processing 332 articles in 111 batches...
Batch size: 3

Processing batch 1/111 (articles 1-3)
  → Extracting government entities...




    Found 0 entities
  → Extracting article mentions...




    Found 9 mentions

Processing batch 2/111 (articles 4-6)
  → Extracting government entities...




    Found 30 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO4: local variable 'art_match' referenced before assignment




Error extracting mentions for row 12406E12_ARTCULO5: local variable 'art_match' referenced before assignment
    Found 12 mentions

Processing batch 3/111 (articles 7-9)
  → Extracting government entities...




    Found 29 entities
  → Extracting article mentions...




    Found 54 mentions

Processing batch 4/111 (articles 10-12)
  → Extracting government entities...




    Found 12 entities
  → Extracting article mentions...




    Found 2 mentions

Processing batch 5/111 (articles 13-15)
  → Extracting government entities...




    Found 4 entities
  → Extracting article mentions...




    Found 6 mentions

Processing batch 6/111 (articles 16-18)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




    Found 18 mentions

Processing batch 7/111 (articles 19-21)
  → Extracting government entities...




    Found 4 entities
  → Extracting article mentions...




    Found 6 mentions

Processing batch 8/111 (articles 22-24)
  → Extracting government entities...




    Found 4 entities
  → Extracting article mentions...




    Found 16 mentions

Processing batch 9/111 (articles 25-27)
  → Extracting government entities...




    Found 8 entities
  → Extracting article mentions...




    Found 27 mentions

Processing batch 10/111 (articles 28-30)
  → Extracting government entities...




    Found 7 entities
  → Extracting article mentions...




    Found 6 mentions

Processing batch 11/111 (articles 31-33)
  → Extracting government entities...




    Found 11 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO31: local variable 'art_match' referenced before assignment




    Found 8 mentions

Processing batch 12/111 (articles 34-36)
  → Extracting government entities...




    Found 9 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO34: local variable 'art_match' referenced before assignment




    Found 12 mentions

Processing batch 13/111 (articles 37-39)
  → Extracting government entities...




    Found 1 entities
  → Extracting article mentions...




    Found 1 mentions

Processing batch 14/111 (articles 40-42)
  → Extracting government entities...




    Found 6 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO40: local variable 'art_match' referenced before assignment




Error extracting mentions for row 12406E12_ARTCULO41: local variable 'art_match' referenced before assignment
    Found 2 mentions

Processing batch 15/111 (articles 43-45)
  → Extracting government entities...




    Found 2 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO43: local variable 'art_match' referenced before assignment




Error extracting mentions for row 12406E12_ARTCULO45: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 16/111 (articles 46-48)
  → Extracting government entities...




    Found 5 entities
  → Extracting article mentions...




    Found 4 mentions

Processing batch 17/111 (articles 49-51)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO49: local variable 'art_match' referenced before assignment




Error extracting mentions for row 12406E12_ARTCULO50: local variable 'art_match' referenced before assignment
    Found 4 mentions

Processing batch 18/111 (articles 52-54)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO52: local variable 'art_match' referenced before assignment




Error extracting mentions for row 12406E12_ARTCULO53: local variable 'art_match' referenced before assignment
Error extracting mentions for row 12406E12_ARTCULO54: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 19/111 (articles 55-57)
  → Extracting government entities...




    Found 4 entities
  → Extracting article mentions...




    Found 6 mentions

Processing batch 20/111 (articles 58-60)
  → Extracting government entities...




    Found 1 entities
  → Extracting article mentions...




    Found 6 mentions

Processing batch 21/111 (articles 61-63)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




    Found 7 mentions

Processing batch 22/111 (articles 64-66)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO64: local variable 'art_match' referenced before assignment




Error extracting mentions for row 12406E12_ARTCULO66: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 23/111 (articles 67-69)
  → Extracting government entities...




    Found 2 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO67: local variable 'art_match' referenced before assignment




Error extracting mentions for row 12406E12_ARTCULO68: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 24/111 (articles 70-72)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO71: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 25/111 (articles 73-75)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




    Found 2 mentions

Processing batch 26/111 (articles 76-78)
  → Extracting government entities...




    Found 4 entities
  → Extracting article mentions...




    Found 7 mentions

Processing batch 27/111 (articles 79-81)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




    Found 15 mentions

Processing batch 28/111 (articles 82-84)
  → Extracting government entities...




    Found 8 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO82: local variable 'art_match' referenced before assignment




Error extracting mentions for row 12406E12_ARTCULO84: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 29/111 (articles 85-87)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO85: local variable 'art_match' referenced before assignment




Error extracting mentions for row 12406E12_ARTCULO86: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 30/111 (articles 88-90)
  → Extracting government entities...




    Found 2 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO88: local variable 'art_match' referenced before assignment




    Found 4 mentions

Processing batch 31/111 (articles 91-93)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO91: local variable 'art_match' referenced before assignment




    Found 1 mentions

Processing batch 32/111 (articles 94-96)
  → Extracting government entities...




    Found 1 entities
  → Extracting article mentions...




    Found 0 mentions

Processing batch 33/111 (articles 97-99)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




    Found 0 mentions

Processing batch 34/111 (articles 100-102)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




    Found 0 mentions

Processing batch 35/111 (articles 103-105)
  → Extracting government entities...




    Found 1 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO103: local variable 'art_match' referenced before assignment




    Found 0 mentions

Processing batch 36/111 (articles 106-108)
  → Extracting government entities...




    Found 14 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO106: local variable 'art_match' referenced before assignment




    Found 6 mentions

Processing batch 37/111 (articles 109-111)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




    Found 7 mentions

Processing batch 38/111 (articles 112-114)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




    Found 8 mentions

Processing batch 39/111 (articles 115-117)
  → Extracting government entities...




    Found 1 entities
  → Extracting article mentions...




    Found 0 mentions

Processing batch 40/111 (articles 118-120)
  → Extracting government entities...




    Found 10 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO118: local variable 'art_match' referenced before assignment




    Found 9 mentions

Processing batch 41/111 (articles 121-123)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO121: local variable 'art_match' referenced before assignment




Error extracting mentions for row 12406E12_ARTCULO122: local variable 'art_match' referenced before assignment
Error extracting mentions for row 12406E12_ARTCULO123: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 42/111 (articles 124-126)
  → Extracting government entities...




    Found 8 entities
  → Extracting article mentions...




    Found 0 mentions

Processing batch 43/111 (articles 127-129)
  → Extracting government entities...




    Found 0 entities
  → Extracting article mentions...




    Found 0 mentions

Processing batch 44/111 (articles 130-132)
  → Extracting government entities...




    Found 0 entities
  → Extracting article mentions...




    Found 0 mentions

Processing batch 45/111 (articles 133-135)
  → Extracting government entities...




    Found 5 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO133: local variable 'art_match' referenced before assignment




    Found 3 mentions

Processing batch 46/111 (articles 136-138)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




    Found 13 mentions

Processing batch 47/111 (articles 139-141)
  → Extracting government entities...




    Found 7 entities
  → Extracting article mentions...




    Found 9 mentions

Processing batch 48/111 (articles 142-144)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




    Found 9 mentions

Processing batch 49/111 (articles 145-147)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO146: local variable 'art_match' referenced before assignment
    Found 3 mentions

Processing batch 50/111 (articles 148-150)
  → Extracting government entities...




    Found 4 entities
  → Extracting article mentions...




    Found 10 mentions

Processing batch 51/111 (articles 151-153)
  → Extracting government entities...




    Found 9 entities
  → Extracting article mentions...




    Found 3 mentions

Processing batch 52/111 (articles 154-156)
  → Extracting government entities...




    Found 0 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO156: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 53/111 (articles 157-159)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




    Found 4 mentions

Processing batch 54/111 (articles 160-162)
  → Extracting government entities...




    Found 4 entities
  → Extracting article mentions...




    Found 4 mentions

Processing batch 55/111 (articles 163-165)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




    Found 0 mentions

Processing batch 56/111 (articles 166-168)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




    Found 5 mentions

Processing batch 57/111 (articles 169-171)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




    Found 3 mentions

Processing batch 58/111 (articles 172-174)
  → Extracting government entities...




    Found 4 entities
  → Extracting article mentions...




    Found 1 mentions

Processing batch 59/111 (articles 175-177)
  → Extracting government entities...




    Found 4 entities
  → Extracting article mentions...




    Found 4 mentions

Processing batch 60/111 (articles 178-180)
  → Extracting government entities...




    Found 4 entities
  → Extracting article mentions...




    Found 2 mentions

Processing batch 61/111 (articles 181-183)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




    Found 6 mentions

Processing batch 62/111 (articles 184-186)
  → Extracting government entities...




    Found 0 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO186: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 63/111 (articles 187-189)
  → Extracting government entities...




    Found 4 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO187: local variable 'art_match' referenced before assignment




Error extracting mentions for row 12406E12_ARTCULO188: local variable 'art_match' referenced before assignment
    Found 3 mentions

Processing batch 64/111 (articles 190-192)
  → Extracting government entities...




    Found 1 entities
  → Extracting article mentions...




    Found 4 mentions

Processing batch 65/111 (articles 193-195)
  → Extracting government entities...




    Found 8 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO193: local variable 'art_match' referenced before assignment




Error extracting mentions for row 12406E12_ARTCULO195: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 66/111 (articles 196-198)
  → Extracting government entities...




    Found 0 entities
  → Extracting article mentions...




    Found 4 mentions

Processing batch 67/111 (articles 199-201)
  → Extracting government entities...




    Found 12 entities
  → Extracting article mentions...




    Found 22 mentions

Processing batch 68/111 (articles 202-204)
  → Extracting government entities...




    Found 6 entities
  → Extracting article mentions...




    Found 5 mentions

Processing batch 69/111 (articles 205-207)
  → Extracting government entities...




    Found 4 entities
  → Extracting article mentions...




    Found 16 mentions

Processing batch 70/111 (articles 208-210)
  → Extracting government entities...




    Found 8 entities
  → Extracting article mentions...




    Found 12 mentions

Processing batch 71/111 (articles 211-213)
  → Extracting government entities...




    Found 9 entities
  → Extracting article mentions...




    Found 9 mentions

Processing batch 72/111 (articles 214-216)
  → Extracting government entities...




    Found 16 entities
  → Extracting article mentions...




    Found 17 mentions

Processing batch 73/111 (articles 217-219)
  → Extracting government entities...




    Found 2 entities
  → Extracting article mentions...




    Found 1 mentions

Processing batch 74/111 (articles 220-222)
  → Extracting government entities...




    Found 2 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO222: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 75/111 (articles 223-225)
  → Extracting government entities...




    Found 4 entities
  → Extracting article mentions...




    Found 1 mentions

Processing batch 76/111 (articles 226-228)
  → Extracting government entities...




    Found 0 entities
  → Extracting article mentions...




    Found 1 mentions

Processing batch 77/111 (articles 229-231)
  → Extracting government entities...




    Found 2 entities
  → Extracting article mentions...




    Found 4 mentions

Processing batch 78/111 (articles 232-234)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO233: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 79/111 (articles 235-237)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




    Found 3 mentions

Processing batch 80/111 (articles 238-240)
  → Extracting government entities...




    Found 18 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO239: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 81/111 (articles 241-243)
  → Extracting government entities...




    Found 4 entities
  → Extracting article mentions...




    Found 5 mentions

Processing batch 82/111 (articles 244-246)
  → Extracting government entities...




    Found 0 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO246: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 83/111 (articles 247-249)
  → Extracting government entities...




    Found 2 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO248: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 84/111 (articles 250-252)
  → Extracting government entities...




    Found 2 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO250: local variable 'art_match' referenced before assignment




Error extracting mentions for row 12406E12_ARTCULO252: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 85/111 (articles 253-255)
  → Extracting government entities...




    Found 4 entities
  → Extracting article mentions...




    Found 3 mentions

Processing batch 86/111 (articles 256-258)
  → Extracting government entities...




    Found 13 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO256: local variable 'art_match' referenced before assignment




Error extracting mentions for row 12406E12_ARTCULO257: local variable 'art_match' referenced before assignment
Error extracting mentions for row 12406E12_ARTCULO258: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 87/111 (articles 259-261)
  → Extracting government entities...




    Found 2 entities
  → Extracting article mentions...




    Found 7 mentions

Processing batch 88/111 (articles 262-264)
  → Extracting government entities...




    Found 8 entities
  → Extracting article mentions...




    Found 11 mentions

Processing batch 89/111 (articles 265-267)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




    Found 6 mentions

Processing batch 90/111 (articles 268-270)
  → Extracting government entities...




    Found 4 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO269: local variable 'art_match' referenced before assignment
    Found 2 mentions

Processing batch 91/111 (articles 271-273)
  → Extracting government entities...




    Found 6 entities
  → Extracting article mentions...




    Found 5 mentions

Processing batch 92/111 (articles 274-276)
  → Extracting government entities...




    Found 4 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO274: local variable 'art_match' referenced before assignment




Error extracting mentions for row 12406E12_ARTCULO275: local variable 'art_match' referenced before assignment
Error extracting mentions for row 12406E12_ARTCULO276: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 93/111 (articles 277-279)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO277: local variable 'art_match' referenced before assignment




Error extracting mentions for row 12406E12_ARTCULO279: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 94/111 (articles 280-282)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO280: local variable 'art_match' referenced before assignment




Error extracting mentions for row 12406E12_ARTCULO282: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 95/111 (articles 283-285)
  → Extracting government entities...




    Found 1 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO283: local variable 'art_match' referenced before assignment




    Found 2 mentions

Processing batch 96/111 (articles 286-288)
  → Extracting government entities...




    Found 0 entities
  → Extracting article mentions...




    Found 4 mentions

Processing batch 97/111 (articles 289-291)
  → Extracting government entities...




    Found 1 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO289: local variable 'art_match' referenced before assignment




    Found 0 mentions

Processing batch 98/111 (articles 292-294)
  → Extracting government entities...




    Found 1 entities
  → Extracting article mentions...




    Found 2 mentions

Processing batch 99/111 (articles 295-297)
  → Extracting government entities...




    Found 3 entities
  → Extracting article mentions...




    Found 12 mentions

Processing batch 100/111 (articles 298-300)
  → Extracting government entities...




    Found 1 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO299: local variable 'art_match' referenced before assignment
Error extracting mentions for row 12406E12_ARTCULO300: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 101/111 (articles 301-303)
  → Extracting government entities...




    Found 2 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO301: local variable 'art_match' referenced before assignment




Error extracting mentions for row 12406E12_ARTCULO302: local variable 'art_match' referenced before assignment
Error extracting mentions for row 12406E12_ARTCULO303: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 102/111 (articles 304-306)
  → Extracting government entities...




    Found 1 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO304: local variable 'art_match' referenced before assignment




    Found 6 mentions

Processing batch 103/111 (articles 307-309)
  → Extracting government entities...




    Found 0 entities
  → Extracting article mentions...




    Found 0 mentions

Processing batch 104/111 (articles 310-312)
  → Extracting government entities...




    Found 4 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO311: local variable 'art_match' referenced before assignment
Error extracting mentions for row 12406E12_ARTCULO312: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 105/111 (articles 313-315)
  → Extracting government entities...




    Found 1 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO313: local variable 'art_match' referenced before assignment




Error extracting mentions for row 12406E12_ARTCULO314: local variable 'art_match' referenced before assignment
Error extracting mentions for row 12406E12_ARTCULO315: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 106/111 (articles 316-318)
  → Extracting government entities...




    Found 2 entities
  → Extracting article mentions...




    Found 10 mentions

Processing batch 107/111 (articles 319-321)
  → Extracting government entities...




    Found 2 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO321: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 108/111 (articles 322-324)
  → Extracting government entities...




    Found 6 entities
  → Extracting article mentions...




    Found 5 mentions

Processing batch 109/111 (articles 325-327)
  → Extracting government entities...




    Found 5 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO325: local variable 'art_match' referenced before assignment




    Found 1 mentions

Processing batch 110/111 (articles 328-330)
  → Extracting government entities...




    Found 5 entities
  → Extracting article mentions...




Error extracting mentions for row 12406E12_ARTCULO328: local variable 'art_match' referenced before assignment




Error extracting mentions for row 12406E12_ARTCULO329: local variable 'art_match' referenced before assignment
    Found 0 mentions

Processing batch 111/111 (articles 331-332)
  → Extracting government entities...




    Found 5 entities
  → Extracting article mentions...




    Found 2 mentions

=== EXTRACTION COMPLETE ===
Total government entities found: 498
Total article mentions found: 529


In [6]:
# Display sample results

print("=== SAMPLE GOVERNMENT ENTITIES ===")
if all_entities:
    for i, entity in enumerate(all_entities[:10]):
        print(f"{i+1}. {entity['row_id']} | {entity['mention']} | {entity['entity_type'] or 'N/A'}")
    if len(all_entities) > 10:
        print(f"... and {len(all_entities) - 10} more")
else:
    print("No entities found")

print("\n=== SAMPLE ARTICLE MENTIONS ===")
if all_mentions:
    for i, mention in enumerate(all_mentions[:10]):
        art_display = mention['art_num'] if mention['art_num'] else "N/A"
        print(f"{i+1}. {mention['row_id']} | Art. {art_display} | {mention['law_name']}")
        print(f"   Extract: {mention['mention_extraction'][:100]}...")
    if len(all_mentions) > 10:
        print(f"... and {len(all_mentions) - 10} more")
else:
    print("No mentions found")


=== SAMPLE GOVERNMENT ENTITIES ===
1. 12406E12_ARTCULO4 | Secretaría | N/A
2. 12406E12_ARTCULO4 | Secretaría | N/A
3. 12406E12_ARTCULO4 | Secretaría | N/A
4. 12406E12_ARTCULO4 | Secretaría | N/A
5. 12406E12_ARTCULO4 | Secretaría | N/A
6. 12406E12_ARTCULO4 | Secretaría | N/A
7. 12406E12_ARTCULO4 | Agencia Digital de Innovación Pública | N/A
8. 12406E12_ARTCULO4 | Secretaría | N/A
9. 12406E12_ARTCULO4 | Procuraduría Ambiental y del Ordenamiento Territorial | N/A
10. 12406E12_ARTCULO4 | Secretaría | N/A
... and 488 more

=== SAMPLE ARTICLE MENTIONS ===
1. 12406E12_ARTCULO1 | Art. 13 | CONSTITUCIÓN POLÍTICA DE LA CIUDAD DE MÉXICO
   Extract: Apartado A del artículo 13 de la Constitución Política de la Ciudad de México...
2. 12406E12_ARTCULO1 | Art. 16 | CONSTITUCIÓN POLÍTICA DE LA CIUDAD DE MÉXICO
   Extract: Apartado A del artículo 16 de la Constitución Política de la Ciudad de México...
3. 12406E12_ARTCULO1 | Art. 16 | LEY AMBIENTAL DE LA CIUDAD DE MÉXICO
   Extract: la presente Ley...
4

In [7]:
# Convert to DataFrames and save to CSV

# Government entities
if all_entities:
    # Data is already in dictionary format
    entities_df = pd.DataFrame(all_entities)
    
    # Remove duplicates
    entities_df = entities_df.drop_duplicates(subset=['row_id', 'mention'])
    
    # Save to CSV
    entities_df.to_csv("/Users/alexa/Projects/cdmx_kg/data/langextract_gov_entities.csv", 
                       index=False, encoding='utf-8')
    print(f"✅ Saved {len(entities_df)} unique government entities to langextract_gov_entities.csv")
    
    # Display summary
    if 'entity_type' in entities_df.columns:
        print(f"Entity types found: {entities_df['entity_type'].value_counts().head()}")
else:
    print("⚠️  No government entities to save")

# Article mentions
if all_mentions:
    # Data is already in dictionary format
    mentions_df = pd.DataFrame(all_mentions)
    
    # Remove duplicates
    mentions_df = mentions_df.drop_duplicates(subset=['row_id', 'art_num', 'law_name', 'mention_extraction'])
    
    # Save to CSV
    mentions_df.to_csv("/Users/alexa/Projects/cdmx_kg/data/langextract_article_mentions.csv", 
                       index=False, encoding='utf-8')
    print(f"✅ Saved {len(mentions_df)} unique article mentions to langextract_article_mentions.csv")
    
    # Display summary
    print(f"Top referenced laws: {mentions_df['law_name'].value_counts().head()}")
    print(f"Articles with numbers: {mentions_df[mentions_df['art_num'].notna()].shape[0]}")
    print(f"Law-only references: {mentions_df[mentions_df['art_num'].isna()].shape[0]}")
else:
    print("⚠️  No article mentions to save")


✅ Saved 429 unique government entities to langextract_gov_entities.csv
Entity types found: Series([], Name: count, dtype: int64)
✅ Saved 501 unique article mentions to langextract_article_mentions.csv
Top referenced laws: law_name
LEY AMBIENTAL DE LA CIUDAD DE MÉXICO           79
la Secretaría                                  22
la Ley Ambiental de la Ciudad de México        12
normas oficiales mexicanas                     12
normas ambientales para la Ciudad de México    10
Name: count, dtype: int64
Articles with numbers: 27
Law-only references: 474


In [8]:
# Quality analysis and comparison

print("=== EXTRACTION QUALITY ANALYSIS ===")

if all_entities:
    print(f"\n📊 GOVERNMENT ENTITIES:")
    print(f"   Total entities extracted: {len(all_entities)}")
    print(f"   Unique entities: {len(entities_df)}")
    print(f"   Average entities per article: {len(all_entities) / len(df):.2f}")
    
    # Most common entity types
    entity_types = entities_df['entity_type'].value_counts()
    print(f"   Most common entity types:")
    for et, count in entity_types.head().items():
        print(f"     - {et}: {count}")

if all_mentions:
    print(f"\n📊 ARTICLE MENTIONS:")
    print(f"   Total mentions extracted: {len(all_mentions)}")
    print(f"   Unique mentions: {len(mentions_df)}")
    print(f"   Average mentions per article: {len(all_mentions) / len(df):.2f}")
    
    # Analysis of mention types
    with_article_num = mentions_df[mentions_df['art_num'].notna()]
    without_article_num = mentions_df[mentions_df['art_num'].isna()]
    print(f"   Mentions with article numbers: {len(with_article_num)} ({len(with_article_num)/len(mentions_df)*100:.1f}%)")
    print(f"   Law-only mentions: {len(without_article_num)} ({len(without_article_num)/len(mentions_df)*100:.1f}%)")

print(f"\n✅ Extraction completed successfully using LangExtract!")
print(f"📁 Results saved to:")
print(f"   - langextract_gov_entities.csv")
print(f"   - langextract_article_mentions.csv")


=== EXTRACTION QUALITY ANALYSIS ===

📊 GOVERNMENT ENTITIES:
   Total entities extracted: 498
   Unique entities: 429
   Average entities per article: 1.50
   Most common entity types:

📊 ARTICLE MENTIONS:
   Total mentions extracted: 529
   Unique mentions: 501
   Average mentions per article: 1.59
   Mentions with article numbers: 27 (5.4%)
   Law-only mentions: 474 (94.6%)

✅ Extraction completed successfully using LangExtract!
📁 Results saved to:
   - langextract_gov_entities.csv
   - langextract_article_mentions.csv
