# Entity Recognition Pipeline for Mexico City Environmental Law

This script processes legal texts to identify:
1. Article mentions with expanded context (30 words before/after)
2. Government entities and legal references using regex patterns only


In [None]:
import pandas as pd
import re
import hashlib
from collections import defaultdict
from cdmx_entity_patterns_fixed import GOV_ENTITY_REGEX


In [None]:
# ID Generation Functions
def create_document_hash(document_name):
    """Create a short 8-character hash from document name"""
    if pd.isna(document_name):
        return "UNKNOWN"
    # Create MD5 hash and take first 8 characters
    hash_obj = hashlib.md5(str(document_name).encode('utf-8'))
    return hash_obj.hexdigest()[:8].upper()

def generate_scalable_row_id(row_index, document_name, section_title="", prefix="CDMX"):
    """Generate scalable IDs for multiple legal documents"""
    doc_hash = create_document_hash(document_name)
    section_clean = re.sub(r'[^\w\d]', '', str(section_title))[:6].upper() if section_title else "NOSEC"
    return f"{prefix}_{doc_hash}_{section_clean}_{row_index:05d}"

def generate_extraction_id(extraction_type, doc_hash, sequential_num):
    """Generate IDs for extracted data (articles, entities)"""
    return f"{extraction_type}_{doc_hash}_{sequential_num:05d}"

print("✓ ID generation functions created")


✓ ID generation functions created


In [20]:
# Load the environmental law data
ley_ambiental = pd.read_excel('/Users/alexa/Projects/cdmx_kg/Mexico_City/LEY_AMBIENTAL_DE_LA_CIUDAD_DE_MEXICO.xlsx')
print(f"Loaded {len(ley_ambiental)} rows")
print(f"Columns: {ley_ambiental.columns.tolist()}")

print("\n📝 Pre-calculating ID components (EFFICIENT APPROACH)...")

# 1. Add document hash column (calculate once per document)
ley_ambiental['doc_hash'] = ley_ambiental['document_name'].apply(create_document_hash)

# 2. Add cleaned section column
ley_ambiental['section_clean'] = ley_ambiental['document_section_title'].apply(
    lambda x: re.sub(r'[^a-zA-Z0-9]', '', str(x))[:13].upper() if pd.notna(x) else "NOSEC"
    #lambda x: re.sub(r'[^\w\d]', '', str(x))[:13].upper() if pd.notna(x) else "NOSEC"
)

# 3. Generate final row IDs using pre-calculated components
ley_ambiental['row_id'] = ley_ambiental.apply(
    lambda row: f"{row['doc_hash']}_{row['section_clean']}", axis=1
)

# Show document hash for reference
doc_hash = ley_ambiental['doc_hash'].iloc[0]
print(f"Document hash for '{ley_ambiental['document_name'].iloc[0]}': {doc_hash}")
print(f"✅ ID components calculated once - much more efficient!")

ley_ambiental[['row_id', 'doc_hash', 'section_clean', 'document_section_title']].head()


Loaded 332 rows
Columns: ['city', 'type_document', 'type_document_2', 'juridiction_level ', 'juridiction_level_name', 'document_name', 'document_start_date', 'document_finished_date', 'document_section_title', 'text']

📝 Pre-calculating ID components (EFFICIENT APPROACH)...
Document hash for 'LEY AMBIENTAL DE LA CIUDAD DE MÉXICO': 12406E12
✅ ID components calculated once - much more efficient!


Unnamed: 0,row_id,doc_hash,section_clean,document_section_title
0,12406E12_ARTCULO1,1.2406e+16,ARTCULO1,Artículo 1
1,12406E12_ARTCULO2,1.2406e+16,ARTCULO2,Artículo 2
2,12406E12_ARTCULO3,1.2406e+16,ARTCULO3,Artículo 3
3,12406E12_ARTCULO4,1.2406e+16,ARTCULO4,Artículo 4
4,12406E12_ARTCULO5,1.2406e+16,ARTCULO5,Artículo 5


In [None]:
# def extract_article_mentions_extended(df, text_column='text', section_column='document_section_title'):
#     """
#     Extract article mentions with 30 words before and after context.
#     Returns only the full_context column for manual review.
#     """
#     # Pattern to match article mentions in Spanish
#     article_pattern = r'\b(?:art[íi]culos?|art\.?)\s*(\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?)\b'
    
#     results = []
#     counter = 1
    
#     for idx, row in df.iterrows():
#         text = str(row[text_column]) if pd.notna(row[text_column]) else ""
#         section_title = row[section_column] if pd.notna(row[section_column]) else ""
        
#         # Use pre-calculated document hash (much more efficient!)
#         doc_hash = row.get('doc_hash', 'UNKNOWN')
        
#         # Find all matches in the text
#         matches = list(re.finditer(article_pattern, text, re.IGNORECASE))
        
#         for match in matches:
#             # Get the full matched text
#             matched_text = match.group(0)
            
#             # Split text into words for context extraction
#             words = text.split()
            
#             # Find the position of the match in the word list
#             match_start_char = match.start()
#             match_end_char = match.end()
            
#             # Find which words contain the match
#             char_count = 0
#             start_word_idx = 0
#             end_word_idx = 0
            
#             for i, word in enumerate(words):
#                 word_start = char_count
#                 word_end = char_count + len(word)
                
#                 if word_start <= match_start_char <= word_end:
#                     start_word_idx = i
#                 if word_start <= match_end_char <= word_end:
#                     end_word_idx = i
#                     break
                
#                 char_count += len(word) + 1  # +1 for space
            
#             # Get 30 words before and after (expanded from 10)
#             context_start = max(0, start_word_idx - 30)
#             context_end = min(len(words), end_word_idx + 31)
            
#             words_before = words[context_start:start_word_idx]
#             words_after = words[end_word_idx + 1:context_end]
            
#             context_before = " ".join(words_before)
#             context_after = " ".join(words_after)
            
#             # Only save the full context with highlighted match
#             full_context = f"{context_before} **{matched_text}** {context_after}"
            
#             results.append({
#                 'article_id': generate_extraction_id('ART', doc_hash, counter),
#                 'source_row_id': row.get('row_id', ''),
#                 section_column: section_title,
#                 'full_context': full_context
#             })
#             counter += 1
    
#     return pd.DataFrame(results)

# print("✓ Article extraction function updated with 30-word context")


✓ Article extraction function updated with 30-word context


In [26]:
def extract_article_mentions_improved(df, text_column='text', section_column='document_section_title'):
    """
    VERSIÓN MEJORADA: Detecta múltiples artículos y ofrece contexto flexible
    
    MEJORAS:
    1. ✅ Detecta "artículos 50 y 325" 
    2. ✅ Contexto por caracteres (300) además de palabras (30)
    3. ✅ Múltiples patrones para casos complejos
    """
    
    # PATRONES MEJORADOS
    article_patterns = [
        # Patrón 1: Múltiples artículos con conectores
        r'\b(?:art[íi]culos?|art\.?)\s*\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?(?:\s*(?:y|al|,)\s*\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?)*',
        
        # Patrón 2: Rangos "del 10 al 15"  
        r'\b(?:art[íi]culos?|art\.?)\s*(?:del\s*)?\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?\s*al\s*\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?',
        
        # Patrón 3: Individual (respaldo)
        r'\b(?:art[íi]culos?|art\.?)\s*\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?'
    ]
    
    results = []
    counter = 1
    
    for idx, row in df.iterrows():
        text = str(row[text_column]) if pd.notna(row[text_column]) else ""
        section_title = row[section_column] if pd.notna(row[section_column]) else ""
        doc_hash = row.get('doc_hash', 'UNKNOWN')
        
        if not text.strip():
            continue
            
        # Aplicar patrones (evita duplicados con set)
        found_matches = set()
        
        for pattern_idx, pattern in enumerate(article_patterns):
            matches = re.finditer(pattern, text, re.IGNORECASE)
            
            for match in matches:
                match_key = (match.start(), match.end(), match.group(0))
                if match_key in found_matches:
                    continue
                found_matches.add(match_key)
                
                matched_text = match.group(0)
                
                # CONTEXTO MEJORADO: 30 palabras + 300 caracteres
                context_words = get_word_context(text, match, 30)
                context_chars = get_char_context(text, match, 300)
                
                results.append({
                    'article_id': f"ART_{doc_hash}_{counter:05d}",
                    'source_row_id': row.get('row_id', ''),
                    'pattern_type': f"Pattern_{pattern_idx + 1}",
                    section_column: section_title,
                    'matched_text': matched_text,
                    'context_30_words': context_words,
                    'context_300_chars': context_chars,
                    'start_char': match.start(),
                    'end_char': match.end(),
                })
                counter += 1
    
    return pd.DataFrame(results)

def get_word_context(text, match, num_words):
    """Contexto por palabras"""
    words = text.split()
    text_before_match = text[:match.start()]
    text_after_match = text[match.end():]
    
    words_before = text_before_match.split()[-num_words:] if text_before_match else []
    words_after = text_after_match.split()[:num_words] if text_after_match else []
    
    before = " ".join(words_before)
    after = " ".join(words_after)
    
    return f"{before} **{match.group(0)}** {after}".strip()

def get_char_context(text, match, num_chars):
    """Contexto por caracteres"""
    start = max(0, match.start() - num_chars)
    end = min(len(text), match.end() + num_chars)
    
    before = text[start:match.start()]
    after = text[match.end():end]
    
    return f"{before}**{match.group(0)}**{after}"

print("✅ FUNCIÓN MEJORADA DE ARTÍCULOS INTEGRADA")
print("   - Detecta múltiples artículos: 'artículos 50 y 325'")
print("   - Contexto dual: 30 palabras + 300 caracteres")
print("   - 3 patrones diferentes para mayor cobertura")

✅ FUNCIÓN MEJORADA DE ARTÍCULOS INTEGRADA
   - Detecta múltiples artículos: 'artículos 50 y 325'
   - Contexto dual: 30 palabras + 300 caracteres
   - 3 patrones diferentes para mayor cobertura


In [22]:
def extract_entities_with_official_patterns(df, text_column='text', section_column='document_section_title'):
    """
    NEW REORGANIZED ENTITY EXTRACTION using official CDMX patterns
    
    Pattern hierarchy:
    1. Official CDMX entities (from government registry)
    2. Legal documents (laws, codes, regulations) 
    3. General government patterns (for non-CDMX entities)
    """
    results = []
    
    # 1. LEGAL DOCUMENT PATTERNS
    legal_patterns = [
        (r'\b(?:Ley|LEY)\s+(?:Orgánica|General|Federal|de|del|para|sobre|[A-ZÁÉÍÓÚÑa-záéíóúñ]+)\s+[A-ZÁÉÍÓÚÑa-záéíóúñ\s]{8,150}', 'LAW_MENTION'),
        (r'\b(?:Código|CÓDIGO)\s+(?:de|del|para)\s+[A-ZÁÉÍÓÚÑa-záéíóúñ\s]{10,80}', 'LAW_CODE'),
        (r'\b(?:Reglamento|REGLAMENTO)\s+(?:de|del|para)\s+[A-ZÁÉÍÓÚÑa-záéíóúñ\s]{10,100}', 'REGULATION'),
        (r'\b(?:Norma|NORMA)\s+Oficial\s+Mexicana\s+[A-Z0-9\-]+', 'NOM'),
        (r'\b(?:Constitución|CONSTITUCIÓN)(?:\s+Política)?\s+(?:de|del)\s+[A-ZÁÉÍÓÚÑa-záéíóúñ\s]{5,80}', 'CONSTITUTION'),  # Fixed to catch "Constitución Política"
    ]
    
    # 2. OFFICIAL CDMX GOVERNMENT ENTITIES (from official registry)
    official_cdmx_patterns = []
    for pattern, full_name, category in GOV_ENTITY_REGEX:
        # Clean the pattern - remove problematic double backslashes
        cleaned_pattern = pattern.replace('\\\\', '\\')
        official_cdmx_patterns.append((cleaned_pattern, category))
    
    # 3. GENERAL GOVERNMENT PATTERNS (for entities not in official CDMX registry)
    general_gov_patterns = [
        # Generic secretarias not captured by official patterns
        (r'\b(?:La\s+|la\s+)?(?:Secretaría|SECRETARÍA)(?:\s+(?:de|del)\s+[A-ZÁÉÍÓÚÑa-záéíóúñ\s]{5,80})?', 'SECRETARIA_GENERAL'),
        # Generic alcaldias not captured by official patterns  
        (r'\b(?:Alcaldía|ALCALDÍA|Alcaldías|ALCALDÍAS)\b', 'ALCALDIA_GENERAL'),
        # Federal agencies
        (r'\b(?:SEDEMA|SEMARNAT|CONAGUA|PROFEPA|CONANP|COFEPRIS|CONDUSEF)\b', 'FEDERAL_AGENCY'),
        # Autonomous federal organs
        (r'\b(?:INE|INAI|CNDH|COFECE|IFT|INEGI|CONEVAL)\b', 'ORG_AUTONOMO_FED'),
        # Federal parastatals
        (r'\b(?:IMSS|ISSSTE|PEMEX|CFE)\b', 'PARAESTATAL_FED'),
        # Generic organizational patterns
        (r'\b(?:Instituto|Tribunal|Consejo|Comité|Coordinación|Organismo|Centro|Sistema|Registro)\s+(?:de|para|Público)\s+[A-ZÁÉÍÓÚÑa-záéíóúñ\s]{8,120}', 'ORG_GENERICO'),
        (r'\b(?:Universidad|UNIVERSIDAD)\s+[A-ZÁÉÍÓÚÑa-záéíóúñ\s]{5,50}', 'UNIVERSITY'),
    ]
    
    # Combine all pattern groups with metadata
    all_pattern_groups = [
        ('LEGAL_DOCS', legal_patterns),
        ('CDMX_OFFICIAL', official_cdmx_patterns), 
        ('GENERAL_GOV', general_gov_patterns),
    ]
    
    print(f"Pattern groups loaded:")
    print(f"  - Legal documents: {len(legal_patterns)} patterns")
    print(f"  - Official CDMX entities: {len(official_cdmx_patterns)} patterns") 
    print(f"  - General government: {len(general_gov_patterns)} patterns")
    
    counter = 1
    
    for idx, row in df.iterrows():
        text = str(row[text_column]) if pd.notna(row[text_column]) else ""
        section_title = row[section_column] if pd.notna(row[section_column]) else ""
        
        # Use pre-calculated document hash (much more efficient!)
        doc_hash = row.get('doc_hash', 'UNKNOWN')
        
        if not text.strip():
            continue
        
        # Apply each pattern group
        for group_name, patterns in all_pattern_groups:
            for pattern, entity_label in patterns:
                try:
                    matches = re.finditer(pattern, text, re.IGNORECASE)
                    for match in matches:
                        results.append({
                            'entity_id': generate_extraction_id('ENT', doc_hash, counter),
                            'source_row_id': row.get('row_id', ''),
                            section_column: section_title,
                            'original_text': text,
                            'entity_text': match.group(0).strip(),
                            'entity_label': entity_label,
                            'pattern_group': group_name,
                            'start_char': match.start(),
                            'end_char': match.end(),
                        })
                        counter += 1
                except re.error as e:
                    print(f"Regex error in {group_name} pattern: {e}")
                    continue
    
    return pd.DataFrame(results)

print("NEW organized entity extraction function created with official CDMX patterns")


NEW organized entity extraction function created with official CDMX patterns


In [28]:
# Extract article mentions with extended context
print("Extracting article mentions with 30-word context...")
#article_mentions_df = extract_article_mentions_extended(ley_ambiental)
article_mentions_df = extract_article_mentions_improved(ley_ambiental)


print(f"Found {len(article_mentions_df)} article mentions")
print(f"From {len(ley_ambiental)} original rows")

# Show sample results
if len(article_mentions_df) > 0:
    print("\nSample article mentions:")
    print("=" * 50)
    for i in range(min(3, len(article_mentions_df))):
        row = article_mentions_df.iloc[i]
        print(f"\nExample {i+1}:")
        print(f"Section: {row['document_section_title']}")
        #print(f"Context: {row['full_context'][:200]}...")
        print("-" * 30)


Extracting article mentions with 30-word context...
Found 36 article mentions
From 332 original rows

Sample article mentions:

Example 1:
Section: Artículo 1
------------------------------

Example 2:
Section: Artículo 1
------------------------------

Example 3:
Section: Artículo 25.-
------------------------------


In [24]:
# Extract entities using NEW organized patterns (includes official CDMX entities)
print("Extracting entities using organized patterns with official CDMX registry...")
entities_df = extract_entities_with_official_patterns(ley_ambiental)

print(f"Found {len(entities_df)} entities")

if len(entities_df) > 0:
    print("\nEntity types found:")
    print(entities_df['entity_label'].value_counts())
    
    print("\nSample entities:")
    print("=" * 50)
    for i in range(min(5, len(entities_df))):
        row = entities_df.iloc[i]
        print(f"\n{i+1}. {row['entity_label']}")
        print(f"   Text: '{row['entity_text']}'")
        print(f"   Section: {row['document_section_title']}")


Extracting entities using organized patterns with official CDMX registry...
Pattern groups loaded:
  - Legal documents: 5 patterns
  - Official CDMX entities: 112 patterns
  - General government: 7 patterns
Found 614 entities

Entity types found:
entity_label
SECRETARIA_GENERAL        324
LAW_MENTION                99
ORG_GENERICO               64
CDMX_PODER_EJECUTIVO       49
ALCALDIA_GENERAL           44
CDMX_OTRO                  15
REGULATION                  9
CONSTITUTION                4
CDMX_PODER_LEGISLATIVO      2
CDMX_ORGANOS_AUTONOMOS      2
LAW_CODE                    2
Name: count, dtype: int64

Sample entities:

1. LAW_MENTION
   Text: 'Ley es reglamentaria de las disposiciones contenidas en el Apartado A del artículo'
   Section: Artículo 1

2. CONSTITUTION
   Text: 'Constitución Política de la Ciudad de México'
   Section: Artículo 1

3. ALCALDIA_GENERAL
   Text: 'Alcaldías'
   Section: Artículo 1

4. ORG_GENERICO
   Text: 'organismo público establecido para ello'
   S

In [29]:
# Save results with UTF-8-sig encoding for Excel compatibility
print("Saving results...")

# Save original data with IDs
#original_output = '/Users/alexa/Projects/cdmx_kg/Mexico_City/ley_ambiental_with_ids.csv'
#ley_ambiental.to_csv(original_output, index=False, encoding='utf-8-sig')
#print(f"Original data with IDs saved to: {original_output}")

# Save article mentions with IDs
articles_output = '/Users/alexa/Projects/cdmx_kg/Mexico_City/article_mentions_extended_context.csv'
article_mentions_df.to_csv(articles_output, index=False, encoding='utf-8-sig')
print(f"Article mentions saved to: {articles_output}")

# Save entity recognition results with IDs
entities_output = '/Users/alexa/Projects/cdmx_kg/Mexico_City/entities_regex_extracted_2.csv'
entities_df.to_csv(entities_output, index=False, encoding='utf-8-sig')
print(f"Entities saved to: {entities_output}")

print(f"\nSummary:")
print(f"- Article mentions: {len(article_mentions_df)} (30-word context each)")
print(f"- Entities found: {len(entities_df)} (organized pattern-based extraction)")
print(f"- Both files saved with UTF-8-sig encoding for Excel compatibility")

if len(entities_df) > 0:
    print(f"\nEntity breakdown by pattern group:")
    print(entities_df['pattern_group'].value_counts())
    
    print(f"\nTop entity types found:")
    print(entities_df['entity_label'].value_counts().head(10))
    
    print(f"\nCDMX Official entities breakdown:")
    cdmx_entities = entities_df[entities_df['pattern_group'] == 'CDMX_OFFICIAL']
    if len(cdmx_entities) > 0:
        print(cdmx_entities['entity_label'].value_counts().head(10))


Saving results...
Article mentions saved to: /Users/alexa/Projects/cdmx_kg/Mexico_City/article_mentions_extended_context.csv
Entities saved to: /Users/alexa/Projects/cdmx_kg/Mexico_City/entities_regex_extracted_2.csv

Summary:
- Article mentions: 36 (30-word context each)
- Entities found: 614 (organized pattern-based extraction)
- Both files saved with UTF-8-sig encoding for Excel compatibility

Entity breakdown by pattern group:
pattern_group
GENERAL_GOV      432
LEGAL_DOCS       114
CDMX_OFFICIAL     68
Name: count, dtype: int64

Top entity types found:
entity_label
SECRETARIA_GENERAL        324
LAW_MENTION                99
ORG_GENERICO               64
CDMX_PODER_EJECUTIVO       49
ALCALDIA_GENERAL           44
CDMX_OTRO                  15
REGULATION                  9
CONSTITUTION                4
CDMX_PODER_LEGISLATIVO      2
CDMX_ORGANOS_AUTONOMOS      2
Name: count, dtype: int64

CDMX Official entities breakdown:
entity_label
CDMX_PODER_EJECUTIVO      49
CDMX_OTRO            

In [None]:
# Demonstrate the ID system
print("ID SYSTEM DEMONSTRATION")
print("=" * 50)

if len(ley_ambiental) > 0:
    print(f"Original Data IDs (sample):")
    print(ley_ambiental[['row_id', 'document_section_title']].head(3))
    
if len(article_mentions_df) > 0:
    print(f"\nArticle Mention IDs (sample):")
    print(article_mentions_df[['article_id', 'source_row_id', 'document_section_title']].head(3))
    
if len(entities_df) > 0:
    print(f"\nEntity IDs (sample):")
    print(entities_df[['entity_id', 'source_row_id', 'entity_text']].head(3))

print(f"\n📊 ID Summary:")
print(f"- Original rows: {len(ley_ambiental)} (each with unique row_id)")
print(f"- Article mentions: {len(article_mentions_df)} (each with unique article_id)")
print(f"- Entities: {len(entities_df)} (each with unique entity_id)")
print(f"- All IDs trace back to source via document hash")

# Show ID structure
if len(ley_ambiental) > 0:
    sample_id = ley_ambiental['row_id'].iloc[0]
    print(f"\n🔍 ID Structure Example: {sample_id}")
    parts = sample_id.split('_')
    if len(parts) >= 4:
        print(f"  - Prefix: {parts[0]} (CDMX)")
        print(f"  - Document Hash: {parts[1]} (8-char hash of document name)")
        print(f"  - Section: {parts[2]} (cleaned section title)")
        print(f"  - Row Number: {parts[3]} (5-digit sequential)")
    
print(f"\n✅ This ID system will scale perfectly for multiple documents!")
print(f"\n🚀 EFFICIENCY IMPROVEMENT:")
print(f"   - OLD: Calculate hash {len(article_mentions_df) + len(entities_df)} times")
print(f"   - NEW: Calculate hash only {len(ley_ambiental['doc_hash'].unique())} time(s)")
print(f"   - Performance gain: ~{(len(article_mentions_df) + len(entities_df))//len(ley_ambiental['doc_hash'].unique())}x faster!")
