In [7]:
import pandas as pd
import re
import hashlib
from collections import defaultdict
from cdmx_entity_patterns_fixed import GOV_ENTITY_REGEX
# Use precise patterns instead of generic ones
from federal_laws_patterns_precise import LAWS_REGEX as FEDERAL_LAWS_REGEX, LAW_CATEGORIES as FEDERAL_LAW_CATEGORIES
from cdmx_laws_patterns_precise import LAWS_REGEX as CDMX_LAWS_REGEX, LAW_CATEGORIES as CDMX_LAW_CATEGORIES

In [8]:
# Load the environmental law data
df = pd.read_csv('/Users/alexa/Projects/cdmx_kg/data/identifiers_0_half.csv')

In [9]:
def extract_context_window(text, match_start, match_end, words_before=30, words_after=30):
    """
    Extract context window around a match with specified number of words before and after.
    
    Args:
        text: Full text string
        match_start: Start position of the match
        match_end: End position of the match  
        words_before: Number of words to include before the match
        words_after: Number of words to include after the match
    
    Returns:
        Dictionary with context information
    """
    import re
    
    # Split text into words using regex (handles punctuation better)
    words = re.findall(r'\b\w+\b|\S', text)
    
    # Find word positions for the match
    char_to_word = {}
    char_pos = 0
    
    for word_idx, word in enumerate(words):
        word_start = text.find(word, char_pos)
        word_end = word_start + len(word)
        
        for char_idx in range(word_start, word_end):
            char_to_word[char_idx] = word_idx
            
        char_pos = word_end
    
    # Find word indices for match boundaries
    match_start_word = char_to_word.get(match_start, 0)
    match_end_word = char_to_word.get(match_end - 1, len(words) - 1)
    
    # Calculate context boundaries
    context_start = max(0, match_start_word - words_before)
    context_end = min(len(words), match_end_word + words_after + 1)
    
    # Extract context words
    before_words = words[context_start:match_start_word]
    match_words = words[match_start_word:match_end_word + 1]
    after_words = words[match_end_word + 1:context_end]
    
    # Join back to text
    before_text = ' '.join(before_words) if before_words else ""
    match_text = ' '.join(match_words)
    after_text = ' '.join(after_words) if after_words else ""
    
    # Create full context
    full_context = f"{before_text} **{match_text}** {after_text}".strip()
    
    return {
        'before_context': before_text.strip(),
        'matched_entity': match_text.strip(), 
        'after_context': after_text.strip(),
        'full_context': full_context,
        'words_before_count': len(before_words),
        'words_after_count': len(after_words)
    }


# Extraction of laws and government entities

In [11]:
def extract_entities_with_true_hierarchy(df, text_column='text', section_column='document_section_title'):
    """    
    TRUE HIERARCHICAL Pattern extraction - Official sources have PRECEDENCE:
    1. Federal laws and regulations (precise patterns) - PRIORITY
    2. CDMX laws and regulations (precise patterns) - PRIORITY  
    3. Official CDMX government entities - PRIORITY
    4. Generic patterns ONLY for text spans NOT matched by official patterns
    """
    import time
    from datetime import datetime
    
    start_time = time.time()
    results = []
    
    print(f"🚀 Starting TRUE HIERARCHICAL extraction at {datetime.now().strftime('%H:%M:%S')}")
    print("=" * 70)
    
    # Load official patterns
    print("📋 Loading pattern groups...")
    federal_laws_patterns = [(pattern, category) for pattern, full_name, category in FEDERAL_LAWS_REGEX]
    cdmx_laws_patterns = [(pattern, category) for pattern, full_name, category in CDMX_LAWS_REGEX]
    official_cdmx_patterns = [(pattern.replace('\\\\', '\\'), category) for pattern, full_name, category in GOV_ENTITY_REGEX]
    
    # Generic fallback patterns
    legal_patterns = [
        (r'\b(?:Ley|LEY)\s+(?:Orgánica|General|Federal|de|del|para|sobre|[A-ZÁÉÍÓÚÑa-záéíóúñ]+)\s+[A-ZÁÉÍÓÚÑa-záéíóúñ\s]{8,150}', 'LAW_MENTION'),
        (r'\b(?:Código|CÓDIGO)\s+(?:de|del|para)\s+[A-ZÁÉÍÓÚÑa-záéíóúñ\s]{10,80}', 'LAW_CODE'),
        (r'\b(?:Reglamento|REGLAMENTO)\s+(?:de|del|para)\s+[A-ZÁÉÍÓÚÑa-záéíóúñ\s]{10,100}', 'REGULATION'),
        (r'\b(?:Norma|NORMA)\s+Oficial\s+Mexicana\s+[A-Z0-9\-]+', 'NOM'),
        (r'\b(?:Constitución|CONSTITUCIÓN)(?:\s+Política)?\s+(?:de|del)\s+[A-ZÁÉÍÓÚÑa-záéíóúñ\s]{5,80}', 'CONSTITUTION'),
    ]
    
    general_gov_patterns = [
        (r'\b(?:La\s+|la\s+)?(?:Secretaría|SECRETARÍA)(?:\s+(?:de|del)\s+[A-ZÁÉÍÓÚÑa-záéíóúñ\s]{5,80})?', 'SECRETARIA_GENERAL'),
        (r'\b(?:Alcaldía|ALCALDÍA|Alcaldías|ALCALDÍAS)\b', 'ALCALDIA_GENERAL'),
        (r'\b(?:SEDEMA|SEMARNAT|CONAGUA|PROFEPA|CONANP|COFEPRIS|CONDUSEF)\b', 'FEDERAL_AGENCY'),
        (r'\b(?:INE|INAI|CNDH|COFECE|IFT|INEGI|CONEVAL)\b', 'ORG_AUTONOMO_FED'),
        (r'\b(?:IMSS|ISSSTE|PEMEX|CFE)\b', 'PARAESTATAL_FED'),
        (r'\b(?:Instituto|Tribunal|Consejo|Comité|Coordinación|Organismo|Centro|Sistema|Registro)\s+(?:de|para|Público)\s+[A-ZÁÉÍÓÚÑa-záéíóúñ\s]{8,120}', 'ORG_GENERICO'),
        (r'\b(?:Universidad|UNIVERSIDAD)\s+[A-ZÁÉÍÓÚÑa-záéíóúñ\s]{5,50}', 'UNIVERSITY'),
    ]
    
    # Group by priority
    official_patterns = [
        ('FEDERAL_LAWS', federal_laws_patterns),
        ('CDMX_LAWS', cdmx_laws_patterns), 
        ('CDMX_OFFICIAL', official_cdmx_patterns)
    ]
    
    fallback_patterns = [
        ('LEGAL_DOCS', legal_patterns),
        ('GENERAL_GOV', general_gov_patterns)
    ]
    
    print(f"✅ OFFICIAL patterns loaded: {sum(len(patterns) for _, patterns in official_patterns)} (PRIORITY)")
    print(f"⚡ FALLBACK patterns loaded: {sum(len(patterns) for _, patterns in fallback_patterns)} (only for uncaptured text)")
    print(f"📊 Total documents to process: {len(df)}")
    print("=" * 70)
    
    # Progress tracking variables
    total_rows = len(df)
    official_matches_count = 0
    fallback_matches_count = 0
    blocked_fallback_count = 0
    processed_rows = 0
    
    for idx, row in df.iterrows():
        processed_rows += 1
        
        # Progress indicator every 100 rows
        if processed_rows % 100 == 0 or processed_rows == total_rows:
            elapsed = time.time() - start_time
            progress_pct = (processed_rows / total_rows) * 100
            print(f"📈 Progress: {processed_rows}/{total_rows} ({progress_pct:.1f}%) | "
                  f"Official: {official_matches_count} | Fallback: {fallback_matches_count} | "
                  f"Blocked: {blocked_fallback_count} | Time: {elapsed:.1f}s")
        
        text = str(row[text_column]) if pd.notna(row[text_column]) else ""
        section_title = row[section_column] if pd.notna(row[section_column]) else ""
        
        if not text.strip():
            continue
        
        # Track official matches by text position for this row
        official_matches = []
        row_official_count = 0
        row_fallback_count = 0
        row_blocked_count = 0
        
        # PASS 1: Extract all OFFICIAL patterns (these have precedence)
        for group_name, patterns in official_patterns:
            for pattern, entity_label in patterns:
                try:
                    matches = re.finditer(pattern, text, re.IGNORECASE)
                    for match in matches:
                        # Extract 30 words before and after the match
                        context = extract_context_window(text, match.start(), match.end(), words_before=30, words_after=30)
                        
                        match_info = {
                            'doc_hash': row.get('doc_hash', ''),
                            'row_id': row.get('row_id', ''),
                            'section_title': section_title,
                            'entity_text': match.group(0).strip(),
                            'entity_label': entity_label,
                            'pattern_group': group_name,
                            'before_context': context['before_context'],
                            'after_context': context['after_context'],
                            'full_context': context['full_context'],
                            'words_before_count': context['words_before_count'],
                            'words_after_count': context['words_after_count']
                        }
                        
                        official_matches.append((match.start(), match.end(), match_info))
                        results.append(match_info)
                        row_official_count += 1
                        
                except re.error as e:
                    print(f"❌ Regex error in {group_name}: {e}")
                    continue
        
        # PASS 2: Apply FALLBACK patterns only to uncovered text spans
        for group_name, patterns in fallback_patterns:
            for pattern, entity_label in patterns:
                try:
                    matches = re.finditer(pattern, text, re.IGNORECASE)
                    for match in matches:
                        match_start, match_end = match.start(), match.end()
                        
                        # Check overlap with official matches
                        overlaps_official = any(
                            max(match_start, off_start) < min(match_end, off_end)
                            for off_start, off_end, _ in official_matches
                        )
                        
                        if overlaps_official:
                            row_blocked_count += 1
                        else:
                            # Only add if no overlap with official patterns
                            # Extract 30 words before and after the match
                            context = extract_context_window(text, match.start(), match.end(), words_before=30, words_after=30)
                            
                            results.append({
                                'doc_hash': row.get('doc_hash', ''),
                                'row_id': row.get('row_id', ''),
                                'section_title': section_title,
                                'entity_text': match.group(0).strip(),
                                'entity_label': entity_label,
                                'pattern_group': group_name,
                                'before_context': context['before_context'],
                                'after_context': context['after_context'],
                                'full_context': context['full_context'],
                                'words_before_count': context['words_before_count'],
                                'words_after_count': context['words_after_count']
                            })
                            row_fallback_count += 1
                            
                except re.error as e:
                    print(f"❌ Regex error in {group_name}: {e}")
                    continue
        
        # Update global counters
        official_matches_count += row_official_count
        fallback_matches_count += row_fallback_count
        blocked_fallback_count += row_blocked_count
    
    # Final summary
    total_time = time.time() - start_time
    total_entities = len(results)
    
    print("=" * 70)
    print(f"✅ EXTRACTION COMPLETED at {datetime.now().strftime('%H:%M:%S')}")
    print(f"⏱️  Total processing time: {total_time:.2f} seconds")
    print(f"📊 Documents processed: {processed_rows}")
    print(f"🎯 Official matches: {official_matches_count}")
    print(f"⚡ Fallback matches: {fallback_matches_count}")
    print(f"🚫 Blocked duplicates: {blocked_fallback_count}")
    print(f"📈 Total entities extracted: {total_entities}")
    print(f"💡 Efficiency: {(official_matches_count / total_entities * 100):.1f}% from official patterns")
    print("=" * 70)
    
    return pd.DataFrame(results)


In [12]:
entities_df = extract_entities_with_true_hierarchy(df)

print(f"\n🎯 FINAL SUMMARY: Found {len(entities_df)} entities with TRUE HIERARCHICAL extraction")


🚀 Starting TRUE HIERARCHICAL extraction at 15:34:09
📋 Loading pattern groups...
✅ OFFICIAL patterns loaded: 896 (PRIORITY)
⚡ FALLBACK patterns loaded: 12 (only for uncaptured text)
📊 Total documents to process: 11130
📈 Progress: 100/11130 (0.9%) | Official: 208 | Fallback: 413 | Blocked: 72 | Time: 15.8s
📈 Progress: 200/11130 (1.8%) | Official: 256 | Fallback: 599 | Blocked: 108 | Time: 30.0s
📈 Progress: 300/11130 (2.7%) | Official: 284 | Fallback: 714 | Blocked: 114 | Time: 43.7s
📈 Progress: 400/11130 (3.6%) | Official: 312 | Fallback: 850 | Blocked: 140 | Time: 57.2s
📈 Progress: 500/11130 (4.5%) | Official: 356 | Fallback: 940 | Blocked: 163 | Time: 71.0s
📈 Progress: 600/11130 (5.4%) | Official: 368 | Fallback: 978 | Blocked: 170 | Time: 85.1s
📈 Progress: 700/11130 (6.3%) | Official: 492 | Fallback: 1061 | Blocked: 236 | Time: 99.0s
📈 Progress: 800/11130 (7.2%) | Official: 540 | Fallback: 1112 | Blocked: 265 | Time: 112.4s
📈 Progress: 900/11130 (8.1%) | Official: 631 | Fallback: 1238

In [15]:
# Display sample results with context
print("📋 Sample of extracted entities with context:")
print("=" * 120)

# Show first 3 results with full context
sample_results = entities_df.head(3)
for idx, row in sample_results.iterrows():
    priority = "🔥 OFFICIAL" if row['pattern_group'] in ['FEDERAL_LAWS', 'CDMX_LAWS', 'CDMX_OFFICIAL'] else "⚡ FALLBACK"
    print(f"{priority} Entity: {row['entity_text']}")
    print(f"   📍 Label: {row['entity_label']} | Group: {row['pattern_group']}")
    print(f"   📝 Context: {row['full_context'][:150]}{'...' if len(row['full_context']) > 150 else ''}")
    print(f"   📊 Words: {row['words_before_count']} before + {row['words_after_count']} after")
    print()

print("=" * 120)
print(f"📈 Column structure: {list(entities_df.columns)}")

# Show basic statistics
entities_df.head()

📋 Sample of extracted entities with context:
⚡ FALLBACK Entity: Constitución Política de los Estados Unidos Mexicanos
   📍 Label: CONSTITUTION | Group: LEGAL_DOCS
   📝 Context: en todo lo concerniente a su régimen interior y a su organización política y administrativa . 5 . Las autoridades de la Ciudad ejercen las facultades ...
   📊 Words: 30 before + 30 after

⚡ FALLBACK Entity: Constitución Política de los Estados Unidos Mexicanos
   📍 Label: CONSTITUTION | Group: LEGAL_DOCS
   📝 Context: depende su competitividad , productividad y prosperidad . 8 . El territorio de la Ciudad de México es el que actualmente tiene de conformidad con el a...
   📊 Words: 30 before + 26 after

⚡ FALLBACK Entity: sistema de división de poderes
   📍 Label: ORG_GENERICO | Group: GENERAL_GOV
   📝 Context: y se instituye para beneficio de éste . 3 . La Ciudad adopta para su gobierno la forma republicana , democrática , representativa , laica y popular , ...
   📊 Words: 30 before + 30 after

📈 Column structur

Unnamed: 0,doc_hash,row_id,section_title,entity_text,entity_label,pattern_group,before_context,after_context,full_context,words_before_count,words_after_count
0,F823AF8C,F823AF8C_ARTCULO1,Artículo 1,Constitución Política de los Estados Unidos Me...,CONSTITUTION,LEGAL_DOCS,en todo lo concerniente a su régimen interior ...,", todas aquellas que ésta no concede expresame...",en todo lo concerniente a su régimen interior ...,30,30
1,F823AF8C,F823AF8C_ARTCULO1,Artículo 1,Constitución Política de los Estados Unidos Me...,CONSTITUTION,LEGAL_DOCS,"depende su competitividad , productividad y pr...",. Sus límites geográficos son los fijados por ...,"depende su competitividad , productividad y pr...",30,26
2,F823AF8C,F823AF8C_ARTCULO1,Artículo 1,sistema de división de poderes,ORG_GENERICO,GENERAL_GOV,y se instituye para beneficio de éste . 3 . La...,", pluralismo político y participación social ....",y se instituye para beneficio de éste . 3 . La...,30,30
3,F823AF8C,F823AF8C_ARTCULO3,Artículo 3,Constitución Política de los Estados Unidos Me...,CONSTITUTION,LEGAL_DOCS,"ingreso , la dignificación del trabajo y el sa...",", la igualdad sustantiva , la no discriminació...","ingreso , la dignificación del trabajo y el sa...",30,30
4,F823AF8C,F823AF8C_ARTCULO4,Artículo 4,Constitución Política de los Estados Unidos Me...,CONSTITUTION,LEGAL_DOCS,derechos humanos A . De la protección de los d...,", en los tratados e instrumentos internacional...",derechos humanos A . De la protección de los d...,30,30


In [16]:
# Save entity recognition results with IDs
entities_output = '/Users/alexa/Projects/cdmx_kg/data/entities_extracted_complete.csv'
entities_df.to_csv(entities_output, index=False, encoding='utf-8-sig')
print(f"Entities saved to: {entities_output}")

Entities saved to: /Users/alexa/Projects/cdmx_kg/data/entities_extracted_complete.csv
