In [1]:
import pandas as pd
import re
import time
from datetime import datetime

print(f"🚀 Starting article extraction process at {datetime.now().strftime('%H:%M:%S')}")
start_time = time.time()

df = pd.read_csv('/Users/alexa/Projects/cdmx_kg/data/identifiers_0_half.csv')

print(f"📊 Data loaded: {len(df)} rows")
print(f"📋 Columns available: {list(df.columns)}")
print(f"⏱️  Data loading took: {time.time() - start_time:.2f} seconds")

🚀 Starting article extraction process at 15:48:02
📊 Data loaded: 11130 rows
📋 Columns available: ['row_id', 'doc_hash', 'document_name', 'document_section_title', 'text']
⏱️  Data loading took: 0.12 seconds


In [None]:
def extract_article_mentions_improved(df, text_column='text', section_column='document_section_title'):
    """
    VERSIÓN MEJORADA: Detecta múltiples artículos y ofrece contexto flexible
    
    MEJORAS:
    1. ✅ Detecta "artículos 50 y 325" 
    2. ✅ Contexto por caracteres (300) además de palabras (30)
    3. ✅ Múltiples patrones para casos complejos
    4. ✅ Tracking de tiempo y progreso
    """
    
    print(f"\n🔍 Starting article extraction from {len(df)} rows...")
    start_time = time.time()
    
    # PATRONES MEJORADOS
    article_patterns = [
        # Patrón 1: Múltiples artículos con conectores
        r'\b(?:art[íi]culos?|art\.?)\s*\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?(?:\s*(?:y|al|,)\s*\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?)*',
        
        # Patrón 2: Rangos "del 10 al 15"  
        r'\b(?:art[íi]culos?|art\.?)\s*(?:del\s*)?\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?\s*al\s*\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?',
        
        # Patrón 3: Individual (respaldo)
        r'\b(?:art[íi]culos?|art\.?)\s*\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?'
    ]
    
    results = []
    processed_rows = 0
    rows_with_matches = 0
    total_matches = 0
    
    # Progress tracking
    total_rows = len(df)
    progress_interval = max(1, total_rows // 20)  # Show progress every 5%
    
    for idx, row in df.iterrows():
        processed_rows += 1
        
        # Progress indicator
        if processed_rows % progress_interval == 0:
            progress = (processed_rows / total_rows) * 100
            elapsed = time.time() - start_time
            rate = processed_rows / elapsed if elapsed > 0 else 0
            eta = (total_rows - processed_rows) / rate if rate > 0 else 0
            print(f"⏳ Progress: {progress:.1f}% ({processed_rows}/{total_rows}) | "
                  f"Rate: {rate:.1f} rows/sec | ETA: {eta:.1f}s | Matches: {total_matches}")
        
        text = str(row[text_column]) if pd.notna(row[text_column]) else ""
        section_title = row[section_column] if pd.notna(row[section_column]) else ""
        
        # Use existing columns directly (since they exist in the data)
        doc_hash = row['doc_hash']
        row_id = row['row_id']
        
        if not text.strip():
            continue
            
        # Aplicar patrones (evita duplicados con set)
        found_matches = set()
        row_match_count = 0
        
        for pattern_idx, pattern in enumerate(article_patterns):
            matches = re.finditer(pattern, text, re.IGNORECASE)
            
            for match in matches:
                match_key = (match.start(), match.end(), match.group(0))
                if match_key in found_matches:
                    continue
                found_matches.add(match_key)
                
                matched_text = match.group(0)
                
                # CONTEXTO MEJORADO: 30 palabras + 300 caracteres
                context_words = get_word_context(text, match, 30)
                context_chars = get_char_context(text, match, 300)
                
                results.append({
                    'doc_hash': doc_hash,
                    'row_id': row_id,
                    'pattern_type': f"Pattern_{pattern_idx + 1}",
                    #'document_section_title': section_title,
                    'matched_text': matched_text,
                    'context_30_words': context_words,
                    'context_300_chars': context_chars,
                    #'start_char': match.start(),
                    #'end_char': match.end(),
                })
                row_match_count += 1
                total_matches += 1
        
        if row_match_count > 0:
            rows_with_matches += 1
    
    elapsed_time = time.time() - start_time
    
    print(f"\n✅ Extraction completed!")
    print(f"⏱️  Total time: {elapsed_time:.2f} seconds")
    print(f"📊 Processed: {processed_rows} rows")
    print(f"🎯 Rows with matches: {rows_with_matches} ({rows_with_matches/processed_rows*100:.1f}%)")
    print(f"🔢 Total matches found: {total_matches}")
    print(f"⚡ Processing rate: {processed_rows/elapsed_time:.1f} rows/second")
    
    return pd.DataFrame(results)

def get_word_context(text, match, num_words):
    """Contexto por palabras"""
    words = text.split()
    text_before_match = text[:match.start()]
    text_after_match = text[match.end():]
    
    words_before = text_before_match.split()[-num_words:] if text_before_match else []
    words_after = text_after_match.split()[:num_words] if text_after_match else []
    
    before = " ".join(words_before)
    after = " ".join(words_after)
    
    return f"{before} **{match.group(0)}** {after}".strip()

def get_char_context(text, match, num_chars):
    """Contexto por caracteres"""
    start = max(0, match.start() - num_chars)
    end = min(len(text), match.end() + num_chars)
    
    before = text[start:match.start()]
    after = text[match.end():end]
    
    return f"{before}**{match.group(0)}**{after}"

print("✅ FUNCIÓN MEJORADA DE ARTÍCULOS INTEGRADA")
print("   - Detecta múltiples artículos: 'artículos 50 y 325'")
print("   - Contexto dual: 30 palabras + 300 caracteres")
print("   - 3 patrones diferentes para mayor cobertura")

✅ FUNCIÓN MEJORADA DE ARTÍCULOS INTEGRADA
   - Detecta múltiples artículos: 'artículos 50 y 325'
   - Contexto dual: 30 palabras + 300 caracteres
   - 3 patrones diferentes para mayor cobertura


In [3]:
# Execute the extraction with timing
extraction_start = time.time()
mentions_df = extract_article_mentions_improved(df)
extraction_time = time.time() - extraction_start

print(f"\n📈 FINAL RESULTS:")
print(f"🔢 Total mentions extracted: {len(mentions_df)}")
print(f"⏱️  Total extraction time: {extraction_time:.2f} seconds")

if len(mentions_df) > 0:
    print(f"\n📋 Sample of extracted data:")
    print(f"📊 Pattern distribution:")
    pattern_counts = mentions_df['pattern_type'].value_counts()
    for pattern, count in pattern_counts.items():
        print(f"   {pattern}: {count} matches ({count/len(mentions_df)*100:.1f}%)")
else:
    print("⚠️  No article mentions found in the data")


🔍 Starting article extraction from 11130 rows...
⏳ Progress: 5.0% (556/11130) | Rate: 12913.2 rows/sec | ETA: 0.8s | Matches: 72
⏳ Progress: 10.0% (1112/11130) | Rate: 13247.6 rows/sec | ETA: 0.8s | Matches: 234
⏳ Progress: 15.0% (1668/11130) | Rate: 12523.2 rows/sec | ETA: 0.8s | Matches: 432
⏳ Progress: 20.0% (2224/11130) | Rate: 10479.8 rows/sec | ETA: 0.8s | Matches: 705
⏳ Progress: 25.0% (2780/11130) | Rate: 10688.9 rows/sec | ETA: 0.8s | Matches: 856
⏳ Progress: 30.0% (3336/11130) | Rate: 11111.5 rows/sec | ETA: 0.7s | Matches: 951
⏳ Progress: 35.0% (3892/11130) | Rate: 11655.4 rows/sec | ETA: 0.6s | Matches: 1022
⏳ Progress: 40.0% (4448/11130) | Rate: 11698.1 rows/sec | ETA: 0.6s | Matches: 1121
⏳ Progress: 45.0% (5004/11130) | Rate: 11630.5 rows/sec | ETA: 0.5s | Matches: 1342
⏳ Progress: 50.0% (5560/11130) | Rate: 11795.3 rows/sec | ETA: 0.5s | Matches: 1480
⏳ Progress: 55.0% (6116/11130) | Rate: 11950.8 rows/sec | ETA: 0.4s | Matches: 1566
⏳ Progress: 59.9% (6672/11130) | Ra

In [None]:
# 🧪 TEST CASE: Your specific example
test_text = "Consejo de Salud del Distrito Federal: al Consejo integrado en términos de los artículos 22 y 23 de la Ley de Salud del Distrito Federal;"

print("🧪 TESTING CURRENT PATTERNS:")
print(f"Test text: {test_text}")
print()

# Current patterns from the function
article_patterns = [
    # Patrón 1: Múltiples artículos con conectores
    r'\b(?:art[íi]culos?|art\.?)\s*\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?(?:\s*(?:y|al|,)\s*\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?)*',
    
    # Patrón 2: Rangos "del 10 al 15"  
    r'\b(?:art[íi]culos?|art\.?)\s*(?:del\s*)?\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?\s*al\s*\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?',
    
    # Patrón 3: Individual (respaldo)
    r'\b(?:art[íi]culos?|art\.?)\s*\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?'
]

for i, pattern in enumerate(article_patterns, 1):
    matches = re.findall(pattern, test_text, re.IGNORECASE)
    print(f"Pattern {i}: {matches}")
    if matches:
        print(f"  ✅ Found: {matches[0]}")
    else:
        print(f"  ❌ No match")
    print()

# Test additional challenging cases
print("🧪 TESTING ADDITIONAL EDGE CASES:")
edge_cases = [
    "según el artículo 15 bis y 16 ter de la constitución",
    "conforme a los artículos 1°, 2°, y 3° del reglamento", 
    "artículos del 50 al 75 de la ley general",
    "art. 123, 124 y 125 establecen que...",
    "Los artículos 1, 2, 3, 4 y 5 determinan",
    "artículo 25 quáter de la ley ambiental"
]

for case in edge_cases:
    print(f"Text: {case}")
    for i, pattern in enumerate(article_patterns, 1):
        matches = re.findall(pattern, case, re.IGNORECASE)
        if matches:
            print(f"  Pattern {i} ✅: {matches[0]}")
    print()


In [None]:
# 💡 SUGGESTED IMPROVEMENTS FOR BETTER COVERAGE

print("💡 IMPROVED PATTERNS:")

# Enhanced patterns with better coverage
improved_patterns = [
    # Pattern 1: Multiple articles with various connectors (improved)
    r'\b(?:art[íi]culos?|art\.?)\s*\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?(?:\s*(?:y|e|al|,|;)\s*\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?)*',
    
    # Pattern 2: Ranges with "del X al Y" or "X al Y"
    r'\b(?:art[íi]culos?|art\.?)\s*(?:del\s*)?\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?\s*al\s*\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?',
    
    # Pattern 3: Comma-separated lists (1, 2, 3, 4 y 5)
    r'\b(?:art[íi]culos?|art\.?)\s*\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?(?:\s*,\s*\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?)*(?:\s*(?:y|e)\s*\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?)?',
    
    # Pattern 4: Individual articles (backup)
    r'\b(?:art[íi]culos?|art\.?)\s*\d+(?:\s*[°º])?(?:\s*bis|ter|qu[aá]ter)?'
]

print("Testing improved patterns:")
test_cases_all = [
    "artículos 22 y 23 de la Ley",
    "según el artículo 15 bis y 16 ter",
    "artículos 1°, 2°, y 3° del reglamento", 
    "artículos del 50 al 75 de la ley",
    "art. 123, 124 y 125 establecen",
    "artículos 1, 2, 3, 4 y 5 determinan",
    "artículo 25 quáter de la ley"
]

for case in test_cases_all:
    print(f"\nText: {case}")
    for i, pattern in enumerate(improved_patterns, 1):
        matches = re.findall(pattern, case, re.IGNORECASE)
        if matches:
            print(f"  Improved Pattern {i} ✅: {matches[0]}")

print("\n" + "="*50)
print("🔍 RECOMMENDATIONS:")
print("1. ✅ Your current patterns should work for 'artículos 22 y 23'")
print("2. 💡 Consider adding comma-separated pattern for complex lists")
print("3. 🔧 Add 'e' as alternative to 'y' (Spanish grammar)")
print("4. 📝 Consider semicolon (;) as separator in formal texts")
print("5. 🎯 Pattern priority: Most specific → Most general")


In [None]:
#mentions_df = mentions_df[['entity_id','source_row_id','original_text','entity_text', 'entity_label','pattern_group']]
mentions_df.head()

In [None]:
# Save entity recognition results with IDs

mentions_output = '/Users/alexa/Projects/cdmx_kg/data/mentions_extracted_complete.csv'
mentions_df.to_csv(mentions_output, index=False, encoding='utf-8-sig')

