# LAW AND ARTICLE MENTIONS NORMALIZATION PIPELINE
Based on entity_normalization.ipynb - normalizes law names and creates hash IDs for laws and law+article combinations


In [23]:
import hashlib
import pandas as pd
from collections import Counter, defaultdict
import os
from dotenv import load_dotenv
from openai import OpenAI
import time

# Initialize OpenAI client
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

print("✅ OpenAI client initialized successfully!")
print("📚 Law and article mention normalization system ready!")


✅ OpenAI client initialized successfully!
📚 Law and article mention normalization system ready!


In [24]:
def prompt_law_normalization(law_names_batch):
    """
    ChatGPT prompt to normalize law names to their official, standardized forms
    """
    laws_text = "\n".join([f"- {law}" for law in law_names_batch])
    
    return f"""You are a specialized legal document normalization system for Mexican law. Your task is to standardize legal document names to their official, complete forms.

TASK: Normalize the following legal document names to their official names.

NORMALIZATION RULES:
• Use OFFICIAL COMPLETE NAMES as they appear in legal documents
• Standardize abbreviations and variations (e.g., "CDMX" → "CIUDAD DE MÉXICO")
• Remove prefixes like "la", "el" when not part of official name
• Keep proper capitalization and accents
• Maintain consistency in naming conventions
• For constitutions, use full official name
• For laws, codes, and regulations, use complete official title

COMMON PATTERNS TO NORMALIZE:
• "LEY AMBIENTAL DE LA CIUDAD DE MÉXICO" (keep as-is - official name)
• "CONSTITUCIÓN POLÍTICA DE LA CIUDAD DE MÉXICO" (keep as-is - official name)
• "la Ley General..." → "LEY GENERAL..." (remove article, standardize case)
• "Código Penal..." → "CÓDIGO PENAL..." (standardize case)


OUTPUT FORMAT (CSV only, no headers):
original_name,normalized_name

LAW NAMES TO NORMALIZE:
{laws_text}

Provide the normalized version for each law:"""


In [25]:
def prompt_law_normalization_with_reference(law_names_batch, reference_laws):
    """
    ChatGPT prompt to normalize law names using existing identifiers as reference
    """
    laws_text = "\n".join([f"- {law}" for law in law_names_batch])
    reference_text = "\n".join([f"- {law}" for law in reference_laws[:50]])  # Limit to first 50 for context
    
    return f"""You are a specialized legal document normalization system for Mexican law. Your task is to standardize legal document names using an existing reference database.

REFERENCE DATABASE OF OFFICIAL LAW NAMES:
{reference_text}
... and {len(reference_laws)-50 if len(reference_laws) > 50 else 0} more official law names in the database.

TASK: Normalize the following legal document names by matching them to the REFERENCE DATABASE above.

NORMALIZATION RULES (IN PRIORITY ORDER):
1. **PRIMARY**: If the law name matches or is a variation of ANY name in the REFERENCE DATABASE, use the EXACT name from the database
2. **SECONDARY**: If no match found in database, create the official standardized name following these rules:
   • Use OFFICIAL COMPLETE NAMES as they appear in legal documents
   • Standardize abbreviations (e.g., "CDMX" → "CIUDAD DE MÉXICO", "DF" → "DISTRITO FEDERAL")
   • Remove prefixes like "la", "el" when not part of official name
   • Keep proper capitalization and accents
   • Use consistent naming patterns

MATCHING STRATEGY:
• Look for exact matches first
• Look for matches ignoring case
• Look for matches ignoring articles ("la", "el", "del", etc.)
• Look for abbreviated vs full name matches
• Consider regional variations (DF vs Ciudad de México)

EXAMPLES:
• "la Ley Ambiental de la Ciudad de México" → "LEY AMBIENTAL DE LA CIUDAD DE MÉXICO" (found in database)
• "Constitución de la CDMX" → "CONSTITUCIÓN POLÍTICA DE LA CIUDAD DE MÉXICO" (found in database)
• "Ley del Medio Ambiente DF" → "LEY AMBIENTAL DE LA CIUDAD DE MÉXICO" (match found)

OUTPUT FORMAT (CSV only, no headers):
original_name,normalized_name

LAW NAMES TO NORMALIZE:
{laws_text}

For each law, provide the normalized version (preferring exact database matches):"""

def create_new_document_hash(document_name):
    """Generate a hash for a new legal document not in existing identifiers"""
    return hashlib.md5(document_name.encode('utf-8')).hexdigest()[:8].upper()

def create_new_article_hash(document_hash, article_number):
    """Generate a row_id for a new article in format: DOC_HASH_ARTCULO{num}"""
    return f"{document_hash}_ARTCULO{article_number}"


In [26]:
def load_existing_identifiers():
    """Load existing identifiers from identifiers_0.csv to use existing hashes"""
    identifiers_df = pd.read_csv("/Users/alexa/Projects/cdmx_kg/data/identifiers_0.csv")
    
    # Create lookup dictionaries
    # Law name -> doc_hash mapping
    law_to_hash = dict(zip(identifiers_df['document_name'], identifiers_df['doc_hash']))
    
    # Get unique law names for reference
    reference_laws = sorted(identifiers_df['document_name'].unique())
    
    # (Law name, article number) -> row_id mapping for specific articles
    article_lookup = {}
    for _, row in identifiers_df.iterrows():
        if 'Artículo' in str(row['document_section_title']):
            # Extract article number from "Artículo X"
            try:
                art_num = str(row['document_section_title']).replace('Artículo ', '').strip()
                key = (row['document_name'], art_num)
                article_lookup[key] = row['row_id']
            except:
                continue
    
    return law_to_hash, article_lookup, reference_laws

def find_law_hash(law_name, law_to_hash_map, normalization_map=None):
    """Find existing doc_hash for a law name, trying different matching strategies"""
    # Handle NaN or None values
    if pd.isna(law_name) or law_name is None:
        print(f"⚠️  No existing hash found for law: {law_name} (NaN/None)")
        return None
    
    # Convert to string if not already
    law_name = str(law_name).strip()
    if law_name == '' or law_name.lower() == 'nan':
        print(f"⚠️  No existing hash found for law: {law_name} (empty/NaN)")
        return None
    
    # Try direct match first
    if law_name in law_to_hash_map:
        return law_to_hash_map[law_name]
    
    # Try normalized name if normalization was done
    if normalization_map and law_name in normalization_map:
        normalized_name = normalization_map[law_name]
        if normalized_name in law_to_hash_map:
            return law_to_hash_map[normalized_name]
    
    # Try case-insensitive matching
    law_name_upper = law_name.upper()
    for existing_law, hash_id in law_to_hash_map.items():
        if existing_law.upper() == law_name_upper:
            return hash_id
    
    # Try partial matching (for variations in law names)
    for existing_law, hash_id in law_to_hash_map.items():
        # Remove common prefixes and articles for comparison
        clean_existing = existing_law.replace('la ', '').replace('el ', '').replace('LA ', '').replace('EL ', '').strip()
        clean_input = law_name.replace('la ', '').replace('el ', '').replace('LA ', '').replace('EL ', '').strip()
        
        if clean_existing.upper() == clean_input.upper():
            return hash_id
    
    print(f"⚠️  No existing hash found for law: {law_name}")
    return None

def find_law_article_hash(law_name, article_number, article_lookup_map, law_to_hash_map, normalization_map=None):
    """Find existing row_id for a specific law+article combination"""
    if pd.isna(article_number) or article_number == '':
        return None
    
    # Handle NaN or None law names
    if pd.isna(law_name) or law_name is None:
        print(f"⚠️  No existing hash found for law+article: {law_name} - Artículo {article_number} (NaN law name)")
        return None
    
    # Convert to string if not already
    law_name = str(law_name).strip()
    if law_name == '' or law_name.lower() == 'nan':
        print(f"⚠️  No existing hash found for law+article: {law_name} - Artículo {article_number} (empty law name)")
        return None
    
    # Convert article number to string
    art_num_str = str(int(article_number)) if isinstance(article_number, float) else str(article_number)
    
    # Try direct match first
    key = (law_name, art_num_str)
    if key in article_lookup_map:
        return article_lookup_map[key]
    
    # Try with normalized law name
    if normalization_map and law_name in normalization_map:
        normalized_name = normalization_map[law_name]
        key = (normalized_name, art_num_str)
        if key in article_lookup_map:
            return article_lookup_map[key]
    
    # Try case-insensitive matching
    for (existing_law, existing_art), row_id in article_lookup_map.items():
        if existing_law.upper() == law_name.upper() and existing_art == art_num_str:
            return row_id
    
    print(f"⚠️  No existing hash found for law+article: {law_name} - Artículo {art_num_str}")
    return None


In [27]:
def normalize_laws_with_chatgpt_reference(law_names_list, reference_laws, batch_size=10):
    """
    Normalize law names using ChatGPT in batches with reference database
    """
    # Get unique law names to normalize
    unique_laws = list(set([law for law in law_names_list if pd.notna(law) and law.strip() != '']))
    normalization_map = {}
    
    print(f"Normalizing {len(unique_laws)} unique law names in batches of {batch_size}...")
    print(f"Using {len(reference_laws)} reference laws from existing identifiers")
    
    for i in range(0, len(unique_laws), batch_size):
        batch = unique_laws[i:i+batch_size]
        batch_num = (i // batch_size) + 1
        total_batches = (len(unique_laws) + batch_size - 1) // batch_size
        
        print(f"  Processing batch {batch_num}/{total_batches} ({len(batch)} laws)")
        
        try:
            # Call ChatGPT for normalization using reference database
            prompt = prompt_law_normalization_with_reference(batch, reference_laws)
            
            resp = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.1
            )
            
            output = resp.choices[0].message.content.strip()
            
            # Parse the normalization results
            for line in output.split('\n'):
                if ',' in line:
                    parts = line.split(',', 1)
                    if len(parts) == 2:
                        original = parts[0].strip().strip('"')
                        normalized = parts[1].strip().strip('"')
                        normalization_map[original] = normalized
                        
        except Exception as e:
            print(f"    Error normalizing batch {batch_num}: {e}")
            # Fallback: use original names
            for law in batch:
                normalization_map[law] = law
        
        # Small delay to avoid rate limits
        time.sleep(0.5)
    
    return normalization_map


In [28]:
def create_law_mentions_id_system():
    """
    Complete law mentions ID system: normalize law names and use existing hash IDs
    """
    print("=== LAW AND ARTICLE MENTIONS NORMALIZATION AND ID SYSTEM ===")
    
    # Load data
    mentions_df = pd.read_csv("/Users/alexa/Projects/cdmx_kg/data/article_mentions_test_01.csv")
    
    print(f"Loaded {len(mentions_df)} law/article mentions")
    print(f"Columns: {list(mentions_df.columns)}")
    
    # Step 1: Load existing identifiers
    print("\n--- Step 1: Loading Existing Identifiers ---")
    law_to_hash_map, article_lookup_map, reference_laws = load_existing_identifiers()
    print(f"Loaded {len(law_to_hash_map)} unique laws with existing hashes")
    print(f"Loaded {len(article_lookup_map)} law+article combinations with existing IDs")
    print(f"Loaded {len(reference_laws)} reference law names for normalization")
    
    # Step 2: Normalize law names using ChatGPT with reference database
    print("\n--- Step 2: Law Name Normalization with Reference Database ---")
    law_names = mentions_df['law_name'].tolist()
    normalization_map = normalize_laws_with_chatgpt_reference(law_names, reference_laws)
    
    # Step 3: Create output dataset with existing and new hash IDs
    print("\n--- Step 3: Assigning Hash IDs (Existing + New) ---")
    
    # Process each mention
    processed_mentions = []
    new_documents = {}  # normalized_law_name -> doc_hash for new laws
    new_articles = []   # List of new article records
    unmatched_laws = set()
    unmatched_articles = set()
    
    for idx, row in mentions_df.iterrows():
        original_law = row['law_name']
        article_num = row['art_num']
        
        # Skip rows with NaN law names
        if pd.isna(original_law) or original_law is None:
            print(f"⚠️  Skipping row {idx}: law_name is NaN/None")
            processed_mentions.append({
                'row_id': row['row_id'],
                'art_num': article_num,
                'original_law_name': original_law,
                'normalized_law_name': None,
                'mention_extraction': row['mention_extraction'],
                'law_hash': None,
                'law_article_hash': None
            })
            continue
        
        # Convert to string and validate
        original_law = str(original_law).strip()
        if original_law == '' or original_law.lower() == 'nan':
            print(f"⚠️  Skipping row {idx}: law_name is empty or NaN string")
            processed_mentions.append({
                'row_id': row['row_id'],
                'art_num': article_num,
                'original_law_name': original_law,
                'normalized_law_name': None,
                'mention_extraction': row['mention_extraction'],
                'law_hash': None,
                'law_article_hash': None
            })
            continue
        
        # Get normalized law name
        normalized_law = normalization_map.get(original_law, original_law)
        
        # Find existing law hash (doc_hash)
        law_hash = find_law_hash(original_law, law_to_hash_map, normalization_map)
        
        # If no existing hash found, create a new one (only if we have a valid normalized law)
        if law_hash is None and normalized_law and normalized_law.strip():
            if normalized_law not in new_documents:
                # Create new document hash
                new_doc_hash = create_new_document_hash(normalized_law)
                new_documents[normalized_law] = new_doc_hash
                print(f"  🆕 Created new hash for: {normalized_law} → {new_doc_hash}")
            law_hash = new_documents[normalized_law]
        
        # Find existing law+article hash (row_id) if article number exists
        law_article_hash = None
        if not pd.isna(article_num) and article_num != '':
            law_article_hash = find_law_article_hash(original_law, article_num, article_lookup_map, law_to_hash_map, normalization_map)
            
            # If no existing article hash found, create a new one
            if law_article_hash is None:
                law_article_hash = create_new_article_hash(law_hash, int(article_num))
                new_articles.append({
                    'row_id': law_article_hash,
                    'doc_hash': law_hash,
                    'document_name': normalized_law,
                    'document_section_title': f'Artículo {int(article_num)}'
                })
                print(f"  🆕 Created new article hash: {normalized_law} Art. {article_num} → {law_article_hash}")
        
        processed_mentions.append({
            'row_id': row['row_id'],
            'art_num': article_num,
            'original_law_name': original_law,
            'normalized_law_name': normalized_law,
            'mention_extraction': row['mention_extraction'],
            'law_hash': law_hash,
            'law_article_hash': law_article_hash
        })
    
    result_df = pd.DataFrame(processed_mentions)
    
    # Step 4: Create new identifiers file for new documents
    print("\n--- Step 4: Creating New Identifiers File ---")
    
    # Create new identifiers for documents without articles (just the law)
    new_doc_records = []
    for normalized_law, doc_hash in new_documents.items():
        new_doc_records.append({
            'row_id': doc_hash,  # For documents without specific articles, row_id = doc_hash
            'doc_hash': doc_hash,
            'document_name': normalized_law,
            'document_section_title': 'Full Document'
        })
    
    # Combine new document records with new article records
    all_new_records = new_doc_records + new_articles
    
    if all_new_records:
        new_identifiers_df = pd.DataFrame(all_new_records)
        new_identifiers_file = "/Users/alexa/Projects/cdmx_kg/data/new_identifiers.csv"
        new_identifiers_df.to_csv(new_identifiers_file, index=False, encoding='utf-8')
        print(f"✅ Created new identifiers file: {new_identifiers_file}")
        print(f"   📄 New documents: {len(new_documents)}")
        print(f"   📖 New articles: {len(new_articles)}")
        print(f"   📋 Total new records: {len(all_new_records)}")
    else:
        print("ℹ️  No new documents or articles to add to identifiers")
    
    # Step 5: Create summary statistics
    print("\n--- Step 5: Summary Statistics ---")
    
    total_mentions = len(result_df)
    mentions_with_law_hash = result_df['law_hash'].notna().sum()
    mentions_with_article_hash = result_df['law_article_hash'].notna().sum()
    unique_laws = result_df['normalized_law_name'].nunique()
    unique_law_hashes = result_df['law_hash'].nunique()
    unique_article_hashes = result_df['law_article_hash'].nunique()
    
    print(f"Total mentions processed: {total_mentions}")
    print(f"Mentions with law hash assigned: {mentions_with_law_hash}/{total_mentions} ({mentions_with_law_hash/total_mentions*100:.1f}%)")
    print(f"Mentions with article hash assigned: {mentions_with_article_hash} (where applicable)")
    print(f"Unique normalized laws: {unique_laws}")
    print(f"Unique law hashes (existing + new): {unique_law_hashes}")
    print(f"Unique article hashes (existing + new): {unique_article_hashes}")
    
    # Count normalizations
    normalized_count = len([k for k, v in normalization_map.items() if k != v])
    print(f"Laws that were normalized: {normalized_count}/{len(normalization_map)}")
    
    # Report new vs existing
    existing_matches = len([h for h in result_df['law_hash'] if h in law_to_hash_map.values()])
    new_hashes = len([h for h in result_df['law_hash'] if h in new_documents.values()])
    print(f"Existing law matches: {existing_matches}")
    print(f"New law hashes created: {new_hashes}")
    
    # Step 6: Save results
    output_file = "/Users/alexa/Projects/cdmx_kg/data/normalized_law_mentions_with_complete_hashes.csv"
    result_df.to_csv(output_file, index=False, encoding='utf-8')
    
    print(f"\n✅ Saved normalized mentions with complete hashes to: {output_file}")
    
    return result_df, normalization_map, new_documents, new_articles


In [29]:
# Run the law mentions normalization and ID system
result_df, normalization_map, new_documents, new_articles = create_law_mentions_id_system()


=== LAW AND ARTICLE MENTIONS NORMALIZATION AND ID SYSTEM ===
Loaded 728 law/article mentions
Columns: ['row_id', 'art_num', 'law_name', 'mention_extraction']

--- Step 1: Loading Existing Identifiers ---
Loaded 212 unique laws with existing hashes
Loaded 16164 law+article combinations with existing IDs
Loaded 212 reference law names for normalization

--- Step 2: Law Name Normalization with Reference Database ---
Normalizing 54 unique law names in batches of 10...
Using 212 reference laws from existing identifiers
  Processing batch 1/6 (10 laws)
  Processing batch 2/6 (10 laws)
  Processing batch 3/6 (10 laws)
  Processing batch 4/6 (10 laws)
  Processing batch 5/6 (10 laws)
  Processing batch 6/6 (4 laws)

--- Step 3: Assigning Hash IDs (Existing + New) ---
⚠️  No existing hash found for law: CONSTITUCIÓN POLÍTICA DE LOS ESTADOS UNIDOS MEXICANOS
  🆕 Created new hash for: CONSTITUCIÓN POLÍTICA DE LOS ESTADOS UNIDOS MEXICANOS → EC2B96E7
⚠️  No existing hash found for law+article: CONST

ValueError: invalid literal for int() with base 10: '27 BIS'

In [14]:
# Display sample results
print("\n=== SAMPLE RESULTS ===")
print("\nFirst 10 rows of normalized data:")
display(result_df.head(10))

print("\nLaw normalization examples:")
normalization_examples = [(k, v) for k, v in normalization_map.items() if k != v][:10]
if normalization_examples:
    for original, normalized in normalization_examples:
        print(f"{original} → {normalized}")
else:
    print("No significant normalizations were needed (laws matched reference database)")

print("\nNew documents created:")
if new_documents:
    for law_name, doc_hash in list(new_documents.items())[:5]:
        print(f"  {doc_hash}: {law_name}")
    if len(new_documents) > 5:
        print(f"  ... and {len(new_documents)-5} more")
else:
    print("  No new documents created")

print("\nData types and null counts:")
print(result_df.isnull().sum())



=== SAMPLE RESULTS ===

First 10 rows of normalized data:


Unnamed: 0,row_id,art_num,original_law_name,normalized_law_name,mention_extraction,law_hash,law_article_hash
0,12406E12_ARTCULO1,13.0,CONSTITUCIÓN POLÍTICA DE LA CIUDAD DE MÉXICO,CONSTITUCIÓN POLÍTICA DE LA CIUDAD DE MÉXICO,artículo 13 de la Constitución Política de la ...,F823AF8C,F823AF8C_ARTCULO13
1,12406E12_ARTCULO1,16.0,CONSTITUCIÓN POLÍTICA DE LA CIUDAD DE MÉXICO,CONSTITUCIÓN POLÍTICA DE LA CIUDAD DE MÉXICO,artículo 16 de la Constitución Política de la ...,F823AF8C,F823AF8C_ARTCULO16
2,12406E12_ARTCULO2,,LEY AMBIENTAL DE LA CIUDAD DE MÉXICO,LEY AMBIENTAL DE LA CIUDAD DE MÉXICO,esta Ley,12406E12,
3,12406E12_ARTCULO3,,LEY AMBIENTAL DE LA CIUDAD DE MÉXICO,LEY AMBIENTAL DE LA CIUDAD DE MÉXICO,la presente Ley,12406E12,
4,12406E12_ARTCULO4,,LEY GENERAL DEL EQUILIBRIO ECOLÓGICO Y LA PROT...,LEY GENERAL DEL EQUILIBRIO ECOLÓGICO Y LA PROT...,la Ley General del Equilibrio Ecológico y la P...,BD25BE0A,
5,12406E12_ARTCULO4,,LEY DE AGUAS NACIONALES,LEY DE AGUAS NACIONALES,la Ley de Aguas Nacionales,DCD3A83B,
6,12406E12_ARTCULO4,,LEY GENERAL DE VIDA SILVESTRE,LEY GENERAL DE VIDA SILVESTRE,la Ley General de Vida Silvestre,D75EAA99,
7,12406E12_ARTCULO4,,LEY GENERAL DE DESARROLLO FORESTAL SUSTENTABLE,LEY GENERAL DE DESARROLLO FORESTAL SUSTENTABLE,la Ley General de Desarrollo Forestal Sustentable,C262D1AD,
8,12406E12_ARTCULO4,,LEY GENERAL PARA LA PREVENCIÓN Y GESTIÓN INTEG...,LEY GENERAL PARA LA PREVENCIÓN Y GESTIÓN INTEG...,la Ley General para la Prevención y Gestión In...,DC6628A9,
9,12406E12_ARTCULO4,,LEY GENERAL DE CAMBIO CLIMÁTICO,LEY GENERAL DE CAMBIO CLIMÁTICO,la Ley General de Cambio Climático,D44C3DBD,



Law normalization examples:
LEY ORGÁNICA DEL PODER EJECUTIVO Y DE LA ADMINISTRACIÓN PÚBLICA DE LA CIUDAD DE MÉXICO → LEY DE LA ADMINISTRACIÓN PÚBLICA DE LA CIUDAD DE MÉXICO
LEY DEL DERECHO AL ACCESO → LEY DE ACCESO DE LAS MUJERES A UNA VIDA LIBRE DE VIOLENCIA DE LA CIUDAD DE MÉXICO

New documents created:
  BD25BE0A: LEY GENERAL DEL EQUILIBRIO ECOLÓGICO Y LA PROTECCIÓN AL AMBIENTE
  DCD3A83B: LEY DE AGUAS NACIONALES
  D75EAA99: LEY GENERAL DE VIDA SILVESTRE
  C262D1AD: LEY GENERAL DE DESARROLLO FORESTAL SUSTENTABLE
  DC6628A9: LEY GENERAL PARA LA PREVENCIÓN Y GESTIÓN INTEGRAL DE LOS RESIDUOS
  ... and 1 more

Data types and null counts:
row_id                  0
art_num                30
original_law_name       0
normalized_law_name     0
mention_extraction      0
law_hash                0
law_article_hash       30
dtype: int64


In [15]:
# Create separate files for unique laws and law+article combinations
print("\n=== CREATING REFERENCE FILES ===")

# Unique laws with their complete hashes (existing + new)
unique_laws_df = result_df[['normalized_law_name', 'law_hash']].drop_duplicates().sort_values('normalized_law_name')
unique_laws_df['frequency'] = result_df['normalized_law_name'].value_counts().reindex(unique_laws_df['normalized_law_name']).values
unique_laws_df['is_new'] = unique_laws_df['law_hash'].isin(new_documents.values())

laws_file = "/Users/alexa/Projects/cdmx_kg/data/unique_laws_with_complete_hashes.csv"
unique_laws_df.to_csv(laws_file, index=False, encoding='utf-8')
print(f"✅ Saved unique laws (existing + new) to: {laws_file}")

# Unique law+article combinations (existing + new)
law_article_combinations = result_df[result_df['law_article_hash'].notna()].copy()
if len(law_article_combinations) > 0:
    unique_combinations_df = law_article_combinations[['normalized_law_name', 'art_num', 'law_hash', 'law_article_hash']].drop_duplicates().sort_values(['normalized_law_name', 'art_num'])
    unique_combinations_df['frequency'] = law_article_combinations.groupby(['normalized_law_name', 'art_num']).size().reindex(unique_combinations_df.set_index(['normalized_law_name', 'art_num']).index).values
    
    # Mark which are new articles
    new_article_hashes = [article['row_id'] for article in new_articles]
    unique_combinations_df['is_new'] = unique_combinations_df['law_article_hash'].isin(new_article_hashes)

    combinations_file = "/Users/alexa/Projects/cdmx_kg/data/unique_law_article_combinations_with_complete_hashes.csv"
    unique_combinations_df.to_csv(combinations_file, index=False, encoding='utf-8')
    print(f"✅ Saved unique law+article combinations (existing + new) to: {combinations_file}")
    print(f"Law+article combinations: {len(unique_combinations_df)} ({unique_combinations_df['is_new'].sum()} new)")
else:
    print("⚠️  No law+article combinations found")

print(f"Unique laws: {len(unique_laws_df)} ({unique_laws_df['is_new'].sum()} new)")

# Complete coverage summary
total_laws_mentioned = result_df['original_law_name'].nunique()
laws_with_hashes = result_df[result_df['law_hash'].notna()]['original_law_name'].nunique()
print(f"\nComplete Coverage:")
print(f"Laws with hashes: {laws_with_hashes}/{total_laws_mentioned} ({laws_with_hashes/total_laws_mentioned*100:.1f}%)")
print(f"New documents created: {len(new_documents)}")
print(f"New articles created: {len(new_articles)}")

# Instructions for updating master identifiers
if new_documents or new_articles:
    print(f"\n📋 TO UPDATE MASTER IDENTIFIERS:")
    print(f"Append the contents of 'new_identifiers.csv' to 'identifiers_0.csv' to keep the master file updated.")



=== CREATING REFERENCE FILES ===
✅ Saved unique laws (existing + new) to: /Users/alexa/Projects/cdmx_kg/data/unique_laws_with_complete_hashes.csv
✅ Saved unique law+article combinations (existing + new) to: /Users/alexa/Projects/cdmx_kg/data/unique_law_article_combinations_with_complete_hashes.csv
Law+article combinations: 2 (0 new)
Unique laws: 14 (6 new)

Complete Coverage:
Laws with hashes: 14/14 (100.0%)
New documents created: 6
New articles created: 0

📋 TO UPDATE MASTER IDENTIFIERS:
Append the contents of 'new_identifiers.csv' to 'identifiers_0.csv' to keep the master file updated.
