In [13]:
# ENTITY NORMALIZATION AND ID SYSTEM
# Step 1: Normalize entity names using ChatGPT
# Step 2: Generate hash IDs for unique entities
# Step 3: Create final output files for knowledge graph

import hashlib
import pandas as pd
from collections import Counter, defaultdict
import os
from dotenv import load_dotenv
from openai import OpenAI

# Initialize OpenAI client
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

print("✅ OpenAI client initialized successfully!")
print("📚 Entity normalization system ready!")

✅ OpenAI client initialized successfully!
📚 Entity normalization system ready!


In [15]:

def prompt_entity_normalization(entities_batch, dictionary_terms):
    """
    ChatGPT prompt to normalize government entity names using the law dictionary
    """
    entities_text = "\n".join([f"- {entity}" for entity in entities_batch])
    dictionary_text = "\n".join([f"- {term}: {definition[:100]}..." for term, definition in dictionary_terms])
    
    return f"""You are a specialized entity normalization system for Mexican government documents. Your task is to standardize government entity names to their official, complete forms.

TASK: Normalize the following government entity mentions to their official names.

DICTIONARY CONTEXT (from environmental law definitions):
{dictionary_text}

NORMALIZATION RULES:
• Use OFFICIAL COMPLETE NAMES (e.g., "la Secretaría" → "Secretaría del Medio Ambiente")
• Standardize abbreviations (e.g., "SEDEMA" → "Secretaría del Medio Ambiente")
• Remove articles when not part of official name (e.g., "la Ciudad de México" → "Ciudad de México")
• Keep official legal document names as-is
• For ambiguous references, choose the most common/official interpretation
• Maintain proper capitalization and accents

COMMON PATTERNS TO NORMALIZE:
• "la Secretaría" → "Secretaría del Medio Ambiente" (context: environmental law)
• "Alcaldía" → "Alcaldías de la Ciudad de México" (generic reference)
• "SEDEMA" → "Secretaría del Medio Ambiente"
• "la Ciudad de México" → "Ciudad de México"
• "Jefatura de Gobierno" → "Jefatura de Gobierno de la Ciudad de México"

OUTPUT FORMAT (CSV only, no headers):
original_mention,normalized_name

ENTITY MENTIONS TO NORMALIZE:
{entities_text}

Provide the normalized version for each entity:"""


In [16]:
def create_entity_hash(normalized_name):
    """Generate a hash ID for a normalized entity name"""
    return hashlib.md5(normalized_name.encode('utf-8')).hexdigest()[:12]



In [17]:
def normalize_entities_with_chatgpt(entities_list, dictionary_df, batch_size=20):
    """
    Normalize entity names using ChatGPT in batches
    """
    # Prepare dictionary terms for context
    dictionary_terms = list(zip(dictionary_df['Termino'], dictionary_df['Definicion']))
    
    # Get unique entities to normalize
    unique_entities = list(set(entities_list))
    normalization_map = {}
    
    print(f"Normalizing {len(unique_entities)} unique entities in batches of {batch_size}...")
    
    for i in range(0, len(unique_entities), batch_size):
        batch = unique_entities[i:i+batch_size]
        batch_num = (i // batch_size) + 1
        total_batches = (len(unique_entities) + batch_size - 1) // batch_size
        
        print(f"  Processing batch {batch_num}/{total_batches} ({len(batch)} entities)")
        
        try:
            # Call ChatGPT for normalization
            prompt = prompt_entity_normalization(batch, dictionary_terms[:10])  # Use top 10 dictionary terms for context
            
            resp = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.1
            )
            
            output = resp.choices[0].message.content.strip()
            
            # Parse the normalization results
            for line in output.split('\n'):
                if ',' in line:
                    parts = line.split(',', 1)
                    if len(parts) == 2:
                        original = parts[0].strip().strip('"')
                        normalized = parts[1].strip().strip('"')
                        normalization_map[original] = normalized
                        
        except Exception as e:
            print(f"    Error normalizing batch {batch_num}: {e}")
            # Fallback: use original names
            for entity in batch:
                normalization_map[entity] = entity
        
        # Small delay to avoid rate limits
        import time
        time.sleep(0.5)
    
    return normalization_map



In [None]:
def create_entity_id_system():
    """
    Complete entity ID system: normalize entities and create hash IDs
    """
    print("=== ENTITY NORMALIZATION AND ID SYSTEM ===")
    
    # Load data
    gov_entities_df = pd.read_csv("/Users/alexa/Projects/cdmx_kg/data/gov_entities_test_01.csv")
    dictionary_df = pd.read_csv("/Users/alexa/Projects/cdmx_kg/data/dictionary_1.csv")
    
    print(f"Loaded {len(gov_entities_df)} entity mentions")
    print(f"Loaded {len(dictionary_df)} dictionary terms")
    
    # Get all entity mentions
    entity_mentions = gov_entities_df['mention'].tolist()
    
    # Step 1: Normalize entities using ChatGPT
    print("\n--- Step 1: Entity Normalization ---")
    normalization_map = normalize_entities_with_chatgpt(entity_mentions, dictionary_df)
    
    # Step 2: Create hash IDs for normalized entities
    print("\n--- Step 2: Hash ID Generation ---")
    entity_hash_map = {}
    normalized_entities = list(set(normalization_map.values()))
    
    for normalized_name in normalized_entities:
        hash_id = create_entity_hash(normalized_name)
        entity_hash_map[normalized_name] = hash_id
    
    print(f"Generated {len(entity_hash_map)} unique entity hash IDs")
    
    # Step 3: Create output files
    print("\n--- Step 3: Creating Output Files ---")
    
    # Mentions file: [mention_id, mention_text, entity_hash_id, source_article]
    mentions_data = []
    for idx, row in gov_entities_df.iterrows():
        original_mention = row['mention']
        source_article = row['row_id']
        
        # Get normalized name and hash ID
        normalized_name = normalization_map.get(original_mention, original_mention)
        entity_hash_id = entity_hash_map.get(normalized_name, create_entity_hash(normalized_name))
        
        mentions_data.append({
            'mention_id': f"mention_{idx+1:06d}",
            'mention_text': original_mention,
            'entity_hash_id': entity_hash_id,
            'source_article': source_article,
            'normalized_name': normalized_name
        })
    
    mentions_df = pd.DataFrame(mentions_data)
    
    # Entities file: [entity_hash_id, normalized_name, entity_type, frequency]
    entity_freq = Counter([m['normalized_name'] for m in mentions_data])
    
    entities_data = []
    for normalized_name, frequency in entity_freq.items():
        hash_id = entity_hash_map.get(normalized_name, create_entity_hash(normalized_name))
        
        # Determine entity type (simple classification)
        #entity_type = classify_entity_type(normalized_name)
        
        entities_data.append({
            'entity_hash_id': hash_id,
            'normalized_name': normalized_name,
            #'entity_type': entity_type,
            'frequency': frequency
        })
    
    entities_df = pd.DataFrame(entities_data)
    
    # Save files
    mentions_file = "/Users/alexa/Projects/cdmx_kg/data/entity_mentions_with_ids.csv"
    entities_file = "/Users/alexa/Projects/cdmx_kg/data/unique_entities_with_ids.csv"
    
    mentions_df.to_csv(mentions_file, index=False, encoding='utf-8')
    entities_df.to_csv(entities_file, index=False, encoding='utf-8')
    
    print(f"✅ Saved mentions to: {mentions_file}")
    print(f"✅ Saved entities to: {entities_file}")
    
    # Summary statistics
    print(f"\n--- Summary ---")
    print(f"Total mentions: {len(mentions_df)}")
    print(f"Unique entities: {len(entities_df)}")
    print(f"Normalization rate: {len([k for k, v in normalization_map.items() if k != v])}/{len(normalization_map)} entities normalized")
    
    return mentions_df, entities_df, normalization_map



In [None]:
# Run the entity normalization and ID system
mentions_df, entities_df, normalization_map = create_entity_id_system()

=== ENTITY NORMALIZATION AND ID SYSTEM ===
Loaded 413 entity mentions
Loaded 69 dictionary terms

--- Step 1: Entity Normalization ---
Normalizing 125 unique entities in batches of 20...
  Processing batch 1/7 (20 entities)
  Processing batch 2/7 (20 entities)
  Processing batch 3/7 (20 entities)
  Processing batch 4/7 (20 entities)
  Processing batch 5/7 (20 entities)
  Processing batch 6/7 (20 entities)
  Processing batch 7/7 (5 entities)

--- Step 2: Hash ID Generation ---
Generated 103 unique entity hash IDs

--- Step 3: Creating Output Files ---
✅ Saved mentions to: /Users/alexa/Projects/cdmx_kg/data/entity_mentions_with_ids.csv
✅ Saved entities to: /Users/alexa/Projects/cdmx_kg/data/unique_entities_with_ids.csv

--- Summary ---
Total mentions: 413
Unique entities: 102
Normalization rate: 33/126 entities normalized


(         mention_id                                 mention_text  \
 0    mention_000001  Jefatura de Gobierno de la Ciudad de México   
 1    mention_000002                Secretaría del Medio Ambiente   
 2    mention_000003                                la Secretaría   
 3    mention_000004                                     Alcaldía   
 4    mention_000005                          la Ciudad de México   
 ..              ...                                          ...   
 408  mention_000409                                    Alcaldías   
 409  mention_000410                        autoridades federales   
 410  mention_000411                        autoridades estatales   
 411  mention_000412                      autoridades municipales   
 412  mention_000413                                 Procuraduría   
 
     entity_hash_id       source_article  \
 0     9939333ac7ed    12406E12_ARTCULO1   
 1     f673af2dbe8a    12406E12_ARTCULO1   
 2     f673af2dbe8a    12406E12_ARTCUL

In [None]:
# def classify_entity_type(entity_name):
#     """Simple entity type classification based on name patterns"""
#     name_lower = entity_name.lower()
    
#     if 'secretaría' in name_lower or 'sedema' in name_lower:
#         return 'SECRETARIA'
#     elif 'alcaldía' in name_lower:
#         return 'ALCALDIA'
#     elif 'jefatura' in name_lower:
#         return 'JEFATURA'
#     elif 'instituto' in name_lower:
#         return 'INSTITUTO'
#     elif 'tribunal' in name_lower or 'corte' in name_lower:
#         return 'TRIBUNAL'
#     elif 'consejo' in name_lower or 'comisión' in name_lower:
#         return 'CONSEJO_COMISION'
#     elif 'universidad' in name_lower:
#         return 'UNIVERSIDAD'
#     elif 'ley' in name_lower or 'código' in name_lower or 'reglamento' in name_lower:
#         return 'DOCUMENTO_LEGAL'
#     elif 'ciudad de méxico' in name_lower:
#         return 'ENTIDAD_TERRITORIAL'
#     else:
#         return 'OTRO'

# print("Entity normalization and ID system functions loaded!")
# print("Run: create_entity_id_system() to start the process")
