# Connect to Chatgpt API to collect entities and relations

In [2]:
import os
from dotenv import load_dotenv
from openai import OpenAI
import pandas as pd

load_dotenv()  # loads variables from .env
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [3]:
# Quick test: Ask model for 5 words
resp = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role": "user", "content": "Say 5 random fruits"}],
    temperature=0.1
)

print(resp.choices[0].message.content)

Sure! Here are five random fruits:

1. Mango
2. Kiwi
3. Dragon fruit
4. Blueberry
5. Papaya


In [4]:
# Load your Excel file
df = pd.read_excel("/Users/alexa/Projects/cdmx_kg/Mexico_City/test_01.xlsx")
df = df[['row_id','doc_hash','document_name', 'document_section_title', 'text']]
print(df.head())

              row_id  doc_hash                                 document_name  \
0  F823AF8C_ARTCULO1  F823AF8C  CONSTITUCIÓN POLÍTICA DE LA CIUDAD DE MÉXICO   
1  F823AF8C_ARTCULO2  F823AF8C  CONSTITUCIÓN POLÍTICA DE LA CIUDAD DE MÉXICO   
2  F823AF8C_ARTCULO3  F823AF8C  CONSTITUCIÓN POLÍTICA DE LA CIUDAD DE MÉXICO   
3  F823AF8C_ARTCULO4  F823AF8C  CONSTITUCIÓN POLÍTICA DE LA CIUDAD DE MÉXICO   
4  F823AF8C_ARTCULO5  F823AF8C  CONSTITUCIÓN POLÍTICA DE LA CIUDAD DE MÉXICO   

  document_section_title                                               text  
0             Artículo 1  De la Ciudad de México\n\n1. \tLa Ciudad de Mé...  
1             Artículo 2  De la naturaleza intercultural, pluriétnica, p...  
2             Artículo 3  De los principios rectores\n\n1. \tLa dignidad...  
3             Artículo 4  Principios de interpretación y aplicación de l...  
4             Artículo 5  Ciudad garantista\n\nA. Progresividad de los d...  


In [5]:
# 1. Firstly ask to identify entities. 
# Output is a csv with a art_name | mention | 

# Prompts
def prompt_entities(batch):
    text = "\n\n".join(
        f"ROW_ID: {row.row_id}\n"
        f"DOCUMENT: {row.document_name}\n"
        f"SECTION: {row.document_section_title}\n"
        f"TEXT: {row.text}"
        for row in batch.itertuples()
    )
    return f"""
You are a specialized legal entity extraction system for Mexico City government documents. 
Your task is to be EXHAUSTIVE and find ALL government entities mentioned.

CRITICAL: 
- READ EVERY WORD CAREFULLY. Do not miss any government entity, no matter how briefly mentioned.
- DO NOT INCLUDE LAW MENTIONS. ONLY GOVERNMENT ENTITIES.

TASK: Extract ALL government entities, institutions, and legal bodies from these legal texts.

ENTITY TYPES TO IDENTIFY (scan for ALL of these):
- Secretarías (ministries): Secretaría de..., SEDEMA, SEDUVI, SIBISO, STyFE, etc.
- Alcaldías (municipal governments): Alcaldía..., any of the 16 alcaldías
- Institutos and agencies: Instituto de..., Agencia de..., ADIP, etc.
- Tribunales and courts: Tribunal..., Juzgado..., Corte..., etc.
- Consejos and commissions: Consejo de..., Comisión de..., etc.
- Universities and schools: Universidad de..., UACM, etc.
- Procuradurías: Procuraduría..., PAOT, etc.
- Federal entities: SEMARNAT, CONAGUA, PROFEPA, etc.
- Any organization with official government role

COMPREHENSIVE EXTRACTION RULES:
1. Extract EXACTLY as written in the text (preserve case, accents, articles like "la", "el")
2. Include full official names AND abbreviations when mentioned
3. Look for entities in ALL parts of the text, including parentheses, footnotes, lists
4. Capture entities mentioned in different forms (e.g., "la Secretaría", "dicha Secretaría")
5. Include entities that appear in compound phrases
6. Do NOT skip vague references - if it's a government body, include it
7. Be especially careful with lists and enumerated items

SEARCH STRATEGY:
- Read the text word by word
- Look for capital letters that might indicate proper nouns
- Check for organizational keywords: Secretaría, Instituto, Consejo, Comisión, Alcaldía, etc.
- Examine abbreviations in parentheses
- Review any lists or bullet points carefully

OUTPUT FORMAT - CSV rows only, no headers, no markdown blocks:
row_id,mention

EXAMPLES:
12406E12_ARTCULO5,Secretaría del Medio Ambiente
12406E12_ARTCULO5,SEDEMA
12406E12_ARTCULO5,Jefatura de Gobierno de la Ciudad de México
12406E12_ARTCULO5,la Secretaría
12406E12_ARTCULO5,Alcaldía Benito Juárez

INPUT:
{text}
"""


In [6]:
# 2. Secondly ask to identify article mentions. 
#Each article mention must include the number article and the law.
# Output is a csv with art_name | art_mention 

def prompt_article_mentions(batch):
    text = "\n\n".join(
        f"ROW_ID: {row.row_id}\n"
        f"DOCUMENT: {row.document_name}\n"
        f"SECTION: {row.document_section_title}\n"
        f"TEXT: {row.text}"
        for row in batch.itertuples()
    )
    return f"""You are a specialized legal citation extraction system for Mexican law documents. Your mission is to find EVERY SINGLE legal reference, no matter how subtle.

CRITICAL: SCAN EVERY WORD. Legal documents often have multiple references per sentence. Miss nothing.

TASK: Find ALL references to articles, laws, codes, and legal documents from the TEXT content only.

IMPORTANT: Only analyze the 'TEXT' field. The article name (SECTION) is just for identification purposes - it tells you which article the mentions were found in.

STRICT EXTRACTION RULE - EXPLICIT MENTIONS ONLY:
- ONLY extract laws, codes, and documents that are EXPLICITLY NAMED in the TEXT
- Include COMPLETE law names - never truncate or abbreviate (e.g., "LEY DEL DERECHO AL ACCESO, DISPOSICIÓN Y SANEAMIENTO DEL AGUA DE LA CIUDAD DE MÉXICO")
- DO NOT infer or assume law names that are not written in the text
- DO NOT extract references to laws that are only implied or suggested

CRITICAL - WHEN TO USE CURRENT LAW (from DOCUMENT field):
- ONLY use current law for: "esta Ley", "la presente Ley", "este Reglamento", "este Código"
- ONLY use current law for: standalone article numbers (e.g., "artículo 5" with NO law mentioned)
- ONLY use current law for: relative references ("artículo anterior", "artículo siguiente")

CRITICAL - WHEN NOT TO USE CURRENT LAW:
- DO NOT use current law for entity mentions like "la Secretaría", "las autoridades"
- DO NOT use current law for general legal concepts like "normatividad", "disposiciones"
- IF NO EXPLICIT LAW NAME AND NOT IN ALLOWED EXCEPTIONS ABOVE → DO NOT EXTRACT


CITATION TYPES TO EXTRACT (find ALL occurrences):
- Article references: "artículo 123", "art. 45", "artículos 1 al 15", "arts.", "apartado"
- Law references: "Ley General de...", "Ley Ambiental", "Ley de...", "dicha Ley", etc.
- Self-references: "esta Ley", "la presente Ley", "este Reglamento", "esta norma"
- Code references: "Código Civil", "Código Penal", "códigos"
- Constitutional references: "Constitución Política", "constitucional", "Carta Magna"
- Regulation references: "Reglamento de...", "reglamentario", "normas reglamentarias"
- Decree references: "Decreto", "decreto ejecutivo"
- Treaties and conventions: "Tratado", "Convención", "acuerdo internacional"
- Relative references: "artículo anterior", "artículo siguiente", "apartado anterior"

EXHAUSTIVE EXTRACTION RULES:
1. ONLY extract mentions from the TEXT field, ignore SECTION and DOCUMENT fields
2. For "esta Ley", "la presente Ley" → replace with actual current law name from DOCUMENT field
3. For "reglamento de esta Ley", "reglamento de la presente Ley" → construct as "REGLAMENTO DE LA [CURRENT LAW NAME]"
4. Extract complete citations as they appear in the text
5. For article + law mentions: Include both article number AND the law/document it refers to
6. For law-only mentions: If no article number is specified, leave art_num empty but include the law name
7. Create ONE ROW per mention found - if one article contains multiple mentions, create multiple rows
8. For article ranges (e.g., "artículos 13 y 16"), create separate rows for each article
9. Check for references using pronouns ("la misma", "dicha ley", "tal disposición")
10. In mention_extraction: Include the COMPLETE TEXT of the specific mention (the full phrase that mentions the article+law or law)
11. Preserve original formatting and punctuation
12. Law names can be very long with commas - extract them in full exactly as written


SPECIAL HANDLING FOR RELATIVE REFERENCES:
13. For "artículo anterior": Extract current article number from SECTION field, subtract 1, use current law from DOCUMENT field
14. For "artículo siguiente": Extract current article number from SECTION field, add 1, use current law from DOCUMENT field
15. For "apartado anterior", "fracción anterior": Only cite the current law name without any article number.
16. Always verify the relative reference makes sense (don't create negative article numbers)

SYSTEMATIC SEARCH APPROACH:
- Read sentence by sentence
- Look for legal keywords: artículo, ley, código, reglamento, decreto, etc.
- Check numbers that might be article references
- Search for relative terms: anterior, siguiente, precedente, subsecuente

OUTPUT FORMAT - CSV rows only, no headers, no markdown blocks:
row_id,art_num,law_name,mention_extraction

Where:
- row_id: Current article being analyzed (from ROW_ID field)
- art_num: Article number being referenced (from TEXT field) - LEAVE EMPTY if no article number mentioned
- law_name: COMPLETE law/document name containing the referenced article (NEVER truncate - use full name exactly as written)
- mention_extraction: The COMPLETE TEXT of the specific mention

EXAMPLES:
For text: "la presente Ley es reglamentaria de las disposiciones contenidas en el Apartado A del artículo 13 y del Apartado A del artículo 16 de la Constitución Política de la Ciudad de México"
Should produce TWO rows:
12406E12_ARTCULO1,13,CONSTITUCIÓN POLÍTICA DE LA CIUDAD DE MÉXICO,artículo 13 de la Constitución Política de la Ciudad de México
12406E12_ARTCULO1,16,CONSTITUCIÓN POLÍTICA DE LA CIUDAD DE MÉXICO,artículo 16 de la Constitución Política de la Ciudad de México

For text: "Se aplicará supletoriamente el Código Civil para el Distrito Federal en materia común"
Should produce ONE row:
12406E12_ARTCULO3,,CÓDIGO CIVIL PARA EL DISTRITO FEDERAL,el Código Civil para el Distrito Federal

For text: "los demás que establezca el reglamento de la presente Ley" (where current law is LEY AMBIENTAL DE LA CIUDAD DE MÉXICO)
Should produce ONE row:
12406E12_ARTCULO5,,REGLAMENTO DE LA LEY AMBIENTAL DE LA CIUDAD DE MÉXICO,el reglamento de la presente Ley

For text: "conforme a lo dispuesto en el artículo anterior" (in ARTÍCULO 5 of LEY AMBIENTAL DE LA CIUDAD DE MÉXICO)
Should produce ONE row:
12406E12_ARTCULO5,4,LEY AMBIENTAL DE LA CIUDAD DE MÉXICO,el artículo anterior

ALLOWED - USE CURRENT LAW:
For text: "esta Ley establece" → 12406E12_ARTCULO5,,LEY AMBIENTAL DE LA CIUDAD DE MÉXICO,esta Ley
For text: "artículo 10" (standalone number) → 12406E12_ARTCULO5,10,LEY AMBIENTAL DE LA CIUDAD DE MÉXICO,artículo 10
For text: "artículo anterior" → 12406E12_ARTCULO5,4,LEY AMBIENTAL DE LA CIUDAD DE MÉXICO,artículo anterior

NOT ALLOWED - DO NOT USE CURRENT LAW:
For text: "según establece la ley" → NO OUTPUT (vague reference)
For text: "dicha disposición" → NO OUTPUT (vague reference) 
For text: "la Secretaría debe aprobar" → NO OUTPUT (entity mention, not legal citation)
For text: "conforme a la normatividad" → NO OUTPUT (general concept, not specific law)

INPUT:
{text}
"""

In [7]:
# Helper function to clean and validate GPT output
def clean_csv_output(raw_output, expected_columns):
    """Clean and validate CSV output from GPT, filtering out artifacts and malformed entries"""
    clean_lines = []
    for line in raw_output.strip().splitlines():
        line = line.strip()
        
        # Skip empty lines
        if not line:
            continue
            
        # Skip markdown code blocks
        if line.startswith('```'):
            continue
            
        # Skip headers (case insensitive)
        if line.lower().startswith('art_name') or line.lower().startswith('document'):
            continue
            
        # Skip lines that look like instructions or comments
        if line.startswith('#') or line.startswith('//') or line.startswith('OUTPUT') or line.startswith('EXAMPLES'):
            continue
            
        # Must contain commas for CSV format
        if ',' not in line:
            continue
            
        # Check if line has the expected number of columns (allow some flexibility)
        parts = line.split(',')
        if len(parts) < expected_columns:
            print(f"    Skipping malformed line (too few columns): {line[:50]}...")
            continue
            
        # Clean each part
        cleaned_parts = []
        for part in parts[:expected_columns]:  # Only take expected number of columns
            cleaned_part = part.strip().strip('"').strip("'")  # Remove quotes and extra spaces
            cleaned_parts.append(cleaned_part)
            
        # Skip if any essential fields are empty (first two columns should not be empty)
        if not cleaned_parts[0]:
            print(f"    Skipping line with empty essential fields: {line[:50]}...")
            continue
            
        # Validate row_id structure (should match pattern like: 12406E12_ARTCULO5)
        import re
        row_id_pattern = r'^[A-F0-9]{8}_[A-ZÁÉÍÓÚÑÜ]+\d*$'
        if not re.match(row_id_pattern, cleaned_parts[0]):
            print(f"    Skipping line with invalid row_id structure: {line[:50]}...")
            continue
            
        clean_lines.append(cleaned_parts)
    
    return clean_lines

# CONFIGURATION - Choose extraction mode
USE_DUAL_PASS = False  # Set to True for dual-pass (0.1 + 0.3), False for single-pass (0.1 only)

# Collectors
gov_entities = []
article_mentions = []

# Batch size - reduced for better quality and completeness
BATCH_SIZE = 8  # Even smaller batches for better attention

def extract_with_single_pass(batch, prompt_func, expected_columns):
    """Single-pass extraction with temperature 0.1"""
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt_func(batch)}],
        temperature=0.1
    )
    output = resp.choices[0].message.content.strip()
    
    if output:
        cleaned_output = clean_csv_output(output, expected_columns)
        return cleaned_output
    return []

def extract_with_dual_pass(batch, prompt_func, expected_columns):
    """Dual-pass extraction with temperatures 0.1 and 0.3"""
    # First pass - standard extraction
    resp1 = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt_func(batch)}],
        temperature=0.1
    )
    output1 = resp1.choices[0].message.content.strip()
    
    # Second pass - with different temperature for variation
    resp2 = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt_func(batch)}],
        temperature=0.3  # Higher temperature for different perspective
    )
    output2 = resp2.choices[0].message.content.strip()
    
    # Combine both outputs
    combined_results = []
    if output1:
        cleaned_results_1 = clean_csv_output(output1, expected_columns)
        combined_results.extend(cleaned_results_1)
    if output2:
        cleaned_results_2 = clean_csv_output(output2, expected_columns)
        combined_results.extend(cleaned_results_2)
    
    # Remove duplicates while preserving order
    seen = set()
    unique_results = []
    for result in combined_results:
        result_key = tuple(result)
        if result_key not in seen:
            seen.add(result_key)
            unique_results.append(result)
    
    return unique_results, len(combined_results)

total_batches = (len(df) + BATCH_SIZE - 1) // BATCH_SIZE
extraction_mode = "dual-pass (0.1 + 0.3)" if USE_DUAL_PASS else "single-pass (0.1 only)"
print(f"Processing {len(df)} articles in {total_batches} batches using {extraction_mode}...")

for i, start in enumerate(range(0, len(df), BATCH_SIZE)):
    batch = df.iloc[start:start+BATCH_SIZE]
    batch_num = i + 1
    
    print(f"\nProcessing batch {batch_num}/{total_batches} (articles {start+1}-{min(start+BATCH_SIZE, len(df))})")

    # Step 1 – Article mentions
    print("  → Extracting article mentions...")
    try:
        if USE_DUAL_PASS:
            # Dual-pass extraction
            unique_mentions, total_mentions = extract_with_dual_pass(batch, prompt_article_mentions, 4)
            print(f"    Found {len(unique_mentions)} unique mentions (from {total_mentions} total)")
        else:
            # Single-pass extraction
            unique_mentions = extract_with_single_pass(batch, prompt_article_mentions, 4)
            print(f"    Found {len(unique_mentions)} mentions")
        
        article_mentions.extend(unique_mentions)
            
    except Exception as e:
        print(f"    Error extracting mentions: {e}")
    
    # Small delay between batches to avoid rate limits
    import time
    delay = 0.5 if not USE_DUAL_PASS else 1.0  # Longer delay for dual-pass
    time.sleep(delay)

print(f"\n=== EXTRACTION COMPLETE ===")
print(f"Total government entities found: {len(gov_entities)}")
print(f"Total article mentions found: {len(article_mentions)}")

# Quality summary
print(f"\n=== QUALITY METRICS ===")
if gov_entities:
    unique_entity_mentions = len(set(tuple(e) for e in gov_entities))
    print(f"Unique government entities: {unique_entity_mentions}")
if article_mentions:
    unique_article_mentions = len(set(tuple(m) for m in article_mentions))
    print(f"Unique article mentions: {unique_article_mentions}")


Processing 573 articles in 72 batches using single-pass (0.1 only)...

Processing batch 1/72 (articles 1-8)
  → Extracting article mentions...
    Found 11 mentions

Processing batch 2/72 (articles 9-16)
  → Extracting article mentions...
    Found 16 mentions

Processing batch 3/72 (articles 17-24)
  → Extracting article mentions...
    Found 8 mentions

Processing batch 4/72 (articles 25-32)
  → Extracting article mentions...
    Found 28 mentions

Processing batch 5/72 (articles 33-40)
  → Extracting article mentions...
    Found 11 mentions

Processing batch 6/72 (articles 41-48)
  → Extracting article mentions...
    Found 32 mentions

Processing batch 7/72 (articles 49-56)
  → Extracting article mentions...
    Found 8 mentions

Processing batch 8/72 (articles 57-64)
  → Extracting article mentions...
    Found 15 mentions

Processing batch 9/72 (articles 65-72)
  → Extracting article mentions...
    Found 9 mentions

Processing batch 10/72 (articles 73-80)
  → Extracting article

In [8]:
# Save results with proper validation
if gov_entities:
    entities_df = pd.DataFrame(gov_entities, columns=["row_id", "mention"])
    entities_df.to_csv("/Users/alexa/Projects/cdmx_kg/data/gov_entities_test_01.csv", index=False, encoding='utf-8')
    print(f"✅ Saved government entities to gov_entities.csv")
else:
    print("⚠️  No government entities to save")

if article_mentions:
    mentions_df = pd.DataFrame(article_mentions, columns=["row_id", "art_num", "law_name", "mention_extraction"])
    mentions_df.to_csv("/Users/alexa/Projects/cdmx_kg/data/article_mentions_test_01.csv", index=False, encoding='utf-8')
    print(f"✅ Saved article mentions to article_mentions.csv")
else:
    print("⚠️  No article mentions to save")

⚠️  No government entities to save
✅ Saved article mentions to article_mentions.csv


# This code section is to use the output of of the identified mentions and classfy them 

In [None]:
# For government entities: 
# Read the dictionary of the document.
# Then, go to the list of entities and check if it is needed to change the mention name based on the dictionary.
# Replace the name if needed.
# Not all the mentions will be needed to be changed, only ones specifiqued for each document. 

# Then, look at the list of mentions and create a new list with the id_hash for each differnte entity.
# Add the hash_id to the list of mentions based on the entity name. 
# The output is the list of mentions with the hash_id for each entity and the list of unique entities with the hash_id.



