In [1]:
import os
working_dir = "/home/gpinon/more_europa/clean_rdc_experiments/projects/P05_refine_dedup"
os.chdir(working_dir)
print(f"Changed working directory to {working_dir}")
import logging
import time
import pandas as pd
import json
from pathlib import Path
from dotenv import load_dotenv
import weaviate
import boto3
from tqdm import tqdm
import asyncio

from src.p05_refine_dedup import config
from src.p05_refine_dedup.utils.s3_io_functions import (
    load_parquet_from_s3,
    upload_parquet_to_s3,
)

Changed working directory to /home/gpinon/more_europa/clean_rdc_experiments/projects/P05_refine_dedup


# Inputs

In [2]:
LLM_JUDGE_MODEL = "gpt4_1_openai"  # or "gpt4o_openai" or "o3_openai"
# LLM_JUDGE_MODEL = "large_mistral"
# input_pairs_xlsx="data/W01/R02_select_pairs_for_eval_dataset/selected_pairs.xlsx"
# input_pairs_xlsx="data/W01/R02_select_pairs_for_eval_dataset/famous_selected_pairs.xlsx"
input_pairs_xlsx="data/W01/R02_select_pairs_for_eval_dataset/selected_famous_close_pairs.xlsx"
prompt_txt = "etc/prompts/prompt_compare_registry_names.txt"
model_config = f"etc/configs/{LLM_JUDGE_MODEL}_config.json"

In [3]:
# Load df_pairs
df_pairs = pd.read_excel(input_pairs_xlsx)
# # seelct first 100 for testing
# df_pairs = df_pairs.head(5)

In [4]:
display(df_pairs.head())

Unnamed: 0,object_id,alias_object_id,full_name,alias,number_of_occurrences,alias_number_of_occurrences,similarity
0,3149,25541,United Network for Organ Sharing Renal Transpl...,United Network for Organ Sharing Kidney Transp...,12,4,0.984532
1,631,11551,Chronic Heart Failure Analysis and Registry in...,Chronic Heart Failure Analysis and Registry in...,7,15,0.995989
2,16188,3678,National CAPD Registry (NCR),National Renal Registry,7,9,0.902307
3,5098,6584,Danish National Diabetes Registry (DNDR),Danish National Diabetes Register (DNDR),17,17,0.996726
4,2520,17252,Swedish National Registers (SNR),Swedish National Registries (SNR),24,4,0.990375


# Build prompts

In [5]:
# Define the prompt template using hybrid approach
def construct_prompt(base_prompt: str, ds1: str, ds2: str) -> str:
    """
    Constructs the final prompt by replacing placeholders with provided dataset strings.

    Args:
        base_prompt (str): The base prompt containing placeholders "{{content_a}}" and "{{content_b}}".
        ds1 (str): The string to replace "{{content_a}}".
        ds2 (str): The string to replace "{{content_b}}".

    Returns:
        str: The final prompt with the placeholders replaced.
    """
    # Replace the placeholders with the provided strings.
    prompt = base_prompt.replace("{{content_a}}", ds1).replace("{{content_b}}", ds2)
    return prompt


In [6]:
# build prompts
prompts = []
total = len(df_pairs)
# logg Building prompts
logging.info(f"Building prompts for {total} pairs")
# Read the base prompt from file (we expect the file to contain the base prompt content)
prompt_file_path = Path(prompt_txt)
with open(prompt_file_path, "r") as pf:
    base_prompt = pf.read().strip()
for idx, row in df_pairs.iterrows():
    ds1 = row["full_name"]
    ds2 = row["alias"]
    # prompts = [{"prompt": f"{i}. Tell me a story.", "custom_id": i} for i in range(num_requests)]
    prompt_dict = {
        "prompt": construct_prompt(base_prompt, ds1, ds2),
        "custom_id": row["object_id"],
    }
    prompts.append(prompt_dict)
# Log the number of prompts built
logging.warning(f"Built {len(prompts)} prompts")



In [7]:
# prompts

# Async inference

In [8]:
import llm_backends
from llm_backends.cache import DiskCacheStorage
from llm_inference.cache.tmp import TmpCacheStorage
cache_storage = TmpCacheStorage()
# ----- INFERENCE (FULL DATASET) -----
backend = llm_backends.OpenAIAsyncBackend(
                api_key=os.getenv("OPENAI_API_KEY"), cache_storage=cache_storage
            )

In [9]:
# Load the model configuration
with open(model_config, "r", encoding="utf-8") as f:
    model_cfg = json.load(f)

In [10]:
logging.basicConfig(
    level=logging.WARNING,
    format="%(asctime)s %(levelname)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)
logging.getLogger("httpx").setLevel(logging.WARNING)

In [11]:
async def run_async_inference(prompts: list, backend, model_cfg) -> pd.Series:
    """
    Run asynchronous inference on a test sample of pairs using the MistralAsyncBackend.
    """
    # Create async backend using temporary cache storage
    start_time = time.time()

    # Make inferences
    logging.warning(f"Making inferences for {len(prompts)} pairs")
    raw_responses = []
    num_requests = len(prompts)
    pbar = tqdm(total=num_requests, desc="Processing async batch")
    for prompt in prompts:
        # Perform inference
        raw_response = await backend.infer_one(prompt, model_cfg)
        raw_response["custom_id"] = prompt["custom_id"]
        # print(raw_response)
        raw_responses.append(raw_response)
        pbar.update(1)
    pbar.close()
    end_time = time.time()
    # elapsed_time in minutes
    elapsed_time = (end_time - start_time) / 60
    logging.warning(f"Elapsed time for async inference: {elapsed_time:.1f} minutes")
    return raw_responses





Processing async batch:  12%|█▏        | 607/5000 [13:39<1:37:39,  1.33s/it] 

2025-07-30 14:22:46,851 - openai._base_client - INFO - Retrying request to /chat/completions in 0.495231 seconds


Processing async batch:  23%|██▎       | 1152/5000 [26:06<2:20:35,  2.19s/it]

2025-07-30 14:35:14,164 - openai._base_client - INFO - Retrying request to /chat/completions in 0.455344 seconds


Processing async batch:  52%|█████▏    | 2607/5000 [1:01:02<1:21:20,  2.04s/it]

2025-07-30 15:10:10,262 - openai._base_client - INFO - Retrying request to /chat/completions in 0.457817 seconds


Processing async batch:  54%|█████▍    | 2697/5000 [1:03:19<1:09:38,  1.81s/it]

2025-07-30 15:22:16,316 - openai._base_client - INFO - Retrying request to /chat/completions in 0.384464 seconds


Processing async batch:  76%|███████▌  | 3786/5000 [1:37:47<25:45,  1.27s/it]     

2025-07-30 15:46:54,507 - openai._base_client - INFO - Retrying request to /chat/completions in 0.441327 seconds


Processing async batch:  97%|█████████▋| 4837/5000 [2:04:00<03:37,  1.33s/it]  

2025-07-30 16:13:08,249 - openai._base_client - INFO - Retrying request to /chat/completions in 0.442827 seconds


Processing async batch: 100%|█████████▉| 4989/5000 [2:08:01<00:19,  1.79s/it]

2025-07-30 16:17:08,393 - openai._base_client - INFO - Retrying request to /chat/completions in 0.481634 seconds


Processing async batch: 100%|█████████▉| 4990/5000 [2:08:12<00:47,  4.70s/it]

2025-07-30 16:17:19,900 - openai._base_client - INFO - Retrying request to /chat/completions in 0.393847 seconds


Processing async batch: 100%|██████████| 5000/5000 [2:08:38<00:00,  1.54s/it]






In [13]:
# print(raw_responses)

In [14]:
# Create a list to store the records with object_id, prompt, and raw response
prompt_response_records = []
# Precompute a mapping from custom_id to prompt object
prompt_map = {p["custom_id"]: p for p in prompts}
llm_responses = []

for raw_response in tqdm(
    raw_responses, desc=f"Processing responses", leave=False
):
    # show raw responses format
    # print(f"Raw response: {raw_response}")
    custom_id = raw_response.get("custom_id", "")
    prompt_obj = prompt_map.get(custom_id)
    if prompt_obj:
        print(f"Processing response for custom_id: {custom_id}")
        prompt_response_records.append(
            {
                "custom_id": custom_id,
                "prompt": prompt_obj["prompt"],
                "llm_response": raw_response,
            }
        )
        # Parse raw response and add additional info
        parsed_response = backend._parse_response(raw_response)
        parsed_response["custom_id"] = custom_id
        llm_responses.append(parsed_response)
        # print(f"Parsed response: {parsed_response}")
        
logging.warning("Batch inference complete on full dataset")

Processing responses:   0%|          | 0/5000 [00:00<?, ?it/s]

                                                                            

Processing response for custom_id: 3149
Processing response for custom_id: 631
Processing response for custom_id: 16188
Processing response for custom_id: 5098
Processing response for custom_id: 2520
Processing response for custom_id: 1080
Processing response for custom_id: 2241
Processing response for custom_id: 8713
Processing response for custom_id: 12831
Processing response for custom_id: 3311
Processing response for custom_id: 694
Processing response for custom_id: 1428
Processing response for custom_id: 8619
Processing response for custom_id: 2950
Processing response for custom_id: 12828
Processing response for custom_id: 19563
Processing response for custom_id: 17810
Processing response for custom_id: 10345
Processing response for custom_id: 9319
Processing response for custom_id: 14559
Processing response for custom_id: 1906
Processing response for custom_id: 7106
Processing response for custom_id: 4697
Processing response for custom_id: 2531
Processing response for custom_id: 



In [15]:
# ----- POST-PROCESSING -----
# Assign 4 columns to test_sample: "explanation", "final_decision", "custom_id" and "uncertain"
for result in tqdm(llm_responses, desc="Processing responses", leave=False):
    # Extract the custom_id from the result
    custom_id = result.get("custom_id", "")
    
    # Find the corresponding prompt response record
    prompt_response = next(
        (pr for pr in prompt_response_records if pr["custom_id"] == custom_id), None
    )
    
    if prompt_response:
        # print(f"Processing custom_id: {custom_id}")
        # print(result)
        # Extract the explanation from the LLM response
        explanation = result.get("explanation", "")
        
        # Determine the final decision based on the LLM response
        final_decision = result.get("final_decision", "uncertain")
        
        # Add the new columns to the DataFrame
        df_pairs.loc[df_pairs.object_id == custom_id, "explanation"] = explanation
        df_pairs.loc[df_pairs.object_id == custom_id, "final_label"] = 1 if final_decision == "same" else 0
        df_pairs.loc[df_pairs.object_id == custom_id, "custom_id"] = custom_id
        df_pairs.loc[df_pairs.object_id == custom_id, "uncertain"] = result.get("uncertain", False)
    
# show the updated DataFrame
display(df_pairs.head())

                                                                          

Unnamed: 0,object_id,alias_object_id,full_name,alias,number_of_occurrences,alias_number_of_occurrences,similarity,explanation,final_label,custom_id,uncertain
0,3149,25541,United Network for Organ Sharing Renal Transpl...,United Network for Organ Sharing Kidney Transp...,12,4,0.984532,‘Renal’ and ‘kidney’ are synonyms; both refer ...,1.0,3149.0,no
1,631,11551,Chronic Heart Failure Analysis and Registry in...,Chronic Heart Failure Analysis and Registry in...,7,15,0.995989,"CHART-2 is a later, separate phase of the orig...",0.0,631.0,no
2,16188,3678,National CAPD Registry (NCR),National Renal Registry,7,9,0.902307,CAPD (Continuous Ambulatory Peritoneal Dialysi...,0.0,16188.0,yes
3,5098,6584,Danish National Diabetes Registry (DNDR),Danish National Diabetes Register (DNDR),17,17,0.996726,‘Registry’ and ‘Register’ are synonymous here;...,1.0,5098.0,no
4,2520,17252,Swedish National Registers (SNR),Swedish National Registries (SNR),24,4,0.990375,‘Registers’ and ‘Registries’ are plural forms ...,1.0,2520.0,no


In [17]:
# columns order
cols = [
    "object_id",
    "alias_object_id",
    "full_name",
    "alias",
    "number_of_occurrences",
    "alias_number_of_occurrences",
    # "alias_number",
    "similarity",
    "final_label",
    "uncertain",
    "explanation",
]
# Reorder the DataFrame columns
df_pairs = df_pairs[cols]

In [18]:
output_assessed_pairs_xlsx=f"data/W01/R03_eval_pairs_similarity_assessment_with_llm/{LLM_JUDGE_MODEL}/famous_close_assessed_pairs.xlsx"
# Save the updated DataFrame to an Excel file
output_path = Path(output_assessed_pairs_xlsx).parent
output_path.mkdir(parents=True, exist_ok=True)
df_pairs.to_excel(output_assessed_pairs_xlsx, index=False)

# Compute stats

In [None]:
output_assessed_pairs_xlsx=f"data/W01/R03_eval_pairs_similarity_assessment_with_llm/{LLM_JUDGE_MODEL}/famous_close_assessed_pairs.xlsx"
# output_assessed_pairs_xlsx=f"data/W01/R03_eval_pairs_similarity_assessment_with_llm/{LLM_JUDGE_MODEL}/famous_assessed_pairs_v1.xlsx"
# first reload the df_pairs to ensure the new columns are present
df_pairs = pd.read_excel(output_assessed_pairs_xlsx)

In [20]:
display(df_pairs.head())

Unnamed: 0,object_id,alias_object_id,full_name,alias,number_of_occurrences,alias_number_of_occurrences,similarity,final_label,uncertain,explanation
0,3149,25541,United Network for Organ Sharing Renal Transpl...,United Network for Organ Sharing Kidney Transp...,12,4,0.984532,1,no,‘Renal’ and ‘kidney’ are synonyms; both refer ...
1,631,11551,Chronic Heart Failure Analysis and Registry in...,Chronic Heart Failure Analysis and Registry in...,7,15,0.995989,0,no,"CHART-2 is a later, separate phase of the orig..."
2,16188,3678,National CAPD Registry (NCR),National Renal Registry,7,9,0.902307,0,yes,CAPD (Continuous Ambulatory Peritoneal Dialysi...
3,5098,6584,Danish National Diabetes Registry (DNDR),Danish National Diabetes Register (DNDR),17,17,0.996726,1,no,‘Registry’ and ‘Register’ are synonymous here;...
4,2520,17252,Swedish National Registers (SNR),Swedish National Registries (SNR),24,4,0.990375,1,no,‘Registers’ and ‘Registries’ are plural forms ...


In [23]:
# Stats
print("Calculating stats for final_label...")
# ratio of 1s and 0s for final_label in % with one decimal
stats = (df_pairs["final_label"].value_counts(normalize=True) * 100).round(1)
print(f"Stats for final_label:\n{stats}")
# # compute same ratio but per distinct values of "alias_number" (there should be 10 distinct values of "alias_number")
# alias_stats = (df_pairs.groupby("alias_number")["final_label"].value_counts(normalize=True).unstack().fillna(0) * 100).round(1)
# print(f"Stats for final_label per alias_number:\n{alias_stats}")

Calculating stats for final_label...
Stats for final_label:
final_label
0    65.5
1    34.5
Name: proportion, dtype: float64


In [24]:
# Stats
print("Calculating stats for uncertain...")
# ratio of yes and no for uncertain in % with one decimal
stats_uncertain = (df_pairs["uncertain"].value_counts(normalize=True) * 100).round(1)
print(f"Stats for uncertain:\n{stats_uncertain}")
# # compute uncertain ratio but per distinct values of "alias_number" (there should be 10 distinct values of "alias_number")
# alias_stats_uncertain = (df_pairs.groupby("alias_number")["uncertain"].value_counts(normalize=True).unstack().fillna(0) * 100).round(1)
# print(f"Stats for uncertain per alias_number:\n{alias_stats_uncertain}")

Calculating stats for uncertain...
Stats for uncertain:
uncertain
no     76.9
yes    23.1
Name: proportion, dtype: float64
