In [74]:
import os
working_dir = "/home/gpinon/more_europa/clean_rdc_experiments/projects/P05_refine_dedup"
os.chdir(working_dir)
print(f"Changed working directory to {working_dir}")
import logging
import time
import pandas as pd
import json
from pathlib import Path
from dotenv import load_dotenv
import weaviate
import boto3
from tqdm import tqdm

from src.p05_refine_dedup import config
from src.p05_refine_dedup.utils.s3_io_functions import (
    load_parquet_from_s3,
    upload_parquet_to_s3,
)

Changed working directory to /home/gpinon/more_europa/clean_rdc_experiments/projects/P05_refine_dedup


# Inputs

In [75]:
s3_input_embeddings_parquet = "registry_data_catalog_experiments/P05_refine_dedup/registry_names_embeddings.parquet"

In [76]:
bucket_name = config.BUCKET_NAME_DEV
folder_path = "registry_data_catalog_experiments/P05_refine_dedup"
file_name = "registry_names_embeddings.parquet"
df = load_parquet_from_s3(
    bucket_name=bucket_name,
    folder_path=folder_path,
    file_name=file_name,
)



# select first entities

In [77]:
PAIRING_CONFIG = {
    1:2800,  # For 2000 registry names, create a pair with the closest alias
    2:1600,  # For 1000 registry names, create a pair with the second closest alias
    3:600,
    # 5:400,
    # 8:200,
    # 13:200,
    # 21:200,
    # 34:200,
    # 55:100,
    # 89:100,
}
# count total of registry names of PAIRING_CONFIG
total_registry_names = sum(PAIRING_CONFIG.values())
print(f"Total registry names to select: {total_registry_names}")
seed = 0

Total registry names to select: 5000


In [78]:
# select total_registry_names from df randomly
df_singles = df.sample(n=total_registry_names, random_state=seed).reset_index(drop=True)

# add a column 'alias_number' to df
# this column will inform what number of alias to use for the pairing, using PAIRING_CONFIG.
# loop over PAIRING_CONFIG and assign the alias_number to the rows
alias_number = []
for i, (key, value) in enumerate(PAIRING_CONFIG.items()):
    alias_number.extend([key] * value)
# extend alias_number to match the length of df
alias_number = alias_number[:len(df)]
df_singles["alias_number"] = alias_number

In [79]:
display(df.tail())

Unnamed: 0,object_id,full_name,full_name_embedding,number_of_occurrences
54330,54331,Piedmont and Aosta Valley Rare Diseases Regist...,"[-0.016647339, 0.01159668, 0.043823242, 0.0026...",1
54331,54332,Healing Legacies Arts Registry (HLAR),"[-0.05142212, -0.004562378, 0.0015249252, -0.0...",1
54332,54333,Finnish IPF-registry (FIPF),"[-0.033813477, 0.015731812, 0.031234741, -0.00...",1
54333,54334,MASTER trial (MASTER),"[0.0031871796, 0.041625977, 0.013008118, -0.00...",1
54334,54335,German Cancer Consortium (GCC),"[-0.041778564, -0.020401001, 0.04714966, -0.00...",1


In [80]:
# # keep first 5 for testing
# df_singles = df_singles.head(5)

In [81]:
# column "full_name_embedding" contains an ndarray of floats of size 1024 (in both df and df_singles)
# for each row of df_singles, compute the cosine similarity with all rows of df, and select the 'alias_number' closest embedding, then add it to df_singles as a new/renamed column 'selected_alias' (initially called 'full_name') in df
from sklearn.metrics.pairwise import cosine_similarity
def find_nth_closest_alias(row, df, n):
    """
    Find the n closest aliases in df to the full_name_embedding of the row.
    """
    full_name_embedding = row["full_name_embedding"]
    similarities = cosine_similarity([full_name_embedding], df["full_name_embedding"].tolist())[0]
    nth_largest_indice = similarities.argsort()[-n:][::-1][-1]  # Get the index of the n-th largest similarity
    nth_largest_similarity = similarities[nth_largest_indice]
    nth_closest_alias = df.iloc[nth_largest_indice]["full_name"]
    return nth_closest_alias, nth_largest_similarity

In [82]:
from tqdm import tqdm
tqdm.pandas()  # Enable progress_apply

# first suffle df_singles to ensure randomness
df_singles = df_singles.sample(frac=1, random_state=seed).reset_index(drop=True)
# select the first 100 rows of df_singles for testing
# df_singles = df_singles.head(500)
# Apply the function to each row of df_singles with tqdm progress bar
df_singles["alias"], df_singles["similarity"] = zip(
    *df_singles.progress_apply(lambda row: find_nth_closest_alias(row, df, row["alias_number"]+1), axis=1)
)

100%|██████████| 5000/5000 [23:07<00:00,  3.60it/s]


In [83]:
# display(df_singles.head())

In [84]:
# display(df_singles.iloc[3000:3005])
# display(df_singles.iloc[3600:3605])
# display(df_singles.iloc[4000:4005])
# display(df_singles.iloc[4400:4405])
# display(df_singles.iloc[4600:4605])
# display(df_singles.iloc[4800:4805])
# display(df_singles.iloc[4900:4905])
# display(df_singles.iloc[4990:4995])

In [85]:
output_pairs_xlsx="data/W01/R02_select_pairs_for_eval_dataset/new_selected_pairs.xlsx"
# Ensure the output directory exists
output_dir = Path(output_pairs_xlsx).parent
output_dir.mkdir(parents=True, exist_ok=True)
# Save the DataFrame to an Excel file
df_singles.to_excel(output_pairs_xlsx, index=False)