In [1]:
import json
import pandas as pd
from p06_search_engine import config

# constants

In [2]:
PATH_DATA = "/home/jperrio/registry_data_catalog_experiments/datasets/007_search_engine_datasets/1st_annotations"
PATH_DATA_QUERIES = f"{PATH_DATA}/queries_JP20250718.xlsx"
PATH_DATA_ANNOTATIONS = f"{PATH_DATA}/annotations_SZ20250721.xlsx"

# Processing

## queries

In [3]:
# Load queries
df_queries = pd.read_excel(PATH_DATA_QUERIES)

In [4]:
# Format queries
df_queries = df_queries.rename(columns={"id": "query_id", "query": "query_text"})[["query_id", "query_text"]]
df_queries["query_id"] = df_queries["query_id"].apply(lambda query_id: f"1st-{query_id:03d}")

In [5]:
# Build dataset of queries
queries = [
    row_query.to_dict()
    for _, row_query in df_queries.iterrows()
]

In [6]:
# Save dataset of queries
with open(config.PATH_DATA_1ST_QUERIES, "w") as file:
    json.dump(queries, file, indent=4)

## annotations

In [7]:
# Load annotations

# iterate over sheets
dfs_annotations = []
for _, df_annotations in pd.read_excel(PATH_DATA_ANNOTATIONS, sheet_name=None).items():

    # clean the dataframe (keep only valid rows and columns)
    df_annotations = df_annotations.loc[
        ## rows
        lambda df: df["query_id"].astype(str).str.isdigit() & df["registry_name"].notna() & df["registry_name"].ne("Not available"), 
        ## columns
        lambda df: ~df.columns.str.startswith("Unnamed:"), 
    ]

    # add cleaned dataframe to list
    dfs_annotations.append(df_annotations)

# concatenate all dataframes
df_annotations = pd.concat(dfs_annotations, ignore_index=True)

In [8]:
# Format annotations
df_annotations = df_annotations.rename(columns={})
df_annotations = df_annotations.sort_values(["query_id", "registry_name"])
df_annotations = df_annotations.reset_index(drop=True).rename_axis("annotation_id").reset_index()

# map registry_names to ids
mapping = {
    0: [15266, 28296], 
    1: [23122, 54418], 
    2: [24400], 
    3: [134, 3929, 12678, 939, 11671, 12736, 4255, 12570, 11323, 4141, 20272, 30259, 7524, 13642, 12467, 24836, 14877, 26025, 28487, 29634, 33493, 35694, 41743, 42097, 43737, 45170, 46019, 46823, 48138], 
    4: [385, 5233, 10289, 19471, 17240, 22221, 34135, 39412, 48710, 53360], 
    5: [2891, 26243, 10277, 5494, 19231, 15398, 20581, 23900, 345943], 
    6: [], #FIXME https://sci-hub.st/https://doi.org/10.1111/j.1399-3046.2004.00232.x
    7: [4956], 
    8: [3202, 1863, 3426, 2138, 18618, 27081, 2062, 24872, 22093, 27183, 50958, 12470, 11153, 26903, 20522, 11690, 28422, 29986, 30149, 33217, 29829, 36920, 38096, 41742, 39261, 43434, 46114, 46770, 53439], 
    9: [], #FIXME https://catalogues.ema.europa.eu/node/2880/administrative-details
    10: [36354], 
    11: [19908], 
    12: [274, 7627, 54583], 
    13: [15826, 43304, 51529], 
    14: [54345], 
    15: [39792, 54367], 
    16: [195, 1028, 22976, 34002, 35105, 48757, 54564], 
    17: [54374, 54475, 280, 44056, 54413], 
    18: [54430], 
    19: [54490], 
    20: [], #FIXME not found
    21: [], #FIXME not found
    22: [54550], 
    23: [5013, 27367, 50509, 47395, 27366], # maybe others
    24: [5298], 
    25: [], #TODO
    26: [54373], 
    27: [10567, 44394], 
    28: [54431], 
    29: [], #FIXME not found
    30: [12419], 
    31: [54511], 
    32: [54518], 
    33: [], #FIXME not found
    34: [28804], 
    35: [2087, 37688, 2978, 33829, 7810, 27508, 41707, 16550, 26107, 10999, 4467, 18034, 16791, 22718, 20672, 24332, 24147, 25461, 18547, 17003, 17581, 32885, 32086, 28343, 33779, 33830, 35635, 35636, 32924, 37054, 38299, 40135, 41295, 43541, 47800, 49159, 52978], 
    36: [4036, 8825], 
    37: [3326], 
    38: [1373], 
    39: [54464], 
    40: [212], 
    41: [23620], 
    42: [], 
    43: [], 
    44: [1459, 11322, 8527, 7922, 32926, 36701, 48081], 
    45: [8318, 1106, 7892, 17480, 41840], 
    46: [6641, 3422, 608, 29589], 
    47: [5765, 24233], 
    48: [26854, 51467], 
    49: [2041, 1326, 131, 530, 18222, 2473, 10459, 13468, 8330, 5027, 3273, 48171], 
    50: [8834, 18404, 33761, 17137, 7389, 19108, 19897, 39981, 42002, 52335, 53289], 
    51: [220, 4614], 
    52: [1541, 44081], 
    53: [7520, 43336], 
    54: [10838], 
    #55: [280, 44056, 907, 52483, 3656, 40406, 49796, 54413], 
    56: [30280, 44627], 
    57: [54545], 
    58: [44476], 
    59: [10491], 
    60: [6465], 
    61: [22531], 
    62: [4982], 
    63: [6020, 11948], 
    64: [38781, 54216], 
    65: [13784, 13522, 30629], 
}
df_annotations["registry_ids"] = df_annotations["annotation_id"].apply(lambda annotation_id: mapping.get(annotation_id, []))

# compose annotation_id
df_annotations["query_id"] = df_annotations["query_id"].apply(lambda query_id: f"1st-{query_id:03d}")
df_annotations["annotation_id"] = df_annotations["annotation_id"].apply(lambda annotation_id: f"1st-{annotation_id+1:03d}")

df_annotations = (
    df_annotations

    .explode("registry_ids", ignore_index=True)
    .rename(columns={"registry_ids": "registry_id"})

    .sort_values(["query_id", "annotation_id", "registry_id"])
    .assign(
        n=lambda df: df.groupby(["query_id", "annotation_id"]).cumcount() + 1,
        annotation_id=lambda df: df.apply(lambda row: f"{row['annotation_id']}-{row['n']:03d}", axis=1), 
    )

    .assign(annotation_label="YES")

    [["annotation_id", "query_id", "registry_id", "annotation_label"]]
)

In [9]:
annotations = df_annotations.to_dict(orient="records")

# Save dataset of annotations
with open(config.PATH_DATA_1ST_ANNOTATIONS, "w") as file:
    json.dump(annotations, file, indent=4)