In [4]:
import os
import mlflow
import plotly.express as px

from p06_search_engine import config

In [5]:
# setup mlflow
os.chdir(config.PATH_PROJECT)
mlflow.set_tracking_uri(config.MLFLOW_URI)
mlflow.set_experiment(config.MLFLOW_EXPERIMENT_NAME)

<Experiment: artifact_location='file:///home/jperrio/registry_data_catalog_experiments/projects/P06_search_engine/mlruns/232264857835789669', creation_time=1755763493203, experiment_id='232264857835789669', last_update_time=1755763493203, lifecycle_stage='active', name='search_engine', tags={}>

In [6]:
# load results from mlflow
df_benchmarks = mlflow.search_runs(filter_string="status = 'FINISHED'")

# process results
df_benchmarks = (
    df_benchmarks
    
    # keep only relevant columns (id, parameters and metrics)
    [[
        "tags.mlflow.runName", 
        *df_benchmarks.columns[df_benchmarks.columns.str.startswith("params.")], 
        *df_benchmarks.columns[df_benchmarks.columns.str.startswith("metrics.")], 
    ]]

    # convert values to correct types
    .where(lambda df: df.ne("None"))
    .astype(float, errors="ignore")
    .assign(**{"params.searching_distance": lambda df: df["params.searching_distance"].str.split(".").str[-1].str.replace("_", "-").str.lower()})
    .assign(**{"params.searching_tokenization": lambda df: df["params.searching_tokenization"].str.split(".").str[-1].str.replace("_", "-").str.lower()})
    .assign(**{"params.searching_fusion_type": lambda df: df["params.searching_fusion_type"].str.split(".").str[-1].str.replace("_", "-").str.lower()})

    #.loc[lambda df: df["params.locations"].isna() | df["params.locations"].eq("None") | df["params.locations"].eq(False)] #FIXME

    .sort_values("metrics.searching_map", ascending=False)
)
display(df_benchmarks)

Unnamed: 0,tags.mlflow.runName,params.preparing_locations,params.preparing_conditions,params.searching_tokenization,params.searching_fusion_type,params.searching_method,params.searching_distance,params.searching_alpha,params.preparing_acronym,params.preparing_name,params.thresholding_relevance,params.thresholding_rank,metrics.thresholding_recall,metrics.thresholding_precision,metrics.searching_map,metrics.thresholding_f1,metrics.thresholding_n_registries
201,name_acronym_locations1_conditions3___vector_d...,1,3,,,vector,cosine,,True,True,-0.21527457833290098,46,0.540069,0.417567,0.585359,0.401651,19.294118
198,name_acronym_locations1_conditions5___vector_d...,1,5,,,vector,cosine,,True,True,-0.21516536474227904,37,0.517894,0.424201,0.581011,0.397497,16.784314
213,name_acronym_conditions3___vector_distanceCOSINE,0,3,,,vector,cosine,,True,True,-0.21644224166870119,41,0.510450,0.431164,0.573250,0.404193,18.156863
210,name_acronym_conditions5___vector_distanceCOSINE,0,5,,,vector,cosine,,True,True,-0.21136171340942383,32,0.449053,0.473841,0.565811,0.395670,13.647059
165,name_conditions5___vector_distanceCOSINE,0,5,,,vector,cosine,,False,True,-0.21541929841041563,38,0.509558,0.495438,0.564241,0.435978,17.686275
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48,name_acronym___hybrid_distanceHAMMING_tokeniza...,0,0,trigram,ranked,hybrid,hamming,0.75,True,True,0.06556109994649884,,0.221750,0.013830,0.009347,0.024945,280.000000
49,name_acronym___hybrid_distanceHAMMING_tokeniza...,0,0,trigram,relative-score,hybrid,hamming,0.75,True,True,0.0649556890130043,,0.216982,0.014021,0.007530,0.025253,280.039216
33,name_acronym_conditions1___vector_distanceHAMMING,0,1,,,vector,hamming,,True,True,,1,0.003922,0.019608,0.005074,0.006536,1.000000
234,name___vector_distanceHAMMING,,,,,vector,hamming,,False,True,-1023.0,,0.072194,0.002431,0.000982,0.004570,500.000000


In [7]:
df = df_benchmarks.copy()

In [8]:
### Visualize performances ###

for metric_name in ["metrics.searching_map"]:
    fig = (
        px.bar(
            df, 
            x="tags.mlflow.runName", 
            y=metric_name, 
            color="params.searching_method", 
            text_auto=".3f", 

            category_orders={"tags.mlflow.runName": df["tags.mlflow.runName"].tolist()}, 
            labels={
                metric_name: metric_name.split(".")[-1].replace("_", " ").title(), 
                "params.searching_method": "Method", 
                "tags.mlflow.runName": "Experiment", 
            }, 
            title=f"Performances by experiment on metric: <i>'{metric_name.split('.')[-1].replace('_', ' ').lower()}'</i>", 
            template="plotly_white", 
        )
        .update_xaxes(tickfont_size=7.5)
        .update_yaxes(rangemode="tozero").add_hline(0, line_dash="solid").add_hline(1, line_dash="dot")
    )
    fig.show()

In [9]:
def plot_parameter_comparison(df, param_name, metric_name, filter=False):

    if filter:
        df = (
            df

            .assign(n=lambda df: (
                df
                .groupby(df.columns[df.columns.str.startswith("params.") & ~df.columns.str.startswith("params.thresholding") & (df.columns != param_name)].tolist(), dropna=False)
                ["params.searching_distance"]
                .transform("nunique")
            ))
            .loc[lambda df: df["n"] == df["n"].max()]
        )

    df = df.groupby(list(set([param_name, "params.searching_method"])))[metric_name].describe().sort_values("mean", ascending=False).reset_index()
    fig = (
        px.bar(
            df, 
            x="params.searching_method", 
            y="mean", error_y="std", 
            color=param_name, 
            barmode="group", 
            text_auto=".3f", 
            color_discrete_map={
                False: "lightcoral", 
                True: "lightgreen", 
                1: "lightgreen", 
                3: "green", 
                5: "darkgreen", 
            }, 
            color_discrete_sequence=px.colors.qualitative.Alphabet, 
            category_orders={
                "params.searching_method": df["params.searching_method"].drop_duplicates().tolist(), 
                param_name: df[param_name].drop_duplicates().tolist(), 
            },
            labels={
                "mean": metric_name.split(".")[-1].replace("_", " ").title(), 
                param_name: param_name.split(".")[-1].replace("_", " ").title(), 
                "params.searching_method": "Method", 
            }, 
            title=f"Comparison of performances by parameter: <i>'{param_name.split('.')[-1].replace('_', ' ').lower()}'</i>", 
            template="plotly_white", 
        )
        .update_yaxes(rangemode="tozero").add_hline(0, line_dash="solid").add_hline(1, line_dash="dot")
    )
    fig.show()



metric_name = "metrics.searching_map"

# Vector search
## distance
plot_parameter_comparison(df.loc[df["params.searching_method"].eq("vector")], param_name="params.searching_distance", metric_name=metric_name, filter=True)
df = df.loc[df["params.searching_distance"].eq("cosine") | df["params.searching_distance"].isna()]

# Keyword search
## tokenization
plot_parameter_comparison(df.loc[df["params.searching_method"].eq("keyword")], param_name="params.searching_tokenization", metric_name=metric_name, filter=True)
df = df.loc[df["params.searching_tokenization"].eq("trigram") | df["params.searching_tokenization"].isna()]

# Hybrid search
## alpha
plot_parameter_comparison(df, param_name="params.searching_alpha", metric_name=metric_name, filter=True)
plot_parameter_comparison(df, param_name="params.searching_fusion_type", metric_name=metric_name, filter=True)
df = df.loc[df["params.searching_alpha"].eq("0.75") | df["params.searching_alpha"].isna()]
df = df.loc[df["params.searching_fusion_type"].eq("ranked") | df["params.searching_fusion_type"].isna()]

# Searching
## method
plot_parameter_comparison(df, param_name="params.searching_method", metric_name=metric_name)

# Preparing
## acronym
plot_parameter_comparison(df, param_name="params.preparing_acronym", metric_name=metric_name)
plot_parameter_comparison(df, param_name="params.preparing_locations", metric_name=metric_name)
plot_parameter_comparison(df, param_name="params.preparing_conditions", metric_name=metric_name)

In [11]:
import numpy as np
import pandas as pd
from p06_search_engine import io

TOP_K = 100

df_annotations = pd.read_json(config.PATH_DATA_2ND_ANNOTATIONS)

dfs = []
for experiment_name in ["name_acronym_locations1_conditions3___vector_distanceCOSINE"]:
    df_searches = pd.read_json(f"{config.PATH_DATA_SEARCHES}/{experiment_name}.json")

    df = (
        df_searches
        .merge(df_annotations, on=["query_id", "registry_id"], how="outer")
        
        .sort_values(by=["query_id", "registry_id"])

        .assign(search_rank=lambda df: df["search_rank"].fillna(np.inf))
        .assign(annotation_label=lambda df: pd.Categorical(df["annotation_label"], categories=["YES", "NO"], ordered=True))
        .sort_values(["query_id", "search_rank"])

        .assign(
            k=lambda df: df["search_rank"].fillna(np.inf), 
            y=lambda df: df["annotation_label"] == "YES", 
            precision_k=lambda df: df.groupby("query_id")["y"].transform("cumsum") / df["k"], 
            recall_k=lambda df: df.groupby("query_id")["y"].transform("cumsum") / df.groupby("query_id")["y"].transform("sum"), 
        )

        .groupby("k")
        [["precision_k", "recall_k"]]
        .mean()

        .head(TOP_K)

        .assign(experiment_name=experiment_name)
        .reset_index()
        [["experiment_name", "k", "precision_k", "recall_k"]]
    )
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)

for metric_name in ["precision_k", "recall_k"]:
    fig = (
        px.line(
            df, 
            x="k", 
            y=metric_name, 
            color="experiment_name", 
            labels={"precision_k": "Precision@k", "recall_k": "Recall@k"}, 
            color_discrete_sequence=px.colors.qualitative.Alphabet, 
            category_orders={"experiment_name": df["experiment_name"].tolist()}, 
            title=f"Performance on metric: <i>'{metric_name}'</i> by experiment",
            template="plotly_white", 
        )
        .update_xaxes(rangemode="tozero").update_yaxes(rangemode="tozero").add_hline(1, line_dash="dot")
    )
    fig.show()

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px

from p06_search_engine import config

df_queries = pd.read_json(config.PATH_DATA_2ND_QUERIES)
df_registries = pd.read_json(config.PATH_DATA_REGISTRIES)

experiment_name = "name_acronym_locations1_conditions3___vector_distanceCOSINE"
df_searches = pd.read_json(f"{config.PATH_DATA_SEARCHES}/{experiment_name}.json")
df_annotations = pd.read_json(config.PATH_DATA_2ND_ANNOTATIONS)

df = (
    df_searches

    .merge(df_annotations, on=["query_id", "registry_id"], how="outer")
    .merge(df_registries, on="registry_id", how="left")
    .merge(df_queries, on="query_id", how="left")
    
    .sort_values(by=["query_id", "registry_id"])

    .assign(search_rank=lambda df: df["search_rank"].fillna(np.inf))
    .assign(annotation_label=lambda df: pd.Categorical(df["annotation_label"], categories=["YES", "NO"], ordered=True))
    .sort_values(["query_id", "search_rank"])
)

df = (
    df

    .assign(
        k=lambda df: df["search_rank"].fillna(np.inf), 
        y=lambda df: df["annotation_label"] == "YES", 
        precision_k=lambda df: df.groupby("query_id")["y"].transform("cumsum") / df["k"], 
        recall_k=lambda df: df.groupby("query_id")["y"].transform("cumsum") / df.groupby("query_id")["y"].transform("sum"), 
        f1_k=lambda df: 2 * (df["precision_k"] * df["recall_k"] / (df["precision_k"] + df["recall_k"])).fillna(0), 
    )
)

s_plot = df.loc[lambda df: df["annotation_label"].eq("YES")].groupby(["query_id", "query_text"])["precision_k"].mean().reset_index().sort_values("precision_k")
display(s_plot["precision_k"].mean())
px.bar(
    s_plot, 
    x="query_id", 
    y="precision_k", 
    text_auto=".3f", 
    hover_data=["query_id", "query_text"], 
    title="MeanAveragePrecision by query", 
    template="plotly_white", 
).add_hline(s_plot["precision_k"].mean(), annotation_text="mean").add_hline(s_plot["precision_k"].median(), line_dash="dot", annotation_text="median").show()

In [19]:
px.bar(df_annotations.groupby("query_id")["annotation_label"].value_counts().unstack("annotation_label").fillna(0)["YES"].sort_values())