In [21]:
import pandas as pd
import plotly.express as px

from p06_search_engine import config, io

In [42]:
px.defaults.template = "plotly_white"

# Load data

In [43]:
queries = io.read_queries(path_data_queries=config.PATH_DATA_2ND_QUERIES)
registries = io.read_registries(path_data_registries=config.PATH_DATA_REGISTRIES)
annotations = io.read_annotations(path_data_annotations=config.PATH_DATA_2ND_ANNOTATIONS)

In [44]:
df_queries = pd.DataFrame(queries)
df_registries = pd.DataFrame(registries)
df_annotations = pd.DataFrame(annotations)

# Describe dataset

In [71]:
df_plot = (
    df_annotations

    .merge(df_queries, on="query_id", how="left")

    .groupby(["query_id", "query_text"])
    ["annotation_label"]
    .value_counts()
    .unstack("annotation_label")
    .fillna(0)
    .assign(MAYBE=lambda df: df.sum(axis=1).max() - df.sum(axis=1))
    .sort_values(["YES", "NO"], ascending=[False, True])
)

fig = (
    px.bar(
        df_plot.reset_index(), 
        x="query_id", 
        y=["YES", "NO", "MAYBE"], 
        color="variable", 
        color_discrete_map={"YES": "green", "NO": "red", "MAYBE": "lightcoral"}, 
        category_orders={"variable": ["YES", "MAYBE", "NO"], "query_id": df_plot.index}, 
        labels={"query_id": "Id of the query", "value": "Number of registries", "variable": "Label of the annotation"}, 
    )
)
fig.show()