In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys

import matplotlib.pyplot as plt
import pandas as pd
from tinydb import TinyDB

sys.path.append("../../../")
from sqqd.defaults import ANNOTATION_PATH, OUTPUT_PATH

## Data loading

In [None]:
def load_annotation_results():
    merged_records = {}
    annotation_results_db = TinyDB(
        ANNOTATION_PATH / "annotated_db.json", indent=4, ensure_ascii=False
    )
    annotation_results_db_table = annotation_results_db.table("results")

    results_db = TinyDB(OUTPUT_PATH / "results_db.json", indent=4, ensure_ascii=False)
    table_results = results_db.table("results")

    for annotation_record in annotation_results_db_table.all():
        record_id = annotation_record.get("id")
        merged_records[record_id] = annotation_record

    for result_record in table_results.all():
        record_id = result_record.get("id")
        if record_id in merged_records:
            merged_records[record_id].update(result_record)
        else:
            merged_records[record_id] = result_record

    return merged_records

In [None]:
annotated_data = load_annotation_results()

## Statistics and plotting

In [None]:
annotated_data_counts = {
    template: {0: 0, 1: 0, 2: 0}
    for template in set(entry["template"] for entry in annotated_data.values())
}

# Counting occurrences of annotations for each template
for entry in annotated_data.values():
    template = entry["template"]
    annotation = entry["annotation"]
    annotated_data_counts[template][annotation] += 1

In [None]:
template_names = list(annotated_data_counts.keys())
total_counts = [sum(template.values()) for template in annotated_data_counts.values()]
total_sum = sum(total_counts)

print(f"Total count of questions before annotation: {total_sum}")
print(f"Counts for each template before annotation: ")
plt.figure(figsize=(12, 8))
plt.bar(template_names, total_counts, color="skyblue")
plt.xticks(rotation=45, ha="right")
plt.xlabel("Template")
plt.ylabel("Count")
plt.title("Counts for Each Template")
plt.tight_layout()
plt.show()

In [None]:
total_correct = 0
correct_counts = {}

for template, counts in annotated_data_counts.items():
    count_correct = counts.get(1, 0)
    correct_counts[template] = count_correct
    total_correct += count_correct

print(f"Total count of questions after annotation: {total_correct}")
print(f"Counts for each template after annotation: ")
plt.figure(figsize=(12, 8))
plt.bar(correct_counts.keys(), correct_counts.values(), color="skyblue")
plt.xticks(rotation=45, ha="right")
plt.xlabel("Template")
plt.ylabel("Count")
plt.title("Counts for Each Template")
plt.tight_layout()
plt.show()

During the annotation process, it appeared that some of the question construction patterns were repeated in the structurally non-broad templates(such as one-hop templates). These questions were marked as "resembling", and they were also excluded from the final result.

In [None]:
templates = list(annotated_data_counts.keys())

values = {
    "Incorrect": [annotated_data_counts[key][0] for key in templates],
    "Correct": [annotated_data_counts[key][1] for key in templates],
    "Resembling": [annotated_data_counts[key][2] for key in templates],
}

bar_width = 0.25

bar1_positions = list(range(len(templates)))
bar2_positions = [pos + bar_width for pos in bar1_positions]
bar3_positions = [pos + bar_width * 2 for pos in bar1_positions]

# Plotting the bars
plt.figure(figsize=(12, 8))

plt.bar(bar1_positions, values["Correct"], width=bar_width, color="green", label="Correct")
plt.bar(bar2_positions, values["Incorrect"], width=bar_width, color="red", label="Incorrect")
plt.bar(bar3_positions, values["Resembling"], width=bar_width, color="blue", label="Resembling")

plt.xlabel("Categories")
plt.ylabel("Values")
plt.title("Bar Plot with Segmented Colors")
plt.xticks([pos + bar_width for pos in bar1_positions], templates, rotation=45, ha="right")
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
df_summarizing = pd.DataFrame(annotated_data_counts)
column_names = {0: "Incorrect", 1: "Correct", 2: "Resembling"}

df_summarizing_swapped = df_summarizing.transpose()
df_summarizing_swapped = df_summarizing_swapped.rename(columns=column_names)

df_summarizing_swapped["Total"] = df_summarizing_swapped.sum(axis=1)

print("Annotation summarizing table: ")
print(df_summarizing_swapped)