# Evaluate quality of different extractors on different data sets

In [1]:
%cd ..

D:\repo\aset-dev


In [2]:
import logging.config

import pandas as pd

from aset.configuration import ASETPipeline
from aset.preprocessing.extraction import StanzaNERExtractor, FigerNERExtractor, SpacyNERExtractor
from aset.resources import ResourceManager
from aset.statistics import Statistics
from aset.data.data import ASETAttribute, ASETDocument, ASETDocumentBase
from aset.interaction import EmptyInteractionCallback, InteractionCallback
from aset.status import EmptyStatusCallback
from experiments.util import consider_overlap_as_match

In [3]:
from datasets.aaai import aaai
from datasets.corona import corona
from datasets.aviation import aviation
from datasets.countries import countries
from datasets.nobel import nobel
from datasets.skyscrapers import skyscrapers

In [4]:
resource_manager = ResourceManager()

In [4]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger()

In [5]:
DATASETS = [('aaai', aaai), ('aviation', aviation), ('corona', corona), ('countries', countries), ('nobel', nobel), ('skyscrapers', skyscrapers)]

In [6]:
EXTRACTORS = [('Stanza', StanzaNERExtractor()), ('SpacyEnCoreWebLg', SpacyNERExtractor('SpacyEnCoreWebLg')), ('SpacyEnCoreSciMd', SpacyNERExtractor('SpacyEnCoreSciMd'))]
#EXTRACTORS = [('FIGER', FigerNERExtractor())]

2022-02-27 19:46:59,691 - aset.resources - INFO - Load resource 'FigerAPI'.
2022-02-27 19:46:59,809 - aset.resources - INFO - Loaded resource 'FigerAPI' in 0.1169741153717041 seconds.


In [7]:
ground_truth = {}
results_absolute = {}
results_relative = {}

for dataset_name, dataset in DATASETS:
    documents = dataset.load_dataset()

    ################################################################################################################
    # document base
    ################################################################################################################
    # select the "user-provided" attribute names and create mappings between them and the dataset's attribute names
    user_attribute_names = dataset.ATTRIBUTES
    user_attribute_name2attribute_name = {
        u_attr_name: attr_name for u_attr_name, attr_name in zip(user_attribute_names, dataset.ATTRIBUTES)
    }

    results_absolute[dataset_name] = {}
    results_relative[dataset_name] = {}
    ground_truth[dataset_name] = {}
    for attribute in dataset.ATTRIBUTES:
        ground_truth[dataset_name][attribute] = 0
        for document in documents:
            if document["mentions"][attribute]:
                ground_truth[dataset_name][attribute] += 1

    document_base = ASETDocumentBase(
        documents=[ASETDocument(doc["id"], doc["text"]) for doc in documents],
        attributes=[ASETAttribute(attribute_name) for attribute_name in user_attribute_names]
    )

    for extractor_name, extractor in EXTRACTORS:
        statistics = Statistics(do_collect=True)

        results_absolute[dataset_name][extractor_name] = {}
        results_relative[dataset_name][extractor_name] = {}

        statistics["user_provided_attribute_names"] = user_attribute_names
        statistics["dataset"]["dataset_name"] = dataset.NAME
        statistics["dataset"]["attributes"] = dataset.ATTRIBUTES
        statistics["dataset"]["num_documents"] = len(documents)

        aset_pipeline = ASETPipeline([
                extractor,
        ])

        statistics["preprocessing"]["config"] = aset_pipeline.to_config()

        aset_pipeline(
            document_base=document_base,
            interaction_callback=EmptyInteractionCallback(),
            status_callback=EmptyStatusCallback(),
            statistics=statistics["preprocessing"]
        )

        for attribute in dataset.ATTRIBUTES:
            results_absolute[dataset_name][extractor_name][attribute] = 0

            statistics["preprocessing"]["results"]["num_extracted"][attribute] = 0
            for document, aset_document in zip(documents, document_base.documents):
                match = False
                for mention in document["mentions"][attribute]:
                    for nugget in aset_document.nuggets:
                        if consider_overlap_as_match(mention["start_char"], mention["end_char"],
                                                     nugget.start_char, nugget.end_char):
                            match = True
                            break
                if match:
                    results_absolute[dataset_name][extractor_name][attribute] += 1
                    statistics["preprocessing"]["results"]["num_extracted"][attribute] += 1

            results_relative[dataset_name][extractor_name][attribute] = results_absolute[dataset_name][extractor_name][attribute] / ground_truth[dataset_name][attribute]

2022-02-27 19:47:02,539 - aset.configuration - INFO - Execute the pipeline.
2022-02-27 19:47:02,540 - aset.status - INFO - Running the pipeline... ~%
2022-02-27 19:47:02,540 - aset.configuration - INFO - Execute FigerNERExtractor.
2022-02-27 19:47:02,541 - aset.status - INFO - Running FigerNERExtractor... ~%
2022-02-27 19:47:02,541 - aset.status - INFO - Running FigerNERExtractor... 0%
2022-02-27 19:48:57,967 - aset.status - INFO - Running FigerNERExtractor... 5%
2022-02-27 19:48:59,722 - aset.status - INFO - Running FigerNERExtractor... 10%
2022-02-27 19:49:01,897 - aset.status - INFO - Running FigerNERExtractor... 15%
2022-02-27 19:49:07,446 - aset.status - INFO - Running FigerNERExtractor... 19%
2022-02-27 19:49:09,352 - aset.status - INFO - Running FigerNERExtractor... 24%
2022-02-27 19:49:12,021 - aset.status - INFO - Running FigerNERExtractor... 29%
2022-02-27 19:49:13,949 - aset.status - INFO - Running FigerNERExtractor... 34%
2022-02-27 19:49:14,735 - aset.status - INFO - Runni

In [8]:
OUT_PATH = "experiments/results/extraction"

In [9]:
for dataset_name, _ in DATASETS:
    gt_df = pd.DataFrame.from_dict(results_relative[dataset_name], orient='index')

    with open(f"{OUT_PATH}/{dataset_name}.csv", "w") as out_file:
        gt_df.to_csv(out_file)

In [11]:
pd.DataFrame.from_dict(ground_truth["corona"], orient='index')

Unnamed: 0,0
date,100
new_cases,98
new_deaths,98
incidence,100
patients_intensive_care,99
vaccinated,53
twice_vaccinated,33
