<!---
Blue Brain Search is a text mining toolbox focused on scientific use cases.

Copyright (C) 2020  Blue Brain Project, EPFL.

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
-->

# Evaluation of NER Models

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_ner_craft_md-0.2.5.tar.gz \
    https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_ner_jnlpba_md-0.2.5.tar.gz \
    https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_ner_bc5cdr_md-0.2.5.tar.gz \
    https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_ner_bionlp13cg_md-0.2.5.tar.gz

In [None]:
from collections import OrderedDict
from pathlib import Path
import sqlite3

import numpy as np
import pandas as pd
import spacy


from bluesearch.mining.eval import (annotations2df, spacy2df, unique_etypes, plot_ner_confusion_matrix, 
                                  ner_report, ner_errors, remove_punctuation)

import matplotlib.pyplot as plt
%matplotlib inline

## Notebook parameters

In [None]:
SCISPACY_MODELS = ['en_ner_craft_md',
                   'en_ner_jnlpba_md',
                   'en_ner_bc5cdr_md',
                   'en_ner_bionlp13cg_md']

ANNOTATIONS_JSONL_PATH = Path('../data_and_models/annotations/ner/')

# prodigy_dataset_name: annotator_name
DATASETS_2_ANNOTATORS = OrderedDict([('annotations3_EmmanuelleLogette_2020-07-06_raw1_8FirstLabels.jsonl', 'EmmanuelleLogette'),
                                     ('annotations4_CharlotteLorin_2020-07-02_raw1_8FirstLabels.jsonl', 'CharlotteLorin')])

annotators = list(DATASETS_2_ANNOTATORS.values())

## Import annotations from experts

In [None]:
df = OrderedDict([(annotator, annotations2df(ANNOTATIONS_JSONL_PATH / annotations_file)) 
                  for annotations_file, annotator in DATASETS_2_ANNOTATORS.items()])

# Inner join: look at annotations for the same tokens (id) of the same sentences (source)
df = df[annotators[1]].merge(df[annotators[0]], 
                                 on=['source', 'id', 'text', 'start_char', 'end_char'], 
                                 suffixes=(f'_{annotators[1]}', f'_{annotators[0]}'),
                                 how='inner')

annotators_names = '\n - '.join(annotators)
print(f'Loaded annotations for {len(df):,d} tokens, provided by the following expert annotators:\n - {annotators_names}.')
display(df.head())

## Add annotations predicted by SciSpaCy models

<div>
<strong> Attention! (1) </strong> 
<br>
    Predicted annotations must be generated like here by calling the SciSpaCy models on the text, sentence by sentence. Doing a batch inference on the whole <code>df['text']</code> column would allow the models to look outside of the boundaries of sentences, so that the predictions would be affected by the order of the sentence in the dataframe.
    </div>
    
    
<div>
<strong> Attention! (2) </strong>
<br>
    In many cases the entity type names in the predicted annotations do not match the ones in the ground truth. To fix that, most evaluation function support a dictionary parameter <code>etypes_map</code>.
</div>

In [None]:
ner_models = {model_name: spacy.load(model_name) for model_name in SCISPACY_MODELS}

# Add columns with predictions of NER models
for ner_model_name, ner_model in ner_models.items():
    model_df = []

    for source, df_ in df.groupby('source'):
        df_ = df_.sort_values(by='id', ignore_index=True)
        df_sentence = spacy2df(spacy_model=ner_model, ground_truth_tokenization=df_['text'].to_list())
        df_sentence['id'] = df_['id'].values
        df_sentence['source'] = source
        model_df.append(df_sentence)

    model_df = pd.concat(model_df, ignore_index=True).rename(columns={'class': f'class_{ner_model_name}'})
    
    df = df.merge(model_df, 
         on=['source', 'id', 'text'], 
         how='inner')
    
scispacy_models_names = "\n - ".join(SCISPACY_MODELS)
print(f'Added annotations for {len(df):,d} tokens, predicted by' \
      f'the following NER models: \n - {scispacy_models_names}.')
display(df.head())

## NER Evaluation

The following NER evaluation functions have a parameter <code>mode</code> which can take one of the two following values.
<ul>
    <li><code>"token"</code>: Each token is considered separately, and only the enity type (e.g. <code>"DISEASE"</code>) of annotations is considered, without considering the full IOB annotations (e.g. <code>"B-DISEASE"</code>).</li>
    <li><code>"entity"</code>: Entities are considered as units, even if they cover a span of several tokens. A True Positive is defined as two entity annotations matching exactly from the first to the last token of the span.</li>

</ul>

<div>
<strong>Note.</strong> For the purpose of the evaluations shown in this section, we will consider the evaluations of <code>EmmanuelleLogette</code> as the ground truth. The resports are therefore produced with respect to the labels used by this annoator.
</div>

<div>
<strong>Note.</strong> Annotation of punctuation elements as parts of an entity can orginiate from small mistakes in the manual labeling process. For this reason, we can remove all punctuation elements by calling <code>remove_punctuation()</code>.
</div>

In [None]:
df = remove_punctuation(df)

### <code>unique_etypes</code>: analyze distribution annotations per entity type

In [None]:
for mode in ('token', 'entity'):
    f, ax = plt.subplots(1, 2, figsize=(12, 4))
    for ax_, annotator in zip(ax, (annotators[0], SCISPACY_MODELS[0])):
        iob_annotations = df[f'class_{annotator}']
        etypes, counts = unique_etypes(iob_annotations, mode=mode, return_counts=True)
        ax_.bar(etypes, counts)
        for x, y in enumerate(counts):
            ax_.text(x, y, f'{y:,d}', ha='center', va='bottom')
        ax_.set_xticklabels(etypes, rotation=45)
        ax_.set_ylabel('Count')
        ax_.grid()
        ax_.set_title(f'Annotations {annotator} [mode = {mode}]')

### <code>ner_report</code>: summarize ner scores

<strong>Note.</strong> If one needs to access exact numeric values of the report, it is possible to call the function with <code>return_dict=True</code>.

In [None]:
iob_true = df[f'class_{annotators[0]}']

for mode in ('token', 'entity'):
    for annotator_pred in (annotators[1], SCISPACY_MODELS[0]):
        print(f'Annotations {annotator_pred} [mode = {mode}]')
        iob_pred = df[f'class_{annotator_pred}']
        if annotator_pred in SCISPACY_MODELS:
            print(ner_report(iob_true, iob_pred, mode=mode, return_dict=False, 
                             etypes_map={'CHEMICAL': 'CHEBI',
                                         'CELL_TYPE': 'CL',
                                         'ORGANISM': 'TAXON',
                                         'PROTEIN': 'GGP'}))
        else:
            print(ner_report(iob_true, iob_pred, mode=mode, return_dict=False))      
        print()
    print()

### <code>ner_errors</code>: show false negatives and false positives

<strong>Note.</strong> For reasons of space, we only print results relative to one entity type, but you can change the value of <code>"ETYPE"</code> to see the other results.

In [None]:
ETYPE = 'CELL_TYPE'

iob_true = df[f'class_{annotators[0]}']

for mode in ('token', 'entity'):
    for annotator_pred in (annotators[1], SCISPACY_MODELS[0]):
        print(f'Annotations {annotator_pred} [mode = {mode}]')
        iob_pred = df[f'class_{annotator_pred}']
        if annotator_pred in SCISPACY_MODELS:
            results_dict = ner_errors(iob_true, iob_pred, mode=mode, return_dict=True, 
                             etypes_map={'CHEMICAL': 'CHEBI',
                                         'CELL_TYPE': 'CL',
                                         'ORGANISM': 'TAXON',
                                         'PROTEIN': 'GGP'},
                            tokens=df.text)
        else:
            results_dict = ner_errors(iob_true, iob_pred, mode=mode, return_dict=True, tokens=df.text)
        print(f'------Entity type: {ETYPE}------')
        print(f'--- False Negatives ---')
        print(' '.join(repr(s) for s in results_dict[ETYPE]['false_neg']))
        print(f'--- False Positives ---')
        print(' '.join(repr(s) for s in results_dict[ETYPE]['false_pos']))
        print()
    print()

### <code>ner_confusion_matrix</code>: breakdown ner predictions against ground truth

<div>
<strong>Note.</strong> The function <code>plot_ner_confusion_matrix</code> is just a wrapper around <code>ner_confusion_matrix</code>.
</div>
<div>
<strong>Note.</strong> The normalization parameter <code>normalize</code> can take 4 possible values: <code>None, "true", "pred", "all"</code>.
</div>

In [None]:
iob_true = df[f'class_{annotators[0]}']

for annotator_pred in (annotators[1], *SCISPACY_MODELS):
    iob_pred = df[f'class_{annotator_pred}']

    f, ax = plt.subplots(1, 2, figsize=(12, 6))
    for ax_, mode in zip(ax, ('token', 'entity')):
        plot_ner_confusion_matrix(iob_true, iob_pred, mode=mode, ax=ax_, normalize=None)
        ax_.set_title(f'{annotator_pred} [mode = {mode}]')
    f.tight_layout()