## CoNLL-2003 Example for Text Extensions for Pandas
### Part 1

To run this notebook, you will need to obtain a copy of the CoNLL-2003 data set's corpus.
Drop the corpus's files into the following locations:
* conll_03/eng.testa
* conll_03/eng.testb
* conll_03/eng.train


In [1]:
# INITIALIZATION BOILERPLATE

# The Jupyter kernel for this notebook usually starts up inside the notebooks
# directory, but the text_extensions_for_pandas package code is in the parent
# directory. Add that parent directory to the front of the Python include path.
import sys
if (sys.path[0] != ".."):
    sys.path[0] = ".."
    
# Libraries
import numpy as np
import pandas as pd

# And of course we need the text_extensions_for_pandas library itself.
import text_extensions_for_pandas as tp

In [2]:
# Read gold standard data for the validation set.

# Note that this data is NOT kept in source control; you need to obtain 
# an appropriate license and download the data yourself separately to
# run this notebook.

# Note also that the original corpus started with the special "-DOCSTART-"
# tag, while other versions start right in with the 
# If you have one of those other versions, you'll need to add the following
# two lines at the beginning to make the tokens line up with the output in
# ner.tgz:
# -DOCSTART- O
# 
# ^^^ note blank line after special token.
#
# If you need to add those lines, you should also remove the extra 
# "-DOCSTART-" token at the end of each file.

gold_standard = tp.conll_2003_to_dataframes("../conll_03/eng.testa")

# tp.conll_2003_to_dataframes() returns a list of dataframes
gold_standard[0]

Unnamed: 0,char_span,token_span,ent_iob,ent_type,sentence
0,"[0, 10): '-DOCSTART-'","[0, 10): '-DOCSTART-'",O,,"[0, 10): '-DOCSTART-'"
1,"[11, 17): 'SOCCER'","[11, 17): 'SOCCER'",O,,"[11, 65): 'SOCCER- JAPAN GET LUCKY WIN, CHINA ..."
2,"[17, 18): '-'","[17, 18): '-'",O,,"[11, 65): 'SOCCER- JAPAN GET LUCKY WIN, CHINA ..."
3,"[19, 24): 'JAPAN'","[19, 24): 'JAPAN'",B,LOC,"[11, 65): 'SOCCER- JAPAN GET LUCKY WIN, CHINA ..."
4,"[25, 28): 'GET'","[25, 28): 'GET'",O,,"[11, 65): 'SOCCER- JAPAN GET LUCKY WIN, CHINA ..."
...,...,...,...,...,...
415,"[2176, 2180): 'each'","[2176, 2180): 'each'",O,,"[2136, 2195): 'All four teams are level with o..."
416,"[2181, 2185): 'from'","[2181, 2185): 'from'",O,,"[2136, 2195): 'All four teams are level with o..."
417,"[2186, 2189): 'one'","[2186, 2189): 'one'",O,,"[2136, 2195): 'All four teams are level with o..."
418,"[2190, 2194): 'game'","[2190, 2194): 'game'",O,,"[2136, 2195): 'All four teams are level with o..."


In [3]:
# Read the outputs of the "bender" team in the original competition.
# Yes, this file is called "testa" in one data set and "testb" in the other.
# Go figure.
bender_output = tp.conll_2003_output_to_dataframes(
    gold_standard, "../resources/conll_03/ner/results/bender/eng.testb")
bender_output[0].head(20)

Unnamed: 0,char_span,token_span,ent_iob,ent_type,sentence
0,"[0, 10): '-DOCSTART-'","[0, 10): '-DOCSTART-'",O,,"[0, 10): '-DOCSTART-'"
1,"[11, 17): 'SOCCER'","[11, 17): 'SOCCER'",O,,"[11, 65): 'SOCCER- JAPAN GET LUCKY WIN, CHINA ..."
2,"[17, 18): '-'","[17, 18): '-'",O,,"[11, 65): 'SOCCER- JAPAN GET LUCKY WIN, CHINA ..."
3,"[19, 24): 'JAPAN'","[19, 24): 'JAPAN'",B,LOC,"[11, 65): 'SOCCER- JAPAN GET LUCKY WIN, CHINA ..."
4,"[25, 28): 'GET'","[25, 28): 'GET'",O,,"[11, 65): 'SOCCER- JAPAN GET LUCKY WIN, CHINA ..."
5,"[29, 34): 'LUCKY'","[29, 34): 'LUCKY'",O,,"[11, 65): 'SOCCER- JAPAN GET LUCKY WIN, CHINA ..."
6,"[35, 38): 'WIN'","[35, 38): 'WIN'",O,,"[11, 65): 'SOCCER- JAPAN GET LUCKY WIN, CHINA ..."
7,"[38, 39): ','","[38, 39): ','",O,,"[11, 65): 'SOCCER- JAPAN GET LUCKY WIN, CHINA ..."
8,"[40, 45): 'CHINA'","[40, 45): 'CHINA'",B,LOC,"[11, 65): 'SOCCER- JAPAN GET LUCKY WIN, CHINA ..."
9,"[46, 48): 'IN'","[46, 48): 'IN'",O,,"[11, 65): 'SOCCER- JAPAN GET LUCKY WIN, CHINA ..."


In [4]:
# Convert the gold standard to spans.
# Again, one dataframe per document.
gold_standard_spans = [tp.iob_to_spans(df) for df in gold_standard]
bender_output_spans = [tp.iob_to_spans(df) for df in bender_output]
bender_output_spans[0].head(10)

Unnamed: 0,token_span,ent_type
0,"[19, 24): 'JAPAN'",LOC
1,"[40, 45): 'CHINA'",LOC
2,"[66, 77): 'Nadim Ladki'",PER
3,"[78, 84): 'AL-AIN'",LOC
4,"[86, 106): 'United Arab Emirates'",ORG
5,"[118, 123): 'Japan'",LOC
6,"[151, 160): 'Asian Cup'",MISC
7,"[196, 201): 'Syria'",LOC
8,"[249, 254): 'China'",LOC
9,"[363, 373): 'Uzbekistan'",LOC


In [5]:
# Let's look at just PER annotations
gold_person = [df[df["ent_type"] == "PER"] for df in gold_standard_spans]
bender_person = [df[df["ent_type"] == "PER"] for df in bender_output_spans]
gold_person[0]

Unnamed: 0,token_span,ent_type
1,"[40, 45): 'CHINA'",PER
2,"[66, 77): 'Nadim Ladki'",PER
12,"[482, 495): 'Igor Shkvyrin'",PER
14,"[618, 632): 'Oleg Shatskiku'",PER
21,"[1079, 1092): 'Takuya Takagi'",PER
22,"[1148, 1168): 'Hiroshige Yanagimoto'",PER
24,"[1216, 1227): 'Salem Bitar'",PER
26,"[1360, 1372): 'Hassan Abbas'",PER
27,"[1489, 1494): 'Bitar'",PER
28,"[1504, 1518): 'Nader Jokhadar'",PER


In [6]:
# We can also ask these span columns to render themselves to HTML for a
# closer look at the target document.
bender_person[0]["token_span"].values

Unnamed: 0,begin,end,begin_token,end_token,covered_text
0,66,77,13,15,Nadim Ladki
1,487,495,90,91,Shkvyrin
2,618,632,113,115,Oleg Shatskiku
3,1079,1092,199,201,Takuya Takagi
4,1148,1168,213,215,Hiroshige Yanagimoto
5,1216,1227,222,224,Salem Bitar
6,1360,1372,250,252,Hassan Abbas
7,1504,1518,280,282,Nader Jokhadar
8,1762,1770,328,330,Shu Kamo


In [7]:
# Let's show how to evaluate these results against the gold standard.
# We could look at exact matches...
gold_person[0].merge(bender_person[0])

Unnamed: 0,token_span,ent_type
0,"[66, 77): 'Nadim Ladki'",PER
1,"[618, 632): 'Oleg Shatskiku'",PER
2,"[1079, 1092): 'Takuya Takagi'",PER
3,"[1148, 1168): 'Hiroshige Yanagimoto'",PER
4,"[1216, 1227): 'Salem Bitar'",PER
5,"[1360, 1372): 'Hassan Abbas'",PER
6,"[1504, 1518): 'Nader Jokhadar'",PER
7,"[1762, 1770): 'Shu Kamo'",PER


In [8]:
# ...or we could give credit for partial matches contained entirely 
# within a true match:
tp.contain_join(gold_person[0]["token_span"], bender_person[0]["token_span"], "gold", "extracted")

Unnamed: 0,gold,extracted
0,"[66, 77): 'Nadim Ladki'","[66, 77): 'Nadim Ladki'"
1,"[482, 495): 'Igor Shkvyrin'","[487, 495): 'Shkvyrin'"
2,"[618, 632): 'Oleg Shatskiku'","[618, 632): 'Oleg Shatskiku'"
3,"[1079, 1092): 'Takuya Takagi'","[1079, 1092): 'Takuya Takagi'"
4,"[1148, 1168): 'Hiroshige Yanagimoto'","[1148, 1168): 'Hiroshige Yanagimoto'"
5,"[1216, 1227): 'Salem Bitar'","[1216, 1227): 'Salem Bitar'"
6,"[1360, 1372): 'Hassan Abbas'","[1360, 1372): 'Hassan Abbas'"
7,"[1504, 1518): 'Nader Jokhadar'","[1504, 1518): 'Nader Jokhadar'"
8,"[1762, 1770): 'Shu Kamo'","[1762, 1770): 'Shu Kamo'"


In [9]:
# ...or we could give credit for matches that overlap at all with
# a true match:
tp.overlap_join(gold_person[0]["token_span"], bender_person[0]["token_span"], "gold", "extracted")

Unnamed: 0,gold,extracted
0,"[66, 77): 'Nadim Ladki'","[66, 77): 'Nadim Ladki'"
1,"[482, 495): 'Igor Shkvyrin'","[487, 495): 'Shkvyrin'"
2,"[618, 632): 'Oleg Shatskiku'","[618, 632): 'Oleg Shatskiku'"
3,"[1079, 1092): 'Takuya Takagi'","[1079, 1092): 'Takuya Takagi'"
4,"[1148, 1168): 'Hiroshige Yanagimoto'","[1148, 1168): 'Hiroshige Yanagimoto'"
5,"[1216, 1227): 'Salem Bitar'","[1216, 1227): 'Salem Bitar'"
6,"[1360, 1372): 'Hassan Abbas'","[1360, 1372): 'Hassan Abbas'"
7,"[1504, 1518): 'Nader Jokhadar'","[1504, 1518): 'Nader Jokhadar'"
8,"[1762, 1770): 'Shu Kamo'","[1762, 1770): 'Shu Kamo'"


In [10]:
# Let's stick with exact matches for now.
# Iterate over the pairs of dataframes for all the documents finding the
# inputs we need to compute precision and recall for each document, and
# wrap these values in a new dataframe.
num_true_positives = [len(gold_person[i].merge(bender_person[i]).index)
                      for i in range(len(gold_person))]
num_extracted = [len(df.index) for df in bender_person]
num_entities = [len(df.index) for df in gold_person]
doc_num = np.arange(len(gold_person))

stats_by_doc = pd.DataFrame({
    "doc_num": doc_num,
    "num_true_positives": num_true_positives,
    "num_extracted": num_extracted,
    "num_entities": num_entities
})
stats_by_doc

Unnamed: 0,doc_num,num_true_positives,num_extracted,num_entities
0,0,8,9,12
1,1,31,31,31
2,2,32,33,40
3,3,20,20,20
4,4,5,5,5
...,...,...,...,...
226,226,2,2,2
227,227,4,4,6
228,228,4,5,4
229,229,0,0,0


In [11]:
# Collection-wide precision and recall can be computed by aggregating
# our dataframe:
num_true_positives = stats_by_doc["num_true_positives"].sum()
num_entities = stats_by_doc["num_entities"].sum()
num_extracted = stats_by_doc["num_extracted"].sum()

precision = num_true_positives / num_extracted
recall = num_true_positives / num_entities
F1 = 2.0 * (precision * recall) / (precision + recall)
print(
"""Number of correct answers: {}
Number of entities identified: {}
Actual number of entities: {}
Precision: {:1.4f}
Recall: {:1.4f}
F1: {:1.4f}""".format(num_true_positives, num_entities, num_entities, precision, recall, F1))

Number of correct answers: 1421
Number of entities identified: 1617
Actual number of entities: 1617
Precision: 0.8977
Recall: 0.8788
F1: 0.8881


In [12]:
# The above numbers match up with the official results.
# (last line below)
!head -14 ../resources/conll_03/ner/results/bender/conlleval.out

eng.testa
processed 51578 tokens with 5942 phrases; found: 5846 phrases; correct: 5280.
accuracy:  98.07%; precision:  90.32%; recall:  88.86%; FB1:  89.58
              LOC: precision:  93.27%; recall:  93.58%; FB1:  93.42
             MISC: precision:  88.51%; recall:  81.02%; FB1:  84.60
              ORG: precision:  84.67%; recall:  83.59%; FB1:  84.13
              PER: precision:  92.26%; recall:  91.91%; FB1:  92.09
eng.testb
processed 46666 tokens with 5648 phrases; found: 5548 phrases; correct: 4698.
accuracy:  96.80%; precision:  84.68%; recall:  83.18%; FB1:  83.92
              LOC: precision:  86.44%; recall:  89.81%; FB1:  88.09
             MISC: precision:  78.35%; recall:  73.22%; FB1:  75.70
              ORG: precision:  80.27%; recall:  76.16%; FB1:  78.16
              PER: precision:  89.77%; recall:  87.88%; FB1:  88.81


In [13]:
# Let's also add some additional columns with per-document stats:
stats_by_doc["precision"] = stats_by_doc["num_true_positives"] / stats_by_doc["num_extracted"]
stats_by_doc["recall"] = stats_by_doc["num_true_positives"] / stats_by_doc["num_entities"]
stats_by_doc["F1"] = 2.0 * (stats_by_doc["precision"] * stats_by_doc["recall"]) / (stats_by_doc["precision"] + stats_by_doc["recall"])
stats_by_doc

Unnamed: 0,doc_num,num_true_positives,num_extracted,num_entities,precision,recall,F1
0,0,8,9,12,0.888889,0.666667,0.761905
1,1,31,31,31,1.000000,1.000000,1.000000
2,2,32,33,40,0.969697,0.800000,0.876712
3,3,20,20,20,1.000000,1.000000,1.000000
4,4,5,5,5,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...
226,226,2,2,2,1.000000,1.000000,1.000000
227,227,4,4,6,1.000000,0.666667,0.800000
228,228,4,5,4,0.800000,1.000000,0.888889
229,229,0,0,0,,,


In [14]:
# Let's zero in on the ten most problematic documents by F1 score.
stats_by_doc.sort_values("F1").head(10)

Unnamed: 0,doc_num,num_true_positives,num_extracted,num_entities,precision,recall,F1
75,75,2,21,2,0.095238,1.0,0.173913
7,7,1,2,4,0.5,0.25,0.333333
8,8,1,1,5,1.0,0.2,0.333333
138,138,2,2,10,1.0,0.2,0.333333
161,161,1,3,2,0.333333,0.5,0.4
104,104,2,3,6,0.666667,0.333333,0.444444
185,185,1,2,2,0.5,0.5,0.5
131,131,1,2,2,0.5,0.5,0.5
85,85,1,1,3,1.0,0.333333,0.5
43,43,7,8,16,0.875,0.4375,0.583333


In [15]:
# What's happening with document 75?
gold_person[75]

Unnamed: 0,token_span,ent_type
2,"[53, 70): 'Brendan Intindola'",PER
54,"[2243, 2253): 'Marc Cohen'",PER


In [16]:
bender_person[75]

Unnamed: 0,token_span,ent_type
2,"[53, 70): 'Brendan Intindola'",PER
6,"[178, 186): 'Santa Fe'",PER
8,"[208, 216): 'Santa Fe'",PER
10,"[265, 273): 'Santa Fe'",PER
14,"[456, 464): 'Santa Fe'",PER
16,"[693, 701): 'Santa Fe'",PER
18,"[827, 835): 'Santa Fe'",PER
25,"[1347, 1355): 'Santa Fe'",PER
28,"[1471, 1475): 'Dome'",PER
30,"[1578, 1586): 'Santa Fe'",PER
