# CoNLL_View_Doc.ipynb

Notebook for viewing individual documents from the CoNLL corpus, for use 
alongside the other CoNLL related notebooks in this directory.

In [1]:
# The Jupyter kernel for this notebook usually starts up inside the notebooks
# directory, but the text_extensions_for_pandas package code is in the parent
# directory. Add that parent directory to the front of the Python include path.
import sys
if (sys.path[0] != ".."):
    sys.path[0] = ".."
    
# Libraries
import numpy as np
import pandas as pd

# And of course we need the text_extensions_for_pandas library itself.
import text_extensions_for_pandas as tp

# Common code shared across notebooks comes from util.py
import util

In [2]:
# Download and cache the data set.
# NOTE: This data set is licensed for research use only. Be sure to adhere
#  to the terms of the license when using this data set!
data_set_info = util.get_conll_data()
data_set_info

{'train': 'outputs/eng.train',
 'dev': 'outputs/eng.testa',
 'test': 'outputs/eng.testb'}

In [3]:
# The raw dataset in its original tokenization
corpus_raw = {}
for fold_name, file_name in data_set_info.items():
    df_list = tp.conll_2003_to_dataframes(file_name, 
                                          ["pos", "phrase", "ent"],
                                          [False, True, True])
    corpus_raw[fold_name] = [
        df.drop(columns=["pos", "phrase_iob", "phrase_type"])
        for df in df_list
    ]


In [4]:
# Convert IOB2-tagged tokens to spans
all_spans = {
    k: [tp.iob_to_spans(df) for df in v] for k, v in corpus_raw.items()
}

In [5]:
# Turn off the 60-row limit for displaying dataframes
pd.options.display.max_rows = None

In [1024]:
fold = "dev"
doc_offset = 214
doc_df = all_spans[fold][doc_offset]
doc_df

Unnamed: 0,token_span,ent_type
0,"[25, 28): 'IRA'",ORG
1,"[46, 52): 'Venice'",LOC
2,"[63, 74): 'Vera Haller'",PER
3,"[83, 88): 'Italy'",LOC
4,"[121, 132): 'Neil Jordan'",PER
5,"[187, 202): 'Michael Collins'",MISC
6,"[239, 242): 'IRA'",ORG
7,"[285, 305): 'Venice Film Festival'",MISC
8,"[326, 337): 'Liam Neeson'",PER
9,"[342, 355): 'Julia Roberts'",PER


In [1025]:
doc_df["token_span"].values

Unnamed: 0,begin,end,begin_token,end_token,covered_text
0,25,28,2,3,IRA
1,46,52,6,7,Venice
2,63,74,9,11,Vera Haller
3,83,88,13,14,Italy
4,121,132,17,19,Neil Jordan
5,187,202,31,33,Michael Collins
6,239,242,40,41,IRA
7,285,305,49,52,Venice Film Festival
8,326,337,57,59,Liam Neeson
9,342,355,60,62,Julia Roberts


In [982]:
# Dataframe of tokens for finding offsets
toks_df = corpus_raw[fold][doc_offset]
toks_df

Unnamed: 0,char_span,token_span,ent_iob,ent_type,sentence
0,"[0, 10): '-DOCSTART-'","[0, 10): '-DOCSTART-'",O,,"[0, 10): '-DOCSTART-'"
1,"[11, 16): 'HORSE'","[11, 16): 'HORSE'",O,,"[11, 60): 'HORSE RACING- TATTERSALLS BREEDERS ..."
2,"[17, 23): 'RACING'","[17, 23): 'RACING'",O,,"[11, 60): 'HORSE RACING- TATTERSALLS BREEDERS ..."
3,"[23, 24): '-'","[23, 24): '-'",O,,"[11, 60): 'HORSE RACING- TATTERSALLS BREEDERS ..."
4,"[25, 36): 'TATTERSALLS'","[25, 36): 'TATTERSALLS'",B,MISC,"[11, 60): 'HORSE RACING- TATTERSALLS BREEDERS ..."
5,"[37, 45): 'BREEDERS'","[37, 45): 'BREEDERS'",I,MISC,"[11, 60): 'HORSE RACING- TATTERSALLS BREEDERS ..."
6,"[46, 52): 'STAKES'","[46, 52): 'STAKES'",O,,"[11, 60): 'HORSE RACING- TATTERSALLS BREEDERS ..."
7,"[53, 59): 'RESULT'","[53, 59): 'RESULT'",O,,"[11, 60): 'HORSE RACING- TATTERSALLS BREEDERS ..."
8,"[59, 60): '.'","[59, 60): '.'",O,,"[11, 60): 'HORSE RACING- TATTERSALLS BREEDERS ..."
9,"[61, 67): 'DUBLIN'","[61, 67): 'DUBLIN'",B,LOC,"[61, 78): 'DUBLIN 1996-08-31'"
