# CoNLL_View_Doc.ipynb

Notebook for viewing individual documents from the CoNLL corpus, for use 
alongside the other CoNLL related notebooks in this directory.

In [1]:
# The Jupyter kernel for this notebook usually starts up inside the notebooks
# directory, but the text_extensions_for_pandas package code is in the parent
# directory. Add that parent directory to the front of the Python include path.
import sys
if (sys.path[0] != ".."):
    sys.path[0] = ".."
    
# Libraries
import numpy as np
import pandas as pd

# And of course we need the text_extensions_for_pandas library itself.
import text_extensions_for_pandas as tp

# Common code shared across notebooks comes from util.py
import util

In [2]:
# Download and cache the data set.
# NOTE: This data set is licensed for research use only. Be sure to adhere
#  to the terms of the license when using this data set!
data_set_info = util.get_conll_data()
data_set_info

{'train': 'outputs/eng.train',
 'dev': 'outputs/eng.testa',
 'test': 'outputs/eng.testb'}

In [3]:
# The raw dataset in its original tokenization
corpus_raw = {}
for fold_name, file_name in data_set_info.items():
    df_list = tp.conll_2003_to_dataframes(file_name, 
                                          ["pos", "phrase", "ent"],
                                          [False, True, True])
    corpus_raw[fold_name] = [
        df.drop(columns=["pos", "phrase_iob", "phrase_type"])
        for df in df_list
    ]


In [4]:
# Convert IOB2-tagged tokens to spans
all_spans = {
    k: [tp.iob_to_spans(df) for df in v] for k, v in corpus_raw.items()
}

In [166]:
# Turn off the 60-row limit for displaying dataframes
pd.options.display.max_rows = None

In [266]:
fold = "test"
doc_offset = 223
doc_df = all_spans[fold][doc_offset]
doc_df

Unnamed: 0,token_span,ent_type
0,"[11, 14): 'NHL'",ORG
1,"[45, 53): 'NEW YORK'",LOC
2,"[76, 91): 'National Hockey'",ORG
3,"[92, 98): 'League'",ORG
4,"[136, 146): 'NY RANGERS'",ORG
5,"[149, 156): 'Toronto'",ORG
6,"[159, 166): 'BUFFALO'",ORG
7,"[169, 176): 'Anaheim'",ORG
8,"[184, 194): 'Pittsburgh'",ORG
9,"[197, 207): 'WASHINGTON'",ORG


In [267]:
doc_df["token_span"].values

Unnamed: 0,begin,end,begin_token,end_token,covered_text
0,11,14,1,2,NHL
1,45,53,9,11,NEW YORK
2,76,91,14,16,National Hockey
3,92,98,16,17,League
4,136,146,27,29,NY RANGERS
5,149,156,30,31,Toronto
6,159,166,32,33,BUFFALO
7,169,176,34,35,Anaheim
8,184,194,39,40,Pittsburgh
9,197,207,41,42,WASHINGTON
