# CoNLL_View_Doc.ipynb

Notebook for viewing individual documents from the CoNLL corpus, for use 
alongside the other CoNLL related notebooks in this directory.

In [1]:
# The Jupyter kernel for this notebook usually starts up inside the notebooks
# directory, but the text_extensions_for_pandas package code is in the parent
# directory. Add that parent directory to the front of the Python include path.
import sys
if ".." not in sys.path:
    sys.path.insert(0, "..")

# Libraries
import numpy as np
import pandas as pd

# And of course we need the text_extensions_for_pandas library itself.
import text_extensions_for_pandas as tp

# Common code shared across notebooks comes from util.py
import util

In [2]:
# Download and cache the data set.
# NOTE: This data set is licensed for research use only. Be sure to adhere
#  to the terms of the license when using this data set!
data_set_info = util.get_conll_data()
data_set_info

{'train': 'outputs/eng.train',
 'dev': 'outputs/eng.testa',
 'test': 'outputs/eng.testb'}

In [3]:
# The raw dataset in its original tokenization
corpus_raw = {}
for fold_name, file_name in data_set_info.items():
    df_list = tp.conll_2003_to_dataframes(file_name, 
                                          ["pos", "phrase", "ent"],
                                          [False, True, True])
    corpus_raw[fold_name] = [
        df.drop(columns=["pos", "phrase_iob", "phrase_type"])
        for df in df_list
    ]


In [4]:
# Convert IOB2-tagged tokens to spans
all_spans = {
    k: [tp.iob_to_spans(df) for df in v] for k, v in corpus_raw.items()
}

In [5]:
# Turn off the 60-row limit for displaying dataframes
pd.options.display.max_rows = None

In [6]:
fold = "dev"
doc_offset = 28
doc_df = all_spans[fold][doc_offset]
doc_df

Unnamed: 0,token_span,ent_type
0,"[20, 29): 'AUSTRALIA'",LOC
1,"[58, 65): 'COLOMBO'",LOC
2,"[77, 86): 'Australia'",LOC
3,"[127, 136): 'Sri Lanka'",LOC
4,"[195, 201): 'Singer'",MISC
5,"[244, 253): 'Australia'",LOC
6,"[255, 264): 'Ian Healy'",PER
7,"[276, 289): 'Michael Bevan'",PER
8,"[291, 297): 'Damien'",PER
9,"[298, 306): 'Flemming'",PER


In [7]:
doc_df["token_span"].values.repr_html_show_offsets = False
doc_df["token_span"].values

In [8]:
# Dataframe of tokens for finding offsets
toks_df = corpus_raw[fold][doc_offset]
toks_df

Unnamed: 0,char_span,token_span,ent_iob,ent_type,sentence,line_num
0,"[0, 10): '-DOCSTART-'","[0, 10): '-DOCSTART-'",O,,"[0, 10): '-DOCSTART-'",7780
1,"[11, 18): 'CRICKET'","[11, 18): 'CRICKET'",O,,"[11, 57): 'CRICKET- AUSTRALIA WIN TOSS AND CHO...",7782
2,"[18, 19): '-'","[18, 19): '-'",O,,"[11, 57): 'CRICKET- AUSTRALIA WIN TOSS AND CHO...",7783
3,"[20, 29): 'AUSTRALIA'","[20, 29): 'AUSTRALIA'",B,LOC,"[11, 57): 'CRICKET- AUSTRALIA WIN TOSS AND CHO...",7784
4,"[30, 33): 'WIN'","[30, 33): 'WIN'",O,,"[11, 57): 'CRICKET- AUSTRALIA WIN TOSS AND CHO...",7785
5,"[34, 38): 'TOSS'","[34, 38): 'TOSS'",O,,"[11, 57): 'CRICKET- AUSTRALIA WIN TOSS AND CHO...",7786
6,"[39, 42): 'AND'","[39, 42): 'AND'",O,,"[11, 57): 'CRICKET- AUSTRALIA WIN TOSS AND CHO...",7787
7,"[43, 49): 'CHOOSE'","[43, 49): 'CHOOSE'",O,,"[11, 57): 'CRICKET- AUSTRALIA WIN TOSS AND CHO...",7788
8,"[50, 52): 'TO'","[50, 52): 'TO'",O,,"[11, 57): 'CRICKET- AUSTRALIA WIN TOSS AND CHO...",7789
9,"[53, 56): 'BAT'","[53, 56): 'BAT'",O,,"[11, 57): 'CRICKET- AUSTRALIA WIN TOSS AND CHO...",7790


###### 