# CoNLL_View_Doc.ipynb

Notebook for viewing individual documents from the CoNLL corpus, for use 
alongside the other CoNLL related notebooks in this directory.

In [1]:
# Libraries
import os
import sys
import numpy as np
import pandas as pd

# And of course we need the text_extensions_for_pandas library itself.
try:
    import text_extensions_for_pandas as tp
except ModuleNotFoundError as e:
    raise Exception("text_extensions_for_pandas package not found on the Jupyter "
                    "kernel's path. Please either run:\n"
                    "   ln -s ../../text_extensions_for_pandas .\n"
                    "from the directory containing this notebook, or use a Python "
                    "environment on which you have used `pip` to install the package.")


In [2]:
# Download and cache the data set.
# NOTE: This data set is licensed for research use only. Be sure to adhere
#  to the terms of the license when using this data set!
data_set_info = tp.io.conll.maybe_download_conll_data("outputs")
data_set_info

{'train': 'outputs/eng.train',
 'dev': 'outputs/eng.testa',
 'test': 'outputs/eng.testb'}

In [3]:
# The raw dataset in its original tokenization
corpus_raw = {}
for fold_name, file_name in data_set_info.items():
    df_list = tp.io.conll.conll_2003_to_dataframes(
        file_name, ["pos", "phrase", "ent"], [False, True, True])
    corpus_raw[fold_name] = [
        df.drop(columns=["pos", "phrase_iob", "phrase_type"])
        for df in df_list
    ]


In [4]:
# Convert IOB2-tagged tokens to spans
all_spans = {
    k: [tp.io.conll.iob_to_spans(df) for df in v] for k, v in corpus_raw.items()
}

In [5]:
# Turn off the 60-row limit for displaying dataframes
pd.options.display.max_rows = None

In [6]:
fold = "train"
doc_offset = 943
doc_df = all_spans[fold][doc_offset]
doc_df

Unnamed: 0,span,ent_type
0,"[25, 35): 'SAN MARINO'",LOC
1,"[36, 46): 'GRAND PRIX'",MISC
2,"[63, 68): 'IMOLA'",LOC
3,"[70, 75): 'Italy'",LOC
4,"[129, 139): 'San Marino'",LOC
5,"[159, 169): 'Grand Prix'",MISC
6,"[174, 188): 'Michael Doohan'",PER
7,"[190, 199): 'Australia'",LOC
8,"[201, 206): 'Honda'",ORG
9,"[228, 245): 'Jean-Michel Bayle'",PER


In [7]:
#doc_df["span"].values.repr_html_show_offsets = False
doc_df["span"].values

Unnamed: 0,begin,end,context
0,25,35,SAN MARINO
1,36,46,GRAND PRIX
2,63,68,IMOLA
3,70,75,Italy
4,129,139,San Marino
5,159,169,Grand Prix
6,174,188,Michael Doohan
7,190,199,Australia
8,201,206,Honda
9,228,245,Jean-Michel Bayle


In [8]:
# Dataframe of tokens for finding offsets
toks_df = corpus_raw[fold][doc_offset]
toks_df

Unnamed: 0,span,ent_iob,ent_type,sentence,line_num
0,"[0, 10): '-DOCSTART-'",O,,"[0, 10): '-DOCSTART-'",219100
1,"[11, 23): 'MOTORCYCLING'",O,,"[11, 62): 'MOTORCYCLING- SAN MARINO GRAND PRIX...",219102
2,"[23, 24): '-'",O,,"[11, 62): 'MOTORCYCLING- SAN MARINO GRAND PRIX...",219103
3,"[25, 28): 'SAN'",B,LOC,"[11, 62): 'MOTORCYCLING- SAN MARINO GRAND PRIX...",219104
4,"[29, 35): 'MARINO'",I,LOC,"[11, 62): 'MOTORCYCLING- SAN MARINO GRAND PRIX...",219105
5,"[36, 41): 'GRAND'",B,MISC,"[11, 62): 'MOTORCYCLING- SAN MARINO GRAND PRIX...",219106
6,"[42, 46): 'PRIX'",I,MISC,"[11, 62): 'MOTORCYCLING- SAN MARINO GRAND PRIX...",219107
7,"[47, 55): 'PRACTICE'",O,,"[11, 62): 'MOTORCYCLING- SAN MARINO GRAND PRIX...",219108
8,"[56, 61): 'TIMES'",O,,"[11, 62): 'MOTORCYCLING- SAN MARINO GRAND PRIX...",219109
9,"[61, 62): '.'",O,,"[11, 62): 'MOTORCYCLING- SAN MARINO GRAND PRIX...",219110


###### 