# CoNLL_View_Doc.ipynb

Notebook for viewing individual documents from the CoNLL corpus, for use 
alongside the other CoNLL related notebooks in this directory.

In [2]:
# Libraries
import os
import sys
import numpy as np
import pandas as pd

# And of course we need the text_extensions_for_pandas library itself.
PROJECT_ROOT = "../.."
try:
    import text_extensions_for_pandas as tp
except ModuleNotFoundError as e:
    # If we're running from within the project source tree and the parent Python
    # environment doesn't have the text_extensions_for_pandas package, use the
    # version in the local source tree.
    if not os.getcwd().endswith("corpus"):
        raise e
    if PROJECT_ROOT not in sys.path:
        sys.path.insert(0, PROJECT_ROOT)
    import text_extensions_for_pandas as tp

# Code shared among notebooks is kept in util.py, in this directory.
import util

In [5]:
# Download and cache the data set.
# NOTE: This data set is licensed for research use only. Be sure to adhere
#  to the terms of the license when using this data set!
data_set_info = tp.io.conll.maybe_download_conll_data("outputs")
data_set_info

{'train': 'outputs/eng.train',
 'dev': 'outputs/eng.testa',
 'test': 'outputs/eng.testb'}

In [7]:
# The raw dataset in its original tokenization
corpus_raw = {}
for fold_name, file_name in data_set_info.items():
    df_list = tp.io.conll.conll_2003_to_dataframes(
        file_name, ["pos", "phrase", "ent"], [False, True, True])
    corpus_raw[fold_name] = [
        df.drop(columns=["pos", "phrase_iob", "phrase_type"])
        for df in df_list
    ]


In [9]:
# Convert IOB2-tagged tokens to spans
all_spans = {
    k: [tp.io.conll.iob_to_spans(df) for df in v] for k, v in corpus_raw.items()
}

In [376]:
# Turn off the 60-row limit for displaying dataframes
pd.options.display.max_rows = None

In [1701]:
fold = "train"
doc_offset = 943
doc_df = all_spans[fold][doc_offset]
doc_df

Unnamed: 0,span,ent_type
0,"[25, 35): 'SAN MARINO'",LOC
1,"[36, 46): 'GRAND PRIX'",MISC
2,"[63, 68): 'IMOLA'",LOC
3,"[70, 75): 'Italy'",LOC
4,"[129, 139): 'San Marino'",LOC
5,"[159, 169): 'Grand Prix'",MISC
6,"[174, 188): 'Michael Doohan'",PER
7,"[190, 199): 'Australia'",LOC
8,"[201, 206): 'Honda'",ORG
9,"[228, 245): 'Jean-Michel Bayle'",PER


In [1702]:
doc_df["span"].values.repr_html_show_offsets = False
doc_df["span"].values

In [1668]:
# Dataframe of tokens for finding offsets
toks_df = corpus_raw[fold][doc_offset]
toks_df

Unnamed: 0,span,ent_iob,ent_type,sentence,line_num
0,"[0, 10): '-DOCSTART-'",O,,"[0, 10): '-DOCSTART-'",192363
1,"[11, 19): 'BASEBALL'",O,,"[11, 52): 'BASEBALL- MAJOR LEAGUE RESULTS WEDN...",192365
2,"[19, 20): '-'",O,,"[11, 52): 'BASEBALL- MAJOR LEAGUE RESULTS WEDN...",192366
3,"[21, 26): 'MAJOR'",B,MISC,"[11, 52): 'BASEBALL- MAJOR LEAGUE RESULTS WEDN...",192367
4,"[27, 33): 'LEAGUE'",I,MISC,"[11, 52): 'BASEBALL- MAJOR LEAGUE RESULTS WEDN...",192368
5,"[34, 41): 'RESULTS'",O,,"[11, 52): 'BASEBALL- MAJOR LEAGUE RESULTS WEDN...",192369
6,"[42, 51): 'WEDNESDAY'",O,,"[11, 52): 'BASEBALL- MAJOR LEAGUE RESULTS WEDN...",192370
7,"[51, 52): '.'",O,,"[11, 52): 'BASEBALL- MAJOR LEAGUE RESULTS WEDN...",192371
8,"[53, 56): 'NEW'",B,LOC,"[53, 72): 'NEW YORK 1996-08-29'",192373
9,"[57, 61): 'YORK'",I,LOC,"[53, 72): 'NEW YORK 1996-08-29'",192374


###### 