# CoNLL_View_Doc.ipynb

Notebook for viewing individual documents from the CoNLL corpus, for use 
alongside the other CoNLL related notebooks in this directory.

In [None]:
# Libraries
import os
import sys
import numpy as np
import pandas as pd

# And of course we need the text_extensions_for_pandas library itself.
try:
    import text_extensions_for_pandas as tp
except ModuleNotFoundError as e:
    raise Exception("text_extensions_for_pandas package not found on the Jupyter "
                    "kernel's path. Please either run:\n"
                    "   ln -s ../../text_extensions_for_pandas .\n"
                    "from the directory containing this notebook, or use a Python "
                    "environment on which you have used `pip` to install the package.")


In [None]:
# Download and cache the data set.
# NOTE: This data set is licensed for research use only. Be sure to adhere
#  to the terms of the license when using this data set!
data_set_info = tp.io.conll.maybe_download_conll_data("outputs")
data_set_info

In [None]:
# The raw dataset in its original tokenization
corpus_raw = {}
for fold_name, file_name in data_set_info.items():
    df_list = tp.io.conll.conll_2003_to_dataframes(
        file_name, ["pos", "phrase", "ent"], [False, True, True])
    corpus_raw[fold_name] = [
        df.drop(columns=["pos", "phrase_iob", "phrase_type"])
        for df in df_list
    ]


In [None]:
# Convert IOB2-tagged tokens to spans
all_spans = {
    k: [tp.io.conll.iob_to_spans(df) for df in v] for k, v in corpus_raw.items()
}

In [None]:
# Turn off the 60-row limit for displaying dataframes
pd.options.display.max_rows = None

In [None]:
fold = "train"
doc_offset = 943
doc_df = all_spans[fold][doc_offset]
doc_df

In [None]:
#doc_df["span"].values.repr_html_show_offsets = False
doc_df["span"].values

In [None]:
# Dataframe of tokens for finding offsets
toks_df = corpus_raw[fold][doc_offset]
toks_df

###### 