# Data_Formats.ipynb

Demonstration of the differences in file size and read time between different file formats for NLP corpora.

In [1]:
import os
import sys
import pandas as pd
import spacy
import pathlib

# And of course we need the text_extensions_for_pandas library itself.
try:
    import text_extensions_for_pandas as tp
except ModuleNotFoundError as e:
    # If we're running from within the project source tree and the parent Python
    # environment doesn't have the text_extensions_for_pandas package, use the
    # version in the local source tree.
    if not os.getcwd().endswith("notebooks"):
        raise e
    if ".." not in sys.path:
        sys.path.insert(0, "..")
    import text_extensions_for_pandas as tp

In [2]:
# Download the CoNLL-2003 data set if necessary
data_set_info = tp.io.conll.maybe_download_conll_data("outputs")
data_set_info

{'train': 'outputs/eng.train',
 'dev': 'outputs/eng.testa',
 'test': 'outputs/eng.testb'}

In [3]:
# Start with the size of the training fold in the original CoNLL format.
train_fold_conll = data_set_info["train"]
conll_bytes = pathlib.Path(train_fold_conll).stat().st_size

f"Size of training fold in CoNLL format: {conll_bytes // 1024} kib"

'Size of training fold in CoNLL format: 3206 kib'

In [4]:
%%time

# Read into one DataFrame per document.
train_fold_dfs = tp.io.conll.conll_2003_to_dataframes(
    train_fold_conll, column_names=["pos", "phrase", "ent"],
    iob_columns=[False, True, True])

CPU times: user 1.62 s, sys: 23.2 ms, total: 1.65 s
Wall time: 1.65 s


In [5]:
# Serialization of multi-doc token span arrays not yet implemented, so
# convert sentences to SpanArrays as a workaround.
for df in train_fold_dfs:
    df["sentence"] = tp.SpanArray(df["span"].array.target_text, df["sentence"].array.begin, df["sentence"].array.end)

In [6]:
train_fold = pd.concat(train_fold_dfs).reset_index(drop=True)
train_fold

Unnamed: 0,span,pos,phrase_iob,phrase_type,ent_iob,ent_type,sentence,line_num
0,"[0, 10): '-DOCSTART-'",-X-,O,,O,,"[0, 10): '-DOCSTART-'",0
1,"[11, 13): 'EU'",NNP,B,NP,B,ORG,"[11, 58): 'EU rejects German call to boycott B...",2
2,"[14, 21): 'rejects'",VBZ,B,VP,O,,"[11, 58): 'EU rejects German call to boycott B...",3
3,"[22, 28): 'German'",JJ,B,NP,B,MISC,"[11, 58): 'EU rejects German call to boycott B...",4
4,"[29, 33): 'call'",NN,I,NP,O,,"[11, 58): 'EU rejects German call to boycott B...",5
...,...,...,...,...,...,...,...,...
204562,"[149, 154): 'three'",CD,I,NP,O,,"[140, 154): 'Division three'",219547
204563,"[155, 162): 'Swansea'",NN,B,NP,B,ORG,"[155, 174): 'Swansea 1 Lincoln 2'",219549
204564,"[163, 164): '1'",CD,I,NP,O,,"[155, 174): 'Swansea 1 Lincoln 2'",219550
204565,"[165, 172): 'Lincoln'",NNP,I,NP,B,ORG,"[155, 174): 'Swansea 1 Lincoln 2'",219551


In [7]:
# Write out the training fold as a Feather file
train_fold_feather = "outputs/eng.train.feather"
train_fold.to_feather(train_fold_feather)

In [8]:
# How big is the Feather file?
feather_bytes = pathlib.Path(train_fold_feather).stat().st_size

f"Size of training fold in Feather format: {feather_bytes // 1024} kib"

'Size of training fold in Feather format: 7449 kib'

In [9]:
train_fold_parquet = "outputs/eng.train.parquet"
train_fold.to_parquet(train_fold_parquet)

In [10]:
# How big is the Parquet file?
parquet_bytes = pathlib.Path(train_fold_parquet).stat().st_size

f"Size of training fold in Parquet format: {parquet_bytes // 1024} kib"

'Size of training fold in Parquet format: 3575 kib'

In [11]:
%%time

# Read the Feather file back in
pd.read_parquet(train_fold_parquet)

CPU times: user 35.5 ms, sys: 8.72 ms, total: 44.2 ms
Wall time: 28.6 ms


Unnamed: 0,span,pos,phrase_iob,phrase_type,ent_iob,ent_type,sentence,line_num
0,"[0, 10): '-DOCSTART-'",-X-,O,,O,,"[0, 10): '-DOCSTART-'",0
1,"[11, 13): 'EU'",NNP,B,NP,B,ORG,"[11, 58): 'EU rejects German call to boycott B...",2
2,"[14, 21): 'rejects'",VBZ,B,VP,O,,"[11, 58): 'EU rejects German call to boycott B...",3
3,"[22, 28): 'German'",JJ,B,NP,B,MISC,"[11, 58): 'EU rejects German call to boycott B...",4
4,"[29, 33): 'call'",NN,I,NP,O,,"[11, 58): 'EU rejects German call to boycott B...",5
...,...,...,...,...,...,...,...,...
204562,"[149, 154): 'three'",CD,I,NP,O,,"[140, 154): 'Division three'",219547
204563,"[155, 162): 'Swansea'",NN,B,NP,B,ORG,"[155, 174): 'Swansea 1 Lincoln 2'",219549
204564,"[163, 164): '1'",CD,I,NP,O,,"[155, 174): 'Swansea 1 Lincoln 2'",219550
204565,"[165, 172): 'Lincoln'",NNP,I,NP,B,ORG,"[155, 174): 'Swansea 1 Lincoln 2'",219551


In [12]:
# SpaCy's converter is picky about how to format document boundaries.
# Adjust the contents of the training fold to suit.
with open(train_fold_conll, "r") as f:
    lines = f.readlines()
    
SPACY_DOC_DELIMITER = "-DOCSTART- -X- O O"
    
train_fold_conll_spacy = "outputs/eng.train2.conll"
with open(train_fold_conll_spacy, "w") as f:
    for l in lines:
        if l.startswith("-DOCSTART-"):
            f.write(SPACY_DOC_DELIMITER + "\n")
        else:
            f.write(l)

In [13]:
%%time

with open(train_fold_conll_spacy) as f:
    file_contents = f.read()

# Read the training fold using SpaCy's corpus utilities
training_docs_generator = spacy.training.converters.conll_ner_to_docs(
    file_contents,
    n_sents=0,
    seg_sents=False,
)
training_docs = list(training_docs_generator)

CPU times: user 1.07 s, sys: 73.1 ms, total: 1.15 s
Wall time: 1.15 s


In [14]:
%%time

# Convert to SpaCy binary
! python -m spacy convert outputs/eng.train2.conll outputs

[38;5;3m⚠ Document delimiters found, automatic document segmentation with `-n`
disabled.[0m
[38;5;2m✔ Generated output file (946 documents): outputs/eng.train2.spacy[0m
CPU times: user 13.2 ms, sys: 21.6 ms, total: 34.8 ms
Wall time: 2.88 s


In [15]:
%%time

# Convert to SpaCy JSON
! python -m spacy convert outputs/eng.train2.conll outputs --file-type json

[38;5;3m⚠ Document delimiters found, automatic document segmentation with `-n`
disabled.[0m
[38;5;2m✔ Generated output file (1 documents): outputs/eng.train2.json[0m
CPU times: user 11 ms, sys: 19.2 ms, total: 30.1 ms
Wall time: 2.78 s


In [16]:
%%time 

train_fold_spacy = "outputs/eng.train2.spacy"

corpus = spacy.training.Corpus(train_fold_spacy)
nlp = spacy.blank("en")
train_data_generator = corpus(nlp)
train_data = list(train_data_generator)

CPU times: user 782 ms, sys: 24.8 ms, total: 807 ms
Wall time: 806 ms


In [17]:
# Read the training fold with nltk
import nltk

In [18]:
# NLTK reader needs the corpus to be an entire directory.
!mkdir -p outputs/eng.train.nltk
!cp outputs/eng.train outputs/eng.train.nltk

In [19]:
%%time
reader = nltk.corpus.reader.conll.ConllCorpusReader(root="outputs/eng.train.nltk", 
                                           fileids=[train_fold_conll],
                                           columntypes=["words", "pos", "ignore", "ne"])
tagged_words = list(reader.tagged_words("eng.train"))
sentences = list(reader.sents("eng.train"))

CPU times: user 881 ms, sys: 115 ms, total: 996 ms
Wall time: 993 ms


In [20]:
reader.tagged_words("eng.train")

[('EU', 'NNP'), ('rejects', 'VBZ'), ('German', 'JJ'), ...]