In [1]:
import os
import sys
import numpy as np
import pandas as pd
import json
import feather

# And of course we need the text_extensions_for_pandas library itself.
try:
    import text_extensions_for_pandas as tp
except ModuleNotFoundError as e:
    # If we're running from within the project source tree and the parent Python
    # environment doesn't have the text_extensions_for_pandas package, use the
    # version in the local source tree.
    if not os.getcwd().endswith("notebooks"):
        raise e
    if ".." not in sys.path:
        sys.path.insert(0, "..")
    import text_extensions_for_pandas as tp

In [5]:
# init file locations, and download data if necessary. 
BASE_DIR = 'CoNLL_u_test_inputs/'
FEATHER_FILE = "conllu_database.feather"

ewt_file_url = "https://github.com/UniversalDependencies/UD_English-EWT/blob/master/en_ewt-ud-dev.conllu"
conll_09_test_data_url =  'https://ufal.mff.cuni.cz/conll2009-st/trial/CoNLL2009-ST-English-trial.zip'

# download the files if they have not already been downloaded 
conll_09_path = tp.io.conll.maybe_download_dataset_data(BASE_DIR, conll_09_test_data_url)
conllu_ewt_path = tp.io.conll.maybe_download_dataset_data(BASE_DIR, ewt_file_url)

# if you already have access to the full conll_2009_dataset, name the file accordingly and uncomment this line 
# conll_09_path = BASE_DIR + 'CoNLL2009-ST-evaluation-English.conllu'

In [17]:
# import two very different documents, both in the conllu file format. 

# by default we look for EWT style column names, 
# so we have to define a new set for this specific conll09 format
conll_09_cols = ["LEMMA","PLEMMA",'POS','PPOS','FEAT','PFEAT','head','phead','DEPREL','PDEPREL','FILLPRED','PRED']

conllu_09_docs = tp.io.conll.conll_u_to_dataframes(conll_09_path,column_names=conll_09_cols)
#now just filter,and display the document 
conllu_09_doc = conllu_09_docs[0].drop(columns=["PLEMMA",'PPOS','PFEAT','phead','PDEPREL','FILLPRED','sentence','line_num'])
display(conllu_09_doc.head())


#simultaneously, we can import an ewt style document, and display it with the same function
conll_u_docs = tp.io.conll.conll_u_to_dataframes(conllu_ewt_path)
#display 
DOC_NUM = 0
doc_df = conll_u_docs[DOC_NUM]
# here we drop the sentence argument for brevity. Note how we look for 
doc_df.head(10).drop(columns = ["sentence"])

Unnamed: 0,span,LEMMA,POS,FEAT,head,DEPREL,PRED,predicate,pred0arg,pred1arg,pred2arg,pred3arg,pred4arg,pred5arg,pred6arg,pred7arg,pred8arg,pred9arg,pred10arg
0,"[0, 3): 'The'",the,DT,,1.0,NMOD,,,,,,,,,,,,,
1,"[4, 11): 'economy'",economy,NN,,3.0,NMOD,,A1,,,,,,,,,,,
2,"[11, 13): ''s'",'s,POS,,1.0,SUFFIX,,,,,,,,,,,,,
3,"[14, 25): 'temperature'",temperature,NN,,4.0,SBJ,temperature.01,A2,A1,,,,,,,,,,
4,"[26, 30): 'will'",will,MD,,,ROOT,,,AM-MOD,,,,,,,,,,


Unnamed: 0,span,lemma,upostag,xpostag,features,head,deprel,deps,misc,sentence_id,paragraph_id,doc_id,line_num
0,"[0, 4): 'From'",from,ADP,IN,,2.0,case,3:case,,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,4
1,"[5, 8): 'the'",the,DET,DT,Definite=Def|PronType=Art,2.0,det,3:det,,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,5
2,"[9, 11): 'AP'",AP,PROPN,NNP,Number=Sing,3.0,obl,4:obl:from,,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,6
3,"[12, 17): 'comes'",come,VERB,VBZ,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,,root,0:root,,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,7
4,"[18, 22): 'this'",this,DET,DT,Number=Sing|PronType=Dem,5.0,det,6:det,,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,8
5,"[23, 28): 'story'",story,NOUN,NN,Number=Sing,3.0,nsubj,4:nsubj,,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,9
6,"[28, 29): ':'",:,PUNCT,:,,3.0,punct,4:punct,,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,10
7,"[30, 39): 'President'",President,PROPN,NNP,Number=Sing,11.0,nsubj,5:nsubj,,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,15
8,"[40, 44): 'Bush'",Bush,PROPN,NNP,Number=Sing,7.0,flat,1:flat,,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,16
9,"[45, 47): 'on'",on,ADP,IN,,10.0,case,4:case,,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,17


In [18]:
# first we combine all of our documents into one large combined dataframe,for simplicity

# because we are concatenating our dataframes, we need to modify the "head" feilds to still point at their desired targets 
df_starts_at =0
temp = conll_u_docs.copy()
for df in temp: 
    df["head"] += df_starts_at
    df_starts_at +=df.shape[0]

# Now concatenate all our documents into one big dataframe
complete_df = temp[0]
complete_df = complete_df.append(temp[1:], ignore_index=True)

#show the last few rows of the dataframe, select just a few columns for compactness
display(complete_df.shape[0])
complete_df.tail()[["span","lemma","upostag","head","deprel"]]

25151

Unnamed: 0,span,lemma,upostag,head,deprel
25146,"[251, 254): 'and'",and,CCONJ,25150.0,cc
25147,"[255, 256): 'a'",a,DET,25150.0,det
25148,"[257, 261): 'very'",very,ADV,25149.0,advmod
25149,"[262, 275): 'knowledgeable'",knowledgeable,ADJ,25150.0,amod
25150,"[276, 281): 'staff'",staff,NOUN,25145.0,conj


In [19]:
# one advantage of using pandas dataframes is that we can write and read them signifcantly faster than we could the raw conllu files 
# here we use pyarrow with feather to save and reload our dataframe. 

# Currently writing multi document files is not supported, so we will have to use a workaround, 
# by converting sentences from TokenSpanArrays to SpanArrays
complete_df["sentence"] = tp.SpanArray(complete_df["span"].array.target_text, complete_df["sentence"].array.begin, complete_df["sentence"].array.end)

#finally write to file using feather 
path = BASE_DIR +FEATHER_FILE
# increase the chunksize slightly, to allow writing in a single block
feather.write_dataframe(complete_df, path,chunksize= 65536*8)
print(f"File written to {path}")

File written to CoNLL_u_test_inputs/conllu_database.feather


In [20]:
# now we can read this df and continue operating on it as before
re_read_df = feather.read_dataframe(path)
print(f"size is {re_read_df.size}")
# show the same subset of the dataframe as above 
re_read_df.tail()[["span","lemma","upostag","head","deprel"]]

size is 352114


Unnamed: 0,span,lemma,upostag,head,deprel
25146,"[251, 254): 'and'",and,CCONJ,25150.0,cc
25147,"[255, 256): 'a'",a,DET,25150.0,det
25148,"[257, 261): 'very'",very,ADV,25149.0,advmod
25149,"[262, 275): 'knowledgeable'",knowledgeable,ADJ,25150.0,amod
25150,"[276, 281): 'staff'",staff,NOUN,25145.0,conj


In [21]:
# because of the support we've built into Text extensions, we can use powerful data visualization tools 
# here we're leveraging spaCy's dependency tree visualization tools, to show the parse tree 
# as specified in the raw conllu file. 
# other integrations with spacy, as well as other packages are possible 

Sentence_num = 110
# use pandas to quickly select the 'n'th sentence in the dataset 
first_sentence = list(re_read_df.groupby("sentence_id",sort=False))[Sentence_num][1]
# then display it, as well as its parse tree. 
display(first_sentence[["span","lemma","upostag","xpostag","head","deprel","sentence"]])
tp.io.spacy.render_parse_tree(first_sentence,tag_col="upostag",label_col="deprel",head_col="head")

Unnamed: 0,span,lemma,upostag,xpostag,head,deprel,sentence
2510,"[979, 982): 'And'",and,CCONJ,CC,2514.0,cc,"[979, 1014): 'And what do we get for this effo..."
2511,"[983, 987): 'what'",what,PRON,WP,2514.0,obj,"[979, 1014): 'And what do we get for this effo..."
2512,"[988, 990): 'do'",do,AUX,VBP,2514.0,aux,"[979, 1014): 'And what do we get for this effo..."
2513,"[991, 993): 'we'",we,PRON,PRP,2514.0,nsubj,"[979, 1014): 'And what do we get for this effo..."
2514,"[994, 997): 'get'",get,VERB,VB,,root,"[979, 1014): 'And what do we get for this effo..."
2515,"[998, 1001): 'for'",for,ADP,IN,2517.0,case,"[979, 1014): 'And what do we get for this effo..."
2516,"[1002, 1006): 'this'",this,DET,DT,2517.0,det,"[979, 1014): 'And what do we get for this effo..."
2517,"[1007, 1013): 'effort'",effort,NOUN,NN,2514.0,obl,"[979, 1014): 'And what do we get for this effo..."
2518,"[1013, 1014): '?'",?,PUNCT,.,2514.0,punct,"[979, 1014): 'And what do we get for this effo..."
