In [119]:
import os
import sys
import numpy as np
import pandas as pd
import json
import feather
import sklearn.pipeline
import sklearn.linear_model

# And of course we need the text_extensions_for_pandas library itself.
try:
    import text_extensions_for_pandas as tp
except ModuleNotFoundError as e:
    # If we're running from within the project source tree and the parent Python
    # environment doesn't have the text_extensions_for_pandas package, use the
    # version in the local source tree.
    if not os.getcwd().endswith("notebooks"):
        raise e
    if ".." not in sys.path:
        sys.path.insert(0, "..")
    import text_extensions_for_pandas as tp

In [37]:
# init file locations, and download data if necessary. 
BASE_DIR = 'CoNLL_u_test_inputs/'
FEATHER_FILE = "conllu_database.feather"

ewt_base_url = "https://github.com/UniversalDependencies/UD_English-EWT/blob/master/en_ewt-ud-"
ewt_dev_url = ewt_base_url + 'dev.conllu'
conll_09_test_data_url =  'https://ufal.mff.cuni.cz/conll2009-st/trial/CoNLL2009-ST-English-trial.zip'

# download the files if they have not already been downloaded 
conll_09_path = tp.io.conll.maybe_download_dataset_data(BASE_DIR, conll_09_test_data_url)
conllu_ewt_path = tp.io.conll.maybe_download_dataset_data(BASE_DIR, ewt_dev_url)

# if you already have access to the full conll_2009_dataset, name the file accordingly and uncomment this line 
# conll_09_path = BASE_DIR + 'CoNLL2009-ST-evaluation-English.conllu'

In [17]:
# import two very different documents, both in the conllu file format. 

# by default we look for EWT style column names, 
# so we have to define a new set for this specific conll09 format
conll_09_cols = ["LEMMA","PLEMMA",'POS','PPOS','FEAT','PFEAT','head','phead','DEPREL','PDEPREL','FILLPRED','PRED']

conllu_09_docs = tp.io.conll.conll_u_to_dataframes(conll_09_path,column_names=conll_09_cols)
#now just filter,and display the document 
conllu_09_doc = conllu_09_docs[0].drop(columns=["PLEMMA",'PPOS','PFEAT','phead','PDEPREL','FILLPRED','sentence','line_num'])
display(conllu_09_doc.head())


#simultaneously, we can import an ewt style document, and display it with the same function
conll_u_docs = tp.io.conll.conll_u_to_dataframes(conllu_ewt_path)
#display 
DOC_NUM = 0
doc_df = conll_u_docs[DOC_NUM]
# here we drop the sentence argument for brevity. Note how we look for 
doc_df.head(10).drop(columns = ["sentence"])

Unnamed: 0,span,LEMMA,POS,FEAT,head,DEPREL,PRED,predicate,pred0arg,pred1arg,pred2arg,pred3arg,pred4arg,pred5arg,pred6arg,pred7arg,pred8arg,pred9arg,pred10arg
0,"[0, 3): 'The'",the,DT,,1.0,NMOD,,,,,,,,,,,,,
1,"[4, 11): 'economy'",economy,NN,,3.0,NMOD,,A1,,,,,,,,,,,
2,"[11, 13): ''s'",'s,POS,,1.0,SUFFIX,,,,,,,,,,,,,
3,"[14, 25): 'temperature'",temperature,NN,,4.0,SBJ,temperature.01,A2,A1,,,,,,,,,,
4,"[26, 30): 'will'",will,MD,,,ROOT,,,AM-MOD,,,,,,,,,,


Unnamed: 0,span,lemma,upostag,xpostag,features,head,deprel,deps,misc,sentence_id,paragraph_id,doc_id,line_num
0,"[0, 4): 'From'",from,ADP,IN,,2.0,case,3:case,,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,4
1,"[5, 8): 'the'",the,DET,DT,Definite=Def|PronType=Art,2.0,det,3:det,,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,5
2,"[9, 11): 'AP'",AP,PROPN,NNP,Number=Sing,3.0,obl,4:obl:from,,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,6
3,"[12, 17): 'comes'",come,VERB,VBZ,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,,root,0:root,,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,7
4,"[18, 22): 'this'",this,DET,DT,Number=Sing|PronType=Dem,5.0,det,6:det,,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,8
5,"[23, 28): 'story'",story,NOUN,NN,Number=Sing,3.0,nsubj,4:nsubj,,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,9
6,"[28, 29): ':'",:,PUNCT,:,,3.0,punct,4:punct,,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,10
7,"[30, 39): 'President'",President,PROPN,NNP,Number=Sing,11.0,nsubj,5:nsubj,,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,15
8,"[40, 44): 'Bush'",Bush,PROPN,NNP,Number=Sing,7.0,flat,1:flat,,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,16
9,"[45, 47): 'on'",on,ADP,IN,,10.0,case,4:case,,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,weblog-blogspot.com_nominations_20041117172713...,17


In [18]:
# first we combine all of our documents into one large combined dataframe,for simplicity

# because we are concatenating our dataframes, we need to modify the "head" feilds to still point at their desired targets 
df_starts_at =0
temp = conll_u_docs.copy()
for df in temp: 
    df["head"] += df_starts_at
    df_starts_at +=df.shape[0]

# Now concatenate all our documents into one big dataframe
complete_df = temp[0]
complete_df = complete_df.append(temp[1:], ignore_index=True)

#show the last few rows of the dataframe, select just a few columns for compactness
display(complete_df.shape[0])
complete_df.tail()[["span","lemma","upostag","head","deprel"]]

25151

Unnamed: 0,span,lemma,upostag,head,deprel
25146,"[251, 254): 'and'",and,CCONJ,25150.0,cc
25147,"[255, 256): 'a'",a,DET,25150.0,det
25148,"[257, 261): 'very'",very,ADV,25149.0,advmod
25149,"[262, 275): 'knowledgeable'",knowledgeable,ADJ,25150.0,amod
25150,"[276, 281): 'staff'",staff,NOUN,25145.0,conj


In [19]:
# one advantage of using pandas dataframes is that we can write and read them signifcantly faster than we could the raw conllu files 
# here we use pyarrow with feather to save and reload our dataframe. 

# Currently writing multi document files is not supported, so we will have to use a workaround, 
# by converting sentences from TokenSpanArrays to SpanArrays
complete_df["sentence"] = tp.SpanArray(complete_df["span"].array.target_text, complete_df["sentence"].array.begin, complete_df["sentence"].array.end)

#finally write to file using feather 
path = BASE_DIR +FEATHER_FILE
# increase the chunksize slightly, to allow writing in a single block
feather.write_dataframe(complete_df, path,chunksize= 65536*8)
print(f"File written to {path}")

File written to CoNLL_u_test_inputs/conllu_database.feather


In [78]:
# now we can read this df and continue operating on it as before
re_read_df = feather.read_dataframe(path)
print(f"size is {re_read_df.size}")
# show the same subset of the dataframe as above 
re_read_df.tail()[["span","lemma","upostag","head","deprel"]]

size is 352114


Unnamed: 0,span,lemma,upostag,head,deprel
25146,"[251, 254): 'and'",and,CCONJ,25150.0,cc
25147,"[255, 256): 'a'",a,DET,25150.0,det
25148,"[257, 261): 'very'",very,ADV,25149.0,advmod
25149,"[262, 275): 'knowledgeable'",knowledgeable,ADJ,25150.0,amod
25150,"[276, 281): 'staff'",staff,NOUN,25145.0,conj


In [35]:
# because of the support we've built into Text extensions, we can use powerful data visualization tools 
# here we're leveraging spaCy's dependency tree visualization tools, to show the parse tree 
# as specified in the raw conllu file. 
# other integrations with spacy, as well as other packages are possible 

Sentence_num = 110
# use pandas to quickly select the 'n'th sentence in the dataset 
first_sentence = list(re_read_df.groupby("sentence_id",sort=False))[Sentence_num][1]
# then display it, as well as its parse tree. 
display(first_sentence[["span","lemma","upostag","xpostag","head","deprel","sentence"]])
tp.io.spacy.render_parse_tree(first_sentence,tag_col="upostag",label_col="deprel",head_col="head")

Unnamed: 0,span,lemma,upostag,xpostag,head,deprel,sentence
2510,"[979, 982): 'And'",and,CCONJ,CC,2514.0,cc,"[979, 1014): 'And what do we get for this effo..."
2511,"[983, 987): 'what'",what,PRON,WP,2514.0,obj,"[979, 1014): 'And what do we get for this effo..."
2512,"[988, 990): 'do'",do,AUX,VBP,2514.0,aux,"[979, 1014): 'And what do we get for this effo..."
2513,"[991, 993): 'we'",we,PRON,PRP,2514.0,nsubj,"[979, 1014): 'And what do we get for this effo..."
2514,"[994, 997): 'get'",get,VERB,VB,,root,"[979, 1014): 'And what do we get for this effo..."
2515,"[998, 1001): 'for'",for,ADP,IN,2517.0,case,"[979, 1014): 'And what do we get for this effo..."
2516,"[1002, 1006): 'this'",this,DET,DT,2517.0,det,"[979, 1014): 'And what do we get for this effo..."
2517,"[1007, 1013): 'effort'",effort,NOUN,NN,2514.0,obl,"[979, 1014): 'And what do we get for this effo..."
2518,"[1013, 1014): '?'",?,PUNCT,.,2514.0,punct,"[979, 1014): 'And what do we get for this effo..."


# Train a classifier model
Now use more text extensions integrations, with spaCy to quickly and easily train a classifier model on our data. 



In [116]:
# We're going to need the whole ewt dataset for this: download them, and parse them in 
fold_paths = {"test":  tp.io.conll.maybe_download_dataset_data(BASE_DIR, ewt_base_url + "test.conllu"),
              "dev":   tp.io.conll.maybe_download_dataset_data(BASE_DIR, ewt_base_url + "dev.conllu"),
              "train": tp.io.conll.maybe_download_dataset_data(BASE_DIR, ewt_base_url + "train.conllu")}
fold_docs = {}
for fold,fold_path in fold_paths.items(): 
    fold_docs[fold] = tp.io.conll.conll_u_to_dataframes(fold_path)
    print(f"converted fold: '{fold}' to list of {len(fold_docs[fold])} dataframes")
    #     uncomment to display segments of the extracted folds 
    #     display(fold_docs[fold][0].head()[['span','lemma','upostag','features','sentence']])


converted fold: 'test' to list of 316 dataframes
converted fold: 'dev' to list of 318 dataframes
converted fold: 'train' to list of 540 dataframes


In [125]:
# now we want to convert each fold to bert-compatible tokenization. 
# we will first initialize a pretrained BERT-compatible tokenizer from transformers,
# and use it to tokenize the document, then use another pretrained model to 
# generate BERT embeddings 
# See the 'Model_Training_with_BERT' notebook for more information on this process. 

bert_model_name = "dslim/bert-base-NER"
tokenizer = transformers.BertTokenizerFast.from_pretrained(bert_model_name)
bert = transformers.BertModel.from_pretrained(bert_model_name)

# also we will want to create a pandas categorical dtype for what we want to predict- part of speech. 
# use the combined df, because it has all the elements 
upostags = list(re_read_df["upostag"].unique())
upostag_dtype,upostag_list,upostag_dict = tp.io.conll.make_iob_tag_categories(upostags)
upostag_dtype

CategoricalDtype(categories=['O', 'B-ADP', 'B-DET', 'B-PROPN', 'B-VERB', 'B-NOUN',
                  'B-PUNCT', 'B-NUM', 'B-PART', 'B-ADJ', 'B-ADV', 'B-AUX',
                  'B-PRON', 'B-CCONJ', 'B-SCONJ', 'B-X', 'B-SYM', 'B-INTJ',
                  'I-ADP', 'I-DET', 'I-PROPN', 'I-VERB', 'I-NOUN', 'I-PUNCT',
                  'I-NUM', 'I-PART', 'I-ADJ', 'I-ADV', 'I-AUX', 'I-PRON',
                  'I-CCONJ', 'I-SCONJ', 'I-X', 'I-SYM', 'I-INTJ'],
, ordered=False)

In [95]:
# prep an example df for use with  a bert model as follows: 
# start by making bert tokens for the target text. 
# then align the texts, convert to iob, and mark the categorical data accordingly

tokenized = tp.io.bert.make_bert_tokens(doc_df.loc[0,'span'].target_text, tokenizer)
# align to text  -- convert "raw" spans to spans over bert tokens
aligned_spans = tp.TokenSpanArray.align_to_tokens(tokenized["span"],doc_df["span"])
# convert to IOB format for token -> UPOSTAG 
tokenized[["postag_iob","postag"]] = tp.io.conll.spans_to_iob(aligned_spans,doc_df["upostag"])
# add tokenization in our specific dtype
tokenized = tp.io.conll.add_token_classes(tokenized,upostag_dtype,iob_col_name = "postag_iob",entity_type_col_name="postag")
# finally, create an embedding for each bert token 
embeddings_df = tp.io.bert.add_embeddings(tokenized, bert)
embeddings_df

Unnamed: 0,token_id,span,input_id,token_type_id,attention_mask,special_tokens_mask,postag_iob,postag,token_class,token_class_id,embedding
0,0,"[0, 0): ''",101,0,1,True,O,,O,0,"[ -0.3136824, -0.12475453, 0.657083..."
1,1,"[0, 4): 'From'",1622,0,1,False,B,ADP,B-ADP,1,"[ -0.16610569, 0.0027155988, 0.836163..."
2,2,"[5, 8): 'the'",1103,0,1,False,B,DET,B-DET,2,"[ -0.5103949, -0.43374223, 0.522710..."
3,3,"[9, 11): 'AP'",10997,0,1,False,B,PROPN,B-PROPN,3,"[ -0.6617647, -0.4930782, -0.01439116..."
4,4,"[12, 17): 'comes'",2502,0,1,False,B,VERB,B-VERB,4,"[ -0.12817244, -0.1552504, 0.5966705..."
...,...,...,...,...,...,...,...,...,...,...,...
98,98,"[463, 465): 'St'",1457,0,1,False,B,PROPN,B-PROPN,3,"[ 1.2990547, 1.8487868, -0.3757655..."
99,99,"[465, 468): 'ead'",12393,0,1,False,I,PROPN,I-PROPN,20,"[ 0.38742343, 1.0229412, 0.3259903..."
100,100,"[468, 471): 'man'",1399,0,1,False,I,PROPN,I-PROPN,20,"[ 1.0614744, 1.2706978, -0.5080642..."
101,101,"[471, 472): '.'",119,0,1,False,B,PUNCT,B-PUNCT,6,"[ -0.22779426, -0.14614452, 0.612667..."


In [99]:
# make the above process into a method for use on the whole dataset: 
def preprocess_document(document, tokenizer,bert):
    temp = tp.io.bert.make_bert_tokens(document.loc[0,'span'].target_text, tokenizer)
    spans = tp.TokenSpanArray.align_to_tokens(temp["span"],document["span"])
    temp[["postag_iob","postag"]] = tp.io.conll.spans_to_iob(spans,document["upostag"])
    temp = tp.io.conll.add_token_classes(temp,upostag_dtype,iob_col_name = "postag_iob",entity_type_col_name="postag")
    return tp.io.bert.add_embeddings(temp, bert)

In [107]:
# now preprocess the whole corpus: 
bert_docs_by_fold = {}
for fold in fold_docs.keys():
    docs = fold_docs[fold]
    print(f"processing fold {fold}")
    bert_docs_by_fold[fold] = tp.jupyter.run_with_progress_bar(len(docs),lambda i: preprocess_document(docs[i],tokenizer,bert))

processing fold test


IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=316, style=ProgressStyle(desc…

processing fold dev


IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=318, style=ProgressStyle(desc…

processing fold train


IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=540, style=ProgressStyle(desc…

In [111]:
# combine folds and save to a feather file, so we don't necessarily need to redo the preprocessing. 
corpus_df = tp.io.conll.combine_folds(bert_docs_by_fold)
corpus_df["text"] = corpus_df["span"].apply(lambda s: s.covered_text)
corpus_df.drop(columns=["span"]).to_feather("outputs/conll_u_corpus.feather")
corpus_df

Unnamed: 0,fold,doc_num,token_id,span,input_id,token_type_id,attention_mask,special_tokens_mask,postag_iob,postag,token_class,token_class_id,embedding,text
0,test,0,0,"[0, 0): ''",101,0,1,True,O,,O,0,"[ -0.37686592, -0.14841378, 0.7398001...",
1,test,0,1,"[0, 4): 'What'",1327,0,1,False,B,PRON,B-PRON,12,"[ -0.23266968, -0.40546328, 0.617192...",What
2,test,0,2,"[5, 7): 'if'",1191,0,1,False,B,SCONJ,B-SCONJ,14,"[ -0.8156859, -0.04782569, 0.08148429...",if
3,test,0,3,"[8, 14): 'Google'",7986,0,1,False,B,PROPN,B-PROPN,3,"[ 0.78967804, -0.8511879, -0.4881262...",Google
4,test,0,4,"[15, 17): 'Mo'",12556,0,1,False,B,VERB,B-VERB,4,"[ -0.25935018, 0.5710723, -0.0910664...",Mo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307892,train,539,756,"[3152, 3154): 'my'",1139,0,1,False,B,PRON,B-PRON,12,"[ -0.06984596, -0.4646067, 0.8547705...",my
307893,train,539,757,"[3155, 3158): 'car'",1610,0,1,False,B,NOUN,B-NOUN,5,"[ 0.14624132, -0.46386197, 0.596684...",car
307894,train,539,758,"[3158, 3159): ')'",114,0,1,False,B,PUNCT,B-PUNCT,6,"[ -0.090651065, -0.29592788, 0.597023...",)
307895,train,539,759,"[3159, 3160): '.'",119,0,1,False,B,PUNCT,B-PUNCT,6,"[ 0.031023545, -0.27608734, 0.782190...",.


In [113]:
# re-read feather document if need be: 
if corpus_df is None or corpus_df.size == 0:
    corpus_df = pd.read_feather("outputs/conll_u_corpus.feather")
    corpus_df

In [110]:
# now get ready to train our model: 
train_df = corpus_df[corpus_df["fold"] == "train"]
train_df

Unnamed: 0,fold,doc_num,token_id,input_id,token_type_id,attention_mask,special_tokens_mask,postag_iob,postag,token_class,token_class_id,embedding
64729,train,0,0,101,0,1,True,O,,O,0,"[ -0.41927838, -0.22575253, 0.6648760..."
64730,train,0,1,2586,0,1,False,B,PROPN,B-PROPN,3,"[ -0.36961424, -1.0804733, -0.283367..."
64731,train,0,2,118,0,1,False,B,PUNCT,B-PUNCT,6,"[ -0.9178737, -0.94624436, -0.808995..."
64732,train,0,3,163,0,1,False,B,PROPN,B-PROPN,3,"[ -0.90530086, -0.97086835, -1.440879..."
64733,train,0,4,19853,0,1,False,I,PROPN,I-PROPN,20,"[ -1.1586123, -1.149766, -1.194975..."
...,...,...,...,...,...,...,...,...,...,...,...,...
307892,train,539,756,1139,0,1,False,B,PRON,B-PRON,12,"[ -0.06984596, -0.4646067, 0.8547705..."
307893,train,539,757,1610,0,1,False,B,NOUN,B-NOUN,5,"[ 0.14624132, -0.46386197, 0.596684..."
307894,train,539,758,114,0,1,False,B,PUNCT,B-PUNCT,6,"[ -0.090651065, -0.29592788, 0.597023..."
307895,train,539,759,119,0,1,False,B,PUNCT,B-PUNCT,6,"[ 0.031023545, -0.27608734, 0.782190..."


In [126]:
# now actually train a model, using sklearn 
MULTI_CLASS= "multinomial"

# How many iterations to run the BGFS optimizer when fitting logistic
# regression models. 100 ==> Fast; 10000 ==> Full convergence
LBGFS_ITERATIONS = 1000

base_pipeline = sklearn.pipeline.Pipeline([
    # Standard scaler. This only makes a difference for certain classes
    # of embeddings.
    #("scaler", sklearn.preprocessing.StandardScaler()),
    ("mlogreg", sklearn.linear_model.LogisticRegression(
        multi_class=MULTI_CLASS,
        verbose=10,
        max_iter=LBGFS_ITERATIONS
    ))
])

X_train = train_df["embedding"].values
Y_train = train_df["token_class_id"]
base_model = base_pipeline.fit(X_train, Y_train)
base_model


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 25.0min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 25.0min finished


Pipeline(steps=[('mlogreg',
                 LogisticRegression(max_iter=1000, multi_class='multinomial',
                                    verbose=10))])

In [127]:
def infer_on_df(df: pd.DataFrame, id_to_class_dict, predictor):
    result_df = df.copy()
    raw_outputs = tp.TensorArray(predictor.predict_proba(result_df["embedding"]))
    result_df["p_id"] = np.argmax(raw_outputs, axis=1)
    result_df["p_postag"]= result_df["p_id"].apply(lambda p_id: id_to_class_dict[p_id])
    iobs, types = tp.io.conll.decode_class_labels(result_df ["p_postag"].values)
    result_df["p_iob"] = iobs
    result_df["p_type"] = types
    result_df["raw_output"] = raw_outputs
    return result_df

dev_results = infer_on_df(corpus_df[corpus_df["fold"] == "test"],upostag_list,base_model)
dev_results

Unnamed: 0,fold,doc_num,token_id,span,input_id,token_type_id,attention_mask,special_tokens_mask,postag_iob,postag,token_class,token_class_id,embedding,text,p_id,p_postag,p_iob,p_type,raw_output
0,test,0,0,"[0, 0): ''",101,0,1,True,O,,O,0,"[ -0.37686592, -0.14841378, 0.7398001...",,0,O,O,,"[ 0.9999999977550698, 1.2013331410996974e-1..."
1,test,0,1,"[0, 4): 'What'",1327,0,1,False,B,PRON,B-PRON,12,"[ -0.23266968, -0.40546328, 0.617192...",What,6,B-PUNCT,B,PUNCT,"[1.1636381343139794e-10, 6.632241756523535e-0..."
2,test,0,2,"[5, 7): 'if'",1191,0,1,False,B,SCONJ,B-SCONJ,14,"[ -0.8156859, -0.04782569, 0.08148429...",if,14,B-SCONJ,B,SCONJ,"[1.9222457126669029e-10, 0.003846132485503873..."
3,test,0,3,"[8, 14): 'Google'",7986,0,1,False,B,PROPN,B-PROPN,3,"[ 0.78967804, -0.8511879, -0.4881262...",Google,3,B-PROPN,B,PROPN,"[ 7.078586986533554e-14, 5.643222225393274e-1..."
4,test,0,4,"[15, 17): 'Mo'",12556,0,1,False,B,VERB,B-VERB,4,"[ -0.25935018, 0.5710723, -0.0910664...",Mo,5,B-NOUN,B,NOUN,"[ 6.094384616059652e-15, 4.748000666238187e-0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32675,test,315,57,"[315, 324): 'exercises'",11536,0,1,False,B,NOUN,B-NOUN,5,"[ 0.04967872, -0.29388765, 0.8163368...",exercises,5,B-NOUN,B,NOUN,"[1.0632510146485344e-09, 2.1276845533963685e-0..."
32676,test,315,58,"[325, 327): 'to'",1106,0,1,False,B,PART,B-PART,8,"[ 0.10523166, -0.3070524, 0.730677...",to,8,B-PART,B,PART,"[2.6366796918071336e-08, 0.2350564146996362..."
32677,test,315,59,"[328, 331): 'use'",1329,0,1,False,B,VERB,B-VERB,4,"[ 0.13797167, -0.47705936, 0.8551439...",use,4,B-VERB,B,VERB,"[2.5537423737537782e-09, 2.6663335749291686e-0..."
32678,test,315,60,"[331, 332): '.'",119,0,1,False,B,PUNCT,B-PUNCT,6,"[ 0.014988249, -0.32625836, 0.820147...",.,6,B-PUNCT,B,PUNCT,"[ 6.589471900572395e-08, 8.925740601368359e-0..."
