In [1]:
"""original copied from https://github.com/grew-nlp/grewpy/blob/master/examples/test_corpus.py"""
import json
import os
import sys
from pathlib import Path
from pprint import pprint
from collections import namedtuple

import pandas as pd
# from grewpy import Graph, CorpusDraft, Request, Corpus, request_counter
from grewpy import (Corpus, 
                    # CorpusDraft, Graph, 
                    Request, request_counter)
from grewpy.grew import GrewError as GrewError

# sys.path.insert(0, os.path.abspath(os.path.join( os.path.dirname(__file__), "../"))) # Use local grew lib

_META_TUP = namedtuple(
    'meta_info', 
    ['sent_id', 'doc_id', 'sent_int', 'prev_id', 'prev_text', 'next_id', 'next_text'])

def corpus_from_path(path):
    return Corpus(str(path))

def docs(expr):

    try:
        print(expr.__doc__)
    except AttributeError:
        print('None')

connected to port: 8888


In [2]:
conllu_path = Path(
    "data/corpora/gitrepo_puddin/2smallest.conll/apw_eng_199911.conllu"
    # "data/corpora/gitrepo_puddin/2smallest.conll/nyt_eng_200405.conllu"
    )
co = corpus_from_path(conllu_path)

 Should be able to set this up to have `conllu_path` (and `pat_path`?)
 as input, and run it in parallel on a list of files,
 even files from different directories

 ...did I just rewrite the entire subset code today? 🤦‍♀️

In [3]:
print("\n=============== len ===============")
print(f"sentence count in {conllu_path.name} = {len(co)}")
print(request_counter())

print(f"len(co[0]) = {len(co[0])}")
print(f"len(co[-1]) = {len(co[-1])}")
print(f"[len(g) for g in co[-3:]] = {[len(g) for g in co[-3:]]}")
# other forms co[-3:-1], co[1:7:2], ...


sentence count in apw_eng_199911.conllu = 1147
1
len(co[0]) = 33
len(co[-1]) = 10
[len(g) for g in co[-3:]] = [9, 31, 10]


In [4]:
print("\n=============== Count request in a corpus ===============")
for xpos in ("RB.*", "JJ.*"):
    # xpos="RB.*"
    if xpos.startswith('RB'):
        print('# ADVERBS')
    elif xpos.startswith('JJ'):
        print('# ADJECTIVES')

    req = Request(f'X[xpos=re"{xpos}"]')

    print(" ----- basic count -----")
    print(f"total {xpos} in {conllu_path.name} = ", co.count(req))

    print(" ----- count with clustering -----")
    print(f"total {xpos} in {conllu_path.name}, clustered by exact POS:")
    # print(request_counter())
    print(pd.Series(co.count(req, ["X.xpos"])).to_frame().rename(columns={0:'total'}).to_markdown(), '\n')


# ADVERBS
 ----- basic count -----
total RB.* in apw_eng_199911.conllu =  840
 ----- count with clustering -----
total RB.* in apw_eng_199911.conllu, clustered by exact POS:
|     |   total |
|:----|--------:|
| RBS |      10 |
| RBR |      32 |
| RB  |     798 | 

# ADJECTIVES
 ----- basic count -----
total JJ.* in apw_eng_199911.conllu =  1809
 ----- count with clustering -----
total JJ.* in apw_eng_199911.conllu, clustered by exact POS:
|     |   total |
|:----|--------:|
| JJS |      55 |
| JJR |      66 |
| JJ  |    1688 | 



In [5]:
#* ALL bigrams
req = Request('ADJ [xpos=re"JJ.?"];'
              'mod: ADJ -[advmod]-> ADV;'
              'ADV < ADJ'
              )
print(str(req))

pattern {ADJ [xpos=re"JJ.?"];mod: ADJ -[advmod]-> ADV;ADV < ADJ}


In [6]:
pat_path = Path('Pat/advadj/all-RB-JJs.pat')
pat_str = pat_path.read_text(encoding='utf8')
print(pat_str)

pattern { 
    ADJ [xpos=re"JJ.?"]; 
    mod: ADJ -[advmod]-> ADV;  
    ADV < ADJ
}

% will match e.g. `not uninteresting`



 ⚠️ Just running the raw pattern file text will result in an error:

In [7]:
print(str(Request(pat_str)))
try: 
    co.count(Request(pat_str))
except: 
    print('ERROR! Bad request. (handled to prevent cancelation of following cells)')

pattern {pattern { 
    ADJ [xpos=re"JJ.?"];mod: ADJ -[advmod]-> ADV;ADV < ADJ
}

% will match e.g. `not uninteresting`}
ERROR! Bad request. (handled to prevent cancelation of following cells)


In [8]:
def grewpize_pat(raw_text): 
    return ''.join(line.strip() for line in raw_text.split('{', 1)[1].split('}',1)[0].strip().splitlines())
clean_str = grewpize_pat(pat_str)
print(clean_str.replace(';', ';\n'))
print('# actual form:')
print(clean_str)

ADJ [xpos=re"JJ.?"];
mod: ADJ -[advmod]-> ADV;
ADV < ADJ
# actual form:
ADJ [xpos=re"JJ.?"];mod: ADJ -[advmod]-> ADV;ADV < ADJ


In [9]:
read_req = Request(clean_str)
print(str(read_req))

pattern {ADJ [xpos=re"JJ.?"];mod: ADJ -[advmod]-> ADV;ADV < ADJ}


In [10]:
# or, all in one go: 
full_read_req = Request(grewpize_pat(pat_path.read_text(encoding='utf8')))
str(full_read_req) == str(read_req)

True

In [11]:
co.count(read_req)

80

In [12]:
print("\n=============== Count `ADV ADJ` bigrams ===============")
print(f"total `ADV ADJ` bigrams in {conllu_path.name}: {co.count(req)}")
print("\n----- count with clustering -----")
print(f"`ADV ADJ` bigrams in {conllu_path.name}, clustered by ADV lemma:")
# print(json.dumps(co.count(req, ["ADV.lemma"]), indent=2))
pd.Series(co.count(req, ["ADV.lemma"])).to_frame().reset_index().rename(
    columns={'index':'adverb', 0:'total_bigrams'}
    ).sort_values('total_bigrams', ascending=False)


total `ADV ADJ` bigrams in apw_eng_199911.conllu: 80

----- count with clustering -----
`ADV ADJ` bigrams in apw_eng_199911.conllu, clustered by ADV lemma:


Unnamed: 0,adverb,total_bigrams
15,more,12
14,most,9
2,too,7
1,very,6
4,so,5
31,as,5
23,how,3
6,reportedly,3
7,really,3
16,long,2


In [13]:
print(f"Top 10 `ADV ADJ` bigrams in {conllu_path.name}")
pd.json_normalize(co.count(req, ["ADV.lemma", "ADJ.lemma"]), sep='_').transpose(
    ).rename(columns={0:'total'}).nlargest(10, 'total')

Top 10 `ADV ADJ` bigrams in apw_eng_199911.conllu


Unnamed: 0,total
reportedly_close,3
how_much,3
very_long,2
too_high,2
so_bad,2
most_valuable,2
most_important,2
more_expensive,2
more_difficult,2
long_admired,2


In [14]:
match_list = co.search(req)

print("\n=============== `ADV ADJ` bigram match info ===============")
pd.json_normalize(match_list, sep='_')




Unnamed: 0,sent_id,matching_nodes_ADV,matching_nodes_ADJ,matching_edges_mod_source,matching_edges_mod_label,matching_edges_mod_target
0,apw_eng_19991101_0059_18,23,24,24,advmod,23
1,apw_eng_19991101_0059_16,4,5,5,advmod,4
2,apw_eng_19991101_0059_15,5,6,6,advmod,5
3,apw_eng_19991101_0059_5,28,29,29,advmod,28
4,apw_eng_19991101_0059_5,31,32,32,advmod,31
...,...,...,...,...,...,...
75,apw_eng_19991101_0006_20,28,29,29,advmod,28
76,apw_eng_19991101_0006_8,10,11,11,advmod,10
77,apw_eng_19991101_0005_14,9,10,10,advmod,9
78,apw_eng_19991101_0002_19,13,14,14,advmod,13


In [15]:
def gen_conllus(match_list, corpus):
    
    # for sent in match_list:
    for i, sent in enumerate(match_list):
        parse = corpus.get(sent['sent_id'])
        if i < 3:
            print(parse.to_conll())
        yield parse.to_conll()+'\n'

In [16]:
conllu_gen = gen_conllus(match_list, co)

subset_dir = conllu_path.parent.joinpath(f'subset_{pat_path.parent.stem}')
if not subset_dir.is_dir(): 
    subset_dir.mkdir()
    
subset_path = subset_dir.joinpath(f'{pat_path.stem}+{conllu_path.name}')
subset_path.write_text('\n'.join(conllu_gen), encoding='utf8')

# sent_id = apw_eng_19991101_0059_18
# text = Katz , an adviser to local governments and an expert on stadium financing , has tried to position himself as a moderate best able to build on the economic recovery generated under popular two-term Mayor Edward G. Rendell .
1	Katz	Katz	_	NNP	_	16	nsubj	16:nsubj	_
2	,	,	_	,	_	0	-	0:-	_
3	an	a	_	DT	_	4	det	4:det	_
4	adviser	adviser	_	NN	_	1	appos	1:appos	_
5	to	to	_	TO	_	4	prep	4:prep	_
6	local	local	_	JJ	_	7	amod	7:amod	_
7	governments	government	_	NNS	_	5	pobj	5:pobj	_
8	and	and	_	CC	_	4	cc	4:cc	_
9	an	a	_	DT	_	10	det	10:det	_
10	expert	expert	_	NN	_	4	conj	4:conj	_
11	on	on	_	IN	_	10	prep	10:prep	_
12	stadium	stadium	_	NN	_	13	nn	13:nn	_
13	financing	financing	_	NN	_	11	pobj	11:pobj	_
14	,	,	_	,	_	0	-	0:-	_
15	has	has	_	AUXZ	_	16	dep	16:dep	_
16	tried	try	_	VBN	_	0	root	0:root	_
17	to	to	_	TO	_	18	aux	18:aux	_
18	position	position	_	VB	_	16	xcomp	16:xcomp	_
19	himself	himself	_	PRP	_	18	dobj	18:dobj	_
20	as	as	_	IN	_	18	prep	18:prep	_
21	a	

101831

In [17]:
advadj_subset = corpus_from_path(subset_path)

 ## modifier bigram only

In [18]:
neg_req = Request(
    'ADJ [xpos=re"JJ.?"];'
    'mod: ADJ -[advmod]-> ADV;'
    'ADV < ADJ;'
    # 'NEG [lemma="not"];'
    # 'NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];'
    # 'neg: ADJ -[re"[^E].*"]-> NEG;'
    # 'neg: ADJ -> NEG;'
    # 'neg: NEG -> ADJ;'
    # 'NEG << ADV;'
    )
print(str(neg_req).replace(';', ';\n\t '))
print('-------------------------\nhits:', co.count(neg_req))

pattern {ADJ [xpos=re"JJ.?"];
	 mod: ADJ -[advmod]-> ADV;
	 ADV < ADJ}
-------------------------
hits: 80


 ## `not` somewhere in sentence

In [19]:
neg_req = Request(
    'ADJ [xpos=re"JJ.?"];'
    'mod: ADJ -[advmod]-> ADV;'
    'ADV < ADJ;'
    'NEG [lemma="not"];'
    # 'NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];'
    # 'neg: ADJ -[re"[^E].*"]-> NEG;'
    # 'neg: ADJ -> NEG;'
    # 'neg: NEG -> ADJ;'
    # 'NEG << ADV;'
    )
print(str(neg_req).replace(';', ';\n\t '))
print('-------------------------\nhits:', co.count(neg_req))

pattern {ADJ [xpos=re"JJ.?"];
	 mod: ADJ -[advmod]-> ADV;
	 ADV < ADJ;
	 NEG [lemma="not"]}
-------------------------
hits: 14


 ## known `NEG` lemma somewhere in sentence

In [20]:
neg_req = Request(
    'ADJ [xpos=re"JJ.?"];'
    'mod: ADJ -[advmod]-> ADV;'
    'ADV < ADJ;'
    # 'NEG [lemma="not"];'
    'NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];'
    # 'neg: ADJ -[re"[^E].*"]-> NEG;'
    # 'neg: ADJ -> NEG;'
    # 'neg: NEG -> ADJ;'
    # 'NEG << ADV;'
    )
print(str(neg_req).replace(';', ';\n\t '))
print('-------------------------\nhits:', co.count(neg_req))

pattern {ADJ [xpos=re"JJ.?"];
	 mod: ADJ -[advmod]-> ADV;
	 ADV < ADJ;
	 NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"]}
-------------------------
hits: 24


 ## known `NEG` lemma preceding `ADV` token node

In [21]:
neg_req = Request(
    'ADJ [xpos=re"JJ.?"];'
    'mod: ADJ -[advmod]-> ADV;'
    'ADV < ADJ;'
    # 'NEG [lemma="not"];'
    'NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];'
    # 'neg: ADJ -[re"[^E].*"]-> NEG;'
    # 'neg: ADJ -> NEG;'
    # 'neg: NEG -> ADJ;'
    'NEG << ADV;'
    )
print(str(neg_req).replace(';', ';\n\t '))
print('-------------------------\nhits:', co.count(neg_req))

pattern {ADJ [xpos=re"JJ.?"];
	 mod: ADJ -[advmod]-> ADV;
	 ADV < ADJ;
	 NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];
	 NEG << ADV}
-------------------------
hits: 20


 ## known `NEG` lemma with `ADJ` node as its **target** (in dependency relationship)
 This is not expected. No $NegPol$ patterns cover relationships of this direction.

In [22]:
neg_req = Request(
    'ADJ [xpos=re"JJ.?"];'
    'mod: ADJ -[advmod]-> ADV;'
    'ADV < ADJ;'
    # 'NEG [lemma="not"];'
    'NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];'
    # 'neg: ADJ -[re"[^E].*"]-> NEG;'
    # 'neg: ADJ -> NEG;'
    'neg: NEG -> ADJ;'
    'NEG << ADV;'
    )
print(str(neg_req).replace(';', ';\n\t '))
print('-------------------------\nhits:', co.count(neg_req))

pattern {ADJ [xpos=re"JJ.?"];
	 mod: ADJ -[advmod]-> ADV;
	 ADV < ADJ;
	 NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];
	 neg: NEG -> ADJ;
	 NEG << ADV}
-------------------------
hits: 0


 ## known `NEG` lemma with `ADJ` node as its **head/source**

In [23]:
neg_req = Request(
    'ADJ [xpos=re"JJ.?"];'
    'mod: ADJ -[advmod]-> ADV;'
    'ADV < ADJ;'
    # 'NEG [lemma="not"];'
    'NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];'
    # 'neg: ADJ -[re"[^E].*"]-> NEG;'
    'neg: ADJ -> NEG;'
    # 'neg: NEG -> ADJ;'
    'NEG << ADV;'
    )
print(str(neg_req).replace(';', ';\n\t '))
print('-------------------------\nhits:', co.count(neg_req))

pattern {ADJ [xpos=re"JJ.?"];
	 mod: ADJ -[advmod]-> ADV;
	 ADV < ADJ;
	 NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];
	 neg: ADJ -> NEG;
	 NEG << ADV}
-------------------------
hits: 6


 👆 This pattern, which does not impose any restrictions on the *type*
 of dependency relationship between `NEG` and `ADJ` yields duplicate results
 in the case of any "enhanced" parsing

In [24]:
hits = co.search(neg_req)
pprint(hits[:2])

[{'matching': {'edges': {'mod': {'label': 'advmod',
                                 'source': '12',
                                 'target': '11'},
                         'neg': {'label': {'1': 'neg', 'enhanced': 'yes'},
                                 'source': '12',
                                 'target': '10'}},
               'nodes': {'ADJ': '12', 'ADV': '11', 'NEG': '10'}},
  'sent_id': 'apw_eng_19991101_0031_5'},
 {'matching': {'edges': {'mod': {'label': 'advmod',
                                 'source': '12',
                                 'target': '11'},
                         'neg': {'label': 'neg',
                                 'source': '12',
                                 'target': '10'}},
               'nodes': {'ADJ': '12', 'ADV': '11', 'NEG': '10'}},
  'sent_id': 'apw_eng_19991101_0031_5'}]


In [25]:
pprint({f"{h['sent_id']} ~ hit {i} neg label":h['matching']['edges']['neg']['label'] for i,h in enumerate(hits[:16])})

{'apw_eng_19991101_0021_10 ~ hit 4 neg label': {'1': 'neg', 'enhanced': 'yes'},
 'apw_eng_19991101_0021_10 ~ hit 5 neg label': 'neg',
 'apw_eng_19991101_0025_10 ~ hit 2 neg label': {'1': 'neg', 'enhanced': 'yes'},
 'apw_eng_19991101_0025_10 ~ hit 3 neg label': 'neg',
 'apw_eng_19991101_0031_5 ~ hit 0 neg label': {'1': 'neg', 'enhanced': 'yes'},
 'apw_eng_19991101_0031_5 ~ hit 1 neg label': 'neg'}


 ## known `NEG` lemma with `ADJ` node as its *head/source* and the relationship type does not start with "E"
 This is to prevent issue with duplicate records for "Enhanced" versions of the dependency.

In [26]:
neg_req = Request(
    'ADJ [xpos=re"JJ.?"];'
    'mod: ADJ -[advmod]-> ADV;'
    'ADV < ADJ;'
    # 'NEG [lemma="not"];'
    'NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];'
    'neg: ADJ -[re"[^E].*"]-> NEG;'
    # 'neg: ADJ -> NEG;'
    # 'neg: NEG -> ADJ;'
    'NEG << ADV;'
    )
print(str(neg_req).replace(';', ';\n\t '))
print('-------------------------\nhits:', co.count(neg_req))

pattern {ADJ [xpos=re"JJ.?"];
	 mod: ADJ -[advmod]-> ADV;
	 ADV < ADJ;
	 NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];
	 neg: ADJ -[re"[^E].*"]-> NEG;
	 NEG << ADV}
-------------------------
hits: 3


 ...and now the duplicate hits (as well as the output structure unpredictability) have been removed:

In [27]:
pprint({f"{h['sent_id']} ~ hit {i} neg label":h['matching']['edges']['neg']['label'] for i,h in enumerate(co.search(neg_req)[:16])})

{'apw_eng_19991101_0021_10 ~ hit 2 neg label': 'neg',
 'apw_eng_19991101_0025_10 ~ hit 1 neg label': 'neg',
 'apw_eng_19991101_0031_5 ~ hit 0 neg label': 'neg'}


In [28]:
pd.json_normalize(co.count(neg_req, ["ADV.lemma", "NEG.lemma"]), sep='_').transpose().rename(columns={0:'total'}).nlargest(10,'total')

Unnamed: 0,total
so_not,2
fully_not,1


In [29]:
neg_df = pd.json_normalize(co.search(neg_req), sep='_').convert_dtypes()
neg_df.columns = (
    neg_df.columns
    .str.replace('matching_', '')
    .str.replace('nodes_', 'index_')
    .str.replace('edges_', 'dep_'))
neg_df

Unnamed: 0,sent_id,index_NEG,index_ADV,index_ADJ,dep_neg_source,dep_neg_label,dep_neg_target,dep_mod_source,dep_mod_label,dep_mod_target
0,apw_eng_19991101_0031_5,10,11,12,12,neg,10,12,advmod,11
1,apw_eng_19991101_0025_10,12,14,15,15,neg,12,15,advmod,14
2,apw_eng_19991101_0021_10,12,14,15,15,neg,12,15,advmod,14


In [30]:
for m in neg_df.sample(min(len(neg_df),8)).sent_id.apply(lambda i: co.get(i).meta): 
    print(f"{m['sent_id']}:\t{m['text']}")

apw_eng_19991101_0031_5:	It was not clear why the Ingush border was not fully open .
apw_eng_19991101_0021_10:	AT & claims that limiting the number of access providers may not be so bad , because a company that can plan its investment in all the equipment it requires to run the Internet through TV cables can offer service faster , cheaper and more efficiently .
apw_eng_19991101_0025_10:	AT & claims that limiting the number of access providers may not be so bad , because a company that can plan its investment in all the equipment it requires to run the Internet through TV cables can offer service faster , cheaper and more efficiently .


 ## Thoughts...
 Instead of creating new subset `.conllu` files with all matching
 sentences + their context sentences (preceding and following),
 it should be possible to take only what is needed for the context sentences
 (i.e. `sent_id` and `sent_text`, the only things put into the hit table)
 and then pull those directly from the meta info.

 The next big question, though, is whether to create the subset file at all in that case.
 **This new module essentially makes the first 2 steps of the pipeline code obsolete.**
 If this functionality had existed 2 years ago... smh 😐

 But I'm not going to rework *everything* at this point.

 ## Plan
 I will create the `advadj` subset and then run the pipeline on those.

 If I remove the context sentences, `fill_match_info` will need to be modified to not collect them
 (since anything it would pull would be incorrect).
 However, it would not necessarily need to add them at all at that point.
 I wanted to make sure I had access to what they are, but so far, I haven't used them.
 If there exists a table where they can be looked up if need be, that should suffice.
 The sentence IDs are stable and unique identifiers, so any table indexed by `sent_id`
 will be easy to connect with the existing data.

 Mock Table Schematic

 > | sent_id | doc_id | conllu_id | sent_text | prev_id | next_id | prev_text | next_text |
 > |:--------|:-------|:----------|:----------|:--------|:--------|:----------|:----------|
 > | ... | ... | ... | ... | ... | ... | ... | ... |

In [31]:
# hit_meta = pd.json_normalize(neg_df.sent_id.apply(lambda i: co.get(i).meta['text'])
#                              ).convert_dtypes()#.rename(columns={'':'newdoc_id'})
hit_meta = neg_df.sent_id.to_frame()
hit_meta = hit_meta.assign(sent_text = hit_meta.sent_id.apply(lambda i: co.get(i).meta['text']),
                           conllu_id=conllu_path.stem, 
                        #    newdoc_id=hit_meta.newdoc_id.str.replace('# newdoc id = ', '').fillna('')
                        )
hit_meta

Unnamed: 0,sent_id,sent_text,conllu_id
0,apw_eng_19991101_0031_5,It was not clear why the Ingush border was not...,apw_eng_199911
1,apw_eng_19991101_0025_10,AT & claims that limiting the number of access...,apw_eng_199911
2,apw_eng_19991101_0021_10,AT & claims that limiting the number of access...,apw_eng_199911


In [32]:
def parse_sent_id(sent_id):
    doc_id, ordinal_str = sent_id.rsplit('_', 1)
    ordinal_int = int(ordinal_str)

    row = (sent_id, doc_id, ordinal_int)
    for context_ix in (ordinal_int + i for i in (-1, 1)): 
        c_text = ''
        c_id = ''
        #> conllu doc sentence numbering starts at 1
        if context_ix > 0:
            c_id = f'{doc_id}_{context_ix}'
            try: 
                c_obj = co.get(c_id)
            except GrewError: 
                c_id = ''
            else: 
                c_text = c_obj.meta['text']
        row += (c_id, c_text)
        
    yield _META_TUP._make(row)

In [33]:
sid = hit_meta.sent_id[0]
print(pd.DataFrame(parse_sent_id(sid)).set_index('sent_id').transpose().to_markdown())

|           | apw_eng_19991101_0031_5                                                                                                                        |
|:----------|:-----------------------------------------------------------------------------------------------------------------------------------------------|
| doc_id    | apw_eng_19991101_0031                                                                                                                          |
| sent_int  | 5                                                                                                                                              |
| prev_id   | apw_eng_19991101_0031_4                                                                                                                        |
| prev_text | But no other vehicles were let through , and a line of cars and trucks crammed with refugees stretched for several miles on the Chechen side . |
| next_id   | apw_eng_19991101_0031_6         

In [34]:
context_info = pd.concat(pd.DataFrame(parse_sent_id(s)) for s in hit_meta.sent_id)
context_info.head()

Unnamed: 0,sent_id,doc_id,sent_int,prev_id,prev_text,next_id,next_text
0,apw_eng_19991101_0031_5,apw_eng_19991101_0031,5,apw_eng_19991101_0031_4,"But no other vehicles were let through , and a...",apw_eng_19991101_0031_6,-LBQ- We are ready to let the refugees pass ...
0,apw_eng_19991101_0025_10,apw_eng_19991101_0025,10,apw_eng_19991101_0025_9,"-LBQ- That 's called logical , consistent th...",apw_eng_19991101_0025_11,-LBQ- We 're moving into an information-base...
0,apw_eng_19991101_0021_10,apw_eng_19991101_0021,10,apw_eng_19991101_0021_9,"-LBQ- That 's called logical , consistent th...",apw_eng_19991101_0021_11,-LBQ- We 're moving into an information-base...


In [35]:
meta_info = hit_meta.set_index('sent_id').join(context_info.set_index('sent_id')).rename(columns={'text':'sent_text'})
meta_info = meta_info[['conllu_id', 'doc_id', 'sent_int', 'prev_id', 'next_id', 'prev_text', 'sent_text', 'next_text']]
print(meta_info.sample(min(len(meta_info),2)).to_markdown())
print("\n---\n")
print(meta_info.iloc[-1, :].squeeze().to_markdown())

| sent_id                  | conllu_id      | doc_id                |   sent_int | prev_id                 | next_id                  | prev_text                                                                                                                                                                                                                                            | sent_text                                                                                                                                                                                                                                                    | next_text                                                |
|:-------------------------|:---------------|:----------------------|-----------:|:------------------------|:-------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [36]:
meta_info.to_csv(subset_dir.joinpath(conllu_path.stem+'+meta.psv'), sep='|')

In [37]:
neg_df.loc[:, ['sent_text', 'conllu_id', 'doc_id']] =  neg_df.sent_id.apply(
    lambda x: meta_info.loc[x, ['sent_text', 'conllu_id', 'doc_id']])
neg_df.sample(min(len(neg_df),10))

Unnamed: 0,sent_id,index_NEG,index_ADV,index_ADJ,dep_neg_source,dep_neg_label,dep_neg_target,dep_mod_source,dep_mod_label,dep_mod_target,sent_text,conllu_id,doc_id
0,apw_eng_19991101_0031_5,10,11,12,12,neg,10,12,advmod,11,It was not clear why the Ingush border was not...,apw_eng_199911,apw_eng_19991101_0031
2,apw_eng_19991101_0021_10,12,14,15,15,neg,12,15,advmod,14,AT & claims that limiting the number of access...,apw_eng_199911,apw_eng_19991101_0021
1,apw_eng_19991101_0025_10,12,14,15,15,neg,12,15,advmod,14,AT & claims that limiting the number of access...,apw_eng_199911,apw_eng_19991101_0025
