In [1]:
"""original copied from https://github.com/grew-nlp/grewpy/blob/master/examples/test_corpus.py"""
import json
import os
import sys
from pathlib import Path
from pprint import pprint
from collections import namedtuple

import pandas as pd
# from grewpy import Graph, CorpusDraft, Request, Corpus, request_counter
from grewpy import (Corpus, 
                    # CorpusDraft, Graph, 
                    Request, request_counter)
from grewpy.grew import GrewError as GrewError

# sys.path.insert(0, os.path.abspath(os.path.join( os.path.dirname(__file__), "../"))) # Use local grew lib

_META_TUP = namedtuple(
    'meta_info', 
    ['sent_id', 'doc_id', 'sent_int', 'prev_id', 'prev_text', 'next_id', 'next_text'])

def corpus_from_path(path):
    return Corpus(str(path))

def docs(expr):

    try:
        print(expr.__doc__)
    except AttributeError:
        print('None')

connected to port: 8888


In [2]:
conllu_path = Path(
    # "data/corpora/gitrepo_puddin/2smallest.conll/apw_eng_199911.conllu"
    "data/corpora/gitrepo_puddin/2smallest.conll/nyt_eng_200405.conllu"
    )
co = corpus_from_path(conllu_path)

 Should be able to set this up to have `conllu_path` (and `pat_path`?)
 as input, and run it in parallel on a list of files,
 even files from different directories

 ...did I just rewrite the entire subset code today? 🤦‍♀️

In [3]:
print("\n=============== len ===============")
print(f"sentence count in {conllu_path.name} = {len(co)}")
print(request_counter())

print(f"len(co[0]) = {len(co[0])}")
print(f"len(co[-1]) = {len(co[-1])}")
print(f"[len(g) for g in co[-3:]] = {[len(g) for g in co[-3:]]}")
# other forms co[-3:-1], co[1:7:2], ...


sentence count in nyt_eng_200405.conllu = 11599
1
len(co[0]) = 5
len(co[-1]) = 4
[len(g) for g in co[-3:]] = [8, 9, 4]


In [4]:
print("\n=============== Count request in a corpus ===============")
for xpos in ("RB.*", "JJ.*"):
    # xpos="RB.*"
    if xpos.startswith('RB'):
        print('# ADVERBS')
    elif xpos.startswith('JJ'):
        print('# ADJECTIVES')

    req = Request(f'X[xpos=re"{xpos}"]')

    print(" ----- basic count -----")
    print(f"total {xpos} in {conllu_path.name} = ", co.count(req))

    print(" ----- count with clustering -----")
    print(f"total {xpos} in {conllu_path.name}, clustered by exact POS:")
    # print(request_counter())
    print(pd.Series(co.count(req, ["X.xpos"])).to_frame().rename(columns={0:'total'}).to_markdown(), '\n')


# ADVERBS
 ----- basic count -----
total RB.* in nyt_eng_200405.conllu =  8558
 ----- count with clustering -----
total RB.* in nyt_eng_200405.conllu, clustered by exact POS:
|     |   total |
|:----|--------:|
| RBS |     118 |
| RBR |     338 |
| RB  |    8102 | 

# ADJECTIVES
 ----- basic count -----
total JJ.* in nyt_eng_200405.conllu =  16212
 ----- count with clustering -----
total JJ.* in nyt_eng_200405.conllu, clustered by exact POS:
|     |   total |
|:----|--------:|
| JJS |     439 |
| JJR |     670 |
| JJ  |   15103 | 



In [5]:
#* ALL bigrams
req = Request('ADJ [xpos=re"JJ.?"];'
              'mod: ADJ -[advmod]-> ADV;'
              'ADV < ADJ'
              )
print(str(req))

pattern {ADJ [xpos=re"JJ.?"];mod: ADJ -[advmod]-> ADV;ADV < ADJ}


In [6]:
pat_path = Path('Pat/advadj/all-RB-JJs.pat')
pat_str = pat_path.read_text(encoding='utf8')
print(pat_str)

pattern { 
    ADJ [xpos=re"JJ.?"]; 
    mod: ADJ -[advmod]-> ADV;  
    ADV < ADJ
}

% will match e.g. `not uninteresting`



 ⚠️ Just running the raw pattern file text will result in an error:

In [7]:
print(str(Request(pat_str)))
try: 
    co.count(Request(pat_str))
except: 
    print('ERROR! Bad request. (handled to prevent cancelation of following cells)')

pattern {pattern { 
    ADJ [xpos=re"JJ.?"];mod: ADJ -[advmod]-> ADV;ADV < ADJ
}

% will match e.g. `not uninteresting`}
ERROR! Bad request. (handled to prevent cancelation of following cells)


In [8]:
def grewpize_pat(raw_text): 
    return ''.join(line.strip() for line in raw_text.split('{', 1)[1].split('}',1)[0].strip().splitlines())
clean_str = grewpize_pat(pat_str)
print(clean_str.replace(';', ';\n'))
print('# actual form:')
print(clean_str)

ADJ [xpos=re"JJ.?"];
mod: ADJ -[advmod]-> ADV;
ADV < ADJ
# actual form:
ADJ [xpos=re"JJ.?"];mod: ADJ -[advmod]-> ADV;ADV < ADJ


In [9]:
read_req = Request(clean_str)
print(str(read_req))

pattern {ADJ [xpos=re"JJ.?"];mod: ADJ -[advmod]-> ADV;ADV < ADJ}


In [10]:
# or, all in one go: 
full_read_req = Request(grewpize_pat(pat_path.read_text(encoding='utf8')))
str(full_read_req) == str(read_req)

True

In [11]:
co.count(read_req)

1158

In [12]:
print("\n=============== Count `ADV ADJ` bigrams ===============")
print(f"total `ADV ADJ` bigrams in {conllu_path.name}: {co.count(req)}")
print("\n----- count with clustering -----")
print(f"`ADV ADJ` bigrams in {conllu_path.name}, clustered by ADV lemma:")
# print(json.dumps(co.count(req, ["ADV.lemma"]), indent=2))
pd.Series(co.count(req, ["ADV.lemma"])).to_frame().reset_index().rename(
    columns={'index':'adverb', 0:'total_bigrams'}
    ).sort_values('total_bigrams', ascending=False)


total `ADV ADJ` bigrams in nyt_eng_200405.conllu: 1158

----- count with clustering -----
`ADV ADJ` bigrams in nyt_eng_200405.conllu, clustered by ADV lemma:


Unnamed: 0,adverb,total_bigrams
101,more,148
11,very,93
100,most,87
187,as,73
25,too,69
...,...,...
109,lovably,1
108,magnificently,1
107,mainly,1
106,materially,1


In [13]:
print(f"Top 10 `ADV ADJ` bigrams in {conllu_path.name}")
pd.json_normalize(co.count(req, ["ADV.lemma", "ADJ.lemma"]), sep='_').transpose(
    ).rename(columns={0:'total'}).nlargest(10, 'total')

Top 10 `ADV ADJ` bigrams in nyt_eng_200405.conllu


Unnamed: 0,total
too_much,11
so_much,11
how_much,11
so_many,10
as_much,10
more_important,8
more_likely,7
how_many,7
too_tight,6
too_many,6


In [14]:
match_list = co.search(req)

print("\n=============== `ADV ADJ` bigram match info ===============")
pd.json_normalize(match_list, sep='_')




Unnamed: 0,sent_id,matching_nodes_ADV,matching_nodes_ADJ,matching_edges_mod_source,matching_edges_mod_label,matching_edges_mod_target
0,nyt_eng_20040507_0026_38,6,7,7,advmod,6
1,nyt_eng_20040507_0026_37,30,31,31,advmod,30
2,nyt_eng_20040507_0026_35,10,11,11,advmod,10
3,nyt_eng_20040507_0026_33,23,24,24,advmod,23
4,nyt_eng_20040507_0026_26,4,5,5,advmod,4
...,...,...,...,...,...,...
1153,nyt_eng_20040501_0002_2,25,26,26,advmod,25
1154,nyt_eng_20040501_0001_46,13,14,14,advmod,13
1155,nyt_eng_20040501_0001_35,9,10,10,advmod,9
1156,nyt_eng_20040501_0001_30,15,16,16,advmod,15


In [15]:
def gen_conllus(match_list, corpus):
    
    # for sent in match_list:
    for i, sent in enumerate(match_list):
        parse = corpus.get(sent['sent_id'])
        if i < 3:
            print(parse.to_conll())
        yield parse.to_conll()+'\n'

In [16]:
conllu_gen = gen_conllus(match_list, co)

subset_dir = conllu_path.parent.joinpath(f'subset_{pat_path.parent.stem}')
if not subset_dir.is_dir(): 
    subset_dir.mkdir()
    
subset_path = subset_dir.joinpath(f'{pat_path.stem}+{conllu_path.name}')
subset_path.write_text('\n'.join(conllu_gen), encoding='utf8')

# sent_id = nyt_eng_20040507_0026_38
# text = And the competitions themselves are impressively edited  -COL-  like a fever dream , with the winners sliding into a pool of Lady P-H 's beer .
1	and	and	_	CC	_	7	cc	7:cc	_
2	the	the	_	DT	_	3	det	3:det	_
3	competitions	competition	_	NNS	_	7	nsubj	7:nsubj	_
4	themselves	themselves	_	PRP	_	3	dep	3:dep	_
5	are	be	_	VBP	_	7	cop	7:cop	_
6	impressively	impressively	_	RB	_	7	advmod	7:advmod	_
7	edited	edited	_	JJ	_	0	root	0:root	_
8	:	:	_	:	_	0	-	0:-	_
9	like	like	_	IN	_	7	prep	7:prep	_
10	a	a	_	DT	_	12	det	12:det	_
11	fever	fever	_	NN	_	12	nn	12:nn	_
12	dream	dream	_	NN	_	9	pobj	9:pobj	_
13	,	,	_	,	_	0	-	0:-	_
14	with	with	_	IN	_	7	prep	7:prep	_
15	the	the	_	DT	_	16	det	16:det	_
16	winners	winner	_	NNS	_	17	nsubj	17:nsubj	_
17	sliding	slide	_	VBG	_	14	pcomp	14:pcomp	_
18	into	into	_	IN	_	17	prep	17:prep	_
19	a	a	_	DT	_	20	det	20:det	_
20	pool	pool	_	NN	_	18	pobj	18:pobj	_
21	of	of	_	IN	_	20	prep	20:prep	_
22	Lady	Lady	_	NNP	_	23	nn	23:nn	_
23	P-H

1470725

In [17]:
advadj_subset = corpus_from_path(subset_path)

 ## modifier bigram only

In [18]:
neg_req = Request(
    'ADJ [xpos=re"JJ.?"];'
    'mod: ADJ -[advmod]-> ADV;'
    'ADV < ADJ;'
    # 'NEG [lemma="not"];'
    # 'NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];'
    # 'neg: ADJ -[re"[^E].*"]-> NEG;'
    # 'neg: ADJ -> NEG;'
    # 'neg: NEG -> ADJ;'
    # 'NEG << ADV;'
    )
print(str(neg_req).replace(';', ';\n\t '))
print('-------------------------\nhits:', co.count(neg_req))

pattern {ADJ [xpos=re"JJ.?"];
	 mod: ADJ -[advmod]-> ADV;
	 ADV < ADJ}
-------------------------
hits: 1158


 ## `not` somewhere in sentence

In [19]:
neg_req = Request(
    'ADJ [xpos=re"JJ.?"];'
    'mod: ADJ -[advmod]-> ADV;'
    'ADV < ADJ;'
    'NEG [lemma="not"];'
    # 'NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];'
    # 'neg: ADJ -[re"[^E].*"]-> NEG;'
    # 'neg: ADJ -> NEG;'
    # 'neg: NEG -> ADJ;'
    # 'NEG << ADV;'
    )
print(str(neg_req).replace(';', ';\n\t '))
print('-------------------------\nhits:', co.count(neg_req))

pattern {ADJ [xpos=re"JJ.?"];
	 mod: ADJ -[advmod]-> ADV;
	 ADV < ADJ;
	 NEG [lemma="not"]}
-------------------------
hits: 194


 ## known `NEG` lemma somewhere in sentence

In [20]:
neg_req = Request(
    'ADJ [xpos=re"JJ.?"];'
    'mod: ADJ -[advmod]-> ADV;'
    'ADV < ADJ;'
    # 'NEG [lemma="not"];'
    'NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];'
    # 'neg: ADJ -[re"[^E].*"]-> NEG;'
    # 'neg: ADJ -> NEG;'
    # 'neg: NEG -> ADJ;'
    # 'NEG << ADV;'
    )
print(str(neg_req).replace(';', ';\n\t '))
print('-------------------------\nhits:', co.count(neg_req))

pattern {ADJ [xpos=re"JJ.?"];
	 mod: ADJ -[advmod]-> ADV;
	 ADV < ADJ;
	 NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"]}
-------------------------
hits: 268


 ## known `NEG` lemma preceding `ADV` token node

In [21]:
neg_req = Request(
    'ADJ [xpos=re"JJ.?"];'
    'mod: ADJ -[advmod]-> ADV;'
    'ADV < ADJ;'
    # 'NEG [lemma="not"];'
    'NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];'
    # 'neg: ADJ -[re"[^E].*"]-> NEG;'
    # 'neg: ADJ -> NEG;'
    # 'neg: NEG -> ADJ;'
    'NEG << ADV;'
    )
print(str(neg_req).replace(';', ';\n\t '))
print('-------------------------\nhits:', co.count(neg_req))

pattern {ADJ [xpos=re"JJ.?"];
	 mod: ADJ -[advmod]-> ADV;
	 ADV < ADJ;
	 NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];
	 NEG << ADV}
-------------------------
hits: 178


 ## known `NEG` lemma with `ADJ` node as its **target** (in dependency relationship)
 This is not expected. No $NegPol$ patterns cover relationships of this direction.

In [22]:
neg_req = Request(
    'ADJ [xpos=re"JJ.?"];'
    'mod: ADJ -[advmod]-> ADV;'
    'ADV < ADJ;'
    # 'NEG [lemma="not"];'
    'NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];'
    # 'neg: ADJ -[re"[^E].*"]-> NEG;'
    # 'neg: ADJ -> NEG;'
    'neg: NEG -> ADJ;'
    'NEG << ADV;'
    )
print(str(neg_req).replace(';', ';\n\t '))
print('-------------------------\nhits:', co.count(neg_req))

pattern {ADJ [xpos=re"JJ.?"];
	 mod: ADJ -[advmod]-> ADV;
	 ADV < ADJ;
	 NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];
	 neg: NEG -> ADJ;
	 NEG << ADV}
-------------------------
hits: 6


 ## known `NEG` lemma with `ADJ` node as its **head/source**

In [23]:
neg_req = Request(
    'ADJ [xpos=re"JJ.?"];'
    'mod: ADJ -[advmod]-> ADV;'
    'ADV < ADJ;'
    # 'NEG [lemma="not"];'
    'NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];'
    # 'neg: ADJ -[re"[^E].*"]-> NEG;'
    'neg: ADJ -> NEG;'
    # 'neg: NEG -> ADJ;'
    'NEG << ADV;'
    )
print(str(neg_req).replace(';', ';\n\t '))
print('-------------------------\nhits:', co.count(neg_req))

pattern {ADJ [xpos=re"JJ.?"];
	 mod: ADJ -[advmod]-> ADV;
	 ADV < ADJ;
	 NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];
	 neg: ADJ -> NEG;
	 NEG << ADV}
-------------------------
hits: 104


 👆 This pattern, which does not impose any restrictions on the *type*
 of dependency relationship between `NEG` and `ADJ` yields duplicate results
 in the case of any "enhanced" parsing

In [24]:
hits = co.search(neg_req)
pprint(hits[:2])

[{'matching': {'edges': {'mod': {'label': 'advmod',
                                 'source': '24',
                                 'target': '23'},
                         'neg': {'label': {'1': 'neg', 'enhanced': 'yes'},
                                 'source': '24',
                                 'target': '21'}},
               'nodes': {'ADJ': '24', 'ADV': '23', 'NEG': '21'}},
  'sent_id': 'nyt_eng_20040507_0026_33'},
 {'matching': {'edges': {'mod': {'label': 'advmod',
                                 'source': '24',
                                 'target': '23'},
                         'neg': {'label': 'neg',
                                 'source': '24',
                                 'target': '21'}},
               'nodes': {'ADJ': '24', 'ADV': '23', 'NEG': '21'}},
  'sent_id': 'nyt_eng_20040507_0026_33'}]


In [25]:
pprint({f"{h['sent_id']} ~ hit {i} neg label":h['matching']['edges']['neg']['label'] for i,h in enumerate(hits[:16])})

{'nyt_eng_20040506_0003_12 ~ hit 14 neg label': {'1': 'advmod',
                                                 'enhanced': 'yes'},
 'nyt_eng_20040506_0003_12 ~ hit 15 neg label': 'advmod',
 'nyt_eng_20040506_0029_4 ~ hit 12 neg label': {'1': 'cc', 'enhanced': 'yes'},
 'nyt_eng_20040506_0029_4 ~ hit 13 neg label': 'cc',
 'nyt_eng_20040507_0002_16 ~ hit 10 neg label': {'1': 'neg', 'enhanced': 'yes'},
 'nyt_eng_20040507_0002_16 ~ hit 11 neg label': 'neg',
 'nyt_eng_20040507_0010_6 ~ hit 8 neg label': {'1': 'neg', 'enhanced': 'yes'},
 'nyt_eng_20040507_0010_6 ~ hit 9 neg label': 'neg',
 'nyt_eng_20040507_0011_15 ~ hit 6 neg label': {'1': 'neg', 'enhanced': 'yes'},
 'nyt_eng_20040507_0011_15 ~ hit 7 neg label': 'neg',
 'nyt_eng_20040507_0016_19 ~ hit 4 neg label': {'1': 'neg', 'enhanced': 'yes'},
 'nyt_eng_20040507_0016_19 ~ hit 5 neg label': 'neg',
 'nyt_eng_20040507_0020_4 ~ hit 2 neg label': {'1': 'neg', 'enhanced': 'yes'},
 'nyt_eng_20040507_0020_4 ~ hit 3 neg label': 'neg',
 'nyt_eng

 ## known `NEG` lemma with `ADJ` node as its *head/source* and the relationship type does not start with "E"
 This is to prevent issue with duplicate records for "Enhanced" versions of the dependency.

In [26]:
neg_req = Request(
    'ADJ [xpos=re"JJ.?"];'
    'mod: ADJ -[advmod]-> ADV;'
    'ADV < ADJ;'
    # 'NEG [lemma="not"];'
    'NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];'
    'neg: ADJ -[re"[^E].*"]-> NEG;'
    # 'neg: ADJ -> NEG;'
    # 'neg: NEG -> ADJ;'
    'NEG << ADV;'
    )
print(str(neg_req).replace(';', ';\n\t '))
print('-------------------------\nhits:', co.count(neg_req))

pattern {ADJ [xpos=re"JJ.?"];
	 mod: ADJ -[advmod]-> ADV;
	 ADV < ADJ;
	 NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];
	 neg: ADJ -[re"[^E].*"]-> NEG;
	 NEG << ADV}
-------------------------
hits: 52


 ...and now the duplicate hits (as well as the output structure unpredictability) have been removed:

In [27]:
pprint({f"{h['sent_id']} ~ hit {i} neg label":h['matching']['edges']['neg']['label'] for i,h in enumerate(co.search(neg_req)[:16])})

{'nyt_eng_20040505_0021_35 ~ hit 15 neg label': 'neg',
 'nyt_eng_20040505_0023_33 ~ hit 14 neg label': 'neg',
 'nyt_eng_20040505_0023_40 ~ hit 13 neg label': 'neg',
 'nyt_eng_20040505_0026_23 ~ hit 12 neg label': 'neg',
 'nyt_eng_20040505_0042_1 ~ hit 11 neg label': 'neg',
 'nyt_eng_20040505_0042_11 ~ hit 10 neg label': 'neg',
 'nyt_eng_20040505_0045_10 ~ hit 9 neg label': 'neg',
 'nyt_eng_20040505_0060_44 ~ hit 8 neg label': 'neg',
 'nyt_eng_20040506_0003_12 ~ hit 7 neg label': 'advmod',
 'nyt_eng_20040506_0029_4 ~ hit 6 neg label': 'cc',
 'nyt_eng_20040507_0002_16 ~ hit 5 neg label': 'neg',
 'nyt_eng_20040507_0010_6 ~ hit 4 neg label': 'neg',
 'nyt_eng_20040507_0011_15 ~ hit 3 neg label': 'neg',
 'nyt_eng_20040507_0016_19 ~ hit 2 neg label': 'neg',
 'nyt_eng_20040507_0020_4 ~ hit 1 neg label': 'neg',
 'nyt_eng_20040507_0026_33 ~ hit 0 neg label': 'neg'}


In [28]:
pd.json_normalize(co.count(neg_req, ["ADV.lemma", "NEG.lemma"]), sep='_').transpose().rename(columns={0:'total'}).nlargest(10,'total')

Unnamed: 0,total
as_not,11
too_not,5
always_not,5
so_not,4
that_not,2
really_not,2
more_without,2
more_never,2
yet_not,1
very_not,1


In [29]:
neg_df = pd.json_normalize(co.search(neg_req), sep='_').convert_dtypes()
neg_df.columns = (
    neg_df.columns
    .str.replace('matching_', '')
    .str.replace('nodes_', 'index_')
    .str.replace('edges_', 'dep_'))
neg_df

Unnamed: 0,sent_id,index_NEG,index_ADV,index_ADJ,dep_neg_source,dep_neg_label,dep_neg_target,dep_mod_source,dep_mod_label,dep_mod_target
0,nyt_eng_20040507_0026_33,21,23,24,24,neg,21,24,advmod,23
1,nyt_eng_20040507_0020_4,3,9,10,10,neg,3,10,advmod,9
2,nyt_eng_20040507_0016_19,5,6,7,7,neg,5,7,advmod,6
3,nyt_eng_20040507_0011_15,9,10,11,11,neg,9,11,advmod,10
4,nyt_eng_20040507_0010_6,19,20,21,21,neg,19,21,advmod,20
5,nyt_eng_20040507_0002_16,29,31,32,32,neg,29,32,advmod,31
6,nyt_eng_20040506_0029_4,2,7,8,8,cc,2,8,advmod,7
7,nyt_eng_20040506_0003_12,22,23,24,24,advmod,22,24,advmod,23
8,nyt_eng_20040505_0060_44,6,7,8,8,neg,6,8,advmod,7
9,nyt_eng_20040505_0045_10,22,23,24,24,neg,22,24,advmod,23


In [30]:
for m in neg_df.sample(8).sent_id.apply(lambda i: co.get(i).meta): 
    print(f"{m['sent_id']}:\t{m['text']}")

nyt_eng_20040507_0011_15:	As horrible as that is , it is n't as bad as being shot dead .
nyt_eng_20040503_0043_28:	 -LBQ-  Nothing is more important than talent .
nyt_eng_20040504_0006_10:	 -LBQ-  It 's everything else she writes in the e-mail after that that 's not too pleasant .
nyt_eng_20040502_0017_41:	But it was not quite enough .
nyt_eng_20040504_0010_13:	And the politics have never been more complicated or fraught with implications for the world as a whole .
nyt_eng_20040504_0041_4:	 -LBQ-  They are not as good as they will be , but as we sit here in May 2004 I can say the life of the Ohio citizen is improving .
nyt_eng_20040505_0021_35:	And the network 's once-mighty  -LBQ-  Must See TV  -RDQ-  Thursday night schedule is n't nearly as formidable .
nyt_eng_20040501_0024_7:	Kerry appeared irritated and insisted the Republican attacks were not so effective .


 ## Thoughts...
 Instead of creating new subset `.conllu` files with all matching
 sentences + their context sentences (preceding and following),
 it should be possible to take only what is needed for the context sentences
 (i.e. `sent_id` and `sent_text`, the only things put into the hit table)
 and then pull those directly from the meta info.

 The next big question, though, is whether to create the subset file at all in that case.
 **This new module essentially makes the first 2 steps of the pipeline code obsolete.**
 If this functionality had existed 2 years ago... smh 😐

 But I'm not going to rework *everything* at this point.

 ## Plan
 I will create the `advadj` subset and then run the pipeline on those.

 If I remove the context sentences, `fill_match_info` will need to be modified to not collect them
 (since anything it would pull would be incorrect).
 However, it would not necessarily need to add them at all at that point.
 I wanted to make sure I had access to what they are, but so far, I haven't used them.
 If there exists a table where they can be looked up if need be, that should suffice.
 The sentence IDs are stable and unique identifiers, so any table indexed by `sent_id`
 will be easy to connect with the existing data.

 Mock Table Schematic

 > | sent_id | doc_id | conllu_id | sent_text | prev_id | next_id | prev_text | next_text |
 > |:--------|:-------|:----------|:----------|:--------|:--------|:----------|:----------|
 > | sent_id | doc_id | conllu_id | sent_text | prev_id | next_id | prev_text | next_text |

In [31]:
hit_meta = pd.json_normalize(neg_df.sent_id.apply(lambda i: co.get(i).meta)
                             ).convert_dtypes().rename(columns={'':'newdoc_id'})
hit_meta = hit_meta.assign(conllu_id=conllu_path.stem, 
                           newdoc_id=hit_meta.newdoc_id.str.replace('# newdoc id = ', '').fillna(''))
hit_meta

Unnamed: 0,sent_id,text,newdoc_id,conllu_id
0,nyt_eng_20040507_0026_33,The performers ' sense of play softens the arc...,,nyt_eng_200405
1,nyt_eng_20040507_0020_4,"It is not , in fact , all that hard to diagnos...",,nyt_eng_200405
2,nyt_eng_20040507_0016_19,And the reviews were n't just bad ; they were ...,,nyt_eng_200405
3,nyt_eng_20040507_0011_15,"As horrible as that is , it is n't as bad as b...",,nyt_eng_200405
4,nyt_eng_20040507_0010_6,"In its simplest form , the argument goes like ...",,nyt_eng_200405
5,nyt_eng_20040507_0002_16,He knows that New York fans were disappointed ...,,nyt_eng_200405
6,nyt_eng_20040506_0029_4,But never have the stakes been as high or the ...,,nyt_eng_200405
7,nyt_eng_20040506_0003_12,He might get only $ 3.5 billion _ the payout f...,,nyt_eng_200405
8,nyt_eng_20040505_0060_44,Even those with insurance are n't always able ...,,nyt_eng_200405
9,nyt_eng_20040505_0045_10,-LBQ- So while the market risk clearly rises...,,nyt_eng_200405


In [32]:
def parse_sent_id(sent_id):
    doc_id, ordinal_str = sent_id.rsplit('_', 1)
    ordinal_int = int(ordinal_str)

    row = (sent_id, doc_id, ordinal_int)
    for context_ix in (ordinal_int + i for i in (-1, 1)): 
        c_text = ''
        c_id = ''
        #> conllu doc sentence numbering starts at 1
        if context_ix > 0:
            c_id = f'{doc_id}_{context_ix}'
            try: 
                c_obj = co.get(c_id)
            except GrewError: 
                c_id = ''
            else: 
                c_text = c_obj.meta['text']
        row += (c_id, c_text)
        
    yield _META_TUP._make(row)

In [33]:
sid = hit_meta.sent_id[0]
print(pd.DataFrame(parse_sent_id(sid)).set_index('sent_id').transpose().to_markdown())

|           | nyt_eng_20040507_0026_33                                                                                                                                                                            |
|:----------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| doc_id    | nyt_eng_20040507_0026                                                                                                                                                                               |
| sent_int  | 33                                                                                                                                                                                                  |
| prev_id   | nyt_eng_20040507_0026_32                                                                                                                  

In [34]:
context_info = pd.concat(pd.DataFrame(parse_sent_id(s)) for s in hit_meta.sent_id)
context_info.head()

Unnamed: 0,sent_id,doc_id,sent_int,prev_id,prev_text,next_id,next_text
0,nyt_eng_20040507_0026_33,nyt_eng_20040507_0026,33,nyt_eng_20040507_0026_32,"It remains , to some extent , fractured , held...",nyt_eng_20040507_0026_34,"With a blond wig and outsize delivery , she 's..."
0,nyt_eng_20040507_0020_4,nyt_eng_20040507_0020,4,nyt_eng_20040507_0020_3,"Over the past two weeks , Schroeder has lashed...",nyt_eng_20040507_0020_5,"This suggests that , at least for Germany , th..."
0,nyt_eng_20040507_0016_19,nyt_eng_20040507_0016,19,nyt_eng_20040507_0016_18,The disc topped hundreds of worst-of-the-year ...,nyt_eng_20040507_0016_20,Here are excerpts from a few random reviews of...
0,nyt_eng_20040507_0011_15,nyt_eng_20040507_0011,15,nyt_eng_20040507_0011_14,The soldiers under Calley 's command certainly...,nyt_eng_20040507_0011_16,"At the time , most people said they wanted Cal..."
0,nyt_eng_20040507_0010_6,nyt_eng_20040507_0010,6,nyt_eng_20040507_0010_5,It took root among thinkers in the 18th centur...,nyt_eng_20040507_0010_7,These passions overwhelm our reason and lead u...


In [35]:
meta_info = hit_meta.set_index('sent_id').join(context_info.set_index('sent_id')).rename(columns={'text':'sent_text'})
meta_info = meta_info[['conllu_id', 'doc_id', 'sent_int', 'prev_id', 'next_id', 'prev_text', 'sent_text', 'next_text']]
print(meta_info.sample(2).to_markdown())
print("\n---\n")
print(meta_info.iloc[-1, :].squeeze().to_markdown())

| sent_id                  | conllu_id      | doc_id                |   sent_int | prev_id                  | next_id                  | prev_text                                                                                                            | sent_text                                                                                                                               | next_text                                                                                                |
|:-------------------------|:---------------|:----------------------|-----------:|:-------------------------|:-------------------------|:---------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------

In [36]:
meta_info.to_csv(subset_dir.joinpath(conllu_path.stem+'+meta.psv'), sep='|')

In [37]:
neg_df.loc[:, ['sent_text', 'conllu_id', 'doc_id']] =  neg_df.sent_id.apply(
    lambda x: meta_info.loc[x, ['sent_text', 'conllu_id', 'doc_id']])
neg_df.sample(10)

Unnamed: 0,sent_id,index_NEG,index_ADV,index_ADJ,dep_neg_source,dep_neg_label,dep_neg_target,dep_mod_source,dep_mod_label,dep_mod_target,sent_text,conllu_id,doc_id
25,nyt_eng_20040504_0041_4,4,5,6,6,neg,4,6,advmod,5,"-LBQ- They are not as good as they will be ,...",nyt_eng_200405,nyt_eng_20040504_0041
27,nyt_eng_20040504_0010_33,25,28,29,29,neg,25,29,advmod,28,"As for Egypt , its delegation boasts former Un...",nyt_eng_200405,nyt_eng_20040504_0010
50,nyt_eng_20040501_0012_46,5,6,7,7,neg,5,7,advmod,6,"Besides , Google is not yet all-powerful , as ...",nyt_eng_200405,nyt_eng_20040501_0012
21,nyt_eng_20040504_0054_16,37,38,39,39,neg,37,39,advmod,38,-LBQ- They may feel that when they 're teen-...,nyt_eng_200405,nyt_eng_20040504_0054
45,nyt_eng_20040501_0029_27,24,25,26,26,neg,24,26,advmod,25,But as Williams ' role diminished with Nelson ...,nyt_eng_200405,nyt_eng_20040501_0029
44,nyt_eng_20040501_0030_40,4,5,6,6,neg,4,6,advmod,5,-LBQ- I 'm not so sure that 's what he said ...,nyt_eng_200405,nyt_eng_20040501_0030
15,nyt_eng_20040505_0021_35,15,17,18,18,neg,15,18,advmod,17,And the network 's once-mighty -LBQ- Must Se...,nyt_eng_200405,nyt_eng_20040505_0021
36,nyt_eng_20040503_0024_11,20,21,22,22,neg,20,22,advmod,21,The researchers studied one of the squid 's sm...,nyt_eng_200405,nyt_eng_20040503_0024
5,nyt_eng_20040507_0002_16,29,31,32,32,neg,29,32,advmod,31,He knows that New York fans were disappointed ...,nyt_eng_200405,nyt_eng_20040507_0002
39,nyt_eng_20040502_0041_44,3,4,5,5,neg,3,5,advmod,4,"Life is not always fair , and neither is baseb...",nyt_eng_200405,nyt_eng_20040502_0041
