In [25]:
"""original copied from https://github.com/grew-nlp/grewpy/blob/master/examples/test_corpus.py"""
import json
import os
import sys
from pathlib import Path
from pprint import pprint

import pandas as pd
# from grewpy import Graph, CorpusDraft, Request, Corpus, request_counter
from grewpy import (Corpus, 
                    # CorpusDraft, Graph, 
                    Request, request_counter)

# sys.path.insert(0, os.path.abspath(os.path.join( os.path.dirname(__file__), "../"))) # Use local grew lib


def grewpy_corpus(path):
    return Corpus(str(path))

def docs(expr):

    try:
        print(expr.__doc__)
    except AttributeError:
        print('None')

In [26]:
pud_file = Path(
    "data/corpora/gitrepo_puddin/2smallest.conll/apw_eng_199911.conllu")
pud = grewpy_corpus(str(pud_file))

print("\n=============== len ===============")
print(f"sentence count in {pud_file.name} = {len(pud)}")
print(request_counter())

# print("\n=============== Get one graph ===============")
# sent_id = "apw_eng_19991101_0001_1"
# graph = pud[sent_id]
# print(f"nb of nodes of {sent_id} = ", len(graph))
# print(request_counter())

print(f"len(pud[0]) = {len(pud[0])}")
print(f"len(pud[-1]) = {len(pud[-1])}")
print(f"[len(g) for g in pud[-3:]] = {[len(g) for g in pud[-3:]]}")
# other forms pud[-3:-1], pud[1:7:2], ...


# dprint ("\n=============== Iteration on graphs of a corpus ===============")
# print ("⚠️  generate one request to Grew backend for each graph")
# acc = 0
# for sent_id in pud.get_sent_ids():
#   acc += len(pud[sent_id])
# print(f"nb of nodes in {pud_file} = ", acc)
# print (request_counter())


sentence count in apw_eng_199911.conllu = 1147
199
len(pud[0]) = 33
len(pud[-1]) = 10
[len(g) for g in pud[-3:]] = [9, 31, 10]


In [27]:
print("\n=============== Count request in a corpus ===============")
# upos="ADV"
# req = Request(f"X[upos={upos}]")
for xpos in ("RB.*", "JJ.*"):
    # xpos="RB.*"
    if xpos.startswith('RB'):
        print('# ADVERBS')
    elif xpos.startswith('JJ'):
        print('# ADJECTIVES')

    req = Request(f'X[xpos=re"{xpos}"]')

    print(" ----- basic count -----")
    print(f"total {xpos} in {pud_file.name} = ", pud.count(req))

    print(" ----- count with clustering -----")
    print(f"total {xpos} in {pud_file.name}, clustered by exact POS:")
    # print(request_counter())
    print(pd.Series(pud.count(req, ["X.xpos"])).to_frame().rename(columns={0:'total'}).to_markdown(), '\n')


# ADVERBS
 ----- basic count -----
total RB.* in apw_eng_199911.conllu =  840
 ----- count with clustering -----
total RB.* in apw_eng_199911.conllu, clustered by exact POS:
|     |   total |
|:----|--------:|
| RBS |      10 |
| RBR |      32 |
| RB  |     798 | 

# ADJECTIVES
 ----- basic count -----
total JJ.* in apw_eng_199911.conllu =  1809
 ----- count with clustering -----
total JJ.* in apw_eng_199911.conllu, clustered by exact POS:
|     |   total |
|:----|--------:|
| JJS |      55 |
| JJR |      66 |
| JJ  |    1688 | 



In [28]:
#* ALL bigrams
req = Request('ADJ [xpos=re"JJ.?"];'
              'mod: ADJ -[advmod]-> ADV;'
              'ADV < ADJ'
              )

In [29]:
print("\n=============== Count `ADV ADJ` bigrams ===============")
print(f"total `ADV ADJ` bigrams in {pud_file.name}: {pud.count(req)}")
print("\n----- count with clustering -----")
print(f"`ADV ADJ` bigrams in {pud_file.name}, clustered by ADV lemma:")
# print(json.dumps(pud.count(req, ["ADV.lemma"]), indent=2))
pd.Series(pud.count(req, ["ADV.lemma"])).to_frame().reset_index().rename(
    columns={'index':'adverb', 0:'total_bigrams'}
    ).sort_values('total_bigrams', ascending=False)


total `ADV ADJ` bigrams in apw_eng_199911.conllu: 80

----- count with clustering -----
`ADV ADJ` bigrams in apw_eng_199911.conllu, clustered by ADV lemma:


Unnamed: 0,adverb,total_bigrams
15,more,12
14,most,9
2,too,7
1,very,6
4,so,5
31,as,5
23,how,3
6,reportedly,3
7,really,3
16,long,2


In [30]:
print(f"Top 10 `ADV ADJ` bigrams in {pud_file.name}")
pd.json_normalize(pud.count(req, ["ADV.lemma", "ADJ.lemma"]), sep='_').transpose(
    ).rename(columns={0:'total'}).nlargest(10, 'total')

Top 10 `ADV ADJ` bigrams in apw_eng_199911.conllu


Unnamed: 0,total
reportedly_close,3
how_much,3
very_long,2
too_high,2
so_bad,2
most_valuable,2
most_important,2
more_expensive,2
more_difficult,2
long_admired,2


In [31]:
match_list = pud.search(req)

print("\n=============== `ADV ADJ` bigram match info ===============")
pd.json_normalize(match_list, sep='_')#.transpose().rename(columns={0:'total'}).nlargest(10, 'total')




Unnamed: 0,sent_id,matching_nodes_ADV,matching_nodes_ADJ,matching_edges_mod_source,matching_edges_mod_label,matching_edges_mod_target
0,apw_eng_19991101_0059_18,23,24,24,advmod,23
1,apw_eng_19991101_0059_16,4,5,5,advmod,4
2,apw_eng_19991101_0059_15,5,6,6,advmod,5
3,apw_eng_19991101_0059_5,28,29,29,advmod,28
4,apw_eng_19991101_0059_5,31,32,32,advmod,31
...,...,...,...,...,...,...
75,apw_eng_19991101_0006_20,28,29,29,advmod,28
76,apw_eng_19991101_0006_8,10,11,11,advmod,10
77,apw_eng_19991101_0005_14,9,10,10,advmod,9
78,apw_eng_19991101_0002_19,13,14,14,advmod,13


In [32]:
def gen_conllus(match_list, corpus):
    
    # for sent in match_list:
    for i, sent in enumerate(match_list):
        parse = corpus.get(sent['sent_id'])
        if i < 3:
            print(parse.to_conll())
        yield parse.to_conll()+'\n'

In [33]:
conllu_gen = gen_conllus(match_list, pud)

test_out_conllu_path = pud_file.with_name(f'pat-match_{pud_file.name}')
test_out_conllu_path.write_text('\n'.join(conllu_gen), encoding='utf8')

# sent_id = apw_eng_19991101_0059_18
# text = Katz , an adviser to local governments and an expert on stadium financing , has tried to position himself as a moderate best able to build on the economic recovery generated under popular two-term Mayor Edward G. Rendell .
1	Katz	Katz	_	NNP	_	16	nsubj	16:nsubj	_
2	,	,	_	,	_	0	-	0:-	_
3	an	a	_	DT	_	4	det	4:det	_
4	adviser	adviser	_	NN	_	1	appos	1:appos	_
5	to	to	_	TO	_	4	prep	4:prep	_
6	local	local	_	JJ	_	7	amod	7:amod	_
7	governments	government	_	NNS	_	5	pobj	5:pobj	_
8	and	and	_	CC	_	4	cc	4:cc	_
9	an	a	_	DT	_	10	det	10:det	_
10	expert	expert	_	NN	_	4	conj	4:conj	_
11	on	on	_	IN	_	10	prep	10:prep	_
12	stadium	stadium	_	NN	_	13	nn	13:nn	_
13	financing	financing	_	NN	_	11	pobj	11:pobj	_
14	,	,	_	,	_	0	-	0:-	_
15	has	has	_	AUXZ	_	16	dep	16:dep	_
16	tried	try	_	VBN	_	0	root	0:root	_
17	to	to	_	TO	_	18	aux	18:aux	_
18	position	position	_	VB	_	16	xcomp	16:xcomp	_
19	himself	himself	_	PRP	_	18	dobj	18:dobj	_
20	as	as	_	IN	_	18	prep	18:prep	_
21	a	

101831

In [34]:
advadj_subset = grewpy_corpus(str(test_out_conllu_path))

In [35]:
#* Negated
neg_req = Request(
    'ADJ [xpos=re"JJ.?"];'
    'mod: ADJ -[advmod]-> ADV;'
    'ADV < ADJ;'
    # 'NEG [lemma="not"];'
    # 'NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];'
    # 'neg: ADJ -[re"[^E].*"]-> NEG;'
    # 'neg: ADJ -> NEG;'
    # 'neg: NEG -> ADJ;'
    # 'NEG << ADV;'
    )
print(str(neg_req).replace(';', ';\n\t '))
print('-------------------------\nhits:', pud.count(neg_req))

pattern {ADJ [xpos=re"JJ.?"];
	 mod: ADJ -[advmod]-> ADV;
	 ADV < ADJ}
-------------------------
hits: 80


In [36]:
#* Negated
neg_req = Request(
    'ADJ [xpos=re"JJ.?"];'
    'mod: ADJ -[advmod]-> ADV;'
    'ADV < ADJ;'
    'NEG [lemma="not"];'
    # 'NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];'
    # 'neg: ADJ -[re"[^E].*"]-> NEG;'
    # 'neg: ADJ -> NEG;'
    # 'neg: NEG -> ADJ;'
    # 'NEG << ADV;'
    )
print(str(neg_req).replace(';', ';\n\t '))
print('-------------------------\nhits:', pud.count(neg_req))

pattern {ADJ [xpos=re"JJ.?"];
	 mod: ADJ -[advmod]-> ADV;
	 ADV < ADJ;
	 NEG [lemma="not"]}
-------------------------
hits: 14


In [37]:
#* Negated
neg_req = Request(
    'ADJ [xpos=re"JJ.?"];'
    'mod: ADJ -[advmod]-> ADV;'
    'ADV < ADJ;'
    # 'NEG [lemma="not"];'
    'NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];'
    # 'neg: ADJ -[re"[^E].*"]-> NEG;'
    # 'neg: ADJ -> NEG;'
    # 'neg: NEG -> ADJ;'
    # 'NEG << ADV;'
    )
print(str(neg_req).replace(';', ';\n\t '))
print('-------------------------\nhits:', pud.count(neg_req))

pattern {ADJ [xpos=re"JJ.?"];
	 mod: ADJ -[advmod]-> ADV;
	 ADV < ADJ;
	 NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"]}
-------------------------
hits: 24


In [38]:
#* Negated
neg_req = Request(
    'ADJ [xpos=re"JJ.?"];'
    'mod: ADJ -[advmod]-> ADV;'
    'ADV < ADJ;'
    # 'NEG [lemma="not"];'
    'NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];'
    # 'neg: ADJ -[re"[^E].*"]-> NEG;'
    # 'neg: ADJ -> NEG;'
    # 'neg: NEG -> ADJ;'
    'NEG << ADV;'
    )
print(str(neg_req).replace(';', ';\n\t '))
print('-------------------------\nhits:', pud.count(neg_req))

pattern {ADJ [xpos=re"JJ.?"];
	 mod: ADJ -[advmod]-> ADV;
	 ADV < ADJ;
	 NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];
	 NEG << ADV}
-------------------------
hits: 20


In [39]:
#* Negated
neg_req = Request(
    'ADJ [xpos=re"JJ.?"];'
    'mod: ADJ -[advmod]-> ADV;'
    'ADV < ADJ;'
    # 'NEG [lemma="not"];'
    'NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];'
    # 'neg: ADJ -[re"[^E].*"]-> NEG;'
    'neg: ADJ -> NEG;'
    # 'neg: NEG -> ADJ;'
    'NEG << ADV;'
    )
print(str(neg_req).replace(';', ';\n\t '))
print('-------------------------\nhits:', pud.count(neg_req))

pattern {ADJ [xpos=re"JJ.?"];
	 mod: ADJ -[advmod]-> ADV;
	 ADV < ADJ;
	 NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];
	 neg: ADJ -> NEG;
	 NEG << ADV}
-------------------------
hits: 6


In [40]:
#* Negated
neg_req = Request(
    'ADJ [xpos=re"JJ.?"];'
    'mod: ADJ -[advmod]-> ADV;'
    'ADV < ADJ;'
    # 'NEG [lemma="not"];'
    'NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];'
    # 'neg: ADJ -[re"[^E].*"]-> NEG;'
    # 'neg: ADJ -> NEG;'
    'neg: NEG -> ADJ;'
    'NEG << ADV;'
    )
print(str(neg_req).replace(';', ';\n\t '))
print('-------------------------\nhits:', pud.count(neg_req))

pattern {ADJ [xpos=re"JJ.?"];
	 mod: ADJ -[advmod]-> ADV;
	 ADV < ADJ;
	 NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];
	 neg: NEG -> ADJ;
	 NEG << ADV}
-------------------------
hits: 0


In [41]:
#* Negated
neg_req = Request(
    'ADJ [xpos=re"JJ.?"];'
    'mod: ADJ -[advmod]-> ADV;'
    'ADV < ADJ;'
    # 'NEG [lemma="not"];'
    'NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];'
    'neg: ADJ -[re"[^E].*"]-> NEG;'
    # 'neg: ADJ -> NEG;'
    # 'neg: NEG -> ADJ;'
    'NEG << ADV;'
    )
print(str(neg_req).replace(';', ';\n\t '))
print('-------------------------\nhits:', pud.count(neg_req))

pattern {ADJ [xpos=re"JJ.?"];
	 mod: ADJ -[advmod]-> ADV;
	 ADV < ADJ;
	 NEG [lemma="not"|"hardly"|"scarcely"|"never"|"rarely"|"barely"|"seldom"|"no"|"nothing"|"none"|"nobody"|"neither"|"without"|"few"|"nor"];
	 neg: ADJ -[re"[^E].*"]-> NEG;
	 NEG << ADV}
-------------------------
hits: 3


In [42]:
pd.json_normalize(pud.count(neg_req, ["ADV.lemma", "NEG.lemma"]), sep='_')

Unnamed: 0,so_not,fully_not
0,2,1


In [43]:
pd.json_normalize(pud.count(neg_req, ["ADV.lemma", "NEG.lemma"]), sep='_').transpose().rename(columns={0:'total'})

Unnamed: 0,total
so_not,2
fully_not,1


In [44]:
pd.json_normalize(pud.search(neg_req), sep='_')

Unnamed: 0,sent_id,matching_nodes_NEG,matching_nodes_ADV,matching_nodes_ADJ,matching_edges_neg_source,matching_edges_neg_label,matching_edges_neg_target,matching_edges_mod_source,matching_edges_mod_label,matching_edges_mod_target
0,apw_eng_19991101_0031_5,10,11,12,12,neg,10,12,advmod,11
1,apw_eng_19991101_0025_10,12,14,15,15,neg,12,15,advmod,14
2,apw_eng_19991101_0021_10,12,14,15,15,neg,12,15,advmod,14


In [45]:
neg_df = pd.json_normalize(pud.search(neg_req), sep='_')
neg_df

Unnamed: 0,sent_id,matching_nodes_NEG,matching_nodes_ADV,matching_nodes_ADJ,matching_edges_neg_source,matching_edges_neg_label,matching_edges_neg_target,matching_edges_mod_source,matching_edges_mod_label,matching_edges_mod_target
0,apw_eng_19991101_0031_5,10,11,12,12,neg,10,12,advmod,11
1,apw_eng_19991101_0025_10,12,14,15,15,neg,12,15,advmod,14
2,apw_eng_19991101_0021_10,12,14,15,15,neg,12,15,advmod,14


In [46]:
neg_df.sent_id.apply(lambda i: pud.get(i).meta['text'])

0    It was not clear why the Ingush border was not...
1    AT & claims that limiting the number of access...
2    AT & claims that limiting the number of access...
Name: sent_id, dtype: object

In [47]:
for m in neg_df.sent_id.apply(lambda i: pud.get(i).meta): 
    pprint(m)

{'sent_id': 'apw_eng_19991101_0031_5',
 'text': 'It was not clear why the Ingush border was not fully open .'}
{'sent_id': 'apw_eng_19991101_0025_10',
 'text': 'AT & claims that limiting the number of access providers may not be '
         'so bad , because a company that can plan its investment in all the '
         'equipment it requires to run the Internet through TV cables can '
         'offer service faster , cheaper and more efficiently .'}
{'sent_id': 'apw_eng_19991101_0021_10',
 'text': 'AT & claims that limiting the number of access providers may not be '
         'so bad , because a company that can plan its investment in all the '
         'equipment it requires to run the Internet through TV cables can '
         'offer service faster , cheaper and more efficiently .'}


In [48]:
pd.json_normalize(neg_df.sent_id.apply(lambda i: pud.get(i).meta), sep='_')

Unnamed: 0,sent_id,text
0,apw_eng_19991101_0031_5,It was not clear why the Ingush border was not...
1,apw_eng_19991101_0025_10,AT & claims that limiting the number of access...
2,apw_eng_19991101_0021_10,AT & claims that limiting the number of access...


In [49]:
pd.DataFrame(neg_df.sent_id.apply(lambda i: pud.get(i).meta))

Unnamed: 0,sent_id
0,"{'sent_id': 'apw_eng_19991101_0031_5', 'text':..."
1,"{'sent_id': 'apw_eng_19991101_0025_10', 'text'..."
2,"{'sent_id': 'apw_eng_19991101_0021_10', 'text'..."


In [50]:
hit_meta = pd.json_normalize(neg_df.sent_id.apply(lambda i: pud.get(i).meta)).convert_dtypes()
hit_meta

Unnamed: 0,sent_id,text
0,apw_eng_19991101_0031_5,It was not clear why the Ingush border was not...
1,apw_eng_19991101_0025_10,AT & claims that limiting the number of access...
2,apw_eng_19991101_0021_10,AT & claims that limiting the number of access...


In [51]:
neg_df = neg_df.assign(sent_text=hit_meta.text, 
                    #    new_doc_id=hit_meta['']
                       )
neg_df

Unnamed: 0,sent_id,matching_nodes_NEG,matching_nodes_ADV,matching_nodes_ADJ,matching_edges_neg_source,matching_edges_neg_label,matching_edges_neg_target,matching_edges_mod_source,matching_edges_mod_label,matching_edges_mod_target,sent_text
0,apw_eng_19991101_0031_5,10,11,12,12,neg,10,12,advmod,11,It was not clear why the Ingush border was not...
1,apw_eng_19991101_0025_10,12,14,15,15,neg,12,15,advmod,14,AT & claims that limiting the number of access...
2,apw_eng_19991101_0021_10,12,14,15,15,neg,12,15,advmod,14,AT & claims that limiting the number of access...


In [55]:
neg_df = pd.json_normalize(pud.search(neg_req), sep='_').convert_dtypes()
neg_df.columns = neg_df.columns.str.replace('matching_', '')
neg_df

Unnamed: 0,sent_id,nodes_NEG,nodes_ADV,nodes_ADJ,edges_neg_source,edges_neg_label,edges_neg_target,edges_mod_source,edges_mod_label,edges_mod_target
0,apw_eng_19991101_0031_5,10,11,12,12,neg,10,12,advmod,11
1,apw_eng_19991101_0025_10,12,14,15,15,neg,12,15,advmod,14
2,apw_eng_19991101_0021_10,12,14,15,15,neg,12,15,advmod,14


In [56]:
hit_meta = pd.json_normalize(neg_df.sent_id.apply(lambda i: pud.get(i).meta)).convert_dtypes()
hit_meta

Unnamed: 0,sent_id,text
0,apw_eng_19991101_0031_5,It was not clear why the Ingush border was not...
1,apw_eng_19991101_0025_10,AT & claims that limiting the number of access...
2,apw_eng_19991101_0021_10,AT & claims that limiting the number of access...


In [58]:
neg_df = neg_df.assign(sent_text=hit_meta.text, 
                       doc_id=neg_df.sent_id.str.rsplit('_', 1).str.get(0), 
                       conllu_id=pud_file.stem
                    #    new_doc_id=hit_meta['']
                       )
neg_df

Unnamed: 0,sent_id,nodes_NEG,nodes_ADV,nodes_ADJ,edges_neg_source,edges_neg_label,edges_neg_target,edges_mod_source,edges_mod_label,edges_mod_target,sent_text,doc_id,conllu_id
0,apw_eng_19991101_0031_5,10,11,12,12,neg,10,12,advmod,11,It was not clear why the Ingush border was not...,apw_eng_19991101_0031,apw_eng_199911
1,apw_eng_19991101_0025_10,12,14,15,15,neg,12,15,advmod,14,AT & claims that limiting the number of access...,apw_eng_19991101_0025,apw_eng_199911
2,apw_eng_19991101_0021_10,12,14,15,15,neg,12,15,advmod,14,AT & claims that limiting the number of access...,apw_eng_19991101_0021,apw_eng_199911
