# Lecture 8: Parsing

## Setup

In [1]:
# !pip install benepar
# !pip install svgling

In [2]:
#setup
#%matplotlib notebook
import pandas as pd
import spacy
from spacy import displacy
import benepar
import nltk

from collections import Counter


df = pd.read_csv('death-penalty-cases.csv')

2023-10-25 19:39:36.038271: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
df

Unnamed: 0,court_id,author_id,state,year,dateFiled,citeCount,snippet
0,nj,,NJ,1965,1965-09-14T00:00:00Z,8,N.J. ( )\n A. d \nIN RE WAIVER OF DEATH PE...
1,fla,4019.0,FL,1973,1973-07-26T00:00:00Z,552,"whether the death penalty is, per se, unconsti..."
2,texcrimapp,5765.0,TX,1975,1975-04-16T00:00:00Z,143,# ;s contention that the assessment of the dea...
3,nm,,NM,2009,2009-11-30T00:00:00Z,0,. d ( )\n -NMSC- \nIN THE MATTER OF DEATH PE...
4,texcrimapp,5758.0,TX,1944,1944-12-20T00:00:00Z,56,assume the district attorney orally waived the...
...,...,...,...,...,...,...,...
32562,ohioctapp,8055.0,OH,2017,2017-07-20T00:00:00Z,0,of two counts of aggravated murder with deat...
32563,cal,,CA,2017,2017-07-20T00:00:00Z,0,his general views about the death penalty as ...
32564,neb,,NE,2017,2017-07-21T00:00:00Z,0,"been subject to the death\npenalty, because Ne..."
32565,ohio,5374.0,OH,2017,2017-07-25T00:00:00Z,0,that Indiana law permits imposition of the de...


## Dependency Parsing with SpaCy

Let's first look at one example:

In [4]:
text = 'Science cannot solve the ultimate mystery of nature.'
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
displacy.render(doc, style="dep")

In [6]:
for sent in doc.sents:
    print("sentence:", sent)
    print("root:", sent.root)
    print([(w, w.dep_) for w in sent.root.children])
    print()

sentence: Science cannot solve the ultimate mystery of nature.
root: solve
[(Science, 'nsubj'), (can, 'aux'), (not, 'neg'), (mystery, 'dobj'), (., 'punct')]



In [7]:
# current sentence
print(sent)
print(sent.root)
print(list(sent.root.children))
# Left children
print(list(sent.root.lefts))
# Right children
print(list(sent.root.rights))
# first token
print(sent[0])
# first token dependency label, cc=conjunction
print(sent[0].dep_)
print(sent[0].head)

Science cannot solve the ultimate mystery of nature.
solve
[Science, can, not, mystery, .]
[Science, can, not]
[mystery, .]
Science
nsubj
solve


# Constituency Parsing with SpaCy

In [8]:
benepar.download('benepar_en3')
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('benepar', config={'model': 'benepar_en3'})
doc = nlp('Science cannot solve the ultimate mystery of nature.')
sent = list(doc.sents)[0]
print(sent._.parse_string)
print(sent._.labels)
print(list(sent._.children)[0])
#nltk.Tree.fromstring(sent._.parse_string)

[nltk_data] Downloading package benepar_en3 to
[nltk_data]     /Users/abombelli/nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


(S (NP (NN Science)) (VP (MD can) (RB not) (VP (VB solve) (NP (NP (DT the) (JJ ultimate) (NN mystery)) (PP (IN of) (NP (NN nature)))))) (. .))
('S',)
Science




# Application
## Unsupervised Discovery of Gendered Language through Latent-Variable Modeling

[Hoyle et al. (2019)](https://www.aclweb.org/anthology/P19-1167/) study the language use of gendered nouns and proceed to train a generative latent-variable model that jointly represents adjective (or verb) choice, with its sentiment given the (natural) gender of a noun. To this extent, they extract noun–adjectives pairs, NSUBJ–verb pairs and DOBJ–verb pairs. 

In the following, we show how to extract NSUBJ-verb pairs from text.

In [9]:
df

Unnamed: 0,court_id,author_id,state,year,dateFiled,citeCount,snippet
0,nj,,NJ,1965,1965-09-14T00:00:00Z,8,N.J. ( )\n A. d \nIN RE WAIVER OF DEATH PE...
1,fla,4019.0,FL,1973,1973-07-26T00:00:00Z,552,"whether the death penalty is, per se, unconsti..."
2,texcrimapp,5765.0,TX,1975,1975-04-16T00:00:00Z,143,# ;s contention that the assessment of the dea...
3,nm,,NM,2009,2009-11-30T00:00:00Z,0,. d ( )\n -NMSC- \nIN THE MATTER OF DEATH PE...
4,texcrimapp,5758.0,TX,1944,1944-12-20T00:00:00Z,56,assume the district attorney orally waived the...
...,...,...,...,...,...,...,...
32562,ohioctapp,8055.0,OH,2017,2017-07-20T00:00:00Z,0,of two counts of aggravated murder with deat...
32563,cal,,CA,2017,2017-07-20T00:00:00Z,0,his general views about the death penalty as ...
32564,neb,,NE,2017,2017-07-21T00:00:00Z,0,"been subject to the death\npenalty, because Ne..."
32565,ohio,5374.0,OH,2017,2017-07-25T00:00:00Z,0,that Indiana law permits imposition of the de...


In [10]:
df = df.sample(n=2000)
df["processed"] = df["snippet"].apply(lambda x: nlp(x))




In [11]:
def extract_subject_verb_pairs(sent):
    subjs = [w for w in sent if w.dep_ == "nsubj"]
    pairs = [(w.lemma_.lower(), w.head.lemma_.lower()) for w in subjs]
    return pairs

df["subj-verb-pairs"] = df["processed"].apply(lambda x: extract_subject_verb_pairs(x))

In [12]:
# most common pairs
counter = Counter()
for item in df["subj-verb-pairs"]:
    counter.update(item)
    
for pair, counts in counter.most_common(n=25):
    print (pair, counts) # -pron- is a pronoun

('penalty', 'be') 197
('state', 'seek') 166
('it', 'be') 90
('this', 'be') 60
('he', 'be') 58
('statute', 'be') 50
('jury', 'find') 49
('court', 'find') 45
('defendant', 'be') 44
('defendant', 'eligible') 38
('jury', 'recommend') 36
('court', 'impose') 35
('he', 'receive') 29
('that', 'be') 29
('court', 'hold') 28
('case', 'be') 28
('state', 'waive') 28
('court', 'sentence') 26
('jury', 'impose') 25
('we', 'have') 21
('we', 'find') 20
('state', 'file') 20
('defendant', 'argue') 20
('imposition', 'be') 20
('who', 'be') 19


In [13]:
#!pip install coreferee
#!python3 -m spacy download en_core_web_trf en_core_web_lg
#!python3 -m coreferee install en

# install coreference resolution for spacy

# !pip install neuralcoref --no-binary neuralcoref
import spacy_transformers
import coreferee
nlp = spacy.load('en_core_web_trf')
nlp.add_pipe('coreferee')

<coreferee.manager.CorefereeBroker at 0x7ff4a6709d00>

In [14]:
# Coreference Resolution
doc = nlp(u'My sister has a dog. She loves him.')
doc._.coref_chains.print()
print(doc._.coref_chains.resolve(doc[8]))

0: sister(1), She(6)
1: dog(4), him(8)
[dog]


In [15]:
# For neuralcoref to work you should downgrade spacy to version 2
# For this reason, we will not be running it today
#!pip install spacy==2.3.5

#!pip install neuralcoref --no-binary neuralcoref
#!python3 -m spacy download en_core_web_md

#import neuralcoref
#neuralcoref.add_to_pipe(nlp)

#df["corefs_resolved"] = df["snippet"].apply(lambda x: nlp(x))

#def extract_subject_verb_pairs_coref(sent):
#    subjs = [w for w in sent if w.dep_ == "nsubj"]
#    pairs = []
#    for w in subjs:
#        # either a subject is part of a coreference chain, then we need to resolve the chain
#        if w._.in_coref:
#            cluster = w._.coref_clusters[0]
#            lemma = cluster.main.root.lemma_.lower()
#            pairs.append((lemma, w.head.lemma_.lower()))
#        # if it's not, we can just do the same as above
#        else:
#            pairs.append((w.lemma_.lower(), w.head.lemma_.lower()))
#    return pairs


#df["subj-verb-pairs-coref"] = df["corefs_resolved"].apply(lambda x: extract_subject_verb_pairs_coref(x))
#counter = Counter()
#for item in df["subj-verb-pairs-coref"]:
#    counter.update(item)
    
#for pair, counts in counter.most_common(n=25):
#    print (pair, counts)

# verbs used with defendant

#for (subject, verb), counts in counter.most_common():
#    if subject == "defendant" and counts > 1:
#        print (subject, verb, counts)

# verbs used with jury

#for (subject, verb), counts in counter.most_common():
#    if subject == "jury" and counts > 1:
#        print (subject, verb, counts)