In [1]:
import nltk
from nltk import pos_tag, word_tokenize
from nltk.chunk import RegexpParser
from nltk.corpus import brown
from tqdm import tqdm

In [14]:
# Define your grammar
grammar = r"""
    NP: {<DT>?<JJ.*>*<NN.*>+|<NNPS>+}        # Noun Phrase: optional determiner, any number of adjectives, singular or plural nouns (including proper nouns)
        {<PRP>|<PRP$>}                       # Personal pronouns or possessive pronouns
        {<NP><POS><NN.*>+|<NNP>+|<NNPS>+}    # Noun phrases that use possessive endings or include proper nouns (singular/plural)
        {<EX><VB.*>}                         # Existential there constructions, e.g., "There is" or "There are"
        {<NNP>+|<NNPS>+}                     # Proper noun sequences, potentially forming proper noun phrases (singular/plural)
    VP: {<MD>?<VB.*>+<RB.*>*}                # Verb Phrase: optional modal, one or more verbs (including past tense), any number of adverbs
        {<TO><VB>}                           # Infinitive verbs
        {<VBD><RB.*>*}                       # Past tense verb followed by any number of adverbs
        {<VBG><RB.*>*}                       # Present participle verb followed by any number of adverbs
        {<VBP><RB.*>*}                       # Non-3rd person singular present verb followed by any number of adverbs
        {<VBN><RB.*>*}                       # Past participle verb followed by any number of adverbs
        {<VBZ><RB.*>*}                       # 3rd person singular present verb followed by any number of adverbs
    ADJP: {<RB.*>*<JJ.*>+}                   # Adjective Phrase: any number of adverbs followed by one or more adjectives
        {<JJR>|<JJS>}                        # Comparative and Superlative adjectives directly
    ADVP: {<RB.*>+}                          # Adverb Phrase: one or more adverbs
        {<RBR>|<RBS>}                        # Comparative and superlative adverbs
    PP: {<IN><NP>}                           # Prepositional Phrase: preposition followed by a noun phrase
    CONJP: {<CC>}                            # Conjunction Phrase: coordinating conjunction
    INTJ: {<UH>}                             # Interjection
    DP: {<DT>|<PDT>|<WDT>|<EX>}              # Determiner Phrase: determiners, pre-determiners, wh-determiners, or existential "there"
    QP: {<CD><NNS|NN>?}                      # Quantifier Phrase: cardinal number followed optionally by plural or singular noun
    WHNP: {<WP>|<WP$>|<WRB>|<WDT>}           # WH Noun Phrase: wh-pronoun, possessive wh-pronoun, wh-adverb, or wh-determiner
    SYMP: {<SYM>}                            # Symbol Phrase: handling symbols
    CD-NP: {<CD><JJ.*>*<NN.*>+}              # Number followed by adjectives and nouns
    PAST-VP: {<VBD><RB.*>*}                  # Past tense verb followed by any number of adverbs
    PRES-VP: {<VBG><RB.*>*}                  # Present participle verb followed by any number of adverbs
    NON3RD-VP: {<VBP><RB.*>*}                # Non-3rd person singular present verb followed by any number of adverbs
    PERFECT-VP: {<VBN><RB.*>*}               # Past participle verb followed by any number of adverbs
    THIRD-PERSON-VP: {<VBZ><RB.*>*}          # 3rd person singular present verb followed by any number of adverbs
    LIST-NP: {<LS><NP>+}                     # List item markers followed by Noun Phrases, e.g., in enumerated lists
"""

# s = 'My dog with a broken leg I not want'
# tokens = word_tokenize(s)
# tagged_tokens = pos_tag(tokens)
# tree = cp.parse(tagged_tokens)
# tree.draw()

parent_child = {}

# Create a chunk parser
cp = nltk.RegexpParser(grammar)

sentences = brown.sents()
for sentence in tqdm(sentences):

    tagged_tokens = pos_tag(sentence)
    tree = cp.parse(tagged_tokens)
    # tree.draw()
    traverse_tree(tree)

100%|██████████| 57340/57340 [00:32<00:00, 1763.82it/s]


In [15]:
parent_child

{'NP': {'DT',
  'EX',
  'JJ',
  'JJR',
  'JJS',
  'NN',
  'NNP',
  'NNPS',
  'NNS',
  'PRP',
  'VB',
  'VBD',
  'VBG',
  'VBN',
  'VBP',
  'VBZ'},
 'VP': {'MD', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'},
 'PP': {'IN', 'NP'},
 'ADVP': {'RB', 'RBR', 'RBS'},
 'DP': {'DT', 'EX', 'PDT', 'WDT'},
 'CONJP': {'CC'},
 'ADJP': {'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS'},
 'QP': {'CD'},
 'WHNP': {'WP', 'WRB'},
 'INTJ': {'UH'},
 'SYMP': {'SYM'},
 'LIST-NP': {'LS', 'NP'}}

In [11]:
def traverse_tree(tree):
    for subtree in tree:
        if type(subtree) == nltk.tree.Tree:
            parent = subtree.label()
            if parent not in parent_child:
                parent_child[parent] = set()
            for child in subtree:
                if type(child) == nltk.tree.Tree:
                    parent_child[parent].add(child.label())
                else:
                    parent_child[parent].add(child[1])
            traverse_tree(subtree)


In [58]:
tagged_tokens = pos_tag(sentences[10])
print(tagged_tokens)
tree = cp.parse(tagged_tokens)

tree.draw()

[('It', 'PRP'), ('urged', 'VBD'), ('that', 'IN'), ('the', 'DT'), ('city', 'NN'), ('``', '``'), ('take', 'VB'), ('steps', 'NNS'), ('to', 'TO'), ('remedy', 'VB'), ("''", "''"), ('this', 'DT'), ('problem', 'NN'), ('.', '.')]


: 