In [None]:
import re
from typing import List, Tuple

import nltk
from nltk.corpus import wordnet as wn

# Question (a)
Wordnet exploration.

In [2]:
# Synsets for 'sink'
sink_synsets = wn.synsets("sink")
print(f"There are {len(sink_synsets)} synsets for 'sink':")

for synset in sink_synsets:
    print(synset)

There are 13 synsets for 'sink':
Synset('sink.n.01')
Synset('sink.n.02')
Synset('sinkhole.n.01')
Synset('cesspool.n.01')
Synset('sink.v.01')
Synset('sink.v.02')
Synset('sink.v.03')
Synset('sink.v.04')
Synset('sink.v.05')
Synset('dip.v.08')
Synset('slump.v.03')
Synset('slump.v.02')
Synset('bury.v.05')


In [3]:
# Direct hypernym of most common noun sense of 'sink'
synset_noun_sink = wn.synset("sink.n.01")
synset_noun_sink.hypernyms()

[Synset('plumbing_fixture.n.01')]

In [14]:
# Direct troponym of most common verb sense 'drink'
synset_verb_drink = wn.synset("drink.v.01")
synset_verb_drink.hyponyms()

[Synset('drain_the_cup.v.01'),
 Synset('guggle.v.03'),
 Synset('suck.v.01'),
 Synset('toss_off.v.02'),
 Synset('lap.v.04'),
 Synset('gulp.v.01'),
 Synset('sip.v.01'),
 Synset('swill.v.02'),
 Synset('guzzle.v.01')]

In [15]:
# Closest ancestor for most common noun sense 'dog' and 'insect'
synset_noun_dog = wn.synset("dog.n.01")
synset_noun_insect = wn.synset("insect.n.01")
synset_noun_dog.lowest_common_hypernyms(synset_noun_insect)

[Synset('animal.n.01')]

In [16]:
# Astronauts that are instances of most common noun sense 'astronaut'
synset_noun_astronaut = wn.synset("astronaut.n.01")
synset_noun_astronaut.instance_hyponyms()

[Synset('gagarin.n.01'),
 Synset('armstrong.n.01'),
 Synset('glenn.n.01'),
 Synset('shepard.n.02'),
 Synset('tereshkova.n.01')]

# Question (b)
Implement Hearst Pattern matching.

In [5]:
def identify_noun_phrases(sentence: str) -> List[str]:
    """
    Identify noun phrases in a sentence using nltk pos tagging.
    """
    print(f"Extracting NPs..")
    # Tokenize sentence
    tokens = nltk.word_tokenize(sentence)
    pos_tags = nltk.pos_tag(tokens)

    # Use NP specific grammar to identify NPs
    NP_grammer = "NP: {<DT>?<JJ>*<NN>}"
    chunk_parser = nltk.RegexpParser(NP_grammer)
    results = chunk_parser.parse(pos_tags)

    # Iterate through each subtree
    NPs = []
    for tree in results.subtrees():
        if tree.label() == "NP":
            # Obtain all words under the NP branch
            words = [w for w, _ in tree.leaves()]
            NPs.append(" ".join(words))

    return NPs


def extract_hypernyms(
    sentence: str, pattern_type: int
) -> Tuple[str, str]:
    """
    Extracts candidate hypernyms from a sentence using Hearst Pattern regex matching.
    """
    print(f"Pattern type {pattern_type} chosen...")
    # Regex for word boundaries (use NP as placeholder)
    NP = "\w+(-?\w+)(\s\w+(-?\w+))?"

    # Use regex to extract NPs
    print(f"Extracting hypernyms using regex...")
    if pattern_type == 1:
        # Hearst Pattern 1: NP {,} especially {NP,}* {or|and} NP
        hp1 = rf"{NP},? especially ({NP},? )*(or|and)? {NP}"
        delimiter1 = r",? especially "

        match = re.search(hp1, sentence)
        if match:
            candidate_words = re.split(
                delimiter1, match.group()
            )

            # Hypernym is before delimiter
            before_delim = candidate_words[0]
            hypernym = identify_noun_phrases(before_delim)
            if len(hypernym) == 0:
                # Use entire phrase if no NPs found
                hypernym = before_delim

            # Hyponyms is after delimiter, either within commas or separated by or/and
            after_delim1 = re.split(
                r", ", candidate_words[1]
            )[0:-1]
            after_delim2 = re.split(
                r",? or|and ", candidate_words[1]
            )[-1]
            hyponyms = after_delim1 + [after_delim2]

        else:
            print("No match found.")
            return None, None

    else:
        # Hearst Pattern 2: NP {, NP}*{,} and other NP
        hp2 = rf"{NP}(,? {NP})*,? and other {NP}"
        delimiter2 = r",? and other "

        match = re.search(hp2, sentence)
        if match:
            candidate_words = re.split(
                delimiter2, match.group()
            )

            # Hypernym is after delimiter
            hypernym = identify_noun_phrases(
                candidate_words[-1]
            )
            if len(hypernym) == 0:
                # Use entire phrase if no NPs found
                hypernym = candidate_words[-1]

            # Hyponyms is before delimiter, within commas
            hyponyms = re.split(r", ", candidate_words[0])

        else:
            print("No match found.")
            return None, None

    return hypernym, hyponyms

In [None]:
# Pick sentences from test set to test hypernym indentification
test = "Charles L. Harris, as leader of the Baltimore Colored City Band, took his group to black neighborhoods across Baltimore, playing marches, waltzes and other music, then switch to jazz-like music with an upbeat tempo, meant for dancing."
hypernym, hyponym = extract_hypernyms(test, pattern_type=2)

print(f"Extracted hypernym: {hypernym}")
print(f"Extracted hyponyms: {hyponym}")

# Question (c)
Implement extension: extracting meronyms.

In [60]:
def identify_noun_phrases(sentence: str) -> List[str]:
    """
    Identify noun phrases in a sentence using nltk pos tagging.
    """
    print(f"Extracting NPs..")
    # Tokenize sentence
    tokens = nltk.word_tokenize(sentence)
    pos_tags = nltk.pos_tag(tokens)

    # Use NP specific grammar to identify NPs
    NP_grammer = "NP: {<DT>?<JJ>*<NN>}"
    chunk_parser = nltk.RegexpParser(NP_grammer)
    results = chunk_parser.parse(pos_tags)

    # Iterate through each subtree
    NPs = []
    for tree in results.subtrees():
        if tree.label() == "NP":
            # Obtain all words under the NP branch
            words = [w for w, _ in tree.leaves()]
            NPs.append(" ".join(words))

    return NPs


def extract_meronyms(sentence: str) -> Tuple[str, str]:
    """
    Extracts candidate meronyms from a sentence using regex matching.
    """
    # Regex for word boundaries (use NP as placeholder)
    NP = "\w+(-?\w+)(\s\w+(-?\w+))?"

    print(f"Extracting meronyms using regex...")
    # Meronym pattern: NP {is|as} part of NP
    pattern = rf"{NP},?( is|as )? part of {NP}"
    match = re.search(pattern, sentence)

    # Use delimiter to split meronym and holonym
    delim = "part of"
    if match:
        candidate_words = re.split(delim, match.group())
        print(candidate_words)
        # Meronym is before delimiter
        meronym = identify_noun_phrases(candidate_words[0])
        if len(meronym) == 0:
            # Use entire phrase if no NPs found
            meronym = candidate_words[0]

        # Holonym is after delimiter
        holonym = identify_noun_phrases(candidate_words[1])
        if len(holonym) == 0:
            # Use entire phrase if no NPs found
            holonym = candidate_words[1]

    else:
        print("No match found.")
        return None, None

    return meronym, holonym

In [67]:
sentence = "Additional attention has been directed to wastewater treatment, urban stormwater runoff, and wetland protection."
meronym, holonym = extract_meronyms(sentence)

print(f"Extracted meronym: {meronym}")
print(f"Extracted holonym: {holonym}")

Extracting meronyms using regex...
No match found.
Extracted meronym: None
Extracted holonym: None
