In [1]:
import nltk.tokenize as nt
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kathrine.swe/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Chunking Important Dates Slots
In this notebook, we take a look at Important Date intents and chunk them to find the necessary slot variables so that we may query our database for a response to our user.

### Definitions
intents:

chunking:

slots:

## Below we examine some intents about breaks
"When is fall break?"

"When is spring break?"

"When is the next break?"

In [18]:
spring_break="When is spring break?"
ss=nt.sent_tokenize(spring_break)
tokenized_sent=[nt.word_tokenize(sent) for sent in ss]
pos_sentences=[nltk.pos_tag(sent) for sent in tokenized_sent]
pos_sentences

[[('When', 'WRB'),
  ('is', 'VBZ'),
  ('spring', 'VBG'),
  ('break', 'NN'),
  ('?', '.')]]

In [16]:
fall_break="When is fall break?"
ss=nt.sent_tokenize(fall_break)
tokenized_sent=[nt.word_tokenize(sent) for sent in ss]
pos_sentences=[nltk.pos_tag(sent) for sent in tokenized_sent]
pos_sentences

[[('When', 'WRB'), ('is', 'VBZ'), ('fall', 'DT'), ('break', 'NN'), ('?', '.')]]

In [17]:
next_break="When is the next break?"
ss=nt.sent_tokenize(next_break)
tokenized_sent=[nt.word_tokenize(sent) for sent in ss]
pos_sentences=[nltk.pos_tag(sent) for sent in tokenized_sent]
pos_sentences

[[('When', 'WRB'),
  ('is', 'VBZ'),
  ('the', 'DT'),
  ('next', 'JJ'),
  ('break', 'NN'),
  ('?', '.')]]

### Note:
In the above examples, the POS tagger is not correctly tagging Spring and Fall to be JJs, or adjectives. Rather, 'spring' is tagged as VBG (Verb Gerund) and 'fall' is DT (Determiner).

Determiners are words like th

In [21]:
text="Where is science hall 112?"
ss=nt.sent_tokenize(text)
tokenized_sent=[nt.word_tokenize(sent) for sent in ss]
pos_sentences=[nltk.pos_tag(sent) for sent in tokenized_sent]
pos_sentences

[[('Where', 'WRB'),
  ('is', 'VBZ'),
  ('science', 'NN'),
  ('hall', 'NN'),
  ('112', 'CD'),
  ('?', '.')]]

In [15]:
text="Where is Intro to Philosophy?"
ss=nt.sent_tokenize(text)
tokenized_sent=[nt.word_tokenize(sent) for sent in ss]
pos_sentences=[nltk.pos_tag(sent) for sent in tokenized_sent]
pos_sentences

[[('Where', 'WRB'),
  ('is', 'VBZ'),
  ('Intro', 'NNP'),
  ('to', 'TO'),
  ('Philosophy', 'NNP'),
  ('?', '.')]]

In [37]:
def extract_NN(sentence):
    grammar = r"""
    NBAR:
        # Nouns and Adjectives, terminated with Nouns
        {<NN.*>*<NN.*>}

    NP:
        {<NBAR>}
        # Above, connected with in/of/etc...
        {<NBAR><IN><NBAR>}
    """
    chunker = nltk.RegexpParser(grammar)
    ne = set()
    chunk = chunker.parse(nltk.pos_tag(nltk.word_tokenize(sentence)))
    for tree in chunk.subtrees(filter=lambda t: t.label() == 'NP'):
        ne.add(' '.join([child[0] for child in tree.leaves()]))
    return ne

In [47]:
extract_NN("When is the end of the semester?")

{'end', 'semester'}

In [45]:
def extract_cardinal_digit(sent):
    grammar = r"""
    CD:
        # Cardinal Digits
        {<CD.*>}

    """
    chunker = nltk.RegexpParser(grammar)
    ne = set()
    chunk = chunker.parse(nltk.pos_tag(nltk.word_tokenize(sent)))
    for tree in chunk.subtrees(filter=lambda t: t.label() == 'CD'):
        ne.add(' '.join([child[0] for child in tree.leaves()]))
    return ne

In [48]:
extract_cardinal_digit("Where is room 112?")

{'112'}

In [41]:
text

'Who teaches CS448?'

In [4]:
def extract_adjective_noun(sent):
    grammar = r"""
    CD:
        # Adjective followed by noun
        {<JJ><NN>}
    """
    chunker = nltk.RegexpParser(grammar)
    ne = set()
    chunk = chunker.parse(nltk.pos_tag(nltk.word_tokenize(sent)))
    for tree in chunk.subtrees(filter=lambda t: t.label() == 'CD'):
        ne.add(' '.join([child[0] for child in tree.leaves()]))
    return ne

In [10]:
extract_adjective_noun("When is the next break?")

{'next break'}

In [None]:
def extract_adjective_noun(sent):
    grammar = r"""
    CD:
        # Adjective followed by noun
        {<JJ><NN>}
    """
    chunker = nltk.RegexpParser(grammar)
    ne = set()
    chunk = chunker.parse(nltk.pos_tag(nltk.word_tokenize(sent)))
    for tree in chunk.subtrees(filter=lambda t: t.label() == 'CD'):
        ne.add(' '.join([child[0] for child in tree.leaves()]))
    return ne