 Entity extraction

In [1]:
#extracting city entities

import spacy
nlp = spacy.load("en_core_web_md")

spacy.explain("GPE")

'Countries, cities, states'

In [3]:
doc = nlp("Can you please confirm that you want to book a table for 2 at 11:30 am at the Bird restaurant in Palo Alto for today")
doc.ents

(2, 11:30 am, Bird, Palo Alto, today)

In [4]:
for ent in doc.ents:
    print(ent.text, ent.label_)

2 CARDINAL
11:30 am TIME
Bird LOC
Palo Alto GPE
today DATE


In [5]:
spacy.explain("LOC")

'Non-GPE locations, mountain ranges, bodies of water'

In [7]:
#extarcting date and time entities
import spacy
nlp = spacy.load("en_core_web_md")
sentences = [
    "I will be eating there at 11:30 am so make it for time.",
    "I'll reach there at 1:30 pm.",
    "No, chage it on next friday.",
    "Sure, Please confirm that the date is now next Firday and for 1 person.",
    "I need to make it on Monday next week at half past 12 in the afternoon.",
    "A quarter past 5 in the evening, please."
]
for sent in sentences:
    doc = nlp(sent)
    ents = doc.ents
    print([(ent.text, ent.label_) for ent in ents])

[('11:30 am', 'TIME')]
[('1:30 pm', 'TIME')]
[('next friday', 'DATE')]
[('next Firday', 'DATE'), ('1', 'CARDINAL')]
[('Monday next week', 'DATE'), ('half past 12', 'CARDINAL')]
[('A quarter past 5', 'DATE')]


In [8]:
spacy.explain('CARDINAL')

'Numerals that do not fall under another type'

In [9]:
from spacy import displacy
displacy.serve(nlp("A quarter past 5"), style='dep')




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [30]:
#some weird case
sentences = [
    "Have a great day.",
    "Have a nice day.",
    "Have a good day.",
    "Have a wonderful day.",
    "Have a sunny and nice day"
]

for sent in sentences:
    doc = nlp(sent)
    ents = doc.ents
    print([(ent.text, ent.label_) for ent in ents])

[('a great day', 'DATE')]
[('a nice day', 'DATE')]
[]
[]
[]


In [31]:
doc = nlp("Have a great day")
wrong_matches = ["a great day", "a nice day"]
date_ents = [ent for ent in doc.ents if ent.label_ == "DATE"]
date_ents = list(filter(lambda e: e.text not in worng_matches, date_ents))
date_ents

[]

In [4]:
 #Extracting phone number
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_md")
doc = nlp("The phone number is 09-779-229-664.")

matcher = Matcher(nlp.vocab)
pattern = [
    {"SHAPE": "dd"}, {"TEXT": "-"},
    {"SHAPE": "ddd"}, {"TEXT": "-"},
    {"SHAPE": "ddd"}, {"TEXT": "-"},
    {"SHAPE": "ddd"}
]

matcher.add("extractPhoneNo", [pattern])
matches = matcher(doc)
for mid, start, end in matches:
    print(doc[start:end])

09-779-229-664


In [33]:
# extracting cuisine types
spacy.explain("NORP")

'Nationalities or religious or political groups'

In [35]:
sentences = [
    "Is there a specific cuisine type you enjoy, such as Mexican, Italian or something else?",
    "I usually like eating the American type of food.",
    "Find me Ethiopian cuisine in Berkeley.",
    "I'm looking for a Filipino palce to eat.",
    "I would like some Italian food.",
    "Malaysian sounds good right now."
]

for sent in sentences:
    doc = nlp(sent)
    ents = doc.ents
    print([(ent.text, ent.label_) for ent in ents])

[('Mexican', 'NORP'), ('Italian', 'NORP')]
[('American', 'NORP')]
[('Ethiopian', 'NORP'), ('Berkeley', 'GPE')]
[('Filipino', 'NORP')]
[('Italian', 'NORP')]
[('Malaysian', 'NORP')]


 Intent Recognition

In [6]:
#Pattern-Based Text classification
import spacy
from spacy.matcher  import Matcher

nlp = spacy.load("en_core_web_md")
matcher = Matcher(nlp.vocab)
sentences = [
    " No, Thanks.",
    " No, thank you very much.",
    " That is all thank you so much.",
    " No, that is all.",
    " Nope, that'll be all.",
    "Thanks No, that's okay.",
    " No thanks.",
    " That's all I needed help with.",
    " No. This should be enough for now.",
    " No, thanks No, thanks a lot.",
    " No, thats all thanks."
]
pattern1 = [
    {"LOWER": {"IN": ["no", "nope"]}},
    {"TEXT": {"IN": [",", "."]}}
]
pattern2 = [
    {"TEXT": {"REGEX": "[Tt]hanks?"}},
    {"LOWER": {"IN": ["you", "a lot"]}, "OP": "*"}
]
pattern3 = [
    {"LOWER": {"IN": ["that", "that's", "thats", "that'll"]}},
    {"LOWER": {"IN": ["is", "will"]}, "OP": "*"},
    {"LOWER": "all"}
]

matcher.add("textExtract", [pattern1, pattern2, pattern3])

for sent in sentences:
    doc = nlp(sent)
    matches = matcher(doc)
    for mid, start, end in matches:
        print(doc[start:end])
    

No,
Thanks
No,
thank
thank you
That is all
thank
thank you
No,
that is all
Nope,
Thanks
No,
thanks
No.
No,
thanks
No,
thanks
No,
thanks


 Classification text with a character-level LSTM

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Input, Embedding, Dense, MaxPooling1D, Dropout, Bidirectional, Conv1D
from tensorflow.keras import optimizers
import numpy as np
from tensorflow.keras.callbacks import TensorBoard
from sklearn.utils import shuffle
import json

In [5]:
utterances = []
labels = []

with open("data/restaurants.json", "r") as jfile:
    data = json.load(jfile)
    print(data)

[{'dialogue_id': '1_00000', 'turns': [{'speaker': 'USER', 'utterance': 'I am feeling hungry so I would like to find a place to eat.', 'slots': [], 'intent': 'FindRestaurants'}, {'speaker': 'SYSTEM', 'utterance': 'Do you have a specific which you want the eating place to be located at?', 'slots': []}, {'speaker': 'USER', 'utterance': 'I would like for it to be in San Jose.', 'slots': [{'exclusive_end': 37, 'slot': 'city', 'start': 29}], 'intent': 'FindRestaurants'}, {'speaker': 'SYSTEM', 'utterance': 'Is there a specific cuisine type you enjoy, such as Mexican, Italian or something else?', 'slots': [{'exclusive_end': 59, 'slot': 'cuisine', 'start': 52}, {'exclusive_end': 68, 'slot': 'cuisine', 'start': 61}]}, {'speaker': 'USER', 'utterance': 'I usually like eating the American type of food.', 'slots': [{'exclusive_end': 34, 'slot': 'cuisine', 'start': 26}], 'intent': 'FindRestaurants'}, {'speaker': 'SYSTEM', 'utterance': 'I see that at 71 Saint Peter there is a good restaurant which is 

In [6]:
for dialogue in data:
    turns = dialogue['turns']
    for turn in turns:
        speaker = turn['speaker']
        if speaker == 'USER':
            utterance, intent = turn['utterance'], turn['intent']
            label = 1 if intent == "FindRestaurants" else 0
            utterances.append(utterance)
            labels.append(label)

In [7]:
utterances[: 10]

['I am feeling hungry so I would like to find a place to eat.',
 'I would like for it to be in San Jose.',
 'I usually like eating the American type of food.',
 'Can you give me the address of this restaurant.',
 'Can you give me the phone number that I can contact them with?',
 'Is there some other restaurant which you can suggest?',
 'Do you have another restaurant matching my needs? For example a restaurant which is economical and is located in Palo Alto.',
 'Alright, that seems good. I would like to make a booking at this restaurant.',
 'I will be eating there at 11:30 am so make it for then.',
 'That suits me well. Can you tell me if they feature live music?']

In [8]:
labels[: 10]

[1, 1, 1, 1, 1, 1, 1, 0, 0, 0]

In [9]:
len(utterances), len(labels)

(1233, 1233)

In [10]:
utterances, labels = shuffle(utterances, labels, random_state=1)

In [11]:
tokenizer = Tokenizer(char_level=True, filters=".,;'\"-", lower=True)
tokenizer.fit_on_texts(utterances)

In [36]:
tokenizer.word_index

{' ': 1,
 'e': 2,
 'a': 3,
 't': 4,
 'o': 5,
 'n': 6,
 'i': 7,
 'r': 8,
 's': 9,
 'h': 10,
 'l': 11,
 'd': 12,
 'u': 13,
 '.': 14,
 'm': 15,
 'c': 16,
 'y': 17,
 'f': 18,
 'p': 19,
 'k': 20,
 'g': 21,
 'w': 22,
 'v': 23,
 '?': 24,
 ',': 25,
 'b': 26,
 "'": 27,
 '1': 28,
 ':': 29,
 '0': 30,
 '3': 31,
 '5': 32,
 'x': 33,
 '4': 34,
 'q': 35,
 '2': 36,
 '!': 37,
 'z': 38,
 '7': 39,
 '6': 40,
 'j': 41,
 '8': 42,
 '9': 43,
 '-': 44,
 '"': 45,
 '`': 46}

In [14]:
utterances = tokenizer.texts_to_sequences(utterances)

In [16]:
mutt_len = max([len(ans) for ans in utterances])
print(mutt_len)

156


In [17]:
MAX_LEN = 150

In [21]:
utterances[0]

[22,
 10,
 3,
 4,
 27,
 9,
 1,
 4,
 10,
 2,
 1,
 3,
 12,
 12,
 8,
 2,
 9,
 9,
 1,
 3,
 6,
 12,
 1,
 16,
 5,
 6,
 4,
 3,
 16,
 4,
 1,
 6,
 13,
 15,
 26,
 2,
 8,
 24]

In [22]:
utterances = pad_sequences(utterances, MAX_LEN, padding="post")

In [23]:
utterances[0]

array([22, 10,  3,  4, 27,  9,  1,  4, 10,  2,  1,  3, 12, 12,  8,  2,  9,
        9,  1,  3,  6, 12,  1, 16,  5,  6,  4,  3, 16,  4,  1,  6, 13, 15,
       26,  2,  8, 24,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [24]:
utterances, labels = np.array(utterances), np.array(labels)

In [25]:
utterances.shape, labels.shape

((1233, 150), (1233,))

In [28]:
#feeding layers

utt_input = Input(shape=(MAX_LEN,))

embedding_layer = Embedding(input_dim = len(tokenizer.word_index) + 1, output_dim = 100, input_length = MAX_LEN)
lstm = Bidirectional(LSTM(units=100, return_sequences=False))

utt_embedding = embedding_layer(utt_input)
utt_encoded = lstm(utt_embedding)

output = Dense(1, activation='sigmoid')(utt_encoded)

In [29]:
model = Model(utt_input, output)

In [33]:
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics=["accuracy"])

In [31]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 150)]             0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 150, 100)          4700      
_________________________________________________________________
bidirectional_2 (Bidirection (None, 200)               160800    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 201       
Total params: 165,701
Trainable params: 165,701
Non-trainable params: 0
_________________________________________________________________


In [34]:
model.fit(utterances, labels, validation_split=0.1, epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x18b70fdc970>