#NLP Tutorial - PART I: Basic pre-processing and a very basic *chatbot*

In [2]:
import numpy as np
import nltk
import random
import string
import sklearn
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt') 
nltk.download('wordnet') 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

Read in a text file.

In [3]:
f=open('data.txt','r',errors = 'ignore')
raw=f.readlines() #use line break to read in paragraphs

In [58]:
sents0 = [ nltk.sent_tokenize(r) for r in raw ] # converts each paragraph to a list of sentences 
sents0 = [ s for sent in sents0 for s in sent  ] # flatten the list
print(len(sents0)) #how many sentences do we get?
sents0[:10]

['A chatbot (also known as a spy, conversational bot, chatterbot, interactive agent, conversational interface, Conversational AI, talkbot or artificial spy entity) is a computer program or an artificial intelligence which conducts a conversation via auditory or textual methods.',
 '[1] Such programs are often designed to convincingly simulate how a human would behave as a conversational partner, thereby passing the Turing test.',
 'Chatbots are typically used in dialog systems for various practical purposes including customer service or information acquisition.',
 'Some chatbots use sophisticated natural language processing systems, but many simpler ones scan for keywords within the input, then pull a reply with the most matching keywords, or the most similar wording pattern, from a database.',
 'The term "ChatterBot" was originally coined by Michael Mauldin (creator of the first Verbot, Julia) in 1994 to describe these conversational programs.',
 "[2] Today, most chatbots are accessed

In [62]:
# remove short sentences
sents = [ s for s in sents0 if len(nltk.word_tokenize(s)) > 8 ]
print(len(sents))
sents[:10]

121


['A chatbot (also known as a spy, conversational bot, chatterbot, interactive agent, conversational interface, Conversational AI, talkbot or artificial spy entity) is a computer program or an artificial intelligence which conducts a conversation via auditory or textual methods.',
 '[1] Such programs are often designed to convincingly simulate how a human would behave as a conversational partner, thereby passing the Turing test.',
 'Chatbots are typically used in dialog systems for various practical purposes including customer service or information acquisition.',
 'Some chatbots use sophisticated natural language processing systems, but many simpler ones scan for keywords within the input, then pull a reply with the most matching keywords, or the most similar wording pattern, from a database.',
 'The term "ChatterBot" was originally coined by Michael Mauldin (creator of the first Verbot, Julia) in 1994 to describe these conversational programs.',
 "[2] Today, most chatbots are accessed

In [82]:
# Prepare a preprocessing function that will do tokenization,
# case lowering, punctuation removal, stopword removal, and stemming

# create a stemmer
snowball = nltk.SnowballStemmer('english')
# get default stopword list
my_stop_words = text.ENGLISH_STOP_WORDS

def MyNormalize(text):
    tokens=nltk.word_tokenize(text.lower())
    tokens=[ t for t in tokens if t not in string.punctuation ]
    tokens=[ t for t in tokens if t not in my_stop_words ]
    toks = [snowball.stem(t) for t in tokens  ]
    return toks

#test the preprocessing function
MyNormalize(sents[0])

['chatbot',
 'known',
 'spi',
 'convers',
 'bot',
 'chatterbot',
 'interact',
 'agent',
 'convers',
 'interfac',
 'convers',
 'ai',
 'talkbot',
 'artifici',
 'spi',
 'entiti',
 'comput',
 'program',
 'artifici',
 'intellig',
 'conduct',
 'convers',
 'auditori',
 'textual',
 'method']

In [83]:
# preprocess the sentences in data, and create a tf-idf vector
TfidfVec = TfidfVectorizer(tokenizer=MyNormalize)

tfidf = TfidfVec.fit_transform(sents)
tfidf.shape

(121, 808)

In [85]:
#prepare some greeting words
GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up","hey",)
GREETING_RESPONSES = ["hi", "hey", "*nods*", "hi there", "hello", "I am glad! You are talking to me"]
def greeting(sentence): 
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)
        

# function to match input to the preprocessed sentences
def response(user_response):
    robo_response=''
    new = TfidfVec.transform([user_response])
    vals = cosine_similarity(new[0], tfidf)
    flat = vals.flatten()
    idx = flat.argsort()[-1]
    sim_max = flat[idx]
    if(sim_max==0):
        robo_response=robo_response+"I am sorry! I don't understand you"
        return robo_response
    else:
        robo_response = robo_response+sents[idx]
        return robo_response, sim_max


In [86]:
response("How can chatbots help people?")

('Thus, for example, online help systems can usefully employ chatbot techniques to identify the area of help that users require, potentially providing a "friendlier" interface than a more formal search or menu system.',
 0.28540638269572366)

In [87]:
MyNormalize("How can chatbots help people?")

['chatbot', 'help', 'peopl']

In [89]:
#starting the bot
flag=True
print("CHATTY: My name is CHATTY. I will answer your queries about Chatbots. If you want to exit, type Bye!")
while(flag==True):
    user_response = input()
    user_response=user_response.lower()
    if(user_response!='bye'):
        if(user_response=='thanks' or user_response=='thank you' ):
            flag=False
            print("CHATTY: You are welcome..")
        else:
            if(greeting(user_response)!=None):
                print("CHATTY: "+greeting(user_response))
            else:
                print("CHATTY: ",end="")
                print(response(user_response))
    else:
        flag=False
        print("CHATTY: Bye! take care...")
        
# now chat with your bot...
# you may experiment with different similarity functions

CHATTY: My name is CHATTY. I will answer your queries about Chatbots. If you want to exit, type Bye!
bye
CHATTY: Bye! take care...


# NLP Tutorial - PART II: NLU with spaCY

In [31]:
import spacy
from spacy import displacy

#load the required model
nlp = spacy.load("en_core_web_sm")

#process a sentence
eg1 = u"What is the weather in Seattle today?"
doc1 = nlp(eg1)

In [33]:
#visualize the results for NER
displacy.render(doc1, style="ent", jupyter=True)

In [34]:
# visualize the results for dependency parsing
displacy.render(doc1, style="dep", jupyter=True)

In [35]:
# the detailed results behind 
for ent in doc1.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Seattle 23 30 GPE
today 31 36 DATE


In [36]:
for token in doc1:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.head,
            token.shape_, token.is_alpha, token.is_stop)

What what PRON WP attr is Xxxx True True
is be AUX VBZ ROOT is xx True True
the the DET DT det weather xxx True True
weather weather NOUN NN nsubj is xxxx True False
in in ADP IN prep weather xx True True
Seattle Seattle PROPN NNP pobj in Xxxxx True False
today today NOUN NN npadvmod is xxxx True False
? ? PUNCT . punct is ? False False


In [37]:
eg2 = u"Apple is looking at buying U.K. startup for $1 billion"
eg3 = u"What's the time now in Singapore?"
eg4 = u"What's the weather now in Singapore?"
doc2 = nlp(eg2)
doc3 = nlp(eg3)
doc4 = nlp(eg4)
print(doc2.similarity(doc1))
print(doc3.similarity(doc1))
print(doc4.similarity(doc1))

0.3832489165356983
0.7573910984233972
0.7977538506573242


  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


In [42]:
#download the model with word vectors， which enables more accurate semantic similarity comparison
!python -m spacy download en_core_web_md

Collecting en_core_web_md==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4MB)
[K     |████████████████████████████████| 96.4MB 1.1MB/s 
Building wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.2.5-cp37-none-any.whl size=98051305 sha256=5ef670d84cc9611d32cb746f4e5336116889fae80befb1db06d1111bb0615366
  Stored in directory: /tmp/pip-ephem-wheel-cache-6_tw06fh/wheels/df/94/ad/f5cf59224cea6b5686ac4fd1ad19c8a07bc026e13c36502d81
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [44]:
#load the model   
import en_core_web_md
nlpd = en_core_web_md.load()

In [46]:
# common tokens come with vectors
tokens = nlpd(u'king queen man woman')
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov, 
          token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)
    
for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

king True 7.1417456 False king PROPN NNP compound xxxx True False
queen True 6.8297405 False queen PROPN NNP compound xxxx True False
man True 6.352939 False man PROPN NNP compound xxx True False
woman True 6.8987513 False woman NOUN NN ROOT xxxx True False
king king 1.0
king queen 0.72526103
king man 0.4088461
king woman 0.26556593
queen king 0.72526103
queen queen 1.0
queen man 0.27109137
queen woman 0.40660653
man king 0.4088461
man queen 0.27109137
man man 1.0
man woman 0.7401745
woman king 0.26556593
woman queen 0.40660653
woman man 0.7401745
woman woman 1.0


In [47]:
# more accurate similarity evaluation with the vectors available.
doc1_md = nlpd(eg1)
doc2_md = nlpd(eg2)
doc3_md = nlpd(eg3)
doc4_md = nlpd(eg4)
print(doc2_md.similarity(doc1_md))
print(doc3_md.similarity(doc1_md))
print(doc4_md.similarity(doc1_md))

0.6995620151208429
0.9084785109142424
0.9528884689826492
