## Chatbot

### import necessary libraries

In [1]:
import numpy as np
import random
import string
import nltk

### importing and reading corpus

In [2]:
chat = open('dialogs.txt','r')
chat_doc = chat.read()

In [3]:
# converting the entire text into uppercase or lowercase, so that the algorithm does not treat the same words in different cases as different

chat_doc = chat_doc.lower()

### Tokenization


##### Tokenization is just the term used to describe the process of converting the normal text strings into a list of tokens i.e words that we actually want. Sentence tokenizer can be used to find the list of sentences and Word tokenizer can be used to find the list of words in strings.

In [4]:
# The NLTK data package includes a pre-trained Punkt tokenizer for English.

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adhua\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# WordNet is a semantically-oriented dictionary of English included in NLTK.

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adhua\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# converts to list of sentences 
sent_tokens = nltk.sent_tokenize(chat_doc)

# converts to list of words
word_tokens = nltk.word_tokenize(chat_doc)

In [7]:
sent_tokens[:2]

['hi, how are you doing?', "i'm fine."]

In [8]:
word_tokens[:5]

['hi', ',', 'how', 'are', 'you']

### Text preprocessing

##### Lemmatization: A slight variant of stemming is lemmatization. 
##### The major difference between these is, that, stemming can often create non-existent words, whereas lemmas are actual words. So, your root stem, meaning the word you end up with, is not something you can just look up in a dictionary, but you can look up a lemma. 

In [21]:
lemmer = nltk.stem.WordNetLemmatizer() #to initialize Lemmatizer

In [22]:
lemmer.lemmatize('greetings', pos = 'v')

'greet'

In [30]:
def Lemmertokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

def LemNormalize(text):
    return Lemmertokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

In [28]:
LemNormalize('how are you doing?')

['how', 'are', 'you', 'doing']

In [34]:
# keyword matching
GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up","hey",)
GREETING_RESPONSES = ["hi", "hey", "*nods*", "hi there", "hello", "I am glad! You are talking to me"]
def greeting(sentence):
 
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)


In [36]:
greeting('hello')

'I am glad! You are talking to me'