Importing the required libraries

In [1]:
import numpy as np
import nltk
import string
import random

importing and reading the corpus

In [2]:
f = open('chatbot.txt','r', errors = 'ignore')
raw_doc = f.read()
raw_doc = raw_doc.lower()
nltk.download('punkt') #Using Punkt tokenizer
nltk.download('wordnet') #Using WordNet dictionary
sent_tokens = nltk.sent_tokenize(raw_doc) #Converts doc to list of sentences
word_tokens = nltk.word_tokenize(raw_doc) # Converts doc to list of words

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [3]:
sent_tokens[:2]

['data science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from structured and unstructured data,[1][2] and apply knowledge and actionable insights from data across a broad range of application domains.',
 'data science is related to data mining, machine learning and big data.']

In [4]:
word_tokens[:2]

['data', 'science']

Text Preprocesssing

In [5]:
lemmer = nltk.stem.WordNetLemmatizer()
#WordNet is a semantically-oriented dictionary of English included in NLTK.
def LemTokens(tokens):
  return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
  return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

**Defining the Greeting Function**

In [6]:
GREET_INPUTS = ("hello", "hi", "greetings", "sup", "what's up", "hey")
GREET_RESPONSES = ["hi", "hey", "nods", "hi there", "hello", "I am glad! You are talking to me"]

def greet(sentence):
  for word in sentence.split():
    if word.lower() in GREET_INPUTS:
      return random.choice(GREET_RESPONSES)

**RESPONSE GENERATION**

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
def response(user_response):
  robo1_response = ''
  TfidfVec = TfidfVectorizer(tokenizer = LemNormalize, stop_words='english')
  tfidf = TfidfVec.fit_transform(sent_tokens)
  vals = cosine_similarity(tfidf[-1], tfidf)
  idx = vals.argsort()[0][-2]
  flat = vals.flatten()
  flat.sort()
  req_tfidf = flat[-2]
  if(req_tfidf==0):
    robo1_response = robo1_response +"I am sorry! I don't understand you"
    return robo1_response
  else:
    robo1_response =  robo1_response+sent_tokens[idx]
    return robo1_response

**Defining conversation start/end protocol**

In [9]:
flag = True
print("BOT: My name is Stark. Let's have a conversation! Also, if you want to exit any time, just type Bye!")
while(flag==True):
  user_response = input()
  user_response = user_response.lower()
  if(user_response!= 'bye'):
    if(user_response=='thanks' or user_response == 'thank you'):
      flag = False
      print("BOT: You are welcome...")
    else:
      if(greet(user_response)!=None):
        print("BOT: "+greet(user_response))
      else:
        sent_tokens.append(user_response)
        word_tokens = word_tokens+nltk.word_tokenize(user_response)
        final_words = list(set(word_tokens))
        print("BOT: ", end = "")
        print(response(user_response))
        sent_tokens.remove(user_response)
  else:
    flag=False
    print("BOT: Goodbye! Take care <3")

BOT: My name is Stark. Let's have a conversation! Also, if you want to exit any time, just type Bye!
Hey
BOT: I am glad! You are talking to me
Hello
BOT: hey
Hello
BOT: hi there
Content
BOT: 

  'stop_words.' % sorted(inconsistent))


[4][5]
contents

    1 foundations
        1.1 relationship to statistics
    2 etymology
        2.1 early usage
        2.2 modern usage
    3 impact
    4 technologies and techniques
        4.1 techniques
    5 references

foundations

data science is an interdisciplinary field focused on extracting knowledge from data sets, which are typically large (see big data), and applying the knowledge and actionable insights from data to solve problems in a wide range of application domains.
foundations


  'stop_words.' % sorted(inconsistent))


BOT: [4][5]
contents

    1 foundations
        1.1 relationship to statistics
    2 etymology
        2.1 early usage
        2.2 modern usage
    3 impact
    4 technologies and techniques
        4.1 techniques
    5 references

foundations

data science is an interdisciplinary field focused on extracting knowledge from data sets, which are typically large (see big data), and applying the knowledge and actionable insights from data to solve problems in a wide range of application domains.
nujsrgbure


  'stop_words.' % sorted(inconsistent))


BOT: I am sorry! I don't understand you
data science


  'stop_words.' % sorted(inconsistent))


BOT: "data science".
reference


  'stop_words.' % sorted(inconsistent))


BOT: machine learning is a technique used to perform tasks by inferencing patterns from data

references

dhar, v. (2013).
3


  'stop_words.' % sorted(inconsistent))


BOT: retrieved 3 april 2020.
bye
BOT: Goodbye! Take care <3
