In [1]:
import nltk

In [2]:
import re
import os
import csv
from nltk.stem.snowball import SnowballStemmer
import random
from nltk.classify import SklearnClassifier
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
import pandas as pd

In [3]:
#get multiple ouputs in the same line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

#ignore all warnings
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [4]:
#display all rows and cols of a dataframe intead of a truncated version
from IPython.display import display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
#Preprocess

sentence = "The computer was born to solve problems that did not exist before"
sentence2 = "Most of the good programmers do programming not because they expect to get paid or get adulation by the public, but because it is fun to program."

In [6]:
#convert sentence to lower case

'This' == 'this'
print('AbcdEFgH'.lower())
sentence.lower()
sentence2.lower()

False

abcdefgh


'the computer was born to solve problems that did not exist before'

'most of the good programmers do programming not because they expect to get paid or get adulation by the public, but because it is fun to program.'

In [7]:
#tokenize and extrax=ction of individual features

tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(sentence)
tokens
tokens2 = tokenizer.tokenize(sentence2)
tokens2

['The',
 'computer',
 'was',
 'born',
 'to',
 'solve',
 'problems',
 'that',
 'did',
 'not',
 'exist',
 'before']

['Most',
 'of',
 'the',
 'good',
 'programmers',
 'do',
 'programming',
 'not',
 'because',
 'they',
 'expect',
 'to',
 'get',
 'paid',
 'or',
 'get',
 'adulation',
 'by',
 'the',
 'public',
 'but',
 'because',
 'it',
 'is',
 'fun',
 'to',
 'program']

In [8]:
#Stopwords : Filter words to remove non useful words
filtered_words = [w for w in tokens if not w in stopwords.words('english')]
filtered_words

['The', 'computer', 'born', 'solve', 'problems', 'exist']

In [9]:

filtered_words = [w for w in tokens2 if not w in stopwords.words('english')]
filtered_words

['Most',
 'good',
 'programmers',
 'programming',
 'expect',
 'get',
 'paid',
 'get',
 'adulation',
 'public',
 'fun',
 'program']

In [10]:
def preprocess(sentence):
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    filtered_words = [w for w in tokens if not w in stopwords.words('english')]
    return filtered_words

In [11]:
preprocessed_sentence = preprocess(sentence)
print(preprocessed_sentence)

['computer', 'born', 'solve', 'problems', 'exist']


In [12]:
preprocess(sentence2)

['good',
 'programmers',
 'programming',
 'expect',
 'get',
 'paid',
 'get',
 'adulation',
 'public',
 'fun',
 'program']

In [13]:
#tagging

tags = nltk.pos_tag(preprocess(sentence))
print(tags)

[('computer', 'NN'), ('born', 'VBN'), ('solve', 'VB'), ('problems', 'NNS'), ('exist', 'VBP')]


In [14]:
tags = nltk.pos_tag(preprocess(sentence2))
print(tags)

[('good', 'JJ'), ('programmers', 'NNS'), ('programming', 'VBG'), ('expect', 'VBP'), ('get', 'NN'), ('paid', 'VBN'), ('get', 'VB'), ('adulation', 'JJ'), ('public', 'JJ'), ('fun', 'NN'), ('program', 'NN')]


In [15]:
#extracting only nouns and verbs
def extract_tagged(sentence):
    features = []
    for tagged_word in sentence:
        word, tag= tagged_word
        if tag=='NN' or tag=='VBN' or tag=='NNS' or tag=='VBP' or tag=='RB' or tag=='VBZ' or tag=='VBG' or tag=='PRP' or tag=='JJ':
            features.append(word)
    return features

In [16]:
extract_tagged(tags)

['good',
 'programmers',
 'programming',
 'expect',
 'get',
 'paid',
 'adulation',
 'public',
 'fun',
 'program']

In [17]:
#lemmatize word
lmtzr = WordNetLemmatizer()
print(lmtzr.lemmatize('feet'))
print(lmtzr.lemmatize('giving'))

foot
giving


In [18]:
#stem words
words_for_stemming = ['stem', 'stemming', 'stemmed', 'stemmer', 'stems', 'feet', 'willing']

In [19]:
stemmer = SnowballStemmer('english')
[stemmer.stem(x) for x in words_for_stemming]

['stem', 'stem', 'stem', 'stemmer', 'stem', 'feet', 'will']

In [20]:
#putting it all together
def extract_features(text):
    words = preprocess(text)
    tags = nltk.pos_tag(words)
    extracted_features = extract_tagged(tags)
    stemmed_words = [stemmer.stem(x) for x in extracted_features]
    result = [lmtzr.lemmatize(x) for x in stemmed_words]
    return result

In [21]:
sentence

'The computer was born to solve problems that did not exist before'

In [22]:
words = extract_features(sentence)
words

['comput', 'born', 'problem', 'exist']

In [23]:
sentence2

'Most of the good programmers do programming not because they expect to get paid or get adulation by the public, but because it is fun to program.'

In [24]:
words = extract_features(sentence2)
words

['good',
 'programm',
 'program',
 'expect',
 'get',
 'paid',
 'adul',
 'public',
 'fun',
 'program']

In [25]:
extract_features('Everybody should learn to program a computer, because it teaches you how to think')

['everybodi', 'learn', 'program', 'comput', 'teach', 'think']

In [26]:
#implementing bag of words
def word_feats(words):
    return dict([(word, True) for word in words])

In [27]:
word_feats(words)

{'good': True,
 'programm': True,
 'program': True,
 'expect': True,
 'get': True,
 'paid': True,
 'adul': True,
 'public': True,
 'fun': True}

In [28]:
#parsing the whole document
def extract_feature_from_doc(data):
    result =[]
    corpus =[]
    #the responses of chatboat
    answers={}
    for (text,category,answer) in data:
        features = extract_features(text)
        corpus.append(features)
        result.append((word_feats(features), category))
        answers[category]= answer
    return (result, sum(corpus,[]), answers)

In [29]:
extract_feature_from_doc([['This is the input text from user', 'category', 'answer to give']])

([({'input': True, 'user': True}, 'category')],
 ['input', 'user'],
 {'category': 'answer to give'})

In [30]:
def get_content(filename):
    doc = os.path.join(filename)
    with open(doc,'r') as content_file:
        lines = csv.reader(content_file,delimiter='|')
        data = [x for x in lines if len(x) ==3]
        return data

In [31]:
filename= 'D:/MaiTexa/projects/AI_ChatBot_Python-master/kerala tourism sample data.txt'
data = get_content(filename)
data

[['Hello',
  'Greetings',
  'Hello. I am ABHIbot. I will serve your leave enquiries.'],
 ['hi hello',
  'Greetings',
  'Hello. I am ABHIbot. I will serve your leave enquiries.'],
 ['hi ',
  'Greetings',
  'Hello. I am ABHIbot. I will serve your leave enquiries.'],
 ['hi',
  'Greetings',
  'Hello. I am ABHIbot. I will serve your leave enquiries.'],
 ['hi',
  'Greetings',
  'Hello. I am ABHIbot. I will serve your leave enquiries.'],
 ['hey',
  'Greetings',
  'Hello. I am ABHIbot. I will serve your leave enquiries.'],
 ['hello, hi',
  'Greetings',
  'Hello. I am ABHIbot. I will serve your leave enquiries.'],
 ['hey',
  'Greetings',
  'Hello. I am ABHIbot. I will serve your leave enquiries.'],
 ['hey, hi',
  'Greetings',
  'Hello. I am ABHIbot. I will serve your leave enquiries.'],
 ['hey, hello',
  'Greetings',
  'Hello. I am ABHIbot. I will serve your leave enquiries.'],
 ['Good morning',
  'Morning',
  'Good Morning. I am ABHIbot. I will serve your leave enquiries.'],
 ['Good afternoon'

In [32]:
features_data, corpus, answers = extract_feature_from_doc(data)

In [33]:
print(features_data[50])

({'tell': True, 'hotel': True, 'munnar': True}, 'Munnar-Hotels')


In [34]:
corpus

['hello',
 'hi',
 'hello',
 'hi',
 'hi',
 'hi',
 'hey',
 'hello',
 'hi',
 'hey',
 'hey',
 'hi',
 'hey',
 'hello',
 'good',
 'morn',
 'good',
 'afternoon',
 'good',
 'even',
 'good',
 'night',
 'today',
 'want',
 'help',
 'need',
 'help',
 'need',
 'help',
 'help',
 'want',
 'help',
 'want',
 'assist',
 'help',
 'great',
 'talk',
 'great',
 'thank',
 'help',
 'thank',
 'thank',
 'much',
 'thank',
 'thank',
 'much',
 'place',
 'place',
 'place',
 'kerala',
 'visit',
 'kerala',
 'tourist',
 'place',
 'place',
 'visit',
 'kerala',
 'place',
 'kerala',
 'locat',
 'place',
 'place',
 'india',
 'place',
 'india',
 'packag',
 'pakag',
 'packag',
 'tell',
 'packag',
 'packag',
 'munnar',
 'packag',
 'munnar',
 'munnar',
 'packag',
 'munnar',
 'pakag',
 'munnar',
 'packag',
 'munnar',
 'packag',
 'munnar',
 'tell',
 'hotel',
 'munnar',
 'hotel',
 'munnar',
 'munnar',
 'hotel',
 'hotel',
 'munnar',
 'restaur',
 'munnar',
 'restaur',
 'munnar',
 'restaur',
 'munnar',
 'restaur',
 'munnar',
 'munna

In [35]:
answers

{'Greetings': 'Hello. I am ABHIbot. I will serve your leave enquiries.',
 'Morning': 'Good Morning. I am ABHIbot. I will serve your leave enquiries.',
 'Afternoon': 'Good afternoon, I am ABHIbot. I will serve your leave enquiries.',
 'Evening': 'Good evening. I am ABHIbot. I will serve your leave enquiries.',
 'Goodbye': 'Good night. Take care.',
 'Opening': "I'm fine! Thank you. How can I help you?",
 'Help': 'How can I help you?',
 'No-Help': 'Ok sir/madam. No problem. Have a nice day.',
 'Closing': "It's glad to know that I have been helpful. Have a good day!",
 'Best-Places': 'Thekkady,Munnar,Alleppey,Kochi,Wayanad,Varkala,Vagamon,Kumarakom,Kovalam,Periyar National Park,Poovar,Kollam,Idukki,Kozhikode,Bekal,Thrissur,Palakkad,Thalassery,Trivandrum,Nelliyampathy,Vythiri,Nilambur,Ponmudi,Kalpetta,Malampuzha,Kannur,Kasaragod,Kottayam',
 'India-Places': 'Currently I know about only kerala places.',
 'Select-Package': 'Select a place.',
 'Munnar-Package': "The packages for munnar are,'Mun

In [36]:
#Training a model using these features
split_ratio = 0.8

def split_dataset(data, split_ratio):
    random.shuffle(data)
    data_length = len(data)
    train_split = int(data_length * split_ratio)
    return (data[:train_split]), (data[train_split:])

In [37]:
training_data, test_data = split_dataset(features_data, split_ratio)
training_data

[({'hotel': True, 'idukki': True}, 'Idukki-Hotels'),
 ({'tell': True, 'hotel': True, 'kozhikod': True}, 'Kozhikode-Hotels'),
 ({'packag': True, 'varkala': True}, 'Varkala-Package'),
 ({'packag': True, 'kovalam': True}, 'Kovalam-Package'),
 ({'varkala': True, 'pakag': True}, 'Varkala-Package'),
 ({'great': True, 'talk': True}, 'Closing'),
 ({'packag': True, 'thiruvananthapuram': True}, 'Trivandrum-Package'),
 ({'hey': True, 'hi': True}, 'Greetings'),
 ({'hey': True}, 'Greetings'),
 ({'packag': True, 'varkala': True}, 'Varkala-Package'),
 ({'hotel': True, 'thrissur': True}, 'Thrissur-Hotels'),
 ({'hey': True, 'hello': True}, 'Greetings'),
 ({'thrissoor': True, 'hotel': True}, 'Thrissur-Hotels'),
 ({'kochi': True, 'pakag': True}, 'Kochi-Package'),
 ({'packag': True, 'thekkadi': True}, 'Thekkady-Package'),
 ({'hi': True}, 'Greetings'),
 ({'hotel': True, 'vagamon': True}, 'Vagamon-Hotels'),
 ({'packag': True, 'munnar': True}, 'Munnar-Package'),
 ({'thank': True}, 'Closing'),
 ({'good': True

In [38]:
#save the data
np.save('training_data', training_data)
np.save('test_data', test_data)

In [39]:
#classification using Decision tree
training_data = np.load('training_data.npy', allow_pickle=True)
test_data = np.load('test_data.npy', allow_pickle=True)

In [40]:
def train_using_decision_tree(training_data, test_data):
    classifier =nltk.classify.DecisionTreeClassifier.train(training_data, entropy_cutoff=0.6, support_cutoff=6)
    classifier_name = type(classifier).__name__
    training_set_accuracy = nltk.classify.accuracy(classifier, training_data)
    print('training set accuracy:', training_set_accuracy)
    test_set_accuracy = nltk.classify.accuracy(classifier, test_data)
    print('test set accuracy :', test_set_accuracy)
    return classifier, classifier_name, test_set_accuracy, training_set_accuracy

In [41]:
dtclassifier, classifer_name, test_set_accuracy, training_set_accuracy = train_using_decision_tree(training_data, test_data)

training set accuracy: 0.9228070175438596
test set accuracy : 0.8055555555555556


In [42]:
def train_using_naiveBayes(training_data, test_data):
    classifier =nltk.classify.NaiveBayesClassifier.train(training_data)
    classifier_name = type(classifier).__name__
    training_set_accuracy = nltk.classify.accuracy(classifier, training_data)
    print('training set accuracy:', training_set_accuracy)
    test_set_accuracy = nltk.classify.accuracy(classifier, test_data)
    print('test set accuracy :', test_set_accuracy)
    return classifier, classifier_name, test_set_accuracy, training_set_accuracy

In [43]:
classifier, classifer_name, test_set_accuracy, training_set_accuracy = train_using_naiveBayes(training_data, test_data)

training set accuracy: 0.9228070175438596
test set accuracy : 0.8055555555555556


In [44]:
print(len(classifier.most_informative_features()))

100


In [45]:
classifier.show_most_informative_features()

Most Informative Features
                nilambur = None           Allepp : Nilamb =      6.4 : 1.0
                  packag = None           Allepp : Trivan =      6.4 : 1.0
                  idukki = None           Allepp : Idukki =      5.8 : 1.0
                 restaur = None           Greeti : Kozhik =      5.7 : 1.0
                   kochi = True           Kochi- : Trivan =      5.4 : 1.0
                   kochi = None           Allepp : Kochi- =      5.2 : 1.0
                nilambur = True           Nilamb : Idukki =      5.1 : 1.0
                   place = None           Allepp : Best-P =      4.5 : 1.0
                thrissur = None           Allepp : Thriss =      4.5 : 1.0
                    help = True             Help : Closin =      4.1 : 1.0


In [46]:
classifier.classify(({'mani': True, 'option': True, 'leav': True}))

'Alleppey-Restaurant'

In [47]:
extract_features('hello')

['hello']

In [48]:
word_feats(extract_features('hello'))

{'hello': True}

In [49]:
input_sentence = 'how many balanced leaves do I have?'
classifier.classify(word_feats(extract_features(input_sentence)))

'Alleppey-Restaurant'

In [50]:
def reply(input_sentence):
    category = dtclassifier.classify(word_feats(extract_features(input_sentence)))
    
    return answers[category]

In [51]:
def reply(input_sentence):
    category = dtclassifier.classify(word_feats(extract_features(input_sentence)))
    if category == 'Best-Places':
        places = answers[category].split(',')
        return ', '.join(random.sample(places, 5))
    if category == 'Munnar-package':
        # Split the string into packages
        packages = answers[category].split("', '")
        # Select 4 random packages
        selected_packages = random.sample(packages, min(4, len(packages)))
        # Join the selected packages with newline and return
        return '\n'.join(selected_packages)
    return answers[category]

In [52]:
reply('hello')

'Hello. I am ABHIbot. I will serve your leave enquiries.'

In [70]:
reply('kerala places')

'Periyar National Park, Thekkady, Bekal, Nilambur, Nelliyampathy'

In [80]:
reply('places in kerala')

'Poovar, Vagamon, Kalpetta, Kasaragod, Ponmudi'

In [64]:
reply('Kerala munnar package')

"The packages for munnar are,'Munnar, Alleppey, Kovalam', 'â‚¹ 18,900*', '4 Nights / 5 Days', 'Munnar, Thekkady, Alleppey', 'â‚¹ 19,000*', '4 Nights / 5 Days','Kochi, Munnar, Alleppey', 'â‚¹ 14,300*', '3 Nights / 4 Days','Munnar, Alleppey', 'â‚¹ 12,900*', '3 Nights / 4 Days',Munnar, Thekkady, Kovalam, Kanyakumari, Alleppey', 'â‚¹ 28,700*', '7 Nights / 8 Days'."

In [65]:
reply('munnar')

"The packages for munnar are,'Munnar, Alleppey, Kovalam', 'â‚¹ 18,900*', '4 Nights / 5 Days', 'Munnar, Thekkady, Alleppey', 'â‚¹ 19,000*', '4 Nights / 5 Days','Kochi, Munnar, Alleppey', 'â‚¹ 14,300*', '3 Nights / 4 Days','Munnar, Alleppey', 'â‚¹ 12,900*', '3 Nights / 4 Days',Munnar, Thekkady, Kovalam, Kanyakumari, Alleppey', 'â‚¹ 28,700*', '7 Nights / 8 Days'."

In [72]:
reply('packages for munnar')

"The packages for munnar are,'Munnar, Alleppey, Kovalam', 'â‚¹ 18,900*', '4 Nights / 5 Days', 'Munnar, Thekkady, Alleppey', 'â‚¹ 19,000*', '4 Nights / 5 Days','Kochi, Munnar, Alleppey', 'â‚¹ 14,300*', '3 Nights / 4 Days','Munnar, Alleppey', 'â‚¹ 12,900*', '3 Nights / 4 Days',Munnar, Thekkady, Kovalam, Kanyakumari, Alleppey', 'â‚¹ 28,700*', '7 Nights / 8 Days'."

In [67]:
reply('Idukki package')

'The packages for Nilambur are,"\'Nilambur, Kozhikode\', \'â‚¹ 7,000*\', \'2 Nights / 3 Days\'","\'Ooty, Nilambur, Palakkad\', \'â‚¹ 9,000*\', \'3 Nights / 4 Days\'","\'Ooty, Nilambur\', \'â‚¹ 10,000*\', \'3 Nights / 4 Days\'","\'Kozhikode, Nilambur, Ooty, Mudumalai, Bandipur, Nagarhole, Kalpetta, Wayanad\', \'â‚¹ 18,500*\', \'7 Nights / 8 Days\'","\'Kannur, Thalassery, Kozhikode, Nilambur, Kalpetta, Wayanad\', \'â‚¹ 16,000*\', \'6 Nights / 7 Days\'"'

In [68]:
reply('Nilambur')

'The packages for Nilambur are,"\'Nilambur, Kozhikode\', \'â‚¹ 7,000*\', \'2 Nights / 3 Days\'","\'Ooty, Nilambur, Palakkad\', \'â‚¹ 9,000*\', \'3 Nights / 4 Days\'","\'Ooty, Nilambur\', \'â‚¹ 10,000*\', \'3 Nights / 4 Days\'","\'Kozhikode, Nilambur, Ooty, Mudumalai, Bandipur, Nagarhole, Kalpetta, Wayanad\', \'â‚¹ 18,500*\', \'7 Nights / 8 Days\'","\'Kannur, Thalassery, Kozhikode, Nilambur, Kalpetta, Wayanad\', \'â‚¹ 16,000*\', \'6 Nights / 7 Days\'"'

In [60]:
reply('package for kochi')

"The packages for Kochi are,'Kochi, Munnar', 'â‚¹ 10,000*', '3 Nights / 4 Days','Kochi, Munnar, Alleppey', 'â‚¹ 21,000*', '5 Nights / 6 Days','Kochi, Munnar, Kumarakom', 'â‚¹ 17,350*', '4 Nights / 5 Days','Kochi, Munnar, Thekkady, Kumarakom', 'â‚¹ 27,950*', '6 Nights / 7 Days','Kochi, Munnar, Thekkady, Kumarakom, Alleppey, Kovalam, Trivandrum', 'â‚¹ 34,890*', '8 Nights / 9 Days'"

In [61]:
reply('periyar national park')

'The packages for periyar national park are,"\'Thekkady\', \'â‚¹ 7,525*\', \'2 Nights / 3 Days\'","\'Munnar, Thekkady\', \'â‚¹ 13,700*\', \'3 Nights / 4 Days\'","\'Munnar, Thekkady, Alleppey\', \'â‚¹ 12,000*\', \'4 Nights / 5 Days\'","\'Munnar, Thekkady, Alleppey\', \'â‚¹ 19,000*\', \'4 Nights / 5 Days\'","\'Munnar, Thekkady, Kovalam, Kanyakumari, Alleppey\', \'â‚¹ 28,700*\', \'7 Nights / 8 Days\'"'