In [46]:
import nltk
import re
import os
import csv
from nltk.stem.snowball import SnowballStemmer
import random
from nltk.classify import SklearnClassifier
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
import pandas as pd

In [47]:
## Get multiple outputs in the same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Ignore all warnings
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [48]:
## Display all rows and columns of a dataframe instead of a truncated version
from IPython.display import display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [49]:
def preprocess(sentence):
    sentence = sentence.lower() #Convert the sentences into lowercase
    tokenizer = RegexpTokenizer(r'\w+') #Tokenize on word charcter
    tokens = tokenizer.tokenize(sentence)
    filtered_words = [w for w in tokens if not w in stopwords.words('english')] # remove stopwords
    return filtered_words

In [50]:
sentence = "The Big brown fox jumped over a lazy dog."
sentence2 = "This is particularly important in today's world where we are swamped with unstructured natural language data on the variety of social media platforms people engage in now-a-days (note -  now-a-days in the decade of 2010-2020)"

In [51]:
preprocessed_sentence = preprocess(sentence)
print(preprocessed_sentence)

['big', 'brown', 'fox', 'jumped', 'lazy', 'dog']


In [52]:
preprocess(sentence2)

['particularly',
 'important',
 'today',
 'world',
 'swamped',
 'unstructured',
 'natural',
 'language',
 'data',
 'variety',
 'social',
 'media',
 'platforms',
 'people',
 'engage',
 'days',
 'note',
 'days',
 'decade',
 '2010',
 '2020']

In [53]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\johnp\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [54]:
tags = nltk.pos_tag(preprocessed_sentence)
print(tags)

[('big', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumped', 'VBD'), ('lazy', 'JJ'), ('dog', 'NN')]


In [55]:
tags2 = nltk.pos_tag(preprocess(sentence2))
print(tags2)

[('particularly', 'RB'), ('important', 'JJ'), ('today', 'NN'), ('world', 'NN'), ('swamped', 'VBD'), ('unstructured', 'JJ'), ('natural', 'JJ'), ('language', 'NN'), ('data', 'NNS'), ('variety', 'NN'), ('social', 'JJ'), ('media', 'NNS'), ('platforms', 'NNS'), ('people', 'NNS'), ('engage', 'VBP'), ('days', 'NNS'), ('note', 'VBP'), ('days', 'NNS'), ('decade', 'NN'), ('2010', 'CD'), ('2020', 'CD')]


In [56]:
def extract_tagged(sentences):
    features = []
    for tagged_word in sentences:
        word, tag = tagged_word
        if tag=='NN' or tag == 'VBN' or tag == 'NNS' or tag == 'VBP' or tag == 'RB' or tag == 'VBZ' or tag == 'VBG' or tag =='PRP' or tag == 'JJ':
            features.append(word)
    return features

In [57]:
extract_tagged(tags2)

['particularly',
 'important',
 'today',
 'world',
 'unstructured',
 'natural',
 'language',
 'data',
 'variety',
 'social',
 'media',
 'platforms',
 'people',
 'engage',
 'days',
 'note',
 'days',
 'decade']

In [58]:
stemmer = SnowballStemmer("english")
lmtzr = WordNetLemmatizer()
def extract_feature(text):
    words = preprocess(text)
#     print('words: ',words)
    tags = nltk.pos_tag(words)
#     print('tags: ',tags)
    extracted_features = extract_tagged(tags)
#     print('Extracted features: ',extracted_features)
    stemmed_words = [stemmer.stem(x) for x in extracted_features]
#     print(stemmed_words)
    result = [lmtzr.lemmatize(x) for x in stemmed_words]
   
    return result

In [59]:
sentence

'The Big brown fox jumped over a lazy dog.'

In [60]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\johnp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [61]:
words = extract_feature(sentence)
print(words)

['big', 'brown', 'fox', 'lazi', 'dog']


In [62]:
words = extract_feature(sentence2)
print(words)

['particular', 'import', 'today', 'world', 'unstructur', 'natur', 'languag', 'data', 'varieti', 'social', 'medium', 'platform', 'peopl', 'engag', 'day', 'note', 'day', 'decad']


In [63]:
extract_feature("He hurt his right foot while he was wearing white shoes on his feet")

['hurt', 'right', 'foot', 'wear', 'white', 'shoe', 'foot']

In [64]:
def word_feats(words):
    return dict([(word, True) for word in words])

In [65]:
word_feats(words)

{'particular': True,
 'import': True,
 'today': True,
 'world': True,
 'unstructur': True,
 'natur': True,
 'languag': True,
 'data': True,
 'varieti': True,
 'social': True,
 'medium': True,
 'platform': True,
 'peopl': True,
 'engag': True,
 'day': True,
 'note': True,
 'decad': True}

In [203]:
def extract_feature_from_doc(data):
    result = []
    corpus = []
    # The responses of the chat bot
    answers = {}
    for (text,category,answer) in data:

        features = extract_feature(text)

        corpus.append(features)
        result.append((word_feats(features), category))
        answers[category] = answer

    return (result, sum(corpus,[]), answers)

In [204]:
extract_feature_from_doc([['this is the input text from the user','category','answer to give']])

([({'input': True, 'user': True}, 'category')],
 ['input', 'user'],
 {'category': 'answer to give'})

In [None]:
import json

# Load the JSON data from a file
with open('intents.json', 'r') as file:
    intents_data = json.load(file)

# Now, 'intents_data' contains your JSON data as a Python dictionary
print(intents_data)

{'intents': [{'tag': 'greeting', 'patterns': ['Hi', 'How are you', 'Is anyone there?', 'Hello', 'Good day'], 'responses': ['Hello, thanks for visiting', 'Good to see you again', 'Hi there, how can I help?'], 'context_set': ''}, {'tag': 'goodbye', 'patterns': ['Bye', 'See you later', 'Goodbye'], 'responses': ['See you later, thanks for visiting', 'Have a nice day', 'Bye! Come back again soon.']}, {'tag': 'thanks', 'patterns': ['Thanks', 'Thank you', "That's helpful"], 'responses': ['Happy to help!', 'Any time!', 'My pleasure']}, {'tag': 'hours', 'patterns': ['What hours are you open?', 'What are your hours?', 'When are you open?'], 'responses': ["We're open every day 9am-9pm", 'Our hours are 9am-9pm every day']}, {'tag': 'mopeds', 'patterns': ['Which mopeds do you have?', 'What kinds of mopeds are there?', 'What do you rent?'], 'responses': ['We rent Yamaha, Piaggio and Vespa mopeds', 'We have Piaggio, Vespa and Yamaha mopeds']}, {'tag': 'payments', 'patterns': ['Do you take credit card

In [None]:
features_data, corpus, answers = extract_feature_from_doc(intents_data)

In [None]:
print(len(features_data))

27


In [79]:
features_data

[('What do you rent?', 'mopeds'),
 ("That's helpful", 'thanks'),
 ('Hi', 'greeting'),
 ('What are your hours?', 'hours'),
 ('Are you cash only?', 'payments'),
 ('See you later', 'goodbye'),
 ('Good day', 'greeting'),
 ('Do you take credit cards?', 'payments'),
 ('How does this work?', 'rental'),
 ('When are you open?', 'hours'),
 ('When do you open today?', 'opentoday'),
 ('Goodbye', 'goodbye'),
 ('Can we rent a moped?', 'rental'),
 ('Hello', 'greeting'),
 ('today', 'today'),
 ('What hours are you open?', 'hours'),
 ('What are your hours today?', 'opentoday'),
 ('Is anyone there?', 'greeting'),
 ('What kinds of mopeds are there?', 'mopeds'),
 ('Are you open today?', 'opentoday'),
 ('Thank you', 'thanks'),
 ('Which mopeds do you have?', 'mopeds'),
 ('Thanks', 'thanks'),
 ('Do you accept Mastercard?', 'payments'),
 ('How are you', 'greeting'),
 ('Bye', 'goodbye'),
 ("I'd like to rent a moped", 'rental')]

In [None]:
len(corpus)

27

In [None]:
corpus[:5]

['Hi', 'How are you', 'Is anyone there?', 'Hello', 'Good day']

In [None]:
answers

{'greeting': 'Hi there, how can I help?',
 'goodbye': 'Bye! Come back again soon.',
 'thanks': 'My pleasure',
 'hours': 'Our hours are 9am-9pm every day',
 'mopeds': 'We have Piaggio, Vespa and Yamaha mopeds',
 'payments': 'We accept most major credit cards',
 'opentoday': 'Our hours are 9am-9pm every day',
 'rental': 'Are you looking to rent today or later this week?',
 'today': 'Same-day rentals please call 1-800-MYMOPED'}

# TRAINING

In [71]:
## split data into train and test sets
split_ratio = 0.8

def split_dataset(data, split_ratio):
    random.shuffle(data)
    data_length = len(data)
    train_split = int(data_length * split_ratio)
    return (data[:train_split]), (data[train_split:])

In [72]:
training_data, test_data = split_dataset(features_data, split_ratio)

In [74]:
training_data[:5]

[('What do you rent?', 'mopeds'),
 ("That's helpful", 'thanks'),
 ('Hi', 'greeting'),
 ('What are your hours?', 'hours'),
 ('Are you cash only?', 'payments')]

In [75]:
test_data[:5]

[('Which mopeds do you have?', 'mopeds'),
 ('Thanks', 'thanks'),
 ('Do you accept Mastercard?', 'payments'),
 ('How are you', 'greeting'),
 ('Bye', 'goodbye')]

In [76]:
# save the data
np.save('training_data', training_data)
np.save('test_data', test_data)

# CLASSIFICATION USING NAIVE BAYES

In [77]:
def train_using_naive_bayes(training_data, test_data):
    classifier = nltk.NaiveBayesClassifier.train(training_data)
    classifier_name = type(classifier).__name__
    training_set_accuracy = nltk.classify.accuracy(classifier, training_data)
    test_set_accuracy = nltk.classify.accuracy(classifier, test_data)
    return classifier, classifier_name, test_set_accuracy, training_set_accuracy

In [200]:
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import word_tokenize

def extract_features(text):
    return dict([(word, True) for word in word_tokenize(text)])

# Train using Naive Bayes classifier
def train_using_naive_bayes(training_data, test_data):
    # Extract features from training data
    # training_features = [(extract_features(text), category) for text, category in training_data]
    
    # Train the Naive Bayes classifier
    classifier = NaiveBayesClassifier.train(training_data)
    
    classifier_name = type(classifier).__name__
    
    # Evaluate classifier on training set
    training_set_accuracy = nltk.classify.accuracy(classifier, training_data)
    
    # Extract features from test data
    test_features = [(extract_features(text), category) for text, category in test_data]
    
    # Evaluate classifier on test set
    test_set_accuracy = nltk.classify.accuracy(classifier, test_features)
    
    return classifier, classifier_name, training_set_accuracy, test_set_accuracy

In [201]:
# Train the classifier using Naive Bayes
classifier, classifier_name, training_set_accuracy, test_set_accuracy  = train_using_naive_bayes(features_data, test_data)

# Print accuracies and other information
print(training_set_accuracy)
print(test_set_accuracy)
print(len(classifier.most_informative_features()))
classifier.show_most_informative_features()

AttributeError: 'str' object has no attribute 'items'

In [185]:
classifier.classify(({"That's": True, 'helpful': True}))

'thanks'

In [192]:
word_feats(extract_feature("That's helpful"))

{'help': True}

In [193]:
input_sentence = "That's helpful"
classifier.classify(word_feats(extract_feature(input_sentence)))

'greeting'

In [196]:
def get_response(input):
    # Extract classification from the input sentence
    class_input = classifier.classify(word_feats(extract_feature(input)))
    print(class_input)
    # Initialize response variable
    response = None

    # Search for the response associated with the tag
    for intent in intents_data['intents']:
        if intent['tag'] == class_input:
            responses = intent['responses']
            response = random.choice(responses)
            break

    print("Response:", response)

In [198]:
# Input sentence
user_input = "Bye"
get_response(user_input)

greeting
Response: Good to see you again


In [199]:
classifier.classify(({"Bye": True, 'helpful': False}))

'goodbye'