In [1]:
# Install Dependencies

%pip install pandas 

Collecting pandas
  Downloading pandas-2.2.1-cp39-cp39-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.1-cp39-cp39-win_amd64.whl (11.6 MB)
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.6 MB 2.0 MB/s eta 0:00:06
   ---------------------------------------- 0.1/11.6 MB 1.8 MB/s eta 0:00:07
    --------------------------------------- 0.2/11.6 MB 2.0 MB/s eta 0:00:06
    --------------------------------------- 0.3/11.6 MB 1.7 MB/s eta 0:00:07
   - -------------------------------------- 0.4/11.6 MB 1.8 MB/s eta 0:00:07
   - -------------------------------------- 0.4/11.6 MB 1.7 MB/s eta 0:00:07
   - -------------------------------------- 0.6/11.6 MB 1.9 MB/s eta 0:00:06
   -- -------------------------------

In [1]:
# Import necessary libraries

import numpy as np
import pandas as pd
import json
import string
import random

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

In [289]:
# Get the data from JSON

with open('../kaggle-dataset.json') as file:
    dataset = json.load(file)

In [290]:
dataset

{'intents': [{'tag': 'goodbye',
   'patterns': ['cya',
    'see you',
    'bye bye',
    'See you later',
    'Goodbye',
    'I am Leaving',
    'Bye',
    'Have a Good day',
    'talk to you later',
    'ttyl',
    'i got to go',
    'gtg'],
   'responses': ['See you later, thanks for visiting',
    'Have a nice day',
    'Bye! Come back again soon.',
    'Sad to see you go :(',
    'Talk to you later',
    'Goodbye!'],
   'context_set': ''},
  {'tag': 'creator',
   'patterns': ['what is the name of your developers',
    'what is the name of your creators',
    'what is the name of the developers',
    'what is the name of the creators',
    'who created you',
    'your developers',
    'your creators',
    'who are your developers',
    'developers',
    'you are made by',
    'you are made by whom',
    'who created you',
    'who create you',
    'creators',
    'who made you',
    'who designed you'],
   'responses': ["I am happy to introduce the creators of this project 'The impl

In [362]:
intents = dataset["intents"]
tag = []
responses = []
patterns = []

for tags in intents:
    tag.append(tags['tag'])
    responses.append(tags['responses'])
    patterns.append(tags['patterns'])
    
responses

[['See you later, thanks for visiting',
  'Have a nice day',
  'Bye! Come back again soon.',
  'Sad to see you go :(',
  'Talk to you later',
  'Goodbye!'],
 ["I am happy to introduce the creators of this project 'The implementation of a Virtual Assistant for the Academic Management of Students of TUP-Manila with the use of Naive Bayes Algorithm': Mary Jane Calulang, Jeanne May Carolino, Maria Evita Juan, John Paul Monter, and Vincent Johanne Tenorio."],
 ["I'm your trusty virtual assistant here to help you with anything you need about Technological University of the Philippines (TUP)!",
  "My name's [CHATBOT NAME], and I'm here to answer your curiosity about Technological University of the Philippines (TUP)!"],
 ['College is open 8am-5pm Monday-Friday!'],
 ['You can contact at: [NUMBER]'],
 ['Our university offers Information Technology, computer Engineering, Mechanical engineering,Chemical engineering, Civil engineering and extc Engineering.'],
 ['For Fee detail visit <a target="_bla

In [2]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def text_cleaning(patterns):
    tokens = word_tokenize(patterns.lower())  # Convert text to lowercase and tokenize
    tokens = [char for char in tokens if char not in string.punctuation]
    cleaned_tokens = [token for token in tokens if token not in stop_words]  # Remove stopwords
    tokens = [stemmer.stem(token) for token in tokens]  # Stemming
    tokens = [lemmatizer.lemmatize(token, wordnet.VERB) for token in tokens]  # Lemmatization
    return tokens

In [3]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

: 

In [285]:
user_input = 'what is the name of your developers?'
text_cleaning(user_input)

['what', 'be', 'the', 'name', 'of', 'your', 'develop']

In [398]:
def get_responses(input_text):
    # Preprocess user input
    input_tokens = set(text_cleaning(input_text))
    max_intersection = 0
    # Iterate through intents
    for intent in dataset['intents']:
        # Check if any pattern matches user input
        for pattern in intent['patterns']:
            pattern_tokens = set(text_cleaning(pattern))
            intersection_size = len(input_tokens.intersection(pattern_tokens))
            if intersection_size >= max_intersection:
                max_intersection = intersection_size
                responses = random.choice(intent['responses'])
    
    # If no matches found, return default response
    if max_intersection == 0:
        responses.append("I'm sorry, I didn't understand that.")
    
    return responses

In [310]:
def get_response(input_text):
    # Preprocess user input
    input_tokens = set(text_cleaning(input_text))
    
    # Initialize a list to store responses
    responses = []
    
    # Initialize variables to keep track of the highest intersection and corresponding intent
    max_intersection = 0
    matching_intent = None
    
    # Iterate through intents
    for intent in dataset['intents']:
        # Check if any pattern matches user input
        for pattern in intent['patterns']:
            pattern_tokens = set(text_cleaning(pattern))
            intersection_size = len(input_tokens.intersection(pattern_tokens))
            if intersection_size > max_intersection:
                max_intersection = intersection_size
                matching_intent = intent
                responses = random.choice(intent['responses'])
    
    # If no matches found, return default response
    if max_intersection == 0:
        responses.append("I'm sorry, I didn't understand that.")
    
    return responses

In [403]:
user_input = "where is the location of the college?"
response = get_response(user_input)
print(response)

<a target="_blank" href="ADD YOU GOOGLE MAP LINK HERE"> here</a>


# UNIT TESTING

In [372]:
chatbot_responses = {}

for index, tags in enumerate(tag):
    chatbot_responses[tags] = responses[index]
        
print(chatbot_responses)

{'goodbye': ['See you later, thanks for visiting', 'Have a nice day', 'Bye! Come back again soon.', 'Sad to see you go :(', 'Talk to you later', 'Goodbye!'], 'creator': ["I am happy to introduce the creators of this project 'The implementation of a Virtual Assistant for the Academic Management of Students of TUP-Manila with the use of Naive Bayes Algorithm': Mary Jane Calulang, Jeanne May Carolino, Maria Evita Juan, John Paul Monter, and Vincent Johanne Tenorio."], 'name': ["I'm your trusty virtual assistant here to help you with anything you need about Technological University of the Philippines (TUP)!", "My name's [CHATBOT NAME], and I'm here to answer your curiosity about Technological University of the Philippines (TUP)!"], 'hours': ['College is open 8am-5pm Monday-Friday!'], 'number': ['You can contact at: [NUMBER]'], 'course': ['Our university offers Information Technology, computer Engineering, Mechanical engineering,Chemical engineering, Civil engineering and extc Engineering.'

In [400]:
import unittest

class TestChatbotResponses(unittest.TestCase):
    def test_greeting_response(self):
        user_input = "Hi!!"
        response = get_response(user_input)
        print(response)
        self.assertIn(response, chatbot_responses["greeting"])
    
    def test_number_response(self):
        user_input = "How can a user contact the college based on the chatbot's response?"
        response = get_response(user_input)
        print(response)
        self.assertIn(response, chatbot_responses["number"])
    
    def test_creator_response(self):
        user_input = "Who are the developers of this chatbot according to the dataset?"
        response = get_response(user_input)
        print(response)
        self.assertIn(response, chatbot_responses["creator"])
    
    def test_location_response(self):
        user_input = "where is the location of the college?"
        response = get_response(user_input)
        print(response)
        self.assertIn(response, chatbot_responses["location"])
        
    def test_chatbotname_response(self):
        user_input = "What is the chatbot's name as per the dataset?"
        response = get_response(user_input)
        print(response)
        self.assertIn(response, chatbot_responses["name"])

# Run the unittests using the main function
if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)


...

I'm your trusty virtual assistant here to help you with anything you need about Technological University of the Philippines (TUP)!
I am happy to introduce the creators of this project 'The implementation of a Virtual Assistant for the Academic Management of Students of TUP-Manila with the use of Naive Bayes Algorithm': Mary Jane Calulang, Jeanne May Carolino, Maria Evita Juan, John Paul Monter, and Vincent Johanne Tenorio.
Hello, thanks for visiting


..
----------------------------------------------------------------------
Ran 5 tests in 0.503s

OK


<a target="_blank" href="ADD YOU GOOGLE MAP LINK HERE"> here</a>
You can contact at: [NUMBER]


# Basics of NLP

In [8]:
# Convert to dataframe format

df = pd.DataFrame(dataset['intents'])
df.head()

Unnamed: 0,tag,patterns,responses,context_set
0,goodbye,"[cya, see you, bye bye, See you later, Goodbye...","[See you later, thanks for visiting, Have a ni...",
1,creator,"[what is the name of your developers, what is ...",[I am happy to introduce the creators of this ...,
2,name,"[name, your name, do you have a name, what are...",[I'm your trusty virtual assistant here to hel...,
3,hours,"[timing of college, what is college timing, wo...",[College is open 8am-5pm Monday-Friday!],
4,number,"[more info, contact info, how to contact colle...",[You can contact at: [NUMBER]],


In [182]:
df.shape

(36, 5)

In [39]:
df['tag'].value_counts()

tag
goodbye           1
creator           1
principal         1
sem               1
admission         1
scholarship       1
facilities        1
college intake    1
uniform           1
committee         1
random            1
swear             1
vacation          1
sports            1
salutation        1
greeting          1
task              1
extchod           1
computerhod       1
placement         1
hostel            1
name              1
hours             1
number            1
course            1
fees              1
location          1
event             1
menu              1
document          1
floors            1
syllabus          1
library           1
infrastructure    1
canteen           1
ragging           1
Name: count, dtype: int64

In [86]:
def count_patterns(patterns_list):
    return len(patterns_list)

patterns_count = df['patterns'].apply(count_patterns)

# Create a DataFrame with tags and their respective pattern counts
tags_and_counts_df = pd.DataFrame({'tag': df['tag'], 'pattern_count': patterns_count})

tags_and_counts_df

               tag  pattern_count
0          goodbye             12
1          creator             16
2             name             13
3            hours             17
4           number             15
5           course             27
6             fees             23
7         location             14
8           hostel             22
9            event             11
10        document             13
11          floors              7
12        syllabus              7
13         library             14
14  infrastructure              3
15         canteen             11
16            menu              7
17       placement              9
18     computerhod              4
19         extchod              4
20       principal              7
21             sem             11
22       admission              6
23     scholarship             26
24      facilities              5
25  college intake              9
26         uniform              9
27       committee              6
28          ra

In [93]:
tags_and_counts_df.shape

(36, 2)

In [96]:
import nltk
from nltk import word_tokenize

input = 'who are your developers?'
print(word_tokenize(input))

['who', 'are', 'your', 'developers', '?']


In [99]:
'''
    STOPWORDS are common words in a language that are often filtered out during text preprocessing in natural language processing tasks, as they typically do not carry significant meaning on their own. These words include articles (e.g., "a", "an", "the"), prepositions (e.g., "on", "in", "at"), conjunctions (e.g., "and", "but", "or"), and other frequently occurring words (e.g., "is", "are", "to").
'''

from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))
print(stopwords)

{"didn't", 'once', 'in', "you're", 'further', 'not', 'own', "don't", "couldn't", 'before', 'so', 'after', 'should', "she's", 'other', 'doing', 'is', "mustn't", 'how', 'herself', 'out', "hasn't", 'by', 'shan', 'you', 'and', 'am', 'which', 'while', 'all', 'than', 'o', 'won', "needn't", 'too', 'just', 'didn', 'mightn', 'd', 'very', 'they', 'between', 'had', 'are', 'off', "shouldn't", 'above', 'of', 'such', 'hadn', 'ain', 'any', 'during', 'ma', 'as', "mightn't", "it's", 'me', 'there', 'hers', 'then', 'those', 'who', 'most', 'couldn', 'here', 'from', 'up', 'on', "shan't", 'only', 'y', 's', 'over', 'through', 'wasn', 'her', "aren't", 'when', 'it', 'has', 'myself', 'its', 'themselves', 'an', 'ourselves', 'at', 'was', 'being', 'himself', 'ours', 't', 'will', "haven't", 'were', "isn't", 'doesn', 'about', 'him', 'we', 'weren', 'have', 'haven', 'nor', 'she', 'for', 'isn', 'having', 'yourselves', 'these', 'more', 'because', 'below', 'each', 'into', 'our', 'their', "hadn't", 'why', "wasn't", 'with'

In [105]:
# Clean the input
import string

def clean_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove punctuation
    return [char for char in text if char not in string.punctuation]

data_1 = clean_text(input)
print(data_1)
print(''.join(data_1))

['w', 'h', 'o', ' ', 'a', 'r', 'e', ' ', 'y', 'o', 'u', 'r', ' ', 'd', 'e', 'v', 'e', 'l', 'o', 'p', 'e', 'r', 's']
who are your developers


In [152]:
from sklearn.feature_extraction.text import CountVectorizer

data_2 = ["what is the name of your developers or developer"]

vectorizer = CountVectorizer()
vectorizer.fit(data_2)
print(vectorizer.vocabulary_)

{'what': 7, 'is': 2, 'the': 6, 'name': 3, 'of': 4, 'your': 8, 'developers': 1, 'or': 5, 'developer': 0}


In [153]:
vector = vectorizer.transform(data_2)
print(vector)

  (0, 0)	1
  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 5)	1
  (0, 6)	1
  (0, 7)	1
  (0, 8)	1


In [179]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def text_cleaning(patterns):
    cleaned_patterns = []
    for pattern in patterns:
        tokens = word_tokenize(pattern.lower())  # Convert text to lowercase and tokenize
        tokens = [char for char in tokens if char not in string.punctuation]
        cleaned_tokens = [token for token in tokens if token not in stop_words]  # Remove stopwords
        cleaned_patterns.append(cleaned_tokens)
    return cleaned_patterns
    

In [180]:
data_2 = ["what is the name of your developers or developer",
            "what is the name of your creators"]
text_cleaning(data_2)

[['name', 'developers', 'developer'], ['name', 'creators']]

In [161]:
df.head()

Unnamed: 0,tag,patterns,responses,context_set,patterns_count
0,goodbye,"[cya, see you, bye bye, See you later, Goodbye...","[See you later, thanks for visiting, Have a ni...",,12
1,creator,"[what is the name of your developers, what is ...",[I am happy to introduce the creators of this ...,,16
2,name,"[name, your name, do you have a name, what are...",[I'm your trusty virtual assistant here to hel...,,13
3,hours,"[timing of college, what is college timing, wo...",[College is open 8am-5pm Monday-Friday!],,17
4,number,"[more info, contact info, how to contact colle...",[You can contact at: [NUMBER]],,15


In [162]:
print(df.iloc[:,1])

0     [cya, see you, bye bye, See you later, Goodbye...
1     [what is the name of your developers, what is ...
2     [name, your name, do you have a name, what are...
3     [timing of college, what is college timing, wo...
4     [more info, contact info, how to contact colle...
5     [list of courses, list of courses offered, lis...
6     [information about fee, information on fee, te...
7     [where is the college located, college is loca...
8     [hostel facility, hostel servive, hostel locat...
9     [events organised, list of events, list of eve...
10    [document to bring, documents needed for admis...
11    [size of campus, building size, How many floor...
12    [Syllabus for IT, what is the Information Tech...
13    [is there any library, library facility, libra...
14    [how is college infrastructure, infrastructure...
15    [food facilities, canteen facilities, canteen ...
16    [food menu, food in canteen, Whats there on me...
17    [What is college placement, Which companie