# AI Chatbot Using NLP

In [1]:
import json, re, ast, operator as op, random
import numpy as np
from tqdm import tqdm
import pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import tkinter as tk
from tkinter import scrolledtext
import warnings
warnings.filterwarnings('ignore')



In [2]:
with open(r"intents.json", "r") as f:
    intents = json.load(f)
print("Number of intents:", len(intents["intents"]))
print("First intent example:", intents["intents"][0].keys())


Number of intents: 23
First intent example: dict_keys(['intent', 'text', 'responses', 'extension', 'context', 'entityType', 'entities'])


In [3]:
texts = []
for it in intents["intents"]:
    texts.extend([t.lower() for t in it["text"]])

from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(texts)


# DO NOT RUN THIS CELL

**Glove.6B.50d.txt** was >100MB therefore this cell was written to achieve the size limit of below 100 MB for each Project.

In [4]:
""" glove_full = "embeddings/glove.6B.50d.txt"
glove_trim = "embeddings/glove_70mb_50d.txt"

dataset_vocab = set(tokenizer.word_index.keys())
target_words = 150000   # ~60–70 MB

kept = 0
with open(glove_full, "r", encoding="utf-8") as fin, \
     open(glove_trim, "w", encoding="utf-8") as fout:
    for i, line in enumerate(fin):
        word = line.split(" ", 1)[0]
        if i < target_words or word in dataset_vocab:
            fout.write(line)
            kept += 1

print(f"Trimmed GloVe saved: {glove_trim}, words kept = {kept}")
 """

' glove_full = "embeddings/glove.6B.50d.txt"\nglove_trim = "embeddings/glove_70mb_50d.txt"\n\ndataset_vocab = set(tokenizer.word_index.keys())\ntarget_words = 150000   # ~60–70 MB\n\nkept = 0\nwith open(glove_full, "r", encoding="utf-8") as fin,      open(glove_trim, "w", encoding="utf-8") as fout:\n    for i, line in enumerate(fin):\n        word = line.split(" ", 1)[0]\n        if i < target_words or word in dataset_vocab:\n            fout.write(line)\n            kept += 1\n\nprint(f"Trimmed GloVe saved: {glove_trim}, words kept = {kept}")\n '

In [5]:
MAX_LEN = 20

texts, labels = [], []
tag_index, tags = {}, []

for it in intents["intents"]:
    tag = it["intent"]  # use your field name
    if tag not in tag_index:
        tag_index[tag] = len(tag_index)
        tags.append(tag)
    for pat in it["text"]:   # your field for training phrases
        texts.append(pat.lower())
        labels.append(tag_index[tag])

tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=MAX_LEN, padding="post")
y = to_categorical(labels, num_classes=len(tag_index))

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)


Shape of X: (151, 20)
Shape of y: (151, 23)


In [6]:
EMBED_DIM = 50
embeddings_index = {}

with open(r"embeddings\glove_70mb_50d.txt", "r", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = coefs

print("Loaded word vectors:", len(embeddings_index))

num_words = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((num_words, EMBED_DIM))
for word, i in tokenizer.word_index.items():
    vec = embeddings_index.get(word)
    if vec is not None:
        embedding_matrix[i] = vec


Loaded word vectors: 150002


In [7]:
model = Sequential()
model.add(Embedding(input_dim=num_words,
                    output_dim=EMBED_DIM,
                    weights=[embedding_matrix],
                    input_length=MAX_LEN,
                    trainable=True))
model.add(GlobalAveragePooling1D())
model.add(Dense(128, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(64, activation="relu"))
model.add(Dense(len(tags), activation="softmax"))

model.compile(optimizer=Adam(1e-3),
              loss="categorical_crossentropy",
              metrics=["accuracy"])

history = model.fit(X, y, epochs=500, batch_size=32,
                    validation_split=0.1, verbose=1)


Epoch 1/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.0519 - loss: 3.1304 - val_accuracy: 0.0000e+00 - val_loss: 3.2764
Epoch 2/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.0444 - loss: 3.1053 - val_accuracy: 0.0000e+00 - val_loss: 3.3081
Epoch 3/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.0889 - loss: 3.0913 - val_accuracy: 0.0000e+00 - val_loss: 3.3546
Epoch 4/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.1407 - loss: 3.0718 - val_accuracy: 0.0000e+00 - val_loss: 3.4086
Epoch 5/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.1037 - loss: 3.0492 - val_accuracy: 0.0000e+00 - val_loss: 3.4776
Epoch 6/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.1111 - loss: 3.0273 - val_accuracy: 0.0000e+00 - val_loss: 3.5564
Epoch 7/500
[1m5/5

In [8]:

SAFE_OPS = {
    ast.Add: op.add, ast.Sub: op.sub, ast.Mult: op.mul,
    ast.Div: op.truediv, ast.Pow: op.pow, ast.USub: op.neg,
    ast.Mod: op.mod, ast.FloorDiv: op.floordiv
}

def safe_eval(expr):
    def _eval(node):
        if isinstance(node, ast.Num): return node.n
        if isinstance(node, ast.BinOp): return SAFE_OPS[type(node.op)](_eval(node.left), _eval(node.right))
        if isinstance(node, ast.UnaryOp): return SAFE_OPS[type(node.op)](_eval(node.operand))
        raise ValueError("Unsupported")
    return _eval(ast.parse(expr, mode="eval").body)


MATH_WORDS = {
    "plus": "+",
    "minus": "-",
    "times": "*",
    "multiplied by": "*",
    "multiply": "*",
    "x": "*",
    "divide": "/",
    "divided by": "/",
    "over": "/",
    "mod": "%",
    "modulus": "%",
    "remainder": "%",
    "power": "**",
    "to the power of": "**"
}

def normalize_math(text):
    text = text.lower()
    # replace word-based math operators
    for word, sym in MATH_WORDS.items():
        text = text.replace(word, f" {sym} ")
    return text

def is_math_query(text):
    text = normalize_math(text)
    return bool(re.search(r'\d', text) and re.search(r'[\+\-\*\/\%\^]', text))

def parse_math(text):
    text = normalize_math(text)
    # keep only valid characters
    expr = re.sub(r"[^\d\.\+\-\*\/\^\%\(\)]", " ", text)
    expr = expr.replace("^", "**")
    return expr.strip()



In [9]:
CONF_THRESHOLD = 0.1

def predict_intent(text):
    seq = tokenizer.texts_to_sequences([text.lower()])
    X_inp = pad_sequences(seq, maxlen=MAX_LEN, padding="post")
    preds = model.predict(X_inp, verbose=0)[0]
    idx = int(np.argmax(preds))
    return tags[idx], float(preds[idx])

def get_response(text):
    if is_math_query(text):
        try:
            expr = parse_math(text)
            val = safe_eval(expr)
            return f"{expr} = {val}"
        except:
            return "Sorry, I couldn’t solve that."

    tag, conf = predict_intent(text)
    if conf < CONF_THRESHOLD:
        return "Sorry, I didn't understand. Can you rephrase?"

    for it in intents["intents"]:
        if it["intent"] == tag:   # match with your key
            return random.choice(it["responses"])
    return "Hmm."


In [10]:
covered = 0
for word in tokenizer.word_index.keys():
    if word in embeddings_index:
        covered += 1
print("Words in tokenizer:", len(tokenizer.word_index))
print("Covered by GloVe:", covered)


Words in tokenizer: 139
Covered by GloVe: 133


In [11]:
for it in intents["intents"]:
    print(it["intent"], ":", len(it["text"]))


Greeting : 7
GreetingResponse : 8
CourtesyGreeting : 7
CourtesyGreetingResponse : 8
CurrentHumanQuery : 7
NameQuery : 6
RealNameQuery : 7
TimeQuery : 7
Thanks : 6
NotTalking2U : 7
UnderstandQuery : 6
Shutup : 7
Swearing : 5
GoodBye : 4
CourtesyGoodBye : 6
WhoAmI : 6
Clever : 7
Gossip : 6
Jokes : 6
PodBayDoor : 7
PodBayDoorResponse : 7
SelfAware : 7
Math : 7


In [12]:
def debug_intent(text):
    seq = tokenizer.texts_to_sequences([text.lower()])
    X_inp = pad_sequences(seq, maxlen=MAX_LEN, padding="post")
    preds = model.predict(X_inp, verbose=0)[0]
    for i, tag in enumerate(tags):
        print(f"{tag}: {preds[i]:.3f}")
    idx = np.argmax(preds)
    print("→ Predicted:", tags[idx], "Confidence:", preds[idx])


In [13]:
#debug_intent("hi")
#debug_intent("bye")
#debug_intent("hello there")
debug_intent("thank you")

Greeting: 0.000
GreetingResponse: 0.000
CourtesyGreeting: 0.000
CourtesyGreetingResponse: 0.000
CurrentHumanQuery: 0.000
NameQuery: 0.000
RealNameQuery: 0.000
TimeQuery: 0.000
Thanks: 0.999
NotTalking2U: 0.000
UnderstandQuery: 0.000
Shutup: 0.000
Swearing: 0.000
GoodBye: 0.000
CourtesyGoodBye: 0.001
WhoAmI: 0.000
Clever: 0.000
Gossip: 0.000
Jokes: 0.000
PodBayDoor: 0.000
PodBayDoorResponse: 0.000
SelfAware: 0.000
Math: 0.000
→ Predicted: Thanks Confidence: 0.99903786


In [14]:
""" while True:
    user = input("You: ")
    if user.lower() in ["quit","exit"]: break
    print("Bot:", get_response(user))
 """

' while True:\n    user = input("You: ")\n    if user.lower() in ["quit","exit"]: break\n    print("Bot:", get_response(user))\n '

In [15]:


def send_message():
    user_msg = entry.get()
    if not user_msg.strip():
        return
    chat_window.config(state=tk.NORMAL)
    chat_window.insert(tk.END, "You: " + user_msg + "\n")
    chat_window.insert(tk.END, "Bot: " + get_response(user_msg) + "\n\n")
    chat_window.config(state=tk.DISABLED)
    chat_window.see(tk.END)
    entry.delete(0, tk.END)

# Create window
root = tk.Tk()
root.title("AI Chatbot")
root.geometry("500x500")

# Chat display area
chat_window = scrolledtext.ScrolledText(root, wrap=tk.WORD)
chat_window.config(state=tk.DISABLED)
chat_window.pack(padx=10, pady=10, fill=tk.BOTH, expand=True)

# User entry
entry_frame = tk.Frame(root)
entry_frame.pack(fill=tk.X, padx=10, pady=5)

entry = tk.Entry(entry_frame, font=("Arial", 12))
entry.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=(0,5))
entry.bind("<Return>", lambda event: send_message())  # send on Enter key

send_button = tk.Button(entry_frame, text="Send", command=send_message)
send_button.pack(side=tk.RIGHT)

root.mainloop()
