In [1]:
#!/usr/bin/env python3
"""
NLP Chatbot (NLTK + optional spaCy)
===================================
A single-file, working chatbot that uses classic NLP techniques to answer user queries.

Features
--------
- Text preprocessing (lowercasing, tokenization, stopword removal, lemmatization)
- Intent recognition via TF-IDF + cosine similarity (scikit-learn if available)
  * Graceful fallback to keyword scoring when scikit-learn isn't installed.
- Small, embedded knowledge base for FAQs
- Rule-handled utilities: greetings, goodbye, thanks, time/date, simple calc
- Optional Named Entity Recognition (NER) via spaCy if installed (adds entities to responses)
- Clear, commented code suitable for coursework submissions

Run
---
1) CLI chatbot (recommended):
   python nlp_chatbot.py

2) Change configuration (optional):
   - Edit INTENTS, KNOWLEDGE_BASE, and THRESHOLDS below.

Dependencies
------------
- Python 3.8+
- NLTK (recommended)
  pip install nltk
  (The script tries to auto-download 'punkt', 'wordnet', 'omw-1.4', 'stopwords' if missing.
   If your environment blocks downloads, it will still run with simpler tokenization.)
- scikit-learn (optional, for TF-IDF):
  pip install scikit-learn
- spaCy (optional, for NER):
  pip install spacy
  python -m spacy download en_core_web_sm

Notes
-----
This is not a generative LLM; it's a classic retrieval/rule-based chatbot—perfect for demonstrating NLP fundamentals.
"""

from __future__ import annotations
import math
import re
import sys
import time
import json
import datetime as dt
from typing import List, Dict, Tuple, Optional

# -------------------- Optional imports with graceful fallback --------------------
# NLTK (preferred)
nltk_available = True
try:
    import nltk
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
except Exception:
    nltk_available = False

# scikit-learn (for TF-IDF)
sklearn_available = True
try:
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
except Exception:
    sklearn_available = False

# spaCy (optional NER)
spacy_nlp = None
try:
    import spacy
    try:
        spacy_nlp = spacy.load("en_core_web_sm")
    except Exception:
        spacy_nlp = None
except Exception:
    spacy_nlp = None

# -------------------- Data: intents + knowledge base --------------------

INTENTS: List[Dict] = [
    {
        "tag": "greeting",
        "patterns": [
            "hello", "hi", "hey", "good morning", "good afternoon", "good evening",
            "yo", "howdy", "namaste", "hola"
        ],
        "responses": [
            "Hello! How can I help you today?",
            "Hi there! What can I do for you?",
            "Hey! Ask me anything."
        ],
        "context_set": ""
    },
    {
        "tag": "goodbye",
        "patterns": ["bye", "goodbye", "see you", "catch you later", "quit", "exit"],
        "responses": [
            "Goodbye! Have a great day.",
            "See you later!",
            "Bye! Come back anytime."
        ],
        "context_set": ""
    },
    {
        "tag": "thanks",
        "patterns": ["thanks", "thank you", "ty", "thx", "appreciate it"],
        "responses": [
            "You're welcome!",
            "Anytime!",
            "Glad I could help."
        ],
        "context_set": ""
    },
    {
        "tag": "time",
        "patterns": ["what time is it", "current time", "tell me the time", "time now"],
        "responses": [],  # handled dynamically
        "context_set": ""
    },
    {
        "tag": "date",
        "patterns": ["what's the date", "today's date", "date today"],
        "responses": [],  # handled dynamically
        "context_set": ""
    },
    {
        "tag": "calc",
        "patterns": ["calculate", "compute", "solve", "what is", "evaluate"],
        "responses": [],  # handled dynamically by parsing an expression
        "context_set": ""
    },
    {
        "tag": "help",
        "patterns": ["help", "what can you do", "commands", "how to use"],
        "responses": [
            "I can handle greetings, time/date, simple calculations, and FAQs.\n"
            "Try: 'what time is it', 'what is 22/7', or ask about the topics I know."
        ],
        "context_set": ""
    },
]

# Simple, embedded FAQ knowledge base
KNOWLEDGE_BASE: Dict[str, str] = {
    "what is nlp": "NLP (Natural Language Processing) is a field of AI focused on enabling computers to understand and generate human language.",
    "what is nltk": "NLTK is a leading Python library for working with human language data—tokenization, stemming, tagging, parsing, and more.",
    "what is spacy": "spaCy is a modern NLP library offering fast tokenization, POS tagging, NER, and other pipelines optimized for production.",
    "difference between nltk and spacy": "NLTK is great for education and research with many resources; spaCy focuses on production performance and modern pipelines (like efficient NER).",
    "what is tfidf": "TF-IDF (Term Frequency–Inverse Document Frequency) scores words by how important they are to a document within a corpus.",
    "who created you": "I was created as a demo chatbot using Python and classic NLP techniques.",
    "what can you do": "I can answer FAQs, tell time/date, do simple calculations, and show extracted entities if spaCy is available."
}

# Similarity thresholds
THRESHOLDS = {
    "intent_confidence": 0.35,  # min score to accept an intent via TF-IDF
    "faq_confidence": 0.25,     # min score to answer from FAQ via TF-IDF
}

# -------------------- NLP utilities --------------------

_STOPWORDS = set()
_LEMMA = None
def _setup_nltk():
    global _STOPWORDS, _LEMMA, nltk_available
    if not nltk_available:
        return

    # Try to download resources if missing; if fails, we proceed with fallbacks.
    try:
        nltk.data.find("tokenizers/punkt")
    except LookupError:
        try:
            nltk.download("punkt", quiet=True)
        except Exception:
            pass

    try:
        nltk.data.find("corpora/stopwords")
    except LookupError:
        try:
            nltk.download("stopwords", quiet=True)
        except Exception:
            pass

    try:
        nltk.data.find("corpora/wordnet")
    except LookupError:
        try:
            nltk.download("wordnet", quiet=True)
        except Exception:
            pass

    try:
        nltk.data.find("corpora/omw-1.4")
    except LookupError:
        try:
            nltk.download("omw-1.4", quiet=True)
        except Exception:
            pass

    try:
        _STOPWORDS = set(stopwords.words("english"))
    except Exception:
        _STOPWORDS = set()

    try:
        _LEMMA = WordNetLemmatizer()
    except Exception:
        _LEMMA = None

def tokenize(text: str) -> List[str]:
    text = text.lower().strip()
    # Remove punctuation except math symbols for calc
    cleaned = re.sub(r"[^\w\s\.\+\-\*\/\(\)%]", " ", text)
    if nltk_available:
        try:
            from nltk.tokenize import word_tokenize
            return [t for t in word_tokenize(cleaned) if t.strip()]
        except Exception:
            pass
    # Fallback: simple split
    return [t for t in cleaned.split() if t.strip()]

def normalize(tokens: List[str]) -> List[str]:
    if not tokens:
        return tokens
    # Remove stopwords and lemmatize
    out = []
    for t in tokens:
        if t in _STOPWORDS:
            continue
        if _LEMMA:
            try:
                t = _LEMMA.lemmatize(t)
            except Exception:
                pass
        out.append(t)
    return out

def preprocess(text: str) -> str:
    return " ".join(normalize(tokenize(text)))

# -------------------- TF-IDF pipeline (with fallback) --------------------

class IntentMatcher:
    def __init__(self, intents: List[Dict]):
        self.intents = intents
        self.use_sklearn = sklearn_available
        self.tags: List[str] = []
        self.pattern_texts: List[str] = []

        for intent in intents:
            tag = intent["tag"]
            patterns = intent.get("patterns", [])
            for p in patterns:
                self.tags.append(tag)
                self.pattern_texts.append(preprocess(p))

        if self.use_sklearn and self.pattern_texts:
            self.vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=1)
            self.tfidf = self.vectorizer.fit_transform(self.pattern_texts)
        else:
            self.vectorizer = None
            self.tfidf = None

    def match(self, user_text: str) -> Tuple[Optional[str], float]:
        query = preprocess(user_text)
        if not query:
            return None, 0.0

        if self.use_sklearn and self.vectorizer is not None:
            q_vec = self.vectorizer.transform([query])
            sims = cosine_similarity(q_vec, self.tfidf)[0]
            if len(sims) == 0:
                return None, 0.0
            idx = int(sims.argmax())
            return self.tags[idx], float(sims[idx])

        # Fallback: simple keyword overlap scoring
        q_tokens = set(query.split())
        best_tag, best_score = None, 0.0
        for tag, pattern in zip(self.tags, self.pattern_texts):
            p_tokens = set(pattern.split())
            score = len(q_tokens & p_tokens) / (len(q_tokens) + 1e-9)
            if score > best_score:
                best_score, best_tag = score, tag
        return best_tag, best_score

class FAQMatcher:
    def __init__(self, kb: Dict[str, str]):
        self.kb = kb
        self.keys = list(kb.keys())
        self.use_sklearn = sklearn_available and len(self.keys) > 0
        if self.use_sklearn:
            self.vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=1)
            self.tfidf = self.vectorizer.fit_transform([preprocess(k) for k in self.keys])
        else:
            self.vectorizer = None
            self.tfidf = None

    def best_answer(self, user_text: str) -> Tuple[Optional[str], float]:
        query = preprocess(user_text)
        if not query:
            return None, 0.0

        if self.use_sklearn and self.vectorizer is not None:
            q_vec = self.vectorizer.transform([query])
            sims = cosine_similarity(q_vec, self.tfidf)[0]
            if len(sims) == 0:
                return None, 0.0
            idx = int(sims.argmax())
            key = self.keys[idx]
            return self.kb[key], float(sims[idx])

        # Fallback: keyword overlap
        q_tokens = set(query.split())
        best_key, best_score = None, 0.0
        for key in self.keys:
            k_tokens = set(preprocess(key).split())
            score = len(q_tokens & k_tokens) / (len(q_tokens) + 1e-9)
            if score > best_score:
                best_score, best_key = score, key
        return (self.kb[best_key] if best_key else None), best_score

# -------------------- Core chatbot --------------------

class NLPChatbot:
    def __init__(self,
                 intents: List[Dict],
                 knowledge_base: Dict[str, str],
                 thresholds: Dict[str, float]):
        _setup_nltk()
        self.intents = intents
        self.kb = knowledge_base
        self.thresholds = thresholds
        self.intent_matcher = IntentMatcher(intents)
        self.faq_matcher = FAQMatcher(knowledge_base)

    def respond(self, message: str) -> str:
        if not message or not message.strip():
            return "I didn't catch that. Could you rephrase?"

        # Try intent detection
        tag, score = self.intent_matcher.match(message)
        if tag and score >= self.thresholds["intent_confidence"]:
            resp = self._handle_intent(tag, message)
            if resp:
                return self._maybe_add_entities(message, resp)

        # Try FAQ retrieval
        answer, faq_score = self.faq_matcher.best_answer(message)
        if answer and faq_score >= self.thresholds["faq_confidence"]:
            return self._maybe_add_entities(message, answer)

        # Fallback
        return ("I'm not sure about that yet. Try asking in a different way, "
                "or type 'help' to see what I can do.")

    def _handle_intent(self, tag: str, message: str) -> Optional[str]:
        if tag == "greeting":
            return self._pick_response(tag)
        if tag == "goodbye":
            return self._pick_response(tag)
        if tag == "thanks":
            return self._pick_response(tag)
        if tag == "help":
            return self._pick_response(tag)
        if tag == "time":
            now = dt.datetime.now().strftime("%H:%M:%S")
            return f"The current time is {now}."
        if tag == "date":
            today = dt.date.today().strftime("%A, %B %d, %Y")
            return f"Today is {today}."
        if tag == "calc":
            result = self._safe_calculate(message)
            return result
        # default
        return self._pick_response(tag)

        # Note: FAQ handled separately

    def _pick_response(self, tag: str) -> str:
        for it in self.intents:
            if it["tag"] == tag:
                rs = it.get("responses", [])
                if rs:
                    import random
                    return random.choice(rs)
        return ""

    def _safe_calculate(self, text: str) -> str:
        """
        Evaluate simple arithmetic expressions safely.
        Allowed: numbers, + - * / % ( )
        """
        expr = self._extract_expression(text)
        if not expr:
            return "Please provide an expression, e.g., 'what is (2+3)*4'."
        if not re.fullmatch(r"[0-9\.\+\-\*\/%\(\)\s]+", expr):
            return "Sorry, I only support basic arithmetic (+, -, *, /, %, parentheses)."

        try:
            # Very restricted eval: use Python's eval with empty globals/locals
            result = eval(expr, {"__builtins__": {}}, {})
            return f"{expr.strip()} = {result}"
        except ZeroDivisionError:
            return "Division by zero is undefined."
        except Exception:
            return "I couldn't evaluate that. Check the expression and try again."

    def _extract_expression(self, text: str) -> Optional[str]:
        # Try to find the longest arithmetic-looking substring
        candidates = re.findall(r"([0-9\.\+\-\*\/%\(\)\s]{3,})", text)
        if not candidates:
            return None
        # Pick the longest
        return max(candidates, key=len)

    def _maybe_add_entities(self, user_text: str, response: str) -> str:
        if spacy_nlp is None:
            return response
        try:
            doc = spacy_nlp(user_text)
            ents = [(ent.text, ent.label_) for ent in doc.ents]
            if ents:
                ent_strs = [f"{t} ({l})" for t, l in ents]
                response += "\n\n[Detected entities: " + ", ".join(ent_strs) + "]"
        except Exception:
            pass
        return response

# -------------------- CLI loop --------------------

def main() -> None:
    print("NLP Chatbot (NLTK + optional spaCy)")
    print("Type 'exit' or 'quit' to end.\n")
    bot = NLPChatbot(INTENTS, KNOWLEDGE_BASE, THRESHOLDS)
    while True:
        try:
            user = input("You: ").strip()
        except (EOFError, KeyboardInterrupt):
            print("\nBot: Goodbye!")
            break
        if not user:
            print("Bot: I didn't catch that. Could you rephrase?")
            continue
        if user.lower() in {"exit", "quit", "bye"}:
            print("Bot:", bot.respond("bye"))
            break
        reply = bot.respond(user)
        print("Bot:", reply)

if __name__ == "__main__":
    main()



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Durga Prasad\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\Durga Prasad\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\Durga Prasad\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "C:\Users\Durga Pr

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Durga Prasad\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\Durga Prasad\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\Durga Prasad\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "C:\Users\Durga Pr

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Durga Prasad\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\Durga Prasad\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\Durga Prasad\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "C:\Users\Durga Pr

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Durga Prasad\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\Durga Prasad\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\Durga Prasad\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "C:\Users\Durga Pr

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



NLP Chatbot (NLTK + optional spaCy)
Type 'exit' or 'quit' to end.



You:  exit


Bot: Goodbye! Have a great day.
