In [1]:
"""
AI Chatbot with NLP - Internship Task 3
File: internship_task_3_chatbot.py

Description:
A simple, well-commented Python chatbot that uses NLP to answer user queries.
This script includes two modes:
  1) Command-line chat loop (default)
  2) Optional Flask web endpoint (uncomment to enable)

Key features:
 - Prepares training data (intents + example phrases + responses)
 - Cleans and lemmatizes input text using NLTK
 - Vectorizes text using TF-IDF (scikit-learn)
 - Uses cosine similarity to find the best-matching response
 - Has a fallback answer when confidence is low
 - Easy to extend: add intents/responses or switch to spaCy

Dependencies:
 - Python 3.8+
 - nltk
 - scikit-learn
 - flask (optional, for web mode)

Install dependencies (recommended):
 pip install nltk scikit-learn flask

You must also download some NLTK corpora the first time you run:
 In Python REPL or at top of script (first run):
 >>> import nltk
 >>> nltk.download('punkt')
 >>> nltk.download('wordnet')
 >>> nltk.download('omw-1.4')


How to run (command line):
 python internship_task_3_chatbot.py

How to use (web mode):
 1) Uncomment the FLASK block at the bottom of this file.
 2) Run the script. Then POST /chat with JSON {"message": "Hi"}


Author: CodTech Intern
Date: 2025
"""

'\nAI Chatbot with NLP - Internship Task 3\nFile: internship_task_3_chatbot.py\n\nDescription:\nA simple, well-commented Python chatbot that uses NLP to answer user queries.\nThis script includes two modes:\n  1) Command-line chat loop (default)\n  2) Optional Flask web endpoint (uncomment to enable)\n\nKey features:\n - Prepares training data (intents + example phrases + responses)\n - Cleans and lemmatizes input text using NLTK\n - Vectorizes text using TF-IDF (scikit-learn)\n - Uses cosine similarity to find the best-matching response\n - Has a fallback answer when confidence is low\n - Easy to extend: add intents/responses or switch to spaCy\n\nDependencies:\n - Python 3.8+\n - nltk\n - scikit-learn\n - flask (optional, for web mode)\n\nInstall dependencies (recommended):\n pip install nltk scikit-learn flask\n\nYou must also download some NLTK corpora the first time you run:\n In Python REPL or at top of script (first run):\n >>> import nltk\n >>> nltk.download(\'punkt\')\n >>> nl

In [2]:
# -------------------------
# Imports
# -------------------------
import re
import random
import json
from typing import List, Tuple

In [3]:
# NLP libraries
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [4]:
# Machine learning / similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
# Optional: Flask for a simple web API (uncomment if using web mode)
# from flask import Flask, request, jsonify


# -------------------------
# Helper functions for NLP preprocessing
# -------------------------
lemmatizer = WordNetLemmatizer()

def clean_text(text: str) -> str:
    """Lowercase, remove non-alphanumerics and extra spaces."""
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def lemmatize_text(text: str) -> str:
    """Tokenize and lemmatize input text, then return re-joined string."""
    tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmas)


def preprocess(text: str) -> str:
    """Full preprocessing pipeline: clean -> lemmatize."""
    cleaned = clean_text(text)
    lemm = lemmatize_text(cleaned)
    return lemm

In [11]:
# -------------------------
# Ensure NLTK resources are available
# -------------------------
# If you haven't downloaded required corpora, uncomment the following lines
# and run the script once. Afterwards you can comment them back out.
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [12]:
# -------------------------
# Training data (intents)
# -------------------------
# This is a small example dataset. You can expand it with more intents and examples.
intents = [
    {
        "tag": "greeting",
        "patterns": [
            "hi",
            "hello",
            "hey",
            "good morning",
            "good evening",
            "hey there"
        ],
        "responses": [
            "Hello! How can I help you today?",
            "Hi there — what can I do for you?",
            "Hey! Ask me anything about this project."
        ]
    },
    {
        "tag": "goodbye",
        "patterns": ["bye", "see you", "goodbye", "catch you later"],
        "responses": ["Goodbye!", "See you later.", "Have a nice day!"]
    },
    {
        "tag": "thanks",
        "patterns": ["thanks", "thank you", "thx", "thanks a lot"],
        "responses": ["You're welcome!", "Anytime!", "Happy to help."]
    },
    {
        "tag": "project_info",
        "patterns": [
            "what is this project",
            "tell me about the project",
            "what does this do",
            "explain the internship task",
            "describe the chatbot"
        ],
        "responses": [
            "This is a sample NLP chatbot built for the CodTech internship task. It uses NLTK and TF-IDF to match user queries to intents.",
            "A small chatbot demo using preprocessing (lemmatization) and TF-IDF + cosine similarity to pick responses."
        ]
    },
    {
        "tag": "how_to_run",
        "patterns": [
            "how to run",
            "how do i run this",
            "run the script",
            "execute"
        ],
        "responses": [
            "Run the script with: python internship_task_3_chatbot.py. Make sure dependencies are installed and NLTK corpora downloaded.",
            "Install requirements with pip, download NLTK resources (punkt, wordnet) and run the .py file."
        ]
    },
    {
        "tag": "default_fallback",
        "patterns": ["*"],
        "responses": [
            "Sorry, I didn't understand that. Can you phrase it differently?",
            "I am not sure I follow — try rephrasing or ask something else."
        ]
    }
]

# -------------------------
# Build the knowledge base
# -------------------------

# We will construct two lists:
#  - documents: example phrases (patterns) after preprocessing
#  - responses_map: maps each document index to a list of possible responses

documents: List[str] = []
responses_map: List[List[str]] = []

for intent in intents:
    tag = intent['tag']
    for pattern in intent['patterns']:
        processed = preprocess(pattern)
        documents.append(processed)
        # store the responses for this pattern (could map by tag too)
        responses_map.append(intent['responses'])

# sanity check
if not documents:
    raise ValueError("No training documents found. Add some patterns to the intents list.")

In [13]:
# -------------------------
# TF-IDF Vectorizer
# -------------------------
# Fit the vectorizer on training patterns (documents)
vectorizer = TfidfVectorizer()
X_docs = vectorizer.fit_transform(documents)

# -------------------------
# Chatbot logic: respond function
# -------------------------

def get_response(user_message: str, top_n: int = 1) -> Tuple[str, float]:
    """Given user input, return best response and similarity score.

    Steps:
      1) Preprocess user input
      2) Vectorize with same TF-IDF vectorizer
      3) Compute cosine similarity to training patterns
      4) Pick the highest scoring pattern
      5) If score is low, return fallback
    """
    if not user_message or not user_message.strip():
        return ("Please say something — I didn't get any input.", 0.0)

    # preprocess + vectorize
    processed = preprocess(user_message)
    user_vec = vectorizer.transform([processed])

    # cosine similarity
    sims = cosine_similarity(user_vec, X_docs).flatten()
    # find top match
    best_idx = sims.argmax()
    best_score = float(sims[best_idx])

    # threshold for fallback (tweakable)
    threshold = 0.35

    if best_score < threshold:
        # low confidence: return fallback intent responses randomly
        # fallback located at tag "default_fallback" (last in our intents list)
        fallback_intent = next((i for i in intents if i['tag'] == 'default_fallback'), None)
        if fallback_intent:
            return (random.choice(fallback_intent['responses']), best_score)
        else:
            return ("I'm not sure I understand — can you try rephrasing?", best_score)

    # high confidence: pick one response from the matched pattern's response list
    candidate_responses = responses_map[best_idx]
    chosen = random.choice(candidate_responses)
    return (chosen, best_score)

In [14]:
# -------------------------
# Command-line chat loop
# -------------------------

def chat_loop():
    print("\n=== CodTech Internship Chatbot (type 'quit' to exit) ===\n")
    while True:
        try:
            user = input("You: ").strip()
        except (KeyboardInterrupt, EOFError):
            print("\nExiting. Bye!")
            break

        if not user:
            print("Bot: Please type something.")
            continue

        if user.lower() in ["quit", "exit", "bye"]:
            print("Bot: Goodbye! Good luck with your internship.")
            break

        response, score = get_response(user)
        # show score for debugging (optional)
        print(f"Bot: {response}  (confidence={score:.2f})")

In [15]:
# -------------------------
# Optional Flask Web API
# -------------------------
# If you prefer a small web endpoint, uncomment and run. Example request:
#   POST /chat  JSON body: {"message": "hello"}
#
# app = Flask(__name__)
#
# @app.route('/chat', methods=['POST'])
# def chat_api():
#     data = request.get_json(force=True)
#     message = data.get('message', '')
#     response_text, score = get_response(message)
#     return jsonify({'response': response_text, 'confidence': score})
#
# if __name__ == '__main__':
#     app.run(host='0.0.0.0', port=5000, debug=True)

# -------------------------
# Entry point
# -------------------------
if __name__ == '__main__':
    # By default, run the command-line chat loop. To run Flask mode, see above.
    chat_loop()



=== CodTech Internship Chatbot (type 'quit' to exit) ===

You: Hi
Bot: Hi there — what can I do for you?  (confidence=1.00)
You: How is the weather today?
Bot: This is a sample NLP chatbot built for the CodTech internship task. It uses NLTK and TF-IDF to match user queries to intents.  (confidence=0.37)
You: How are you?
Bot: Goodbye!  (confidence=0.42)
You: What is my name?
Bot: This is a sample NLP chatbot built for the CodTech internship task. It uses NLTK and TF-IDF to match user queries to intents.  (confidence=0.75)
You: Thank you
Bot: Anytime!  (confidence=1.00)
You: Bye
Bot: Goodbye! Good luck with your internship.
