In [1]:
import json
import os
import time
import re
from collections import Counter
from sentence_transformers import SentenceTransformer, util
import numpy as np
from pinecone import Pinecone, ServerlessSpec
from tqdm.notebook import tqdm
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Divy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from dotenv import load_dotenv
load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
PINECONE_INDEX_NAME = "owasp-chatbot-index"
EMBEDDING_DIM = 768

In [3]:
# Update the Pinecone index check section
print("Initializing Pinecone client and connecting to index...")
pc = Pinecone(api_key=PINECONE_API_KEY)

# Connect to the existing Pinecone index
try:
    # List all indexes and check if our index exists
    indexes = pc.list_indexes()
    index_names = [index.name for index in indexes.indexes] if hasattr(indexes, 'indexes') else []
    
    if PINECONE_INDEX_NAME not in index_names:
        raise ValueError(f"Pinecone index '{PINECONE_INDEX_NAME}' does not exist. Please run the data population notebook first.")
    
    # Connect to the index
    pinecone_index = pc.Index(PINECONE_INDEX_NAME)
    print(f"Successfully connected to Pinecone index '{PINECONE_INDEX_NAME}'.")
    
    # Get index stats
    index_stats = pinecone_index.describe_index_stats()
    print(f"Index stats: Dimension={index_stats.dimension}, Total Vectors={index_stats.total_vector_count}")
    
except Exception as e:
    print(f"CRITICAL ERROR: Failed to connect to Pinecone index '{PINECONE_INDEX_NAME}'.")
    print(f"Reason: {e}")
    print("Please ensure:")
    print("1. Your Pinecone API key is correct")
    print("2. The index name is correct")
    print("3. The index exists in your Pinecone project")
    print("4. Your network connection is stable")
    raise

Initializing Pinecone client and connecting to index...
Successfully connected to Pinecone index 'owasp-chatbot-index'.
Index stats: Dimension=768, Total Vectors=2160


In [4]:
import pinecone
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)
print("Available indexes:", [idx.name for idx in pc.list_indexes()])

Available indexes: ['owasp-chatbot-index']


In [6]:
model_path = './fine_tuned_owasp_model_advanced'
print(f"Initializing FINE-TUNED BERT model from {model_path}...")
try:
    model = SentenceTransformer(model_path)
    print("Fine-tuned BERT model loaded successfully.")
except Exception as e:
    print(f"Error loading fine-tuned model from {model_path}: {e}")
    print("Please ensure you've run Notebook 1 successfully and the model path is correct.")
    print("Falling back to default 'all-mpnet-base-v2' model. This will likely reduce accuracy.")
    model = SentenceTransformer('all-mpnet-base-v2')
    print("Default 'all-mpnet-base-v2' model loaded.")

print("\nInitializing Pinecone client...")
pc = Pinecone(api_key=PINECONE_API_KEY)

# Define the index name directly since it's not in .env
INDEX_NAME = "owasp-chatbot-index"

try:
    print(f"\nConnecting to Pinecone index: {INDEX_NAME}")
    
    # List all indexes for debugging
    print("\nAvailable indexes in your project:")
    indexes = pc.list_indexes()
    index_names = [index.name for index in indexes]
    print(index_names)
    
    # Check if index exists
    if INDEX_NAME not in index_names:
        raise ValueError(f"Index '{INDEX_NAME}' not found. Available indexes: {index_names}")
    
    # Connect to the index
    pinecone_index = pc.Index(INDEX_NAME)
    
    # Verify connection by getting stats
    try:
        stats = pinecone_index.describe_index_stats()
        print(f"\nSuccessfully connected to index: {INDEX_NAME}")
        print(f"Index stats: {stats.dimension} dimensions, {stats.total_vector_count} vectors")
    except Exception as e:
        print(f"\nWarning: Connected to index but could not retrieve stats: {e}")
        print("The index might be initializing. Waiting 10 seconds and retrying...")
        time.sleep(10)
        stats = pinecone_index.describe_index_stats()
        print(f"Retry successful! Index stats: {stats.dimension} dimensions, {stats.total_vector_count} vectors")
    
except Exception as e:
    print(f"\nERROR: Failed to connect to Pinecone index '{INDEX_NAME}'")
    print(f"Reason: {str(e)}")
    print("\nTroubleshooting steps:")
    print("1. Verify the index name is correct (case-sensitive)")
    print("2. Check your Pinecone dashboard to confirm the index exists")
    print("3. Ensure your API key has permissions to access the index")
    print("4. Try accessing the index directly with: pc.Index('owasp-chatbot-index')")
    raise

print("\nPinecone connection established successfully!")

Initializing FINE-TUNED BERT model from ./fine_tuned_owasp_model_advanced...
Fine-tuned BERT model loaded successfully.

Initializing Pinecone client...

Connecting to Pinecone index: owasp-chatbot-index

Available indexes in your project:
['owasp-chatbot-index']

Successfully connected to index: owasp-chatbot-index
Index stats: 768 dimensions, 2160 vectors

Pinecone connection established successfully!


In [7]:
# Cell 4: Define keyword extraction and query expansion functions

def extract_keywords(text):
    """Extract important keywords from text for query expansion."""
    stopwords = {"a", "an", "the", "and", "or", "but", "if", "then", "else", "when", 
                "at", "by", "for", "with", "about", "against", "between", "into", 
                "through", "during", "before", "after", "above", "below", "to", "from", 
                "up", "down", "in", "out", "on", "off", "over", "under", "again", 
                "further", "then", "once", "here", "there", "all", "any", "both", 
                "each", "few", "more", "most", "other", "some", "such", "no", "nor", 
                "not", "only", "own", "same", "so", "than", "too", "very", "can", 
                "will", "just", "should", "now", "what", "which", "how", "where", "is", "are"}
    
    security_terms = {"vulnerability", "exploit", "cve", "attack", "threat", "risk", 
                     "compromise", "security", "breach", "patch", "fix", "update", 
                     "mitigation", "remediation", "severity", "impact", "unauthorized", 
                     "access", "disclosure", "injection", "overflow", "credentials"}
    
    stopwords = stopwords - security_terms
    
    words = text.lower().split()
    important_words = [word for word in words if word not in stopwords and len(word) > 2]
    
    word_counts = Counter(important_words)
    keywords = [word for word, count in word_counts.most_common(7)]
    
    all_keywords = list(set(important_words + keywords))
    
    return all_keywords

def expand_query(question):
    """Generate multiple query variations to improve retrieval based on keywords and common patterns."""
    print(f"  - Original query for expansion: '{question}'")
    keywords = extract_keywords(question)
    
    queries = [question]
    
    if len(keywords) > 0:
        keyword_query = " ".join(keywords)
        if keyword_query.lower() != question.lower() and keyword_query not in queries:
            queries.append(keyword_query)
    
    question_lower = question.lower()
    for q_word in ["what is ", "what does ", "how to ", "how do i ", "tell me about ", "explain "]:
        if question_lower.startswith(q_word):
            clean_q = question[len(q_word):].strip()
            if clean_q and clean_q.lower() not in [q.lower() for q in queries]:
                queries.append(clean_q)
    
    if any(term in question_lower for term in ["vulnerability", "security", "risk", "threat", "issue"]):
        if "fix" in question_lower or "remediation" in question_lower or "solution" in question_lower:
            if "how to fix" not in question_lower and "remediation steps" not in question_lower:
                queries.append(f"how to fix {question_lower}")
                queries.append(f"remediation steps for {question_lower}")
        if "prevent" in question_lower or "avoid" in question_lower:
            queries.append(f"prevent {question_lower}")
        if "detect" in question_lower or "find" in question_lower:
            queries.append(f"detect {question_lower}")

    unique_queries = list(dict.fromkeys(queries))
    
    print(f"  - Expanded queries ({len(unique_queries)}):")
    for i, q in enumerate(unique_queries):
        print(f"    {i+1}. {q}")
    
    return unique_queries

In [8]:
# Cell 5: Define the find_answer_from_pinecone function (MODIFIED for context)

def find_answer_from_pinecone(user_query, pinecone_index, model, chat_history=[], top_k_per_query=3, final_top_k=1, similarity_threshold=0.6):
    """
    Finds the most semantically similar question in Pinecone, retrieving from expanded queries,
    and includes proactive suggestions if available. Now incorporates chat history for context.
    
    Args:
        user_query (str): The current question asked by the user.
        pinecone_index (Pinecone.Index): The initialized Pinecone index.
        model (SentenceTransformer): The loaded (fine-tuned) BERT model.
        chat_history (list): A list of recent (user, bot) conversation turns.
        top_k_per_query (int): Number of top results to retrieve for each expanded query.
        final_top_k (int): Number of overall best results to consider.
        similarity_threshold (float): Minimum similarity score to consider a match valid.

    Returns:
        str: The answer to the most similar question, or a fallback message if no good match is found.
    """
    if not user_query:
        return "Please type a question to get started!"

    # 1. Contextualize the user query
    contextual_query = user_query
    if chat_history:
        # Combine the last few turns of conversation to provide context for the current query.
        # We limit the history to avoid making the query too long for BERT.
        # Format: "(user: user_message) (bot: bot_response) current_user_query"
        recent_history_parts = []
        for sender, message in chat_history[-4:]: # Consider last 2 user turns and 2 bot turns (4 items total)
            # Shorten message if very long to prevent input token limits
            short_message = message[:100] + "..." if len(message) > 100 else message
            recent_history_parts.append(f"({sender}: {short_message})")
        
        contextual_query = f"{' '.join(recent_history_parts)} {user_query}"
        print(f"\n--- Contextualized Query (with history): '{contextual_query}' ---")

    # 2. Expand the contextualized query
    expanded_queries = expand_query(contextual_query) # Use contextual_query for expansion
    
    all_query_results = []

    # 3. Generate embeddings and query Pinecone for each expanded query
    for query_text in expanded_queries:
        query_embedding = model.encode(query_text).tolist()
        try:
            results = pinecone_index.query(
                vector=query_embedding,
                top_k=top_k_per_query,
                include_metadata=True
            )
            all_query_results.extend(results.matches)
        except Exception as e:
            print(f"Error querying Pinecone with expanded query '{query_text}': {e}")

    if not all_query_results:
        return "Sorry, I'm having trouble retrieving information from my knowledge base right now, or no results were found for any query variations."

    # 4. Consolidate and rank all results from expanded queries
    all_query_results.sort(key=lambda x: x.score, reverse=True)
    
    unique_results = []
    seen_ids = set()
    for match in all_query_results:
        original_id = match.metadata.get('id_original')
        if original_id and original_id not in seen_ids:
            unique_results.append(match)
            seen_ids.add(original_id)
    
    if not unique_results:
        return "I couldn't find any relevant information for your query in the knowledge base after trying multiple approaches."

    best_match = unique_results[0]
    best_score = best_match.score
    
    matched_question = best_match.metadata.get('question', 'N/A')
    matched_answer = best_match.metadata.get('answer', 'N/A')
    related_topics = best_match.metadata.get('related_topics', [])

    print(f"\n--- Debug Info ---")
    print(f"Original user query: '{user_query}'")
    print(f"Contextualized query used for search: '{contextual_query}'") # Show the actual query used for search
    print(f"Overall best matched question (from Pinecone): '{matched_question}'")
    print(f"Overall best similarity score: {best_score:.4f} (Threshold: {similarity_threshold})")
    print(f"Related topics found: {related_topics}")
    print(f"--- End Debug Info ---\n")

    response_text = ""
    if best_score >= similarity_threshold:
        response_text = matched_answer
        
        if related_topics:
            response_text += "\n\n**You might also be interested in:**\n"
            for topic in related_topics:
                response_text += f"- {topic}\n"
    else:
        response_text = (f"I'm sorry, I couldn't find a direct answer to '{user_query}' "
                         f"in my knowledge base. The closest match I found was "
                         f"'{matched_question}' with a similarity score of {best_score:.2f}. "
                         f"Perhaps try rephrasing your question or asking about a more specific OWASP Top 10 topic.")
    
    return response_text

In [9]:
# Cell 6: Define the question segmentation function (MODIFIED to use NLTK)

def segment_questions(user_input):
    """
    Segments a user's multi-question input into individual sentences/questions using NLTK.
    Requires 'punkt' tokenizer to be downloaded (nltk.download('punkt')).
    """
    if not user_input.strip():
        return []

    # Use NLTK's sentence tokenizer for more robust segmentation
    # It handles various punctuation and sentence structures better than simple regex.
    sentences = nltk.sent_tokenize(user_input)
    
    # Filter out empty strings and strip whitespace from each segmented sentence
    final_questions = [q.strip() for q in sentences if q.strip()]
    
    # Fallback: if NLTK fails to segment (e.g., if punkt not downloaded or unusual input),
    # revert to a simpler regex-based or single-question approach.
    if not final_questions:
        # Original regex-based segmentation (kept for robustness if NLTK isn't available/working)
        segments_regex = re.split(r'(\?|\!|\.|\band\b|\bor\b)', user_input, flags=re.IGNORECASE)
        temp_questions = []
        current_q_part = ""
        for i, segment in enumerate(segments_regex):
            segment = segment.strip()
            if not segment: continue
            is_delimiter = False
            if segment in ['?', '!', '.'] or segment.lower() in ['and', 'or']: is_delimiter = True
            if is_delimiter:
                if current_q_part:
                    if segment in ['?', '!', '.']: current_q_part += segment
                    temp_questions.append(current_q_part.strip())
                    current_q_part = ""
            else:
                if current_q_part: current_q_part += " "
                current_q_part += segment
        if current_q_part: temp_questions.append(current_q_part.strip())
        
        final_questions = [q for q in temp_questions if q]
        
    # If still no questions, return the original input as a single question
    if not final_questions and user_input.strip():
        final_questions = [user_input.strip()]

    return final_questions

In [11]:
# Cell 7: Main Execution - Connect to Pinecone and Start Chatbot

print("Starting Main Chatbot Backend...")

print("\n=======================================================")
print("   OWASP Top 10 Chatbot Ready for Interaction! ")
print("=======================================================")
print("Type your questions about OWASP vulnerabilities (e.g., 'What is injection?').")
print("You can now ask follow-up questions like 'How to prevent that?'")
print("Type 'exit' or 'quit' to end the session.")
print("-------------------------------------------------------\n")

# --- Initialize chat history ---
chat_history = [] # Stores (sender, message) tuples, typically (role, text)

# --- Start the interactive chatbot session ---
while True:
    user_input = input("You: ").strip()
    
    if user_input.lower() in ['exit', 'quit']:
        print("Bot: Goodbye! Stay secure.")
        break
    
    if not user_input:
        print("Bot: Please enter a question.")
        continue

    # Add user's raw input to history *before* segmentation for full context
    chat_history.append(("user", user_input))

    # Segment the user's input into individual questions
    # NLTK's sent_tokenize is now the primary method
    questions = segment_questions(user_input)
    
    if not questions:
        print("Bot: I didn't detect any valid questions in your input. Please try again.")
        # Remove the last user input from history if no valid questions were found
        if chat_history and chat_history[-1][0] == "user":
            chat_history.pop()
        continue

    full_response_parts = []
    # Loop through each segmented question to get an individual response
    for i, q in enumerate(questions):
        # Pass the current chat_history to the find_answer_from_pinecone function for context
        individual_response = find_answer_from_pinecone(q, pinecone_index, model, chat_history=chat_history)
        
        if len(questions) > 1:
            full_response_parts.append(f"**Question {i+1}:** _{q}_") # Markdown for bold and italics
            full_response_parts.append(individual_response)
            full_response_parts.append("\n---") # Separator between answers
        else:
            full_response_parts.append(individual_response) # For single questions, no prefix needed

    # Join all individual responses into a single output
    final_bot_response = "\n".join(full_response_parts).strip()
    # Remove trailing separator if it's the last one in the list
    if final_bot_response.endswith("\n---"):
        final_bot_response = final_bot_response[:-4].strip()
    elif final_bot_response.endswith("---"):
        final_bot_response = final_bot_response[:-3].strip()

    print(f"Bot: {final_bot_response}\n")
    
    # Add the bot's final combined response for this turn to history
    chat_history.append(("bot", final_bot_response))

    # Optional: Limit chat history to a few turns to prevent it from growing too large
    # and to keep context relevant to recent interactions.
    # Keep last 2 user messages and 2 bot responses (total 4 elements in history)
    max_history_elements = 4 
    if len(chat_history) > max_history_elements:
        chat_history = chat_history[-max_history_elements:]

Starting Main Chatbot Backend...

   OWASP Top 10 Chatbot Ready for Interaction! 
Type your questions about OWASP vulnerabilities (e.g., 'What is injection?').
You can now ask follow-up questions like 'How to prevent that?'
Type 'exit' or 'quit' to end the session.
-------------------------------------------------------


--- Contextualized Query (with history): '(user: "If my system's internal configuration files are accessible through a web browser, what vulnerabilit...) "If my system's internal configuration files are accessible through a web browser, what vulnerability applies?"' ---
  - Original query for expansion: '(user: "If my system's internal configuration files are accessible through a web browser, what vulnerabilit...) "If my system's internal configuration files are accessible through a web browser, what vulnerability applies?"'
  - Expanded queries (2):
    1. (user: "If my system's internal configuration files are accessible through a web browser, what vulnerabilit...) 