In [11]:
import os
import json
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
from typing import Any, Dict, List, Optional, Union
import uuid # Ensure uuid is imported for generating IDs if needed


# Load environment variables
load_dotenv()

# Configuration
# This path should point to the parent directory of your OWASP_2021_Data and PORT_Scanning_QA folders.
ENHANCED_QA_DIR = r'D:\VulnScanAI_Chatbot\Data\QA_Pairs' 

MODEL_PATH = r'd:\OWASP_BERT\fine_tuned_owasp_model_advanced'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
BATCH_SIZE = 32
MAX_TEXT_LENGTH = 512
PINECONE_INDEX_NAME = "owasp-qa" # This remains the overall index name
EMBEDDING_DIM = 768  # For BERT-based models

# Define the single, unified namespace for all your knowledge base data
UNIFIED_PINECONE_NAMESPACE = "owasp-cybersecurity-kb" # <-- NEW: This is your chosen unified namespace

# OWASP Category Mapping - ALL JSON files will now be upserted into the UNIFIED_PINECONE_NAMESPACE
# Keys are relative paths from ENHANCED_QA_DIR
OWASP_CATEGORY_MAP = {
    'OWASP_Top10_QA/A01_2021.json': UNIFIED_PINECONE_NAMESPACE,
    'OWASP_Top10_QA/A02_2021.json': UNIFIED_PINECONE_NAMESPACE,
    'OWASP_Top10_QA/A03_2021.json': UNIFIED_PINECONE_NAMESPACE,
    'OWASP_Top10_QA/A04_2021.json': UNIFIED_PINECONE_NAMESPACE,
    'OWASP_Top10_QA/A05_2021.json': UNIFIED_PINECONE_NAMESPACE,
    'OWASP_Top10_QA/A06_2021.json': UNIFIED_PINECONE_NAMESPACE,
    'OWASP_Top10_QA/A07_2021.json': UNIFIED_PINECONE_NAMESPACE,
    'OWASP_Top10_QA/A08_2021.json': UNIFIED_PINECONE_NAMESPACE,
    'OWASP_Top10_QA/A09_2021.json': UNIFIED_PINECONE_NAMESPACE,
    'OWASP_Top10_QA/A10_2021.json': UNIFIED_PINECONE_NAMESPACE,
    'PORT_Scanning_QA/port_scanning.json': UNIFIED_PINECONE_NAMESPACE
}

In [12]:
def load_model(model_path: str) -> SentenceTransformer:
    """Load the fine-tuned SentenceTransformer model."""
    print(f"Loading fine-tuned model from: {model_path}")
    print(f"Using device: {DEVICE.upper()}")
    
    try:
        model = SentenceTransformer(model_path, device=DEVICE)
        model.max_seq_length = MAX_TEXT_LENGTH
        return model
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        raise

# Load the model
model = load_model(MODEL_PATH)

Loading fine-tuned model from: d:\OWASP_BERT\fine_tuned_owasp_model_advanced
Using device: CUDA


In [13]:
def initialize_pinecone(index_name: str, dimension: int) -> Any:
    """Initialize Pinecone and return the index."""
    # Initialize Pinecone
    pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
    
    # Check if index exists, create if not
    if index_name not in [index.name for index in pc.list_indexes()]:
        print(f"Creating new index: {index_name}")
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric="cosine",
            spec=ServerlessSpec(cloud='aws', region='us-east-1')
        )
        print(f"Index '{index_name}' created.")
    
    # Connect to the index
    
    index = pc.Index(index_name)
    print(f"Connected to index: {index_name}")
    print(f"Index stats: {index.describe_index_stats()}")
    return index

# Initialize Pinecone
index = initialize_pinecone(PINECONE_INDEX_NAME, EMBEDDING_DIM)

Connected to index: owasp-qa
Index stats: {'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [14]:
def process_and_upsert_file(file_path: str) -> dict: # Removed namespace parameter
    """Process a single JSON file and upsert its contents to Pinecone."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            qa_data = json.load(f)
        
        # Handle both list and dict formats (OWASP 2021 JSONs are often categorized under keys)
        if isinstance(qa_data, dict):
            qa_list = []
            for category, items in qa_data.items():
                if isinstance(items, list):
                    qa_list.extend(items)
                elif isinstance(items, dict): # Fallback, though typically not needed for OWASP 2021 structure
                    qa_list.extend(items.values())
            qa_data = qa_list
        
        # Determine a more specific source for metadata
        # This will be the relative path from ENHANCED_QA_DIR (e.g., 'OWASP_2021_Data/A01_2021.json')
        # We can also infer a higher-level category from the path.
        relative_file_path = os.path.relpath(file_path, ENHANCED_QA_DIR).replace("\\", "/")
        
        # Extract the primary category (e.g., 'OWASP_2021_Data' or 'PORT_Scanning_QA')
        top_level_category = relative_file_path.split('/')[0] if '/' in relative_file_path else ""


        vectors = []
        for item in tqdm(qa_data, desc=f"Processing {os.path.basename(file_path)}"):
            try:
                question = item.get('question', '').strip()
                answer = item.get('answer', '').strip()

                if not question or not answer: # Ensure both question and answer are present
                    # print(f"Skipping item with missing question or answer in {os.path.basename(file_path)} (ID: {item.get('id', 'N/A')})")
                    continue
                
                # Use the question as the main text for embedding
                text_to_embed = question

                # Create metadata
                metadata = {
                    'question': question, # Store original question for display
                    'answer': answer,     # Store original answer for display
                    'id': item.get('id', str(uuid.uuid4())), # Use existing ID or generate new UUID
                    'type': item.get('type', 'N/A'),
                    'intent': item.get('intent', 'N/A'),
                    'source_file': relative_file_path, # e.g., 'OWASP_2021_Data/A01_2021.json'
                    'top_level_category': top_level_category # e.g., 'OWASP_2021_Data'
                }
                
                # Add related topics if they exist and are not empty
                if 'related_topics' in item and item['related_topics'] and len(item['related_topics']) > 0:
                    metadata['related_topics'] = ', '.join(item['related_topics'])
                else:
                    metadata['related_topics'] = 'N/A' # Explicitly set N/A if not present

                # Generate embedding for the question
                embedding = model.encode(text_to_embed, convert_to_tensor=True).cpu().numpy().tolist()
                
                vectors.append({
                    'id': metadata['id'],
                    'values': embedding,
                    'metadata': metadata
                })
                
            except Exception as e:
                print(f"Error processing item (ID: {item.get('id', 'N/A')}) in {os.path.basename(file_path)}: {str(e)}")
                continue
        
        # Upsert in batches to the UNIFIED_PINECONE_NAMESPACE
        batch_size = 100
        if vectors: # Only upsert if there are vectors to prevent errors
            for i in range(0, len(vectors), batch_size):
                batch = vectors[i:i + batch_size]
                try:
                    # Always upsert to the UNIFIED_PINECONE_NAMESPACE
                    index.upsert(vectors=batch, namespace=UNIFIED_PINECONE_NAMESPACE) 
                except Exception as e:
                    print(f"Error upserting batch {i//batch_size + 1} to namespace {UNIFIED_PINECONE_NAMESPACE}: {str(e)}")
            
            print(f"Upserted {len(vectors)} vectors to unified namespace: {UNIFIED_PINECONE_NAMESPACE}")
            return {'status': 'success', 'count': len(vectors)}
        else:
            print(f"No valid vectors to upsert from {os.path.basename(file_path)}.")
            return {'status': 'skipped', 'count': 0, 'message': 'No valid Q&A pairs found to embed.'}

    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return {'status': 'error', 'error': str(e)}

In [15]:
def process_all_files():
    """Process all JSON files specified in OWASP_CATEGORY_MAP and upsert to the unified namespace."""
    results = {}
    
    # Iterate through the OWASP_CATEGORY_MAP to get relative paths and their target namespace (which is now unified)
    for relative_path, target_namespace in OWASP_CATEGORY_MAP.items():
        file_path = os.path.join(ENHANCED_QA_DIR, relative_path) # Construct the full file path

        print(f"\n{'='*80}")
        print(f"Processing file: {relative_path} into namespace: {target_namespace}")
        print('='*80)
        
        if os.path.exists(file_path):
            # Pass only file_path, as target_namespace is handled globally now
            result = process_and_upsert_file(file_path) 
            results[relative_path] = result
        else:
            print(f"File not found: {file_path}")
            results[relative_path] = {'status': 'error', 'error': 'File not found'}
    
    print("\nProcessing complete. Summary:")
    for filename, result in results.items():
        status = result.get('status', 'unknown')
        count = result.get('count', 0)
        print(f"{filename}: {status} - {count} vectors processed")
    
    return results

# Run the processing
process_all_files()


Processing file: OWASP_Top10_QA/A01_2021.json into namespace: owasp-cybersecurity-kb


Processing A01_2021.json:   0%|          | 0/350 [00:00<?, ?it/s]

Upserted 350 vectors to unified namespace: owasp-cybersecurity-kb

Processing file: OWASP_Top10_QA/A02_2021.json into namespace: owasp-cybersecurity-kb


Processing A02_2021.json:   0%|          | 0/372 [00:00<?, ?it/s]

Upserted 372 vectors to unified namespace: owasp-cybersecurity-kb

Processing file: OWASP_Top10_QA/A03_2021.json into namespace: owasp-cybersecurity-kb


Processing A03_2021.json:   0%|          | 0/333 [00:00<?, ?it/s]

Upserted 333 vectors to unified namespace: owasp-cybersecurity-kb

Processing file: OWASP_Top10_QA/A04_2021.json into namespace: owasp-cybersecurity-kb


Processing A04_2021.json:   0%|          | 0/350 [00:00<?, ?it/s]

Upserted 350 vectors to unified namespace: owasp-cybersecurity-kb

Processing file: OWASP_Top10_QA/A05_2021.json into namespace: owasp-cybersecurity-kb


Processing A05_2021.json:   0%|          | 0/336 [00:00<?, ?it/s]

Upserted 336 vectors to unified namespace: owasp-cybersecurity-kb

Processing file: OWASP_Top10_QA/A06_2021.json into namespace: owasp-cybersecurity-kb


Processing A06_2021.json:   0%|          | 0/368 [00:00<?, ?it/s]

Upserted 368 vectors to unified namespace: owasp-cybersecurity-kb

Processing file: OWASP_Top10_QA/A07_2021.json into namespace: owasp-cybersecurity-kb


Processing A07_2021.json:   0%|          | 0/400 [00:00<?, ?it/s]

Upserted 400 vectors to unified namespace: owasp-cybersecurity-kb

Processing file: OWASP_Top10_QA/A08_2021.json into namespace: owasp-cybersecurity-kb


Processing A08_2021.json:   0%|          | 0/420 [00:00<?, ?it/s]

Upserted 420 vectors to unified namespace: owasp-cybersecurity-kb

Processing file: OWASP_Top10_QA/A09_2021.json into namespace: owasp-cybersecurity-kb


Processing A09_2021.json:   0%|          | 0/408 [00:00<?, ?it/s]

Upserted 408 vectors to unified namespace: owasp-cybersecurity-kb

Processing file: OWASP_Top10_QA/A10_2021.json into namespace: owasp-cybersecurity-kb


Processing A10_2021.json:   0%|          | 0/439 [00:00<?, ?it/s]

Upserted 439 vectors to unified namespace: owasp-cybersecurity-kb

Processing file: PORT_Scanning_QA/port_scanning.json into namespace: owasp-cybersecurity-kb


Processing port_scanning.json:   0%|          | 0/310 [00:00<?, ?it/s]

Upserted 310 vectors to unified namespace: owasp-cybersecurity-kb

Processing complete. Summary:
OWASP_Top10_QA/A01_2021.json: success - 350 vectors processed
OWASP_Top10_QA/A02_2021.json: success - 372 vectors processed
OWASP_Top10_QA/A03_2021.json: success - 333 vectors processed
OWASP_Top10_QA/A04_2021.json: success - 350 vectors processed
OWASP_Top10_QA/A05_2021.json: success - 336 vectors processed
OWASP_Top10_QA/A06_2021.json: success - 368 vectors processed
OWASP_Top10_QA/A07_2021.json: success - 400 vectors processed
OWASP_Top10_QA/A08_2021.json: success - 420 vectors processed
OWASP_Top10_QA/A09_2021.json: success - 408 vectors processed
OWASP_Top10_QA/A10_2021.json: success - 439 vectors processed
PORT_Scanning_QA/port_scanning.json: success - 310 vectors processed


{'OWASP_Top10_QA/A01_2021.json': {'status': 'success', 'count': 350},
 'OWASP_Top10_QA/A02_2021.json': {'status': 'success', 'count': 372},
 'OWASP_Top10_QA/A03_2021.json': {'status': 'success', 'count': 333},
 'OWASP_Top10_QA/A04_2021.json': {'status': 'success', 'count': 350},
 'OWASP_Top10_QA/A05_2021.json': {'status': 'success', 'count': 336},
 'OWASP_Top10_QA/A06_2021.json': {'status': 'success', 'count': 368},
 'OWASP_Top10_QA/A07_2021.json': {'status': 'success', 'count': 400},
 'OWASP_Top10_QA/A08_2021.json': {'status': 'success', 'count': 420},
 'OWASP_Top10_QA/A09_2021.json': {'status': 'success', 'count': 408},
 'OWASP_Top10_QA/A10_2021.json': {'status': 'success', 'count': 439},
 'PORT_Scanning_QA/port_scanning.json': {'status': 'success', 'count': 310}}

In [16]:
def query_pinecone(query: str, top_k: int = 3): # Simplified signature
    """Query the Pinecone index using the unified namespace."""
    try:
        # Generate query embedding
        query_embedding = model.encode(query, convert_to_tensor=True).cpu().numpy().tolist()
        
        # Query parameters - always use the UNIFIED_PINECONE_NAMESPACE
        query_params = {
            'vector': query_embedding,
            'top_k': top_k,
            'include_metadata': True,
            'namespace': UNIFIED_PINECONE_NAMESPACE # Always query the unified namespace
        }
        
        print(f"Searching in unified namespace: {UNIFIED_PINECONE_NAMESPACE}")
        
        # Execute query
        results = index.query(**query_params)
        
        # Print results
        print(f"\nTop {top_k} results for query: '{query}'\n")
        if not results.matches:
            print("No matches found in the unified Pinecone knowledge base.")
            return

        for i, match in enumerate(results.matches, 1):
            print(f"Result {i}:")
            print(f"  ID: {match.id}")
            print(f"  Score: {match.score:.4f}")
            
            # CRITICAL DEBUGGING: Print the raw metadata first
            print(f"  DEBUG - Raw match metadata: {match.metadata}")

            # Print the actual question and answer from metadata
            question_text = match.metadata.get('question', 'N/A')
            answer_text = match.metadata.get('answer', 'N/A')

            print(f"  Question: {question_text}")
            print(f"  Answer: {answer_text[:200]}...") # Truncate long answers for display

            # Print other relevant metadata for context
            print(f"  Source File: {match.metadata.get('source_file', 'N/A')}")
            print(f"  Top-Level Category: {match.metadata.get('top_level_category', 'N/A')}")
            if 'related_topics' in match.metadata and match.metadata['related_topics'] != 'N/A':
                print(f"  Related Topics: {match.metadata['related_topics']}")
            print(f"  Type: {match.metadata.get('type', 'N/A')}") # Added type
            print(f"  Intent: {match.metadata.get('intent', 'N/A')}") # Added intent
            
            print("-" * 80)
            
    except Exception as e:
        print(f"Error querying Pinecone: {str(e)}")

In [17]:
# Test queries for port scanning (will now search in unified namespace)
print("Testing Port Scanning Q&A (now unified):")
test_queries_port_scanning = [
    "What is port scanning?",
    "How to detect port scans?",
    "What are the different types of port scans?",
    "How to prevent port scanning?",
    "What is the difference between TCP and UDP scanning?"
]

for query in test_queries_port_scanning:
    query_pinecone(query, top_k=3)

# Test general cybersecurity queries (will now search in unified namespace)
print("\nTesting General Cybersecurity Q&A (now unified):")
test_queries_general_cybersecurity = [
    "What is a SQL injection vulnerability?",
    "How do I prevent cross-site scripting (XSS)?",
    "Explain Broken Access Control.",
    "What are cryptographic failures?",
    "Describe insecure design principles.",
    "How can I secure vulnerable components?",
    "What are common security misconfigurations?",
    "How does server-side request forgery (SSRF) work?",
    "What are the remediation steps for a weak SSH configuration?", # From your app.py test
    "What solutions exist for protecting against SYN floods?" # From your app.py test
]

for query in test_queries_general_cybersecurity:
    query_pinecone(query, top_k=5) # Can increase top_k for general queries if desired

print("\nAll tests complete for unified namespace.")

Testing Port Scanning Q&A (now unified):
Searching in unified namespace: owasp-cybersecurity-kb

Top 3 results for query: 'What is port scanning?'

Result 1:
  ID: PS-Q001
  Score: 0.9298
  DEBUG - Raw match metadata: {'answer': 'Port scanning is a technique used to identify open, closed, or filtered ports on a target system. It helps determine which services or applications are running and accessible over the network.', 'id': 'PS-Q001', 'intent': 'define_port_scanning', 'question': 'What is port scanning in network security?', 'related_topics': 'Network Security, Nmap, Penetration Testing, Ports', 'source_file': 'PORT_Scanning_QA/port_scanning.json', 'top_level_category': 'PORT_Scanning_QA', 'type': 'basic_understanding'}
  Question: What is port scanning in network security?
  Answer: Port scanning is a technique used to identify open, closed, or filtered ports on a target system. It helps determine which services or applications are running and accessible over the network....
  Sour