In [None]:
import pandas as pd
import torch
import json
import os
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
owasp_category_map = {
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A01_2021.json': 'A01:2021 – Broken Access Control',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A02_2021.json': 'A02:2021 – Cryptographic Failures',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A03_2021.json': 'A03:2021 – Injection',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A04_2021.json': 'A04:2021 – Insecure Design',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A05_2021.json': 'A05:2021 – Security Misconfigurationn',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A06_2021.json': 'A06:2021 – Vulnerable and Outdated Components',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A07_2021.json': 'A07:2021 – Identification and Authentication Failures',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A08_2021.json': 'A08:2021 – Software and Data Integrity Failures',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A09_2021.json': 'A09:2021 – Security Logging and Monitoring Failures',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A10_2021.json': 'A10:2021 – Server-Side Request Forgery (SSRF)',
}

In [None]:
# Step 3: Load and Parse JSON Data (using the robust function from Stage 2)
def load_json_data_for_qa(file_paths, owasp_map):
    """
    Loads and parses JSON files, recursively extracting 'question', 'answer',
    'id', 'intent', 'type', 'related_topics', and now 'owasp_category'.
    """
    all_data = []

    def extract_qa_items(obj, current_file_name):
        if isinstance(obj, dict):
            # Check for the core Q&A fields
            if 'question' in obj and 'answer' in obj:
                item = {
                    'question': obj['question'],
                    'answer': obj['answer'],
                    'id': obj.get('id', None),
                    'intent': obj.get('intent', None),
                    'type': obj.get('type', None),
                    'related_topics': obj.get('related_topics', []),
                    'owasp_category': owasp_map.get(current_file_name, 'General') # Assign OWASP category
                }
                all_data.append(item)
            # Recurse into dictionary values
            for key, value in obj.items():
                extract_qa_items(value, current_file_name)
        elif isinstance(obj, list):
            # Recurse into list elements
            for item in obj:
                extract_qa_items(item, current_file_name)

    # Use file_files from the global scope of the Jupyter environment if available
    global file_files
    
    for file_path in file_files:
        try:
            # Use content_fetcher to get file content (CRUCIAL for this environment)
            content = content_fetcher.fetch(source_references=[{"id": file_path, "type": "uploaded"}])
            data = json.loads(content)
            
            # Extract just the filename from the path (e.g., 'A01_2021.json')
            current_file_name = os.path.basename(file_path)
            
            extract_qa_items(data, current_file_name)
        except FileNotFoundError:
            print(f"Error: {file_path} not found.")
        except json.JSONDecodeError:
            print(f"Error: Could not decode JSON from {file_path}.")
        except Exception as e:
            print(f"An unexpected error occurred while processing {file_path}: {e}")

    return pd.DataFrame(all_data)


    # Define file paths (ensure these match your uploaded JSONs)
file_paths = [
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A01_2021.json',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A02_2021.json',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A03_2021.json',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A04_2021.json',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A05_2021.json',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A06_2021.json',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A07_2021.json',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A08_2021.json',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A09_2021.json',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A10_2021.json'
] 


# Load data into a DataFrame
df_qa = load_json_data_for_qa(file_paths, owasp_category_map) # Pass owasp_category_map

if df_qa.empty:
    print("No Q&A data loaded for Pinecone. Please ensure JSON files are correct and present.")
    exit()

# Drop rows with missing essential information for embeddings/Pinecone
# Ensure 'owasp_category' is also checked
df_qa.dropna(subset=['question', 'answer', 'id', 'type', 'owasp_category'], inplace=True)
print(f"Loaded and filtered {len(df_qa)} Q&A entries for Pinecone indexing.")
print("\nDataFrame Head:")
print(df_qa.head())
print(f"\nUnique OWASP Categories (for namespaces): {df_qa['owasp_category'].nunique()}")
print(df_qa['owasp_category'].value_counts())

In [None]:
# Step 4: Initialize Pinecone
# IMPORTANT: Replace with your actual Pinecone API Key and Environment
# You can find these in your Pinecone dashboard: https://app.pinecone.io/
PINECONE_API_KEY = "YOUR_API_KEY" # Replace with your Pinecone API Key
PINECONE_ENVIRONMENT = "YOUR_ENVIRONMENT" # Replace with your Pinecone environment (e.g., "us-east-1" or "gcp-starter")

if PINECONE_API_KEY == "YOUR_API_KEY" or PINECONE_ENVIRONMENT == "YOUR_ENVIRONMENT":
    print("\nWARNING: Please replace 'YOUR_API_KEY' and 'YOUR_ENVIRONMENT' with your actual Pinecone credentials.")
    # For demonstration, we'll proceed, but index creation/upsert will fail without valid credentials.
    # exit() # Uncomment to stop execution if credentials are not set

try:
    pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
    print("\nPinecone initialized successfully.")
except Exception as e:
    print(f"Error initializing Pinecone: {e}")
    print("Please check your API key, environment, and network connectivity.")
    # exit() # Uncomment to stop execution if Pinecone initialization fails


# Define Pinecone index details
index_name = "security-qa-chatbot"
dimension = 768 # all-mpnet-base-v2 embeddings are 768-dimensional
metric = "cosine" # Cosine similarity is standard for sentence embeddings

# Check if index exists, create if not
if index_name not in pc.list_indexes():
    print(f"Creating new Pinecone index: {index_name}...")
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric=metric,
        spec=ServerlessSpec(cloud='aws', region='us-west-2') # Example spec, adjust region/cloud as needed
    )
    print(f"Index '{index_name}' created.")
else:
    print(f"Index '{index_name}' already exists.")

index = pc.Index(index_name)
print(f"Pinecone index info: {index.describe_index_stats()}")

In [None]:
# Step 5: Load Semantic Model (from Stage 2)
# Ensure you have the fine_tuned_semantic_model directory from Stage 2 run
semantic_model_path = "./fine_tuned_semantic_model"
if not os.path.exists(semantic_model_path):
    print(f"\nError: Fine-tuned semantic model not found at '{semantic_model_path}'.")
    print("Please ensure Stage 2 (Semantic Search Embedding Fine-tuning) was run successfully and saved the model.")
    exit()

try:
    semantic_model = SentenceTransformer(semantic_model_path)
    semantic_model.to(device) # Move model to GPU
    print(f"\nSemantic embedding model loaded successfully from {semantic_model_path}.")
except Exception as e:
    print(f"Error loading semantic model: {e}")
    print("Ensure 'sentence-transformers' library is correctly installed and the model path is valid.")
    exit()

In [None]:
# Step 6: Prepare Data for Indexing & Generate Embeddings
# We'll create embeddings for a combination of question and answer for richer retrieval context.
# Each vector in Pinecone needs a unique ID and optional metadata.
# We'll use the 'id' from your JSON data as the vector ID.
# Metadata will include 'question', 'answer', 'intent', 'type', 'related_topics', and 'owasp_category'.

vectors_to_upsert = []
batch_size = 100 # Adjust batch size for upserting to Pinecone (max 100 per call)
max_text_length = 512 # Limit length of text for embedding to avoid truncation issues

print("\nGenerating embeddings and preparing data for Pinecone upsert...")
# Iterate through DataFrame in batches
for i in range(0, len(df_qa), batch_size):
    batch_df = df_qa.iloc[i : i + batch_size]
    
    # Prepare texts for embedding
    # Concatenate question and answer for a rich embedding context
    texts_to_embed = []
    for _, row in batch_df.iterrows():
        combined_text = f"question: {row['question']} answer: {row['answer']}"
        texts_to_embed.append(combined_text[:max_text_length]) 

    # Generate embeddings for the batch
    batch_embeddings = semantic_model.encode(texts_to_embed, convert_to_tensor=True, device=device).tolist()

    # Prepare vectors for upsert
    for idx, row in batch_df.iterrows():
        vector_id = str(row['id']) # Ensure ID is string
        embedding = batch_embeddings[idx - i] # Get corresponding embedding for the row
        
        # Prepare metadata
        metadata = {
            "question": row['question'],
            "answer": row['answer'],
            "intent": row['intent'],
            "type": row['type'],
            "related_topics": row['related_topics'],
            "owasp_category": row['owasp_category'] # Added owasp_category to metadata
        }
        
        vectors_to_upsert.append({
            "id": vector_id,
            "values": embedding,
            "metadata": metadata
        })

print(f"Prepared {len(vectors_to_upsert)} vectors for upsert.")

In [None]:
# Step 7: Upsert to Pinecone Index with Namespaces
# Pinecone allows namespaces to logically partition an index.
# We will now use the 'owasp_category' field as namespaces.

vectors_by_namespace = {}
for vec in vectors_to_upsert:
    # Use owasp_category for namespace
    vec_namespace = vec['metadata']['owasp_category'] 
    if vec_namespace not in vectors_by_namespace:
        vectors_by_namespace[vec_namespace] = []
    vectors_by_namespace[vec_namespace].append(vec)

print("\nUpserting vectors to Pinecone index with OWASP categories as namespaces...")
upsert_batch_size = 100 # Pinecone recommended batch size for upsert

for namespace, vectors in vectors_by_namespace.items():
    print(f"Upserting {len(vectors)} vectors to namespace: '{namespace}'")
    for i in range(0, len(vectors), upsert_batch_size):
        batch = vectors[i : i + upsert_batch_size]
        try:
            index.upsert(vectors=batch, namespace=namespace)
        except Exception as e:
            print(f"Error during upsert to namespace '{namespace}': {e}")
            # Continue or handle error as needed

print("\nPinecone upsert complete.")
print(f"Final index stats after upsert: {index.describe_index_stats()}")

In [None]:
# Step 8: Test Retrieval (Basic Semantic Search Query)
print("\n--- Testing Basic Pinecone Retrieval ---")

# Example query to test retrieval
query_text = "How can I prevent unauthorized access in my web application?"
# Use a specific OWASP category namespace for targeted search
query_namespace = 'A01:2021 – Broken Access Control' # Example namespace

# Generate embedding for the query using the semantic model
query_embedding = semantic_model.encode(query_text, convert_to_tensor=True, device=device).tolist()

try:
    print(f"\nQuerying Pinecone in namespace '{query_namespace}' for: \"{query_text}\"")
    query_results = index.query(
        vector=query_embedding,
        top_k=5, # Retrieve top 5 most similar results
        include_metadata=True, # Important to get back question, answer, etc.
        namespace=query_namespace # Querying a specific namespace
    )

    print("\nTop 5 Retrieved Results:")
    for i, match in enumerate(query_results.matches):
        print(f"\n--- Result {i+1} ---")
        print(f"Score: {match.score:.4f}")
        print(f"Vector ID: {match.id}")
        if match.metadata:
            print(f"Question: {match.metadata.get('question', 'N/A')}")
            print(f"Answer: {match.metadata.get('answer', 'N/A')[:100]}...") # Truncate answer
            print(f"Intent: {match.metadata.get('intent', 'N/A')}")
            print(f"Type: {match.metadata.get('type', 'N/A')}")
            print(f"OWASP Category: {match.metadata.get('owasp_category', 'N/A')}") # Display OWASP category
            print(f"Related Topics: {match.metadata.get('related_topics', 'N/A')}")
        else:
            print("No metadata found.")

except Exception as e:
    print(f"Error during Pinecone query: {e}")
    print("Please ensure your Pinecone index is active and correctly configured.")


print("\nStage 4: Embedding and Pinecone indexing complete with OWASP categories as namespaces.")