In [8]:
import os
import json
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
from typing import Any, Dict, List, Optional, Union


# Load environment variables
load_dotenv()

# Configuration
ENHANCED_QA_DIR = r'D:\CyberSec_Report_Parser\QA_Pairs\PORT_Scanning_QA'
MODEL_PATH = r'd:\OWASP_BERT\fine_tuned_owasp_model_advanced'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
BATCH_SIZE = 32
MAX_TEXT_LENGTH = 512
PINECONE_INDEX_NAME = "owasp-qa"
EMBEDDING_DIM = 768  # For BERT-based models

# OWASP Category Mapping
OWASP_CATEGORY_MAP = {
    #'A01_2021.json': 'A01_2021_Broken_Access_Control',
    #'A02_2021.json': 'A02_2021_Cryptographic_Failures',
    #'A03_2021.json': 'A03_2021_Injection',
    #'A04_2021.json': 'A04_2021_Insecure_Design',
    #'A05_2021.json': 'A05_2021_Security_Misconfiguration',
    #'A06_2021.json': 'A06_2021_Vulnerable_Components',
    #'A07_2021.json': 'A07_2021_Identification_Failures',
    #'A08_2021.json': 'A08_2021_Software_Integrity_Failures',
    #'A09_2021.json': 'A09_2021_Logging_Monitoring_Failures',
    #'A10_2021.json': 'A10_2021_Server_Side_Request_Forgery',
    'port_scanning.json': 'Nmap_Port_Scanning'
}

In [9]:
def load_model(model_path: str) -> SentenceTransformer:
    """Load the fine-tuned SentenceTransformer model."""
    print(f"Loading fine-tuned model from: {model_path}")
    print(f"Using device: {DEVICE.upper()}")
    
    try:
        model = SentenceTransformer(model_path, device=DEVICE)
        model.max_seq_length = MAX_TEXT_LENGTH
        return model
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        raise

# Load the model
model = load_model(MODEL_PATH)

Loading fine-tuned model from: d:\OWASP_BERT\fine_tuned_owasp_model_advanced
Using device: CUDA


In [10]:
def initialize_pinecone(index_name: str, dimension: int) -> Any:
    """Initialize Pinecone and return the index."""
    # Initialize Pinecone
    pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
    
    # Check if index exists, create if not
    if index_name not in [index.name for index in pc.list_indexes()]:
        print(f"Creating new index: {index_name}")
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric="cosine",
            spec=ServerlessSpec(cloud='aws', region='us-east-1')
        )
        print(f"Index '{index_name}' created.")
    
    # Connect to the index
    index = pc.Index(index_name)
    print(f"Connected to index: {index_name}")
    print(f"Index stats: {index.describe_index_stats()}")
    return index

# Initialize Pinecone
index = initialize_pinecone(PINECONE_INDEX_NAME, EMBEDDING_DIM)

Connected to index: owasp-qa
Index stats: {'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'A01_2021_Broken_Access_Control': {'vector_count': 350},
                'A02_2021_Cryptographic_Failures': {'vector_count': 362},
                'A03_2021_Injection': {'vector_count': 333},
                'A04_2021_Insecure_Design': {'vector_count': 350},
                'A05_2021_Security_Misconfiguration': {'vector_count': 336},
                'A06_2021_Vulnerable_Components': {'vector_count': 368},
                'A07_2021_Identification_Failures': {'vector_count': 400},
                'A08_2021_Software_Integrity_Failures': {'vector_count': 420},
                'A09_2021_Logging_Monitoring_Failures': {'vector_count': 388},
                'A10_2021_Server_Side_Request_Forgery': {'vector_count': 439}},
 'total_vector_count': 3746,
 'vector_type': 'dense'}


In [11]:
def process_and_upsert_file(file_path: str, namespace: str) -> dict:
    """Process a single JSON file and upsert its contents to Pinecone."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            qa_data = json.load(f)
        
        # Handle both list and dict formats
        if isinstance(qa_data, dict):
            # Convert to list of Q&As if it's a dict with categories
            qa_list = []
            for category, items in qa_data.items():
                if isinstance(items, list):
                    qa_list.extend(items)
                elif isinstance(items, dict):
                    qa_list.extend(items.values())
            qa_data = qa_list
        
        # Process each Q&A
        vectors = []
        for item in tqdm(qa_data, desc=f"Processing {os.path.basename(file_path)}"):
            try:
                # Generate embedding for the question
                question = item.get('question', '').strip()
                if not question:
                    continue
                
                # Create metadata
                metadata = {
                    'text': question,
                    'answer': item.get('answer', '').strip(),
                    'type': item.get('type', 'N/A'),
                    'intent': item.get('intent', 'N/A'),
                    'id': item.get('id', str(uuid.uuid4())),
                    'source': 'port_scanning' if 'port_scanning' in namespace.lower() else 'owasp'
                }
                
                # Add related topics if they exist
                if 'related_topics' in item and item['related_topics']:
                    metadata['related_topics'] = ', '.join(item['related_topics'])
                
                # Generate embedding
                embedding = model.encode(question, convert_to_tensor=True).cpu().numpy().tolist()
                
                vectors.append({
                    'id': metadata['id'],
                    'values': embedding,
                    'metadata': metadata
                })
                
            except Exception as e:
                print(f"Error processing item: {str(e)}")
                continue
        
        # Upsert in batches
        batch_size = 100
        for i in range(0, len(vectors), batch_size):
            batch = vectors[i:i + batch_size]
            try:
                index.upsert(vectors=batch, namespace=namespace)
            except Exception as e:
                print(f"Error upserting batch {i//batch_size + 1}: {str(e)}")
        
        print(f"Upserted {len(vectors)} vectors to namespace: {namespace}")
        return {'status': 'success', 'count': len(vectors)}
        
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return {'status': 'error', 'error': str(e)}

In [13]:
import uuid
def process_all_files():
    """Process all JSON files in the specified directory."""
    results = {}
    for filename, namespace in OWASP_CATEGORY_MAP.items():
        file_path = os.path.join(ENHANCED_QA_DIR, filename)
        print(f"\n{'='*80}")
        print(f"Processing: {filename} -> {namespace}")
        print('='*80)
        
        if os.path.exists(file_path):
            result = process_and_upsert_file(file_path, namespace)
            results[filename] = result
        else:
            print(f"File not found: {file_path}")
            results[filename] = {'status': 'error', 'error': 'File not found'}
    
    print("\nProcessing complete. Summary:")
    for filename, result in results.items():
        status = result.get('status', 'unknown')
        count = result.get('count', 0)
        print(f"{filename}: {status} - {count} vectors processed")
    
    return results

# Run the processing
process_all_files()


Processing: port_scanning.json -> Nmap_Port_Scanning


Processing port_scanning.json:   0%|          | 0/310 [00:00<?, ?it/s]

Upserted 310 vectors to namespace: Nmap_Port_Scanning

Processing complete. Summary:
port_scanning.json: success - 310 vectors processed


{'port_scanning.json': {'status': 'success', 'count': 310}}

In [14]:
def query_pinecone(query: str, namespace: str = None, top_k: int = 3, is_port_scanning: bool = False):
    """Query the Pinecone index."""
    try:
        # Generate query embedding
        query_embedding = model.encode(query, convert_to_tensor=True).cpu().numpy().tolist()
        
        # Query parameters
        query_params = {
            'vector': query_embedding,
            'top_k': top_k,
            'include_metadata': True
        }
        
        # Add namespace if specified
        if namespace:
            query_params['namespace'] = namespace
            print(f"Searching in namespace: {namespace}")
        else:
            print("Searching across all namespaces")
        
        # Execute query
        results = index.query(**query_params)
        
        # Print results
        print(f"\nTop {top_k} results for query: '{query}'\n")
        for i, match in enumerate(results.matches, 1):
            print(f"Result {i}:")
            print(f"  ID: {match.id}")
            print(f"  Score: {match.score:.4f}")
            print(f"  Question: {match.metadata.get('text', 'N/A')}")
            
            # Handle different metadata structures
            if is_port_scanning:
                print(f"  Answer: {match.metadata.get('answer', 'N/A')[:200]}...")
                print(f"  Type: {match.metadata.get('type', 'N/A')}")
                print(f"  Intent: {match.metadata.get('intent', 'N/A')}")
                if 'related_topics' in match.metadata:
                    print(f"  Related Topics: {match.metadata['related_topics']}")
            else:
                print(f"  Answer: {match.metadata.get('answer', 'N/A')[:200]}...")
                print(f"  OWASP Category: {match.metadata.get('owasp_category', 'N/A')}")
            
            print("-" * 80)
            
    except Exception as e:
        print(f"Error querying Pinecone: {str(e)}")

In [15]:
# Test queries for port scanning
print("Testing Port Scanning Q&A:")
test_queries = [
    "What is port scanning?",
    "How to detect port scans?",
    "What are the different types of port scans?",
    "How to prevent port scanning?",
    "What is the difference between TCP and UDP scanning?"
]

for query in test_queries:
    query_pinecone(query, namespace="Nmap_Port_Scanning", is_port_scanning=True)

# Test a query across all namespaces
print("\nTesting across all namespaces:")
query_pinecone("security best practices", top_k=5)

Testing Port Scanning Q&A:
Searching in namespace: Nmap_Port_Scanning

Top 3 results for query: 'What is port scanning?'

Result 1:
  ID: PS-Q001
  Score: 0.9298
  Question: What is port scanning in network security?
  Answer: Port scanning is a technique used to identify open, closed, or filtered ports on a target system. It helps determine which services or applications are running and accessible over the network....
  Type: basic_understanding
  Intent: define_port_scanning
  Related Topics: Network Security, Nmap, Penetration Testing, Ports
--------------------------------------------------------------------------------
Result 2:
  ID: PS-Q002
  Score: 0.7232
  Question: Why is port scanning performed during a security assessment?
  Answer: Port scanning is performed to discover active services and potential entry points for an attacker. It helps security professionals understand a system's exposure and vulnerabilities....
  Type: basic_understanding
  Intent: purpose_of_port_scann