In [1]:
import json
import os
import random
import torch
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, util
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from datasets import Dataset 

In [2]:
PREVIOUSLY_FINE_TUNED_MODEL_PATH = 'D:\\OWASP_BERT\\fine_tuned_owasp_model_advanced'
NEW_FINE_TUNED_MODEL_SAVE_PATH = 'D:\\OWASP_BERT\\fine_tuned_owasp_model_advanced'
PORT_SCAN_QA_FILE_PATHS = [
    r'D:\CyberSec_Report_Parser\QA_Pairs\PORT_Scanning_QA\port_scanning.json'
]

# Training parameters
BATCH_SIZE = 16  # Adjust based on your GPU memory. Lower if you get OOM errors.
NUM_EPOCHS = 10   # Start with a small number (e.g., 1-5) for further fine-tuning.
                 # You often don't need as many epochs as initial training.
LEARNING_RATE = 2e-5 # Common learning rate for fine-tuning BERT models
WARMUP_STEPS_RATIO = 0.1 # 10% of total training steps for warmup
EVALUATION_STEPS_RATIO = 0.5 # Evaluate after every 50% of an epoch
CORPUS_CHUNK_SIZE = 500 # Adjust based on GPU memory for evaluation.
CHECKPOINT_SAVE_LIMIT = 3

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {DEVICE}")

Using device: cuda


In [3]:
import uuid  # Add this import at the top of your imports

def load_qa_data(json_file_paths: list) -> list[dict]:
    """
    Loads Q&A data from a list of JSON files.
    Each JSON file is expected to contain a dictionary where keys are categories
    and values are lists of Q&A objects (e.g., {"question": "...", "answer": "..."}).
    Flattens the data into a single list of Q&A dictionaries.
    """
    all_qa_data = []
    for file_path in json_file_paths:
        if not os.path.exists(file_path):
            print(f"Warning: Q&A file not found at {file_path}. Skipping.")
            continue
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                # Assuming the structure is similar to your OWASP data: categories -> list of Q&A
                for category, items in data.items():
                    for item in items:
                        # Ensure 'question' and 'answer' keys exist
                        if 'question' in item and 'answer' in item:
                            all_qa_data.append({
                                'question': item['question'],
                                'answer': item['answer'],
                                'id': item.get('id', str(uuid.uuid4())),  # Changed from random.uuid4()
                                'type': item.get('type', 'N/A')
                            })
            print(f"Successfully loaded {len(all_qa_data)} Q&A entries from {os.path.basename(file_path)}")
        except json.JSONDecodeError:
            print(f"Error: Could not decode JSON from {file_path}. Check for syntax errors. Skipping.")
        except Exception as e:
            print(f"Error processing {file_path}: {str(e)}. Skipping.")
    
    return all_qa_data

In [4]:
# --- 2. Load and Prepare Data ---
print("\n--- Loading and preparing Port Scanning Q&A data ---")
qa_data = load_qa_data(PORT_SCAN_QA_FILE_PATHS)

if not qa_data:
    raise ValueError("No Port Scanning Q&A data loaded. Please check your file paths and content.")

# Split data into training and validation sets
# A smaller test_size (e.g., 0.05 to 0.1) is often sufficient for validation during fine-tuning
train_qa_data, val_qa_data = train_test_split(qa_data, test_size=0.1, random_state=42)

# Create InputExample objects for training
train_examples = []
for item in train_qa_data:
    # MultipleNegativesRankingLoss works well when question and answer are positive pairs
    train_examples.append(InputExample(texts=[item['question'], item['answer']]))

print(f"Prepared {len(train_examples)} training examples for further fine-tuning.")

# Create DataLoader for training
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=BATCH_SIZE)

# Prepare data for InformationRetrievalEvaluator
# This creates a validation set where the model tries to retrieve the correct answer for each query.
eval_queries = {}
eval_corpus = {}
eval_relevant_docs = {}

answer_id_counter = 0
answer_id_map = {} # To map answers to unique corpus IDs

for item in val_qa_data:
    query_id = item['id']
    query_text = item['question']

    # Ensure unique IDs for answers in the corpus
    if item['answer'] not in answer_id_map:
        answer_id_map[item['answer']] = f"ans_{answer_id_counter}"
        answer_id_counter += 1
    corpus_id = answer_id_map[item['answer']]
    corpus_text = item['answer']

    eval_queries[query_id] = query_text
    eval_corpus[corpus_id] = corpus_text

    if query_id not in eval_relevant_docs:
        eval_relevant_docs[query_id] = set()
    eval_relevant_docs[query_id].add(corpus_id)

print(f"Prepared {len(eval_queries)} validation queries and {len(eval_corpus)} validation corpus entries.")


--- Loading and preparing Port Scanning Q&A data ---
Successfully loaded 310 Q&A entries from port_scanning.json
Prepared 279 training examples for further fine-tuning.
Prepared 31 validation queries and 31 validation corpus entries.


In [5]:
# --- 3. Load Previously Fine-tuned Model for Further Fine-tuning ---
print(f"\n--- Loading previously fine-tuned model from: {PREVIOUSLY_FINE_TUNED_MODEL_PATH} ---")
try:
    # Load the already fine-tuned model from your first training run
    model = SentenceTransformer(PREVIOUSLY_FINE_TUNED_MODEL_PATH)
    print("Previously fine-tuned model loaded successfully.")
except Exception as e:
    print(f"ERROR: Could not load model from {PREVIOUSLY_FINE_TUNED_MODEL_PATH}: {e}")
    print("Please ensure the path is correct and the directory contains a valid SentenceTransformer model.")
    print("FATAL: Cannot proceed without a loaded model. Exiting.")
    exit() # Exit if the previous model cannot be loaded

# Move model to the detected device (GPU or CPU)
model.to(DEVICE)
print(f"Model moved to {DEVICE}.")


--- Loading previously fine-tuned model from: D:\OWASP_BERT\fine_tuned_owasp_model_advanced ---
Previously fine-tuned model loaded successfully.
Model moved to cuda.


In [6]:
# --- 4. Define Loss Function ---
# MultipleNegativesRankingLoss is generally a good choice for semantic search fine-tuning.
train_loss = losses.MultipleNegativesRankingLoss(model=model)
print("Loss function defined.")


Loss function defined.


In [7]:
# --- 5. Define Evaluator ---
# This evaluator will periodically assess the model's performance on the validation set.
# The 'owasp_validation' name should probably be 'port_scan_validation' now.
ir_evaluator = evaluation.InformationRetrievalEvaluator(
    queries=eval_queries,
    corpus=eval_corpus,
    relevant_docs=eval_relevant_docs,
    show_progress_bar=True,
    corpus_chunk_size=CORPUS_CHUNK_SIZE,
    name='port_scan_validation' # Name for the evaluation log for new domain
)
print("InformationRetrievalEvaluator defined for port scan validation.")

InformationRetrievalEvaluator defined for port scan validation.


In [8]:
# --- 6. Train the Model ---
print(f"\n--- Starting further fine-tuning for {NUM_EPOCHS} epochs ---")

# Calculate warmup steps based on ratio
warmup_steps = int(len(train_dataloader) * NUM_EPOCHS * WARMUP_STEPS_RATIO)

# Calculate evaluation steps based on ratio
evaluation_steps = int(len(train_dataloader) * EVALUATION_STEPS_RATIO)
if evaluation_steps == 0: # Ensure at least one evaluation step for very small datasets
    evaluation_steps = 1

os.makedirs(NEW_FINE_TUNED_MODEL_SAVE_PATH, exist_ok=True) # Ensure output directory exists

model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=ir_evaluator,
          epochs=NUM_EPOCHS,
          warmup_steps=warmup_steps,
          output_path=NEW_FINE_TUNED_MODEL_SAVE_PATH,
          save_best_model=True, # Save the model that achieves the best performance on the evaluator
          optimizer_params={'lr': LEARNING_RATE},
          use_amp=True, # Use Automatic Mixed Precision for faster training on GPUs (if supported)
          checkpoint_path=NEW_FINE_TUNED_MODEL_SAVE_PATH, # Path to save checkpoints
          checkpoint_save_steps=len(train_dataloader) // CHECKPOINT_SAVE_LIMIT if len(train_dataloader) > CHECKPOINT_SAVE_LIMIT else 1, # Save checkpoint periodically
          checkpoint_save_total_limit=CHECKPOINT_SAVE_LIMIT, # Keep only the last few checkpoints
          evaluation_steps=evaluation_steps, # How often to run evaluation
          show_progress_bar=True
         )

print("\n--- Further fine-tuning complete! ---")
print(f"Best fine-tuned model saved to: {NEW_FINE_TUNED_MODEL_SAVE_PATH}")


--- Starting further fine-tuning for 10 epochs ---


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Port Scan Validation Cosine Accuracy@1,Port Scan Validation Cosine Accuracy@3,Port Scan Validation Cosine Accuracy@5,Port Scan Validation Cosine Accuracy@10,Port Scan Validation Cosine Precision@1,Port Scan Validation Cosine Precision@3,Port Scan Validation Cosine Precision@5,Port Scan Validation Cosine Precision@10,Port Scan Validation Cosine Recall@1,Port Scan Validation Cosine Recall@3,Port Scan Validation Cosine Recall@5,Port Scan Validation Cosine Recall@10,Port Scan Validation Cosine Ndcg@10,Port Scan Validation Cosine Mrr@10,Port Scan Validation Cosine Map@100
9,No log,No log,0.967742,1.0,1.0,1.0,0.967742,0.333333,0.2,0.1,0.967742,1.0,1.0,1.0,0.988095,0.983871,0.983871
18,No log,No log,0.967742,1.0,1.0,1.0,0.967742,0.333333,0.2,0.1,0.967742,1.0,1.0,1.0,0.988095,0.983871,0.983871
27,No log,No log,0.967742,1.0,1.0,1.0,0.967742,0.333333,0.2,0.1,0.967742,1.0,1.0,1.0,0.988095,0.983871,0.983871
36,No log,No log,0.967742,1.0,1.0,1.0,0.967742,0.333333,0.2,0.1,0.967742,1.0,1.0,1.0,0.988095,0.983871,0.983871
45,No log,No log,0.967742,1.0,1.0,1.0,0.967742,0.333333,0.2,0.1,0.967742,1.0,1.0,1.0,0.988095,0.983871,0.983871
54,No log,No log,0.967742,1.0,1.0,1.0,0.967742,0.333333,0.2,0.1,0.967742,1.0,1.0,1.0,0.988095,0.983871,0.983871
63,No log,No log,0.967742,1.0,1.0,1.0,0.967742,0.333333,0.2,0.1,0.967742,1.0,1.0,1.0,0.988095,0.983871,0.983871
72,No log,No log,0.967742,1.0,1.0,1.0,0.967742,0.333333,0.2,0.1,0.967742,1.0,1.0,1.0,0.988095,0.983871,0.983871
81,No log,No log,0.967742,1.0,1.0,1.0,0.967742,0.333333,0.2,0.1,0.967742,1.0,1.0,1.0,0.988095,0.983871,0.983871
90,No log,No log,0.967742,1.0,1.0,1.0,0.967742,0.333333,0.2,0.1,0.967742,1.0,1.0,1.0,0.988095,0.983871,0.983871


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:01<00:00,  1.81s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:01<00:00,  1.81s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.03s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:01<00:00,  1.80s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:01<00:00,  1.81s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:01<00:00,  1.99s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:01<00:00,  1.81s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:01<00:00,  1.81s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:01<00:00,  2.00s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:01<00:00,  1.80s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:01<00:00,  1.81s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:01<00:00,  1.99s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:01<00:00,  1.81s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:01<00:00,  1.79s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.01s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:01<00:00,  1.81s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:01<00:00,  1.81s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:01<00:00,  1.97s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:01<00:00,  1.81s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:01<00:00,  1.81s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:01<00:00,  1.96s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:01<00:00,  1.82s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# --- 7. Verify the Newly Fine-tuned Model ---
print(f"\n--- Verifying the newly fine-tuned model from: {NEW_FINE_TUNED_MODEL_SAVE_PATH} ---")
try:
    # Load the newly fine-tuned model for testing
    final_fine_tuned_model = SentenceTransformer(NEW_FINE_TUNED_MODEL_SAVE_PATH)
    print("Newly fine-tuned model loaded successfully for verification.")

    # Test with example queries from the Port Scanning domain
    test_question_1 = "What is the status of port 22 on 192.168.1.5?"
    test_answer_1 = "Port 22 (SSH) is open on 192.168.1.5. It is typically used for secure remote access."

    test_question_2 = "Explain TCP SYN scan."
    test_answer_2 = "A TCP SYN scan sends a SYN packet and waits for a SYN-ACK. If received, it sends an RST, preventing a full connection, indicating an open port."

    # Example of a less relevant pair from a different domain
    test_question_3 = "What is Cross-Site Scripting?"
    test_answer_3 = "Cross-Site Scripting (XSS) is a vulnerability where attackers inject malicious client-side scripts into web pages viewed by other users."

    test_embedding_q1 = final_fine_tuned_model.encode(test_question_1, convert_to_tensor=True)
    test_embedding_a1 = final_fine_tuned_model.encode(test_answer_1, convert_to_tensor=True)

    test_embedding_q2 = final_fine_tuned_model.encode(test_question_2, convert_to_tensor=True)
    test_embedding_a2 = final_fine_tuned_model.encode(test_answer_2, convert_to_tensor=True)

    test_embedding_q3 = final_fine_tuned_model.encode(test_question_3, convert_to_tensor=True)
    test_embedding_a3 = final_fine_tuned_model.encode(test_answer_3, convert_to_tensor=True)


    # Calculate similarity scores
    similarity_q1_a1 = util.cos_sim(test_embedding_q1, test_embedding_a1).item()
    similarity_q2_a2 = util.cos_sim(test_embedding_q2, test_embedding_a2).item()
    similarity_q1_q3 = util.cos_sim(test_embedding_q1, test_embedding_q3).item() # Q from port scan vs Q from OWASP
    similarity_q1_a3 = util.cos_sim(test_embedding_q1, test_embedding_a3).item() # Q from port scan vs A from OWASP

    print(f"\nVerification Results:")
    print(f"Similarity (Port Scan Q1, Port Scan A1): {similarity_q1_a1:.4f} (Expected High)")
    print(f"Similarity (Port Scan Q2, Port Scan A2): {similarity_q2_a2:.4f} (Expected High)")
    print(f"Similarity (Port Scan Q1, XSS Q3): {similarity_q1_q3:.4f} (Expected Lower, but still related to security)")
    print(f"Similarity (Port Scan Q1, XSS A3): {similarity_q1_a3:.4f} (Expected Low)")

    print("\nIf fine-tuning was successful, you should observe higher similarity scores for questions and answers within the Port Scanning domain.")
    print("The model should now be better at understanding and retrieving information specifically related to port scanning.")

except Exception as e:
    print(f"An error occurred during final model loading or testing: {e}")
    print("Please check the output path and ensure the model was saved correctly.")
