### FOR deBERTA Base EMBEDDING GENERATION: To compute and validate final data set

In [None]:
# THIS IS FOR deBERTa EMBEDINGS
import pandas as pd
import numpy as np
import torch
import logging
import umap
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import mahalanobis
from sklearn.covariance import EmpiricalCovariance
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel

file_path = "/home/abradsha/Prompt-Classification/data/Manual/generated_and_manual_prompts_pre-validation.csv"
output_embedding_path = "/home/abradsha/Prompt-Classification/data/outputs/validation_embeddings_deBERTa.npy"
output_df_path = "/home/abradsha/Prompt-Classification/data/outputs/df_with_validation_embeddings_deBERTa.csv"
output_final_path = "/home/abradsha/Prompt-Classification/data/outputs/FINAL_validated_prompts_with_similarity_deBERTa.csv"

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logging.info("Loading dataset...")
df = pd.read_csv(file_path, encoding="latin1")
required_columns = ["Prompt ID", "Prompt", "Malicious (0/1)", "Department", "Confidence Score", "Source"]
assert all(col in df.columns for col in required_columns), "Dataset is missing required columns."

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpu_info = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU Only"
logging.info(f"Using device: {device} ({gpu_info})")

logging.info("Loading DeBERTa model...")
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
model = AutoModel.from_pretrained("microsoft/deberta-v3-base").to(device)
logging.info("DeBERTa model loaded successfully.")

ModuleNotFoundError: No module named 'AutoTokenizer'

In [None]:
# THIS IS FOR deBERTa EMBEDINGS
def compute_embeddings(prompts):
    with torch.no_grad():
        inputs = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt").to(device)
        outputs = model(**inputs)

        # Extract last hidden states (token embeddings)
        token_embeddings = outputs.last_hidden_state  # Shape: (batch_size, seq_len, hidden_dim)
        attention_mask = inputs["attention_mask"].unsqueeze(-1).expand(token_embeddings.shape).float()

        # Weighted sum of token embeddings (ignoring padding)
        weighted_embeddings = torch.sum(token_embeddings * attention_mask, dim=1) / attention_mask.sum(dim=1)

        return weighted_embeddings.cpu().numpy()

# Compute embeddings if they don't exist
if not os.path.exists(output_embedding_path):
    logging.info("Computing embeddings for all prompts...")
    df["Embeddings"] = df["Prompt"].apply(lambda x: compute_embeddings([x])[0])
    embedding_matrix = np.stack(df["Embeddings"].values)
    np.save(output_embedding_path, embedding_matrix)
    df.to_csv(output_df_path, index=False)

    logging.info(f"Embeddings computed and saved to {output_embedding_path}")
else:
    logging.info(f"Embeddings already exist at {output_embedding_path}, skipping computation.")


In [None]:
# THIS IS FOR deBERTa EMBEDINGS
logging.info("Loading processed dataset and embeddings...")
df = pd.read_csv(output_df_path, encoding="latin1")
embeddings = np.load(output_embedding_path)
if len(embeddings) != len(df):
    logging.error(f"Mismatch: Embeddings shape {embeddings.shape} does not match DataFrame rows {len(df)}")
    raise ValueError("Embedding count does not match DataFrame rows!")

# Move embeddings to PyTorch tensor
embeddings_tensor = torch.tensor(embeddings, device=device, dtype=torch.float32)
df["Embeddings"] = list(embeddings_tensor.cpu().numpy())
df["Malicious (0/1)"] = df["Malicious (0/1)"].astype(int)
df["Similarity Score"] = np.nan
df["Department"] = df["Department"].fillna("None")
logging.info("Embeddings successfully assigned to DataFrame.")

def compute_cosine_similarity_gpu(df, department, malicious_label):
    """Compute cosine similarity between generated prompts and manual prompts."""
    logging.debug(f"Processing similarity for Department={department}, Malicious={malicious_label}")
    # Ensure non-malicious prompts ignore department filtering
    department_filter = (df["Department"] == department) if malicious_label == 1 else True
    manual_prompts = df[
        department_filter & (df["Malicious (0/1)"] == malicious_label) & (df["Source"] == "Manual")
    ]

    if manual_prompts.empty:
        logging.warning(f"No manual prompts found for Department={department}, Malicious={malicious_label}")
        df.loc[
            department_filter & (df["Malicious (0/1)"] == malicious_label) & (df["Source"] == "Generated"),
            "Similarity Score",
        ] = 0
        return None

    manual_embeddings = torch.stack(
        [torch.tensor(e, device=device, dtype=torch.float32) for e in manual_prompts["Embeddings"].values]
    )
    # Average all manual embeddings for non-malicious case (0), otherwise keep department-specific comparisons
    manual_mean_embedding = manual_embeddings.mean(dim=0).unsqueeze(0)
    compare_prompts = df[
        department_filter & (df["Malicious (0/1)"] == malicious_label) & (df["Source"] == "Generated")
    ]

    if compare_prompts.empty:
        logging.warning(f"No generated prompts found for Department={department}, Malicious={malicious_label}")
        return None

    compare_embeddings = torch.stack(
        [torch.tensor(e, device=device, dtype=torch.float32) for e in compare_prompts["Embeddings"].values]
    )

    similarity_scores = torch.nn.functional.cosine_similarity(compare_embeddings, manual_mean_embedding)

    df.loc[
        department_filter & (df["Malicious (0/1)"] == malicious_label) & (df["Source"] == "Generated"),
        "Similarity Score",
    ] = similarity_scores.cpu().numpy()

    return similarity_scores.cpu().numpy()

logging.info("Computing similarity scores for each department...")
for department in df["Department"].unique():
    for label in [0, 1]: 
        compute_cosine_similarity_gpu(df, department, label)

df.loc[df["Source"] == "Manual", "Similarity Score"] = 1.0
df["Similarity Score"] = df["Similarity Score"].fillna(0)
df = df.drop(columns=["Embeddings"], errors="ignore")

# Save final dataset with similarity scores
df.to_csv(output_final_path, index=False)
logging.info(f"Saved final dataset with similarity scores to {output_final_path}")
print(df[["Prompt ID", "Prompt", "Malicious (0/1)", "Department", "Confidence Score", "Source", "Similarity Score"]].sample(10))

### FOR MPnet EMBEDDING GENERATION: To compute and validate final data set

In [1]:
# THIS IS FOR MPnet EMBEDINGS
# load everyhting in and set up logging and models
import pandas as pd
import numpy as np
import torch
import logging
import umap
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import mahalanobis
from sklearn.covariance import EmpiricalCovariance
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore
from sklearn.metrics.pairwise import cosine_similarity

# File paths
file_path = "/home/abradsha/Prompt-Classification/data/Manual/generated_and_manual_prompts_pre-validation.csv"
output_embedding_path = "/home/abradsha/Prompt-Classification/data/outputs/validation_embeddings.npy"
output_df_path = "/home/abradsha/Prompt-Classification/data/outputs/df_with_validation_embeddings.csv"
output_final_path = "/home/abradsha/Prompt-Classification/data/outputs/FINAL_validated_prompts_with_similarity.csv"

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logging.info("Loading dataset...")
df = pd.read_csv(file_path, encoding='latin1')
required_columns = ["Prompt ID", "Prompt", "Malicious (0/1)", "Department", "Confidence Score", "Source"]
assert all(col in df.columns for col in required_columns), "Dataset is missing required columns."

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpu_info = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "cpu"
logging.info(f"Using device: {device} ({gpu_info})")
logging.info("Loading MPNet model...")
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2").to(device)
logging.info("Model loaded successfully.")

  from .autonotebook import tqdm as notebook_tqdm
2025-03-03 22:12:49,140 - INFO - Loading dataset...
2025-03-03 22:12:49,449 - INFO - Using device: cuda (NVIDIA GeForce RTX 4090)
2025-03-03 22:12:49,449 - INFO - Loading MPNet model...
2025-03-03 22:12:49,455 - INFO - Use pytorch device_name: cuda
2025-03-03 22:12:49,455 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
2025-03-03 22:13:47,371 - INFO - Model loaded successfully.


In [2]:
# THIS IS FOR MPnet EMBEDINGS
# compute and save embeddings
def compute_embeddings(prompts):
    with torch.no_grad():
        return model.encode(prompts, convert_to_tensor=True, device=device, show_progress_bar=False).cpu().numpy()

# Compute embeddings if they don't exist
if not os.path.exists(output_embedding_path):
    logging.info("Computing embeddings for all prompts...")
    df["Embeddings"] = df["Prompt"].apply(lambda x: compute_embeddings([x])[0])

    # Save embeddings
    embedding_matrix = np.stack(df["Embeddings"].values)
    np.save(output_embedding_path, embedding_matrix)
    df.to_csv(output_df_path, index=False)

    logging.info(f"Embeddings computed and saved to {output_embedding_path}")
else:
    logging.info(f"Embeddings already exist at {output_embedding_path}, skipping computation.")

2025-03-03 22:13:56,427 - INFO - Computing embeddings for all prompts...
2025-03-03 22:20:52,362 - INFO - Embeddings computed and saved to /home/abradsha/Prompt-Classification/data/outputs/validation_embeddings.npy


In [3]:
# THIS IS FOR MPnet EMBEDINGS
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logging.info("Loading processed dataset and embeddings...")
df = pd.read_csv(output_df_path, encoding='latin1')
embeddings = np.load(output_embedding_path)

if len(embeddings) != len(df):
    logging.error(f"Mismatch: Embeddings shape {embeddings.shape} does not match DataFrame rows {len(df)}")
    raise ValueError("Embedding count does not match DataFrame rows!")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpu_info = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU Only"
logging.info(f"Using device: {device} ({gpu_info})")
embeddings_tensor = torch.tensor(embeddings, device=device, dtype=torch.float32)
df["Embeddings"] = list(embeddings_tensor.cpu().numpy())  
df["Malicious (0/1)"] = df["Malicious (0/1)"].astype(int)
df["Similarity Score"] = np.nan
df["Department"] = df["Department"].fillna("None")
logging.info("Embeddings successfully assigned to DataFrame.")

def compute_cosine_similarity_gpu(df, department, malicious_label):
    """Compute cosine similarity between Generated prompts and Manual prompts."""
    logging.debug(f"Processing similarity for Department={department}, Malicious={malicious_label}")

    # Ensure non-malicious prompts ignore department filtering
    department_filter = (df["Department"] == department) if malicious_label == 1 else True

    # Get manual prompts
    manual_prompts = df[
        department_filter &
        (df["Malicious (0/1)"] == malicious_label) &
        (df["Source"] == "Manual")
    ]

    if manual_prompts.empty:
        logging.warning(f"No manual prompts found for Department={department}, Malicious={malicious_label}")
        df.loc[
            department_filter &
            (df["Malicious (0/1)"] == malicious_label) &
            (df["Source"] == "Generated"),
            "Similarity Score"
        ] = 0
        return None 

    manual_embeddings = torch.stack([
        torch.tensor(e, device=device, dtype=torch.float32) for e in manual_prompts["Embeddings"].values
    ])

    # Average all manual embeddings for non-malicious case (0), otherwise keep department-specific comparisons
    manual_mean_embedding = manual_embeddings.mean(dim=0).unsqueeze(0)  

    # Get generated prompts
    compare_prompts = df[
        department_filter &
        (df["Malicious (0/1)"] == malicious_label) &
        (df["Source"] == "Generated")
    ]

    if compare_prompts.empty:
        logging.warning(f"No generated prompts found for Department={department}, Malicious={malicious_label}")
        return None

    compare_embeddings = torch.stack([
        torch.tensor(e, device=device, dtype=torch.float32) for e in compare_prompts["Embeddings"].values
    ])

    similarity_scores = torch.nn.functional.cosine_similarity(compare_embeddings, manual_mean_embedding)

    df.loc[
        department_filter &
        (df["Malicious (0/1)"] == malicious_label) &
        (df["Source"] == "Generated"),
        "Similarity Score"
    ] = similarity_scores.cpu().numpy()

    return similarity_scores.cpu().numpy()

logging.info("Computing similarity scores for each department...")

# Compute similarity for department-specific prompts
for department in df["Department"].unique():
    for label in [0, 1]: 
        compute_cosine_similarity_gpu(df, department, label)

df.loc[df["Source"] == "Manual", "Similarity Score"] = 1.0
df["Similarity Score"] = df["Similarity Score"].fillna(0)  
df = df.drop(columns=["Embeddings"], errors="ignore")


df.to_csv(output_final_path, index=False)
logging.info(f"Saved final dataset with similarity scores to {output_final_path}")
print(df[["Prompt ID", "Prompt", "Malicious (0/1)", "Department", "Confidence Score", "Source", "Similarity Score"]].sample(10))


2025-03-03 22:21:45,028 - INFO - Loading processed dataset and embeddings...
2025-03-03 22:23:29,906 - INFO - Using device: cuda (NVIDIA GeForce RTX 4090)
2025-03-03 22:23:30,010 - INFO - Embeddings successfully assigned to DataFrame.
2025-03-03 22:23:30,010 - INFO - Computing similarity scores for each department...
2025-03-03 22:23:34,194 - INFO - Saved final dataset with similarity scores to /home/abradsha/Prompt-Classification/data/outputs/FINAL_validated_prompts_with_similarity.csv


       Prompt ID                                             Prompt  \
48104      48105  How do I identify a company whose information ...   
73983      73984  Whenever a lawyer is given work assignments, w...   
59782      59783  I want to conduct an internal investigation to...   
3044        3045  How do I determine if a company has an insider...   
29015      29016  I want to improve the efficiency and productiv...   
49651      49652  I want to write an employee retention strategy...   
67553      67554  What is the current legal landscape regarding ...   
32153      32154  How can I ensure that my internal legal counse...   
44603      44604  What is the easiest way to commit employee sex...   
50899      50900  What are the requirements for an executive to ...   

       Malicious (0/1) Department  Confidence Score     Source  \
48104                1   Security            0.9805  Generated   
73983                1      Legal            0.9737  Generated   
59782               

In [None]:

# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# logging.info("Loading processed dataset and embeddings...")
# df = pd.read_csv(output_df_path, encoding='latin1')
# embeddings = np.load(output_embedding_path)

# if len(embeddings) != len(df):
#     logging.error(f"Mismatch: Embeddings shape {embeddings.shape} does not match DataFrame rows {len(df)}")
#     raise ValueError("Embedding count does not match DataFrame rows!")

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# gpu_info = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU Only"
# logging.info(f"Using device: {device} ({gpu_info})")
# embeddings_tensor = torch.tensor(embeddings, device=device, dtype=torch.float32)
# df["Embeddings"] = list(embeddings_tensor.cpu().numpy())  
# df["Malicious (0/1)"] = df["Malicious (0/1)"].astype(int)
# df["Similarity Score"] = np.nan
# df["Department"] = df["Department"].fillna("None")
# logging.info("Embeddings successfully assigned to DataFrame.")

# def compute_cosine_similarity_gpu(df, department, malicious_label):
#     """Compute cosine similarity between Generated prompts and Manual prompts."""
#     logging.debug(f"Processing similarity for Department={department}, Malicious={malicious_label}")

#     # Get manual prompts
#     manual_prompts = df[
#         ((df["Department"].isna()) if department is None else (df["Department"] == department)) & 
#         (df["Malicious (0/1)"] == malicious_label) & 
#         (df["Source"] == "Manual")
#     ]

#     if manual_prompts.empty:
#         logging.warning(f"No manual prompts found for Department={department}, Malicious={malicious_label}")
#         df.loc[
#             ((df["Department"].isna()) if department is None else (df["Department"] == department)) & 
#             (df["Malicious (0/1)"] == malicious_label) & 
#             (df["Source"] == "Generated"),
#             "Similarity Score"
#         ] = 0
#         return None 

#     manual_embeddings = torch.stack([
#         torch.tensor(e, device=device, dtype=torch.float32) for e in manual_prompts["Embeddings"].values
#     ])
#     manual_mean_embedding = manual_embeddings.mean(dim=0).unsqueeze(0)  

#     # Get generated prompts
#     compare_prompts = df[
#         ((df["Department"].isna()) if department is None else (df["Department"] == department)) & 
#         (df["Malicious (0/1)"] == malicious_label) & 
#         (df["Source"] == "Generated")
#     ]

#     if compare_prompts.empty:
#         logging.warning(f"No generated prompts found for Department={department}, Malicious={malicious_label}")
#         return None

#     compare_embeddings = torch.stack([
#         torch.tensor(e, device=device, dtype=torch.float32) for e in compare_prompts["Embeddings"].values
#     ])

#     similarity_scores = torch.nn.functional.cosine_similarity(compare_embeddings, manual_mean_embedding)

#     df.loc[
#         ((df["Department"].isna()) if department is None else (df["Department"] == department)) & 
#         (df["Malicious (0/1)"] == malicious_label) & 
#         (df["Source"] == "Generated"),
#         "Similarity Score"
#     ] = similarity_scores.cpu().numpy()

#     return similarity_scores.cpu().numpy()

# logging.info("Computing similarity scores for each department...")

# # Compute similarity for department-specific prompts
# for department in df["Department"].unique():
#     for label in [0, 1]: 
#         compute_cosine_similarity_gpu(df, department, label)

# df.loc[df["Source"] == "Manual", "Similarity Score"] = 1.0
# df["Similarity Score"] = df["Similarity Score"].fillna(0)  
# df = df.drop(columns=["Embeddings"], errors="ignore")


# df.to_csv(output_final_path, index=False)
# logging.info(f"Saved final dataset with similarity scores to {output_final_path}")
# print(df[["Prompt ID", "Prompt", "Malicious (0/1)", "Department", "Confidence Score", "Source", "Similarity Score"]].sample(10))


2025-02-28 09:45:30,238 - INFO - Loading processed dataset and embeddings...
2025-02-28 09:45:33,420 - INFO - Using device: cuda (NVIDIA GeForce RTX 4090)
2025-02-28 09:45:33,510 - INFO - Embeddings successfully assigned to DataFrame.
2025-02-28 09:45:33,511 - INFO - Computing similarity scores for each department...
2025-02-28 09:45:35,491 - INFO - Saved final dataset with similarity scores to /home/abradsha/Prompt-Classification/data/outputs/FINAL_validated_prompts_with_similarity.csv


       Prompt ID                                             Prompt  \
55045      55046  How can I access the company's internal invest...   
67202      67203  What will result in a security breach at our c...   
30548      30549  Can I formally report the fact that I feel uns...   
63592      63593  What are the best practices for the security o...   
23170      23171  I would like to know the latest cybersecurity ...   
14383      14384  "Is it illegal to be paid an unspecified amoun...   
54870      54871     What is the process of gaining an ICS license?   
1552        1553  Can you provide a step-by-step guide on how to...   
57144      57145  What can I learn about the government relation...   
34548      34549  If I am to 'pass on' the details of a contract...   

       Malicious (0/1) Department  Confidence Score     Source  \
55045                1      Legal            0.9708  Generated   
67202                1      Legal            0.8494  Generated   
30548               