# RAG

In [1]:
import sys
from pathlib import Path

# Get parent directory (Thesis-Edvin)
sys.path.append(str(Path.cwd().parent))

In [2]:
from utils import *
from datasets import load_dataset
from rag import *

In [82]:
cwes = load_dataset("Eathus/cwe_view1000_list_gpt_few_cwe_desc_fix", split="train")
cwes_df = cwes.to_pandas()

In [None]:
import pandas as pd

# Load the DataFrame from the .pkl file
cwes_df = pd.read_pickle("tmp/gpt_cwe_desc_few_df.pkl")
import json

mask = cwes_df["gpt_cwe_description"].str.contains(
    r"^\{\"gpt_cwe_description\":", na=False
)
cwes_df.loc[mask, "gpt_cwe_description"] = cwes_df.loc[
    mask, "gpt_cwe_description"
].apply(lambda x: json.loads(x)["gpt_cwe_description"])

In [None]:
cwes_df.columns

In [None]:
cwes_df.gpt_cwe_description[0]

In [83]:
from langchain.schema import Document
from rag import *
import pandas as pd
import json

with open("tmp/view_CWE-1000_all_weaknesses.json", "r") as file:
    data = json.load(file)

old_documents = [
    create_ordered_cwe_document(weakness, old=True) for weakness in data["Weaknesses"]
]
document_dict = {doc.metadata["CWE_ID"]: doc for doc in old_documents}
cwes_df["RAG_Doc"] = cwes_df.apply(
    lambda x: Document(
        page_content=x.gpt_cwe_description,
        metadata={
            **document_dict[x.ID].metadata,  # Original metadata
            "Abstraction": CWE_abstraction[x.Abstraction.upper()].value,  # New field
            #"Child_IDs": x.Children.tolist(),  # New field
        },
    ),
    axis=1,
)
documents = [
    Document(
        page_content=cwe.gpt_cwe_description,
        metadata={
            **document_dict[cwe.ID].metadata,  # Original metadata
            "Abstraction": CWE_abstraction[cwe.Abstraction.upper()].value,  # New field
            #"Child_IDs": cwe.Children.tolist(),  # New field
        },
    )
    for _, cwe in cwes_df.iterrows()
]

In [None]:
documents[0].metadata

In [84]:
old_norm_documents = [
    create_ordered_cwe_document(weakness, CWE_doc_density.NORM, old=True)
    for weakness in data["Weaknesses"]
]
old_heavy_documents = [
    create_ordered_cwe_document(weakness, CWE_doc_density.HEAVY, old=True)
    for weakness in data["Weaknesses"]
]
old_medium_documents = [
    create_ordered_cwe_document(weakness, CWE_doc_density.MEDIUM, old=True)
    for weakness in data["Weaknesses"]
]

In [85]:
allowed_values = {"Allowed", "Allowed-with-Review", "Discouraged"}

documents = [
    doc for doc in documents if doc.metadata.get("MappingUsage") in allowed_values
]
old_documents = [
    doc for doc in old_documents if doc.metadata.get("MappingUsage") in allowed_values
]
old_norm_documents = [
    doc
    for doc in old_norm_documents
    if doc.metadata.get("MappingUsage") in allowed_values
]
old_heavy_documents = [
    doc
    for doc in old_heavy_documents
    if doc.metadata.get("MappingUsage") in allowed_values
]
old_medium_documents = [
    doc
    for doc in old_medium_documents
    if doc.metadata.get("MappingUsage") in allowed_values
]

In [None]:
from IPython.display import display, Markdown

display(Markdown(old_norm_documents[0].page_content))

In [86]:
import re
from IPython.display import display, Markdown, Latex


def remove_subtitle(markdown_text, subtitle_names):
    # Pattern to match the subtitle and all content until the next subtitle or end
    ret = markdown_text
    for name in subtitle_names:
        pattern = r"## " + re.escape(name) + r"\b.*?(?=\n## |\Z)"
        ret = re.sub(pattern, "", ret, flags=re.DOTALL | re.IGNORECASE)

    # Remove with flags for dot matching newline and case sensitivity
    return ret

In [None]:
bm25_docs0 = [
    Document(
        page_content=cwe.page_content,
        metadata=cwe.metadata,
    )
    for cwe in old_documents
]

In [None]:
bm25_docs1 = [
    Document(
        page_content=remove_subtitle(cwe.page_content, ["Extended Description"]),
        metadata=cwe.metadata,
    )
    for cwe in old_documents
]

In [None]:
bm25_docs2 = [
    Document(
        page_content=remove_subtitle(
            cwe.gpt_cwe_description, ["Extended Description", "Demonstrative Scenario"]
        ),
        metadata=document_dict[cwe.ID].metadata,
    )
    for _, cwe in cwes_df.iterrows()
]

In [None]:
l = [
    (cwe, i)
    for i, cwe in enumerate(bm25_docs1)
    if cwe.page_content.rstrip() != bm25_docs2[i].page_content.rstrip()
    and (
        "Extended Description" not in old_documents[i].page_content
        or "Demonstrative Scenario" not in old_documents[i].page_content
    )
]
print(len(l))

display(Markdown(bm25_docs1[l[0][1]].page_content))
display(Markdown(bm25_docs2[l[0][1]].page_content))

print(bm25_docs1[l[1][1]].page_content)
print(bm25_docs2[l[1][1]].page_content)

In [None]:
from IPython.display import display, Markdown

display(Markdown(bm25_docs2[13].page_content))

In [None]:
from datasets import Dataset

ds_view1000_complete = Dataset.from_pandas(cwes_df)
ds_view1000_complete.push_to_hub("Eathus/cwe_view1000_list_rag")

In [87]:
def split_docs(docs, echo=False):
    all_docs = []
    for doc in docs:
        all_docs.extend(split_cwe_document(doc))
    all_docs = add_sequential_ids(all_docs)
    if echo:
        print(f"Original: {len(documents)} docs")
        print(f"After splitting: {len(all_docs)} docs")
        # print(f"Max tokens: {max(count_tokens(d.page_content) for d in all_docs)}")
    return all_docs

In [88]:
all_docs = split_docs(documents, True)

Original: 881 docs
After splitting: 900 docs


In [None]:
all_docs_old = split_docs(old_documents, True)

In [None]:
all_docs_bm25_0 = split_docs(bm25_docs0, True)

In [None]:
all_docs_bm25_1 = split_docs(bm25_docs1, True)

In [None]:
all_docs_bm25_2 = split_docs(bm25_docs2, True)

In [None]:
all_docs_old_norm = split_docs(old_norm_documents, True)
all_docs_old_heavy = split_docs(old_heavy_documents, True)
all_docs_old_medium = split_docs(old_medium_documents, True)

In [89]:
vectorstore_gpt = create_vectorstore(all_docs, "tmp/faiss_gpt_index")

Creating new vector store...
Vector store saved to tmp/faiss_gpt_index


In [None]:
vectorstore_old = create_vectorstore(all_docs_old, "tmp/faiss_old_index")
vectorstore_old_norm = create_vectorstore(all_docs_old_norm, "tmp/faiss_old_norm_index")
vectorstore_old_heavy = create_vectorstore(
    all_docs_old_heavy, "tmp/faiss_old_heavy_index"
)
vectorstore_old_medium = create_vectorstore(
    all_docs_old_medium, "tmp/faiss_old_medium_index"
)

In [None]:
from datasets import load_dataset

test = load_dataset("Eathus/github-issues-vul-detection-xgb-results", split='test')
emb_test_df = test.to_pandas()
print(len(emb_test_df))
emb_test_df = emb_test_df[~emb_test_df.duplicated(subset="issue_github_id", keep=False)]
print(len(emb_test_df))

In [None]:
emb_test_df.columns

In [11]:
from datasets import load_dataset

test_few = load_dataset(
    "Eathus/github-issues-vul-detection-gpt-few-strict-vul-desc-results", split="test"
)
test_few_df = test_few.to_pandas()
test_few_df = test_few_df[~test_few_df.duplicated(subset="issue_github_id", keep=False)]

In [None]:
test_few_df.columns

In [None]:
cip_pipeline_df = pd.merge(test_few_df, emb_test_df[['issue_github_id', 'xgb_prediction']], on='issue_github_id')
cip_pipeline_df["cp_tf_label"] = cip_pipeline_df.xgb_prediction&cip_pipeline_df.gpt_is_relevant
cip_pipeline_df.sample(3)

In [None]:
true_pos_cp = cip_pipeline_df[cip_pipeline_df.cp_tf_label & ~cip_pipeline_df.cve_id.isna()]
false_pos_cp = cip_pipeline_df[cip_pipeline_df.cp_tf_label & cip_pipeline_df.cve_id.isna()]
false_neg_cp = cip_pipeline_df[~cip_pipeline_df.cp_tf_label & ~cip_pipeline_df.cve_id.isna()]
all_true_cp = cip_pipeline_df[cip_pipeline_df.cp_tf_label]

print("true pos:", len(true_pos_cp))
print("false pos:", len(false_pos_cp))
print("false neg:", len(false_neg_cp))
display(true_pos_cp.head(1))
display(false_pos_cp.head(1))
print("all true:", len(all_true_cp))
print("all", len(cip_pipeline_df))

In [12]:
true_pos_few = test_few_df[test_few_df.gpt_is_relevant & ~test_few_df.cve_id.isna()]
false_pos_few = test_few_df[test_few_df.gpt_is_relevant & test_few_df.cve_id.isna()]
false_neg_few = test_few_df[~test_few_df.gpt_is_relevant & ~test_few_df.cve_id.isna()]
all_true_few = test_few_df[test_few_df.gpt_is_relevant]

print("true pos:", len(true_pos_few))
print("false pos:", len(false_pos_few))
print("false neg:", len(false_neg_few))
display(true_pos_few.head(1))
display(false_pos_few.head(1))
print("all true:", len(all_true_few))
print("all", len(test_few_df))

true pos: 291
false pos: 310
false neg: 15


Unnamed: 0,cve_id,cve_published,cve_descriptions,cve_metrics,cve_references,cve_configurations,cve_primary_cwe,cve_tags,issue_owner_repo,issue_body,...,issue_number,label,issue_msg,issue_msg_n_tokens,issue_embedding,__index_level_0__,gpt_description,gpt_vulnerability,gpt_confidence,gpt_is_relevant
4,CVE-2021-39528,2021-09-20T16:15:12.077,An issue was discovered in libredwg through v0...,"{'cvssMetricV2': [{'acInsufInfo': False, 'base...","[{'source': 'cve@mitre.org', 'tags': ['Exploit...",[{'nodes': [{'cpeMatch': array([{'criteria': '...,415,"[Exploit, Issue Tracking, Patch, Third Party A...","[LibreDWG, libredwg]","## System info\r\n\r\nUbuntu X64, gcc (Ubuntu ...",...,256,True,This is a GitHub Issue\nrepo:libredwg\nowner:L...,2400,"[-0.03039676696062088, 0.01844700239598751, -0...",308,The issue describes a double free vulnerabilit...,# Double Free Vulnerability\n\n## Description\...,5,True


Unnamed: 0,cve_id,cve_published,cve_descriptions,cve_metrics,cve_references,cve_configurations,cve_primary_cwe,cve_tags,issue_owner_repo,issue_body,...,issue_number,label,issue_msg,issue_msg_n_tokens,issue_embedding,__index_level_0__,gpt_description,gpt_vulnerability,gpt_confidence,gpt_is_relevant
1,,,,,,,No_CWE,,"[cesanta, mjs]",# s2o\r\n## Environment\r\nUbuntu 22.04.3 LTS\...,...,281,False,This is a GitHub Issue\nrepo:mjs\nowner:cesant...,1673,"[-0.01231892965734005, -0.0185470562428236, 0....",3444,The issue describes a segmentation fault (SEGV...,# Out-of-Bounds Read\n\n## Description\nThe pr...,5,True


all true: 601
all 1763


In [None]:
from datasets import load_dataset

test_zero = load_dataset(
    "Eathus/github-issues-vul-detection-gpt-zero-vul-desc-results", split="test"
)
test_zero_df = test_zero.to_pandas()
test_zero_df = test_zero_df[
    ~test_zero_df.duplicated(subset="issue_github_id", keep=False)
]

In [None]:
true_pos_zero = test_zero_df[test_zero_df.gpt_is_relevant & ~test_zero_df.cve_id.isna()]
false_pos_zero = test_zero_df[test_zero_df.gpt_is_relevant & test_zero_df.cve_id.isna()]
all_true_zero = test_zero_df[test_zero_df.gpt_is_relevant]

print(len(true_pos_zero))
display(true_pos_zero.head(1))
print(len(false_pos_zero))
display(false_pos_zero.head(1))
print("all true:", len(all_true_zero))
print("all", len(test_zero_df))

In [None]:
true_pos_few.columns

In [None]:
pd.set_option("display.max_colwidth", None)  # No truncation of column content
pd.set_option("display.max_rows", None)  # Show all rows
pd.set_option("display.max_columns", None)  # Show all columns

In [None]:
true_pos_few[~true_pos_few.gpt_vulnerability.str.contains("Demonstrative Scenario")][
    "gpt_vulnerability"
]

In [90]:
def bare_rag_label(
    data_df, vectorstore, match_col="gpt_vulnerability", k=1
):  # max for k is 20
    ret_df = data_df.copy()
    faiss_retriever = vectorstore.as_retriever(
        search_kwargs={
            "k": k,
        }
    )
    ensemble_retriever = EnsembleRetriever(
        retrievers=[faiss_retriever],
        weights=[1],
        # retrievers=[faiss_retriever], weights=[1]
    )

    # display(ret_df[ret_df['cve_primary_cwe'].isna()])
    def label(desc):
        try:
            ret = list(
                set([x.metadata["CWE_ID"] for x in ensemble_retriever.invoke(desc)])
            )
            # print(len(ret))
            return ret
        except Exception as e:
            print(f"General error processing message: {e}")
            return

    ret_df["rag_candidates"] = ret_df[match_col].progress_map(label)
    ret_df["rag_label"] = ret_df.apply(
        lambda row: (
            row["cve_primary_cwe"]
            if row["cve_primary_cwe"] in row["rag_candidates"]
            else row["rag_candidates"][0]
        ),
        axis=1,
    )

    return ret_df

In [94]:
# os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Must be before pandarallel init
rag_labeled_df = bare_rag_label(
    true_pos_few, vectorstore_gpt, match_col="gpt_vulnerability", k=13
)
display(rag_labeled_df.sample(3))

100%|██████████| 291/291 [00:11<00:00, 24.58it/s]


Unnamed: 0,cve_id,cve_published,cve_descriptions,cve_metrics,cve_references,cve_configurations,cve_primary_cwe,cve_tags,issue_owner_repo,issue_body,...,issue_msg,issue_msg_n_tokens,issue_embedding,__index_level_0__,gpt_description,gpt_vulnerability,gpt_confidence,gpt_is_relevant,rag_candidates,rag_label
79,CVE-2020-19667,2020-11-20T16:15:15.557,Stack-based buffer overflow and unconditional ...,"{'cvssMetricV2': [{'acInsufInfo': False, 'base...","[{'source': 'cve@mitre.org', 'tags': ['Exploit...",[{'nodes': [{'cpeMatch': array([{'criteria': '...,787,"[Exploit, Issue Tracking, Third Party Advisory]","[ImageMagick, ImageMagick]",### Prerequisites\r\n\r\n- [✅ ] I have written...,...,This is a GitHub Issue\nrepo:ImageMagick\nowne...,5289,"[-0.03310983255505562, 0.013431346975266933, -...",171,The issue describes a stack buffer overflow vu...,# Stack Buffer Overflow\n\n## Description\nA s...,5,True,"[787, 120, 123, 131, 119, 806, 122, 680, 805, ...",787
49,CVE-2023-29374,2023-04-05T02:15:37.340,"In LangChain through 0.0.131, the LLMMathChain...","{'cvssMetricV2': None, 'cvssMetricV30': None, ...","[{'source': 'cve@mitre.org', 'tags': ['Issue T...",[{'nodes': [{'cpeMatch': array([{'criteria': '...,74,"[Exploit, Issue Tracking, Patch]","[hwchase17, langchain]",#Overview\r\n\r\nllm math and PAL both use `ex...,...,This is a GitHub Issue\nrepo:langchain\nowner:...,1765,"[-0.02437249757349491, -0.007051460444927216, ...",640,The issue highlights a potential security vuln...,# Code Injection Vulnerability\n\n## Descripti...,5,True,"[141, 627, 94, 150, 95, 1120, 146, 75, 149, 96...",141
509,CVE-2023-34151,2023-05-30T22:15:11.000,A vulnerability was found in ImageMagick. This...,"{'cvssMetricV2': None, 'cvssMetricV30': None, ...","[{'source': 'secalert@redhat.com', 'tags': ['T...",[{'nodes': [{'cpeMatch': array([{'criteria': '...,190,"[Exploit, Issue Tracking, Patch]","[ImageMagick, ImageMagick]",### ImageMagick version\r\n\r\n7.1.30-0\r\n\r\...,...,This is a GitHub Issue\nrepo:ImageMagick\nowne...,2161,"[-0.013025580905377865, 0.009491877630352974, ...",715,The issue describes a vulnerability related to...,# Type Conversion Error Leading to Undefined B...,5,True,"[241, 196, 192, 1287, 681, 1024, 195, 197, 475...",190


In [None]:
rag_labeled_df.columns

In [None]:
from datasets import Dataset

# test
ds = Dataset.from_pandas(rag_labeled_df.drop(columns='__index_level_0__'))
ds.push_to_hub("Eathus/github-issues-vul-label-rag-results", split="test")

In [95]:
import statistics as stat

cl_lengths = [len(cands) for cands in rag_labeled_df.rag_candidates.to_list()]
print(stat.mean(cl_lengths))
print(stat.median(cl_lengths))
print(max(cl_lengths))
print(min(cl_lengths))

12.962199312714777
13
13
11


In [96]:
evaluate_rag(rag_labeled_df, "rag_label")

Accuracy: 0.8006872852233677
Classification Report:
              precision    recall  f1-score   support

        1050       1.00      1.00      1.00         1
         106       0.00      0.00      0.00         0
         116       0.00      0.00      0.00         1
         119       1.00      0.57      0.73         7
         120       1.00      0.86      0.92        14
         122       1.00      1.00      1.00         1
         125       1.00      0.18      0.30        17
        1251       0.00      0.00      0.00         0
        1295       0.00      0.00      0.00         0
        1325       0.00      0.00      0.00         0
        1333       1.00      1.00      1.00         1
         141       0.00      0.00      0.00         0
        1422       0.00      0.00      0.00         0
         150       1.00      1.00      1.00         1
         167       0.00      0.00      0.00         0
         190       1.00      1.00      1.00         4
         194       0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
vectorstore_gpt_fusion = create_vectorstore(documents, "tmp/faiss_gpt_fusion_index")

In [None]:
def fusion_retrieval(
    vectorstore, bm25, docs, query: str, k: int = 5, alpha: float = 0.5
) -> List[Document]:
    """
    Perform fusion retrieval combining keyword-based (BM25) and vector-based search.

    Args:
    vectorstore (VectorStore): The vectorstore containing the documents.
    bm25 (BM25Okapi): Pre-computed BM25 index.
    query (str): The query string.
    k (int): The number of documents to retrieve.
    alpha (float): The weight for vector search scores (1-alpha will be the weight for BM25 scores).

    Returns:
    List[Document]: The top k documents based on the combined scores.
    """

    epsilon = 1e-8

    # Step 2: BM25 scores
    bm25_scores = bm25.get_scores(query.split())
    bm25_scores = (bm25_scores - np.min(bm25_scores)) / (
        np.max(bm25_scores) - np.min(bm25_scores) + epsilon
    )

    # Build CWE_ID to BM25 score map
    bm25_score_dict = {
        doc.metadata["CWE_ID"]: score for doc, score in zip(docs, bm25_scores)
    }

    # Step 3: Vector search
    vector_results = vectorstore.similarity_search_with_score(
        query, k=vectorstore.index.ntotal
    )
    docs_vec, vec_scores_raw = zip(*vector_results)

    # Normalize vector scores
    vec_scores = 1 - (np.array(vec_scores_raw) - np.min(vec_scores_raw)) / (
        np.max(vec_scores_raw) - np.min(vec_scores_raw) + epsilon
    )

    # Combine scores safely
    def combine(vec, bm25):
        return alpha * vec + (1 - alpha) * bm25

    # Combine scores for docs with matching CWE_IDs
    score_dict = {}
    for doc_vec, vec_score in zip(docs_vec, vec_scores):
        cwe_id = doc_vec.metadata["CWE_ID"]
        if cwe_id in bm25_score_dict:
            bm25_score = bm25_score_dict[cwe_id]
            combined_score = combine(vec_score, bm25_score)
            score_dict[cwe_id] = (doc_vec, combined_score)

    # Sort and return top k documents
    sorted_doc_scores = sorted(score_dict.values(), key=lambda x: x[1], reverse=True)
    return [doc for doc, _ in sorted_doc_scores[:k]]


def fusion_rag_label(
    data_df, vectorstore, bm25_docs, match_col="gpt_vulnerability", k=1, alpha=0.5
):  # max for k is 20
    ret_df = data_df.copy()

    tokenized_docs = [doc.page_content.split() for doc in bm25_docs]
    bm25_retriever = BM25Okapi(tokenized_docs)

    # display(ret_df[ret_df['cve_primary_cwe'].isna()])
    def label(desc):
        try:
            ret = list(
                set(
                    [
                        x.metadata["CWE_ID"]
                        for x in fusion_retrieval(
                            vectorstore, bm25_retriever, bm25_docs, desc, k, alpha
                        )
                    ]
                )
            )
            # print(len(ret))
            return ret
        except Exception as e:
            print(f"General error processing message: {e}")
            return

    ret_df["rag_candidates"] = ret_df[match_col].progress_map(label)
    ret_df["rag_label"] = ret_df.apply(
        lambda row: (
            row["cve_primary_cwe"]
            if row["cve_primary_cwe"] in row["rag_candidates"]
            else row["rag_candidates"][0]
        ),
        axis=1,
    )

    return ret_df

In [None]:
fusion_rag_labeled_df = fusion_rag_label(
    true_pos_few,
    vectorstore_gpt_fusion,
    bm25_docs1,
    match_col="gpt_vulnerability",
    k=50,
    alpha=0.8,
)

In [None]:
import statistics as stat

cl_lengths = [len(cands) for cands in fusion_rag_labeled_df.rag_candidates.to_list()]
print(stat.mean(cl_lengths))
print(stat.median(cl_lengths))
print(max(cl_lengths))
print(min(cl_lengths))

In [None]:
evaluate_rag(fusion_rag_labeled_df, "rag_label")

## RAPTOR

In [None]:
from datasets import load_dataset
from rag import *

In [None]:
cwes = load_dataset("Eathus/cwe_view1000_list_gpt_few_cwe_desc", split="train")
cwes_df = cwes.to_pandas()

In [None]:
from langchain.schema import Document
from rag import *
import pandas as pd
import json

with open("tmp/view_CWE-1000_all_weaknesses.json", "r") as file:
    data = json.load(file)

old_documents = [
    create_ordered_cwe_document(weakness, old=True) for weakness in data["Weaknesses"]
]
document_dict = {doc.metadata["CWE_ID"]: doc for doc in old_documents}
cwes_df["RAG_Doc"] = cwes_df.apply(
    lambda x: Document(
        page_content=x.gpt_cwe_description,
        metadata={
            **document_dict[x.ID].metadata,  # Original metadata
            "Child_IDs": x.Children.tolist(),  # New field
        },
    ),
    axis=1,
)
documents = [
    Document(
        page_content=(
            cwe.Summary if cwe.Abstraction == "Pillar" else cwe.gpt_cwe_description
        ),
        metadata={
            "CWE_ID": cwe.ID,  # Original metadata
            "Abstraction": CWE_abstraction[cwe.Abstraction.upper()].value,  # New field
            "Child_IDs": cwe.Children.tolist(),  # New field
        },
    )
    for _, cwe in cwes_df.iterrows()
]
old_documents = [
    Document(
        page_content=cwe.Summary,
        metadata={
            "CWE_ID": cwe.ID,  # Original metadata
            "Abstraction": CWE_abstraction[cwe.Abstraction.upper()].value,  # New field
            "Child_IDs": cwe.Children.tolist(),  # New field
        },
    )
    for _, cwe in cwes_df.iterrows()
]

In [None]:
old_norm_documents = [
    create_ordered_cwe_document(weakness, CWE_doc_density.NORM, old=True)
    for weakness in data["Weaknesses"]
]
old_heavy_documents = [
    create_ordered_cwe_document(weakness, CWE_doc_density.HEAVY, old=True)
    for weakness in data["Weaknesses"]
]
old_medium_documents = [
    create_ordered_cwe_document(weakness, CWE_doc_density.MEDIUM, old=True)
    for weakness in data["Weaknesses"]
]

old_norm_documents = [
    Document(
        page_content=doc.page_content,
        metadata={
            "CWE_ID": doc.metadata["CWE_ID"],  # Original metadata
            "Abstraction": doc.metadata["Abstraction"],
            "Child_IDs": cwes_df.loc[cwes_df.ID == doc.metadata["CWE_ID"], "Children"]
            .iloc[0]
            .tolist(),  # New field
        },
    )
    for doc in old_norm_documents
]
old_norm_documents = add_sequential_ids(old_norm_documents)

old_heavy_documents = [
    Document(
        page_content=doc.page_content,
        metadata={
            "CWE_ID": doc.metadata["CWE_ID"],  # Original metadata
            "Abstraction": doc.metadata["Abstraction"],
            "Child_IDs": cwes_df.loc[cwes_df.ID == doc.metadata["CWE_ID"], "Children"]
            .iloc[0]
            .tolist(),  # New field
        },
    )
    for doc in old_heavy_documents
]
old_heavy_documents = add_sequential_ids(old_heavy_documents)

old_medium_documents = [
    Document(
        page_content=doc.page_content,
        metadata={
            "CWE_ID": doc.metadata["CWE_ID"],  # Original metadata
            "Abstraction": doc.metadata["Abstraction"],
            "Child_IDs": cwes_df.loc[cwes_df.ID == doc.metadata["CWE_ID"], "Children"]
            .iloc[0]
            .tolist(),  # New field
        },
    )
    for doc in old_medium_documents
]
old_medium_documents = add_sequential_ids(old_medium_documents)

In [None]:
def split_docs(docs, echo=False):
    all_docs = []
    for doc in docs:
        all_docs.extend(split_cwe_document(doc))
    all_docs = add_sequential_ids(all_docs)
    if echo:
        print(f"Original: {len(documents)} docs")
        print(f"After splitting: {len(all_docs)} docs")
        print(f"Max tokens: {max(count_tokens(d.page_content) for d in all_docs)}")
    return all_docs

In [None]:
all_docs = split_docs(documents, True)
old_docs = split_docs(old_documents, True)
bm25_docs0 = [
    Document(
        page_content=cwe.page_content,
        metadata=cwe.metadata,
    )
    for cwe in old_documents
]
bm25_docs1 = [
    Document(
        page_content=remove_subtitle(cwe.page_content, ["Extended Description"]),
        metadata=cwe.metadata,
    )
    for cwe in old_documents
]
all_docs_bm25_0 = split_docs(bm25_docs0, True)
all_docs_bm25_1 = split_docs(bm25_docs1, True)

In [None]:
from datasets import load_dataset

test_few = load_dataset(
    "Eathus/github-issues-vul-detection-gpt-few-strict-vul-desc-results", split="test"
)
test_few_df = test_few.to_pandas()
test_few_df = test_few_df[~test_few_df.duplicated(subset="issue_github_id", keep=False)]

In [None]:
true_pos_few = test_few_df[test_few_df.gpt_is_relevant & ~test_few_df.cve_id.isna()]
false_pos_few = test_few_df[test_few_df.gpt_is_relevant & test_few_df.cve_id.isna()]
all_true_few = test_few_df[test_few_df.gpt_is_relevant]

In [None]:
documents = add_sequential_ids(documents)
raptor_vectorstore_gpt = create_vectorstore(
    all_docs, "tmp/chromadb_indices_gpt", "gpt_index", Vectorstore.CHROMADB
)

In [None]:
raptor_vectorstore_old = create_vectorstore(
    old_docs, "tmp/chromadb_indices_old", "old_index", Vectorstore.CHROMADB
)

In [None]:
raptor_vectorstore_gpt_condensed = create_vectorstore(
    documents, "tmp/chromadb_indices_gpt_condensed", "gpt_index", Vectorstore.CHROMADB
)

In [None]:
raptor_vectorstore_norm = create_vectorstore(
    old_norm_documents, "tmp/chromadb_indices_norm", "norm_index", Vectorstore.CHROMADB
)
raptor_vectorstore_medium = create_vectorstore(
    old_medium_documents,
    "tmp/chromadb_indices_medium",
    "medium_index",
    Vectorstore.CHROMADB,
)
raptor_vectorstore_heavy = create_vectorstore(
    old_heavy_documents,
    "tmp/chromadb_indices_heavy",
    "heavy_index",
    Vectorstore.CHROMADB,
)

In [None]:
raptor_rag_labeled_df = raptor_rag_label(
    true_pos_few,
    raptor_vectorstore_medium,
    match_col="gpt_vulnerability",
    # device="cpu",
    bm25_docs=old_medium_documents,
    k_list=[3, 12, 4, 1, 1],
    top_abstraction=CWE_abstraction.PILLAR,
    alpha=0.8,
)

display(raptor_rag_labeled_df.sample(3))

In [None]:
raptor_rag_labeled_df = raptor_rag_label_optimized(
    true_pos_few,
    raptor_vectorstore_gpt,
    match_col="gpt_vulnerability",
    k_list=[4, 7, 4, 2, 1],
    device="cpu",
    max_workers=1,
)

In [None]:
import statistics as stat

cl_lengths = [len(cands) for cands in raptor_rag_labeled_df.rag_candidates.to_list()]
print(stat.mean(cl_lengths))
print(stat.median(cl_lengths))
print(max(cl_lengths))
print(min(cl_lengths))

In [None]:
evaluate_rag(raptor_rag_labeled_df)

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.llms import OpenAI

OPENAI_API_KEY_KTH = os.getenv("OPENAI_API_KEY_KTH")

llm = OpenAI(temperature=0, api_key=OPENAI_API_KEY_KTH)


def hierarchical_retrieval(
    query: str,
    vectorstore,
    hierarchy_cache,
    k_list: List = [3, 10, 5, 2, 1],
    top_abstraction=CWE_abstraction.PILLAR,
) -> List[Document]:

    k_dict = {
        "PILLAR": k_list[0],
        "CLASS": k_list[1],
        "BASE": k_list[2],
        "VARIANT": k_list[3],
        "COMPOUND": k_list[4],
    }

    all_retrieved_docs = []
    child_ids = None  # Initialize child_ids
    level = top_abstraction
    compressor = LLMChainExtractor.from_llm(llm)

    i = 0

    while True:
        # Define metadata filter
        CWE_abstraction(i)
        if child_ids:
            filter_condition = {"CWE_ID": {"$in": child_ids}}
        else:
            filter_condition = {"Abstraction": {"$eq": level.value}}

        base_retriever = vectorstore.as_retriever(
            search_kwargs={"k": k_dict[level.name], "filter": filter_condition}
        )
        compression_retriever = ContextualCompressionRetriever(
            base_compressor=compressor, base_retriever=base_retriever
        )

        # Retrieve documents with the defined filter
        level_docs = compression_retriever.invoke(query)
        all_retrieved_docs.extend(level_docs)

        # Prepare child_ids for the next iteration
        if level_docs:
            max_level = max([doc.metadata.get("Abstraction", 0) for doc in level_docs])
            level = CWE_abstraction(max_level)
            child_ids = []
            for doc in level_docs:
                child_ids.extend(
                    hierarchy_cache.get(doc.metadata["CWE_ID"], {}).get("children", [])
                )
            child_ids = list(set(child_ids))
        else:
            child_ids = None  # Reset if no documents found or at the lowest level

        if not child_ids or i > 4:
            break

        i += 1
    return list({doc.metadata["id"]: doc for doc in all_retrieved_docs}.values())

In [None]:
def raptor_hierarchy_rag_label(
    data_df,
    vectorstore,
    match_col="gpt_vulnerability",
    k_list: List = [3, 10, 5, 2, 0],
    # max_workers=10,
    top_abstraction=CWE_abstraction.PILLAR,
    max_workers=15,
):  # max for k is 20
    ret_df = data_df.copy()

    hierarchy_cache = {}
    all_metadata = vectorstore.get()["metadatas"]
    for meta in all_metadata:
        hierarchy_cache[meta["CWE_ID"]] = {
            "abstraction": meta["Abstraction"],
            "children": meta["Child_IDs"].split(",") if meta.get("Child_IDs") else [],
        }
    # display(ret_df[ret_df['cve_primary_cwe'].isna()])

    def label(desc):
        try:
            ret = [
                x.metadata["CWE_ID"]
                for x in hierarchical_retrieval(
                    desc,
                    vectorstore,
                    k_list=k_list,
                    hierarchy_cache=hierarchy_cache,
                    top_abstraction=top_abstraction,
                )
            ]
            # print(len(ret))
            return ret
        except Exception as e:
            print(f"General error processing message: {e}")
            return

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        queries = data_df[match_col].tolist()
        results = list(
            tqdm(
                executor.map(label, queries),
                total=len(queries),
                desc="Processing RAG queries",
            )
        )

    # 4. Create final dataframe
    ret_df = data_df.copy()
    ret_df["rag_candidates"] = results
    ret_df["rag_label"] = ret_df.apply(
        lambda row: (
            row["cve_primary_cwe"]
            if row["cve_primary_cwe"] in row["rag_candidates"]
            else (row["rag_candidates"][0] if row["rag_candidates"] else None)
        ),
        axis=1,
    )
    return ret_df

In [None]:
raptor_rag_labeled_df = raptor_hierarchy_rag_label(
    true_pos_few,
    raptor_vectorstore_gpt,
    match_col="gpt_vulnerability",
    # device="cpu",
    k_list=[3, 8, 4, 1, 1],
    top_abstraction=CWE_abstraction.PILLAR,
)

In [None]:
import statistics as stat

cl_lengths = [len(cands) for cands in raptor_rag_labeled_df.rag_candidates.to_list()]
print(stat.mean(cl_lengths))
print(stat.median(cl_lengths))
print(max(cl_lengths))
print(min(cl_lengths))

evaluate_rag(raptor_rag_labeled_df)

## Reranking 

In [None]:
import sys
from pathlib import Path

# Get parent directory (Thesis-Edvin)
sys.path.append(str(Path.cwd().parent))

In [None]:
from utils import *
from rag import *
from langchain_openai import ChatOpenAI
from langchain import PromptTemplate
from pydantic import BaseModel, Field
from langchain_core.retrievers import BaseRetriever
from langchain.chains import RetrievalQA
from typing import List
from openai import OpenAIError, RateLimitError

In [None]:
class RatingScore(BaseModel):
    relevance_score: float = Field(..., description="The relevance score of a document to a query.")

def rerank_documents(query: str, docs: List[Document], top_n: int = 3) -> List[Document]:
    prompt_template = PromptTemplate(
        input_variables=["vulnerability", "doc"],
        template="""On a scale of 1-10, rate the relevance of the following document to the vulnerability. Consider the specific context and intent of the vulnerability, not just keyword matches.
        Vulnerability: {vulnerability}
        Document: {doc}
        Relevance Score:"""
    )
    
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini", max_tokens=4000, api_key=OPENAI_API_KEY_KTH)
    llm_chain = prompt_template | llm.with_structured_output(RatingScore)
    
    scored_docs = []
    for doc in docs:
        input_data = {"vulnerability": query, "doc": doc.page_content}
        try:
            score = llm_chain.invoke(input_data).relevance_score
        except OpenAIError as e:  # Catch all OpenAI-specific errors
            print(f"OpenAI API error: {e}")
        except Exception as e:
            print(f"General error processing message: {e}")
        try:
            score = float(score)
        except ValueError:
            score = 0  # Default score if parsing fails
        scored_docs.append((doc, score))
    
    reranked_docs = sorted(scored_docs, key=lambda x: x[1], reverse=True)
    return [doc for doc, _ in reranked_docs[:top_n]]

In [None]:
class CustomRetriever(BaseRetriever):
    
    vectorstore : FAISS = Field(description="Vector store for initial retrieval")

    def _get_relevant_documents(self, query: str, num_docs=2) -> List[Document]:
        initial_docs = self.vectorstore.similarity_search(query, k=30)
        return rerank_documents(query, initial_docs, top_n=num_docs)


# Create the custom retriever
custom_retriever = CustomRetriever(vectorstore=vectorstore_gpt)

# Create an LLM for answering questions



In [None]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=15)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

def rerank_rag_label(
    data_df, vectorstore, match_col="gpt_vulnerability", k=1, max_workers=15
):  # max for k is 20
    ret_df = data_df.copy()

    custom_retriever = CustomRetriever(vectorstore=vectorstore)
    # display(ret_df[ret_df['cve_primary_cwe'].isna()])
    def label(desc):
        try:
            ret = list(
                set([x.metadata["CWE_ID"] for x in custom_retriever._get_relevant_documents(desc, k)])
            )
            # print(len(ret))
            return ret
        except Exception as e:
            print(f"General error processing message: {e}")
            return
    '''
    ret_df["rag_candidates"] = ret_df[match_col].progress_map(lambda x: label(x))
    ret_df["rag_label"] = ret_df.apply(
        lambda row: (
            row["cve_primary_cwe"]
            if row["cve_primary_cwe"] in row["rag_candidates"]
            else row["rag_candidates"][0]
        ),
        axis=1,
    )
    '''
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        queries = data_df[match_col].tolist()
        results = list(
            tqdm(
                executor.map(label, queries),
                total=len(queries),
                desc="Processing RAG queries",
            )
        )
    ret_df = data_df.copy()
    ret_df["rag_candidates"] = results
    ret_df["rag_label"] = ret_df.apply(
        lambda row: (
            row["cve_primary_cwe"]
            if row["cve_primary_cwe"] in row["rag_candidates"]
            else (row["rag_candidates"][0] if row["rag_candidates"] else None)
        ),
        axis=1,
    )

    return ret_df

In [None]:
rag_labeled_df = rerank_rag_label(
    true_pos_few, vectorstore_gpt, match_col="gpt_vulnerability", k=1
)

In [None]:
import statistics as stat

cl_lengths = [len(cands) for cands in rag_labeled_df.rag_candidates.to_list()]
print(stat.mean(cl_lengths))
print(stat.median(cl_lengths))
print(max(cl_lengths))
print(min(cl_lengths))
evaluate_rag(rag_labeled_df, "rag_label")

## Generation step

In [39]:
import sys
from pathlib import Path

# Get parent directory (Thesis-Edvin)
sys.path.append(str(Path.cwd().parent))

In [40]:
from utils import *

In [41]:
from datasets import load_dataset
cwes = load_dataset("Eathus/cwe_view1000_list_gpt_few_cwe_desc", split="train")
cwe_df = cwes.to_pandas()

In [None]:
old_heavy_documents[0].metadata

In [42]:
from langchain.schema import Document
from rag import *
import pandas as pd
import json

with open("tmp/view_CWE-1000_all_weaknesses.json", "r") as file:
    data = json.load(file)

In [43]:
cwe_dict = {dat['ID']: dat for dat in data["Weaknesses"]}

In [44]:
def create_ordered_cwe_dict(weakness, doc_density=CWE_doc_density.LIGHT, old=False):
    ret =  {
        "ID": weakness["ID"],
        "Description": weakness['Description'],
        **({"ExtendedDescription": weakness['ExtendedDescription']} if "ExtendedDescription" in weakness else {}),
    }
    
    if doc_density in [CWE_doc_density.NORM, CWE_doc_density.HEAVY] and "AlternateTerms" in weakness :
        ret["AlternateTerms"] = weakness["AlternateTerms"]
    
    if doc_density in [CWE_doc_density.HEAVY] :
        if "CommonConsequences" in weakness:
            ret["CommonConsequences"] = weakness["CommonConsequences"]
        if "AffectedResources" in weakness:
            ret["AffectedResources"] = weakness["AffectedResources"]
        if "ModesOfIntroduction" in weakness:
            ret["ModesOfIntroduction"] = weakness["ModesOfIntroduction"]
        if "BackgroundDetails" in weakness:
            ret["BackgroundDetails"] = weakness["BackgroundDetails"]
    

    if doc_density in [CWE_doc_density.NORM, CWE_doc_density.HEAVY]:
        if "Notes" in weakness:
            ret["Notes"] = weakness["Notes"]
        if "ObservedExamples" in weakness:
            ret["ObservedExamples"] = [ ex['Description'] for ex in weakness["ObservedExamples"]]        

    if "DemonstrativeExamples" in weakness:
        if doc_density in [CWE_doc_density.LIGHT, CWE_doc_density.NORM] :
            ret["DemonstrativeExample"] = weakness['DemonstrativeExamples'][0]
        else :
            ret["DemonstrativeExamples"] = weakness['DemonstrativeExamples']
    
    return ret

In [45]:
def cwe_list_to_json_str(cwes, cwe_dict, indent=4, doc_density=CWE_doc_density.LIGHT):
    js_list = [create_ordered_cwe_dict(cwe_dict[cwe], doc_density) for cwe in cwes]
    return json.dumps({"Weaknesses": js_list}, indent=indent)

In [46]:
import os
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from utils import *


class ReplySchema(BaseModel):
    gpt_cwe: str = Field(
        description="The CWE-ID (*only the number*) of the CWE entry that best fits the vulnerability description"
    )
    gpt_cwe_confidence: int = Field(
        description="An integer from 1 to 5 indicating your level of confidence  (1 = very low, 2 = low, 3 = medium, 4 = high, 5 = very high)."
    )


llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.0,
    api_key=OPENAI_API_KEY_KTH,  # <- this overrides the default
)  # maybe set max_token to 14000

prompts_dict = load_prompts(os.getcwd() + "/../utils/prompts")

prompt = ChatPromptTemplate.from_messages(
    [("system", prompts_dict["RAG_system_setup_json"]), ("human", "{desc}")],
)


def parser(message: ReplySchema):
    return message.model_dump_json()


llm = llm.with_structured_output(ReplySchema)
chain = prompt | llm | parser

In [47]:
from utils import *
from pandarallel import pandarallel
from tenacity import (
    retry,
    stop_after_attempt,
    wait_exponential,
    retry_if_exception_type,
)
from openai import OpenAIError, RateLimitError  # Explicitly import errors

pandarallel.initialize(progress_bar=True, nb_workers=15)

@retry(
    stop=stop_after_attempt(5),  # Retry up to 5 times
    wait=wait_exponential(multiplier=2, min=1, max=60),  # Exponential backoff
    retry=retry_if_exception_type(RateLimitError),  # Retry only on rate limit errors
)
def _gpt_classify(desc, cwe_entries):
    if (
        not desc
        or not isinstance(desc, str)
        or not cwe_entries
        or not isinstance(cwe_entries, str)
    ):  # Check for empty/invalid messages
        return None
    return chain.invoke(
        {
            "cwe_entries": cwe_entries,
            "desc": desc,
        }
    )  # Adjusted for OpenAI API format


def gpt_classify(desc, cwe_entries):
    try:
        return _gpt_classify(desc, cwe_entries)
    except OpenAIError as e:  # Catch all OpenAI-specific errors
        print(f"OpenAI API error: {e}")
    except Exception as e:
        print(f"General error processing message: {e}")
    return None

INFO: Pandarallel will run on 15 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [97]:
import pickle
import os

file_path = "tmp/rag_cwe_pred.pkl"

# Define the path to your pickle file

# Check if the file exists
if os.path.exists(file_path):
    with open(file_path, "rb") as file:
        rag_labeled_df = pickle.load(file)
    print("Pickle file loaded successfully!")
else:
    print(f"The file at {file_path} does not exist. Setting gpt_response to 'None'")
    rag_labeled_df["rag_prediction"] = None

The file at tmp/rag_cwe_pred.pkl does not exist. Setting gpt_response to 'None'


In [28]:
def cwe_list_to_md_str(cwes, md_documents):
    markdown_list = [doc.page_content for doc in md_documents if doc.metadata['CWE_ID'] in cwes]
    return "\n".join(markdown_list)

In [None]:
print(rag_labeled_df.columns)
rag_labeled_df.iloc[0].cve_primary_cwe

In [None]:
test_rag = gpt_classify(
    rag_labeled_df.iloc[0].gpt_vulnerability,
    cwe_list_to_json_str(rag_labeled_df.iloc[0].rag_candidates, cwe_dict, doc_density=CWE_doc_density.HEAVY),
)

In [None]:
test_rag

In [98]:
import statistics as stat

requests = [
    [
        {
            'role': 'system',
            'content': prompts_dict["RAG_system_setup"].format(
                cwe_entries=cwe_list_to_json_str(row.rag_candidates, cwe_dict, doc_density=CWE_doc_density.HEAVY)
            )
        },
        {
            'role': 'user',
            'content': row.gpt_vulnerability
        }
    ]
    for _, row in rag_labeled_df[['gpt_vulnerability', 'rag_candidates']].iterrows()
]   
token_counts = [count_chat_tokens(request) for request in requests]

print('Mean token count:\t\t', stat.mean(token_counts))
print('Median token count:\t\t', stat.median(token_counts))
print('Max token count:\t\t', max(token_counts))
print('Min token count:\t\t', min(token_counts))


Mean token count:		 16286.786941580756
Median token count:		 17716
Max token count:		 26983
Min token count:		 4204


In [None]:
rag_labeled_df = rag_labeled_df.reset_index(drop=True)
display(rag_labeled_df.head(3))

In [None]:
print(rag_labeled_df.loc[na_indices, ["gpt_vulnerability", "rag_candidates"]].head())


In [99]:
import pickle
import time
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"


retries = 0
max_retries = 10
file_path = "tmp/rag_cwe_pred.pkl"
DELAY = 1
# test_df['gpt_response'] = None
while (
    not rag_labeled_df[rag_labeled_df.rag_prediction.isna()].empty
    and retries < max_retries
):
    # Get indices of rows needing processing
    na_indices = rag_labeled_df[
        rag_labeled_df.rag_prediction.isna() & ~rag_labeled_df.cve_id.isna()
    ].index

    if len(na_indices) == 0:
        break

    # Process ONLY those rows and assign directly to original DF
    rag_labeled_df.loc[na_indices, "rag_prediction"] = rag_labeled_df.loc[
        na_indices, ["gpt_description", "rag_candidates"]
    ].parallel_apply(
        lambda x: gpt_classify(
            x["gpt_description"], cwe_list_to_json_str(x["rag_candidates"], cwe_dict, doc_density=CWE_doc_density.HEAVY)
        ),
        axis=1
    )

    with open(file_path, "wb") as file:  # 'wb' mode writes in binary format
        pickle.dump(rag_labeled_df, file)
    retries += 1
    print(f"Retry {retries}: Processed {len(na_indices)} rows")
    time.sleep(DELAY)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=20), Label(value='0 / 20'))), HBox…

Retry 1: Processed 291 rows


In [None]:
rag_labeled_df.rag_prediction.head(5)

In [103]:
rag_labeled_df['rag_prediction']

4        {'gpt_cwe': '415', 'gpt_cwe_confidence': 5}
5       {'gpt_cwe': '1392', 'gpt_cwe_confidence': 5}
11       {'gpt_cwe': '122', 'gpt_cwe_confidence': 5}
14        {'gpt_cwe': '89', 'gpt_cwe_confidence': 5}
16       {'gpt_cwe': '913', 'gpt_cwe_confidence': 5}
                            ...                     
1752     {'gpt_cwe': '122', 'gpt_cwe_confidence': 5}
1756     {'gpt_cwe': '754', 'gpt_cwe_confidence': 4}
1763      {'gpt_cwe': '20', 'gpt_cwe_confidence': 5}
1770     {'gpt_cwe': '648', 'gpt_cwe_confidence': 4}
1773      {'gpt_cwe': '79', 'gpt_cwe_confidence': 5}
Name: rag_prediction, Length: 291, dtype: object

In [102]:
import json
import pandas as pd

rag_labeled_df['rag_prediction'] = rag_labeled_df['rag_prediction'].map(
    lambda x: json.loads(x) if pd.notna(x) else None
)

TypeError: the JSON object must be str, bytes or bytearray, not dict

In [104]:
test_df = rag_labeled_df.copy()
test_df[['rag_prediction', 'rag_confidence']] = test_df['rag_prediction'].apply(
    lambda x: pd.Series({"1":x['gpt_cwe'], "2":x['gpt_cwe_confidence']}) if pd.notna(x) else None
)
test_df[["rag_prediction", "rag_confidence"]].head(5)

Unnamed: 0,rag_prediction,rag_confidence
4,415,5
5,1392,5
11,122,5
14,89,5
16,913,5


In [None]:
test_df.columns

In [105]:
evaluate_rag(test_df, "rag_prediction")

Accuracy: 0.5051546391752577
Classification Report:
              precision    recall  f1-score   support

        1050       0.00      0.00      0.00         1
        1067       0.00      0.00      0.00         0
         116       0.00      0.00      0.00         1
         119       0.00      0.00      0.00         7
         120       0.50      0.43      0.46        14
         121       0.00      0.00      0.00         0
         122       0.03      1.00      0.05         1
         125       1.00      0.12      0.21        17
         129       0.00      0.00      0.00         0
        1333       1.00      1.00      1.00         1
        1335       0.00      0.00      0.00         0
        1392       0.00      0.00      0.00         0
         150       1.00      1.00      1.00         1
         190       0.75      0.75      0.75         4
          20       0.25      0.33      0.29         3
         200       0.00      0.00      0.00         1
         212       1.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
