In [3]:
pip install pandas faiss-cpu sentence-transformers


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# Load CSV data into DataFrame
csv_file_path = "E:/NLP/CVE_Database-main/cve_data_with_categories.csv"
df = pd.read_csv(csv_file_path)

# Preprocess the 'Description' column to replace NaN or non-string values with an empty string
df['Description'] = df['Description'].fillna('').astype(str)

  from tqdm.autonotebook import tqdm, trange


In [3]:
df.columns

Index(['CVE ID', 'Source Identifier', 'Published Date', 'Last Modified Date',
       'Vulnerability Status', 'Description', 'CVSS Score', 'Weaknesses',
       'Configuration', 'References', 'Entities', 'Dependencies', 'Category'],
      dtype='object')

df

In [5]:
df

Unnamed: 0,CVE ID,Source Identifier,Published Date,Last Modified Date,Vulnerability Status,Description,CVSS Score,Weaknesses,Configuration,References,Entities,Dependencies,Category
0,CVE-1999-0001,cve@mitre.org,1999-12-30 05:00:00.000,2010-12-16 05:00:00.000,analyzed,ip_inputc bsdderiv tcpip implement allow remot...,5.0,cwe,cpeobsdibsd_o cpeofreebsdfreebsd cpeofreebsdfr...,httpwwwopenbsdorgerratahtmltcpfix httpwwwosvdborg,[],"[('ip_inputc', 'compound', 'bsdderiv'), ('bsdd...",vulnerability
1,CVE-1999-0002,cve@mitre.org,1998-10-12 04:00:00.000,2009-01-26 05:00:00.000,analyzed,buffer overflow nf mountd give root access rem...,10.0,cwe,cpeobsdibsd_o cpeocalderaopenlinux cpeoredhatl...,ftppatchessgicomsupportfreesecurityadvisoriesi...,"[('mostli', 'ORG')]","[('buffer', 'compound', 'overflow'), ('overflo...",vulnerability
2,CVE-1999-0003,cve@mitre.org,1998-04-01 05:00:00.000,2018-10-30 16:26:22.357,analyzed,execut command root via buffer overflow toolta...,10.0,nvdcweother,cpeatritrealted_cd cpeosgiirix cpeosgiirix cpe...,ftppatchessgicomsupportfreesecurityadvisoriesa...,[],"[('execut', 'ROOT', 'execut'), ('command', 'co...",vulnerability
3,CVE-1999-0004,cve@mitre.org,1997-12-16 05:00:00.000,2018-10-12 21:29:02.120,modified,mime buffer overflow email client eg solari ma...,5.0,nvdcweother,cpeahpdtmail cpeauniversity_of_washingtonpin c...,httpsdocsmicrosoftcomenussecurityupdatessecuri...,[],"[('mime', 'compound', 'overflow'), ('buffer', ...",vulnerability
4,CVE-1999-0005,cve@mitre.org,1998-07-20 04:00:00.000,2008-09-09 12:33:31.117,analyzed,arbitrari command execut via imap buffer overf...,10.0,nvdcweother,cpeanetscapemessaging_serv cpeauniversity_of_w...,httpsunsolvesuncompubcgiretrievepldoctypecolld...,[],"[('arbitrari', 'compound', 'command'), ('comma...",vulnerability
...,...,...,...,...,...,...,...,...,...,...,...,...,...
238603,CVE-2024-27133,reefs@jfrog.com,2024-02-23 22:15:55.287,2024-02-23 22:15:55.287,received,insuffici sanit mlflow lead xss run recip use ...,,cwe,,httpsgithubcommlflowmlflowpul httpsresearchjfr...,"[('insuffici sanit mlflow', 'ORG'), ('xss', 'G...","[('insuffici', 'compound', 'sanit'), ('sanit',...",vulnerability
238604,CVE-2024-27215,cve@mitre.org,2024-02-21 16:15:50.657,2024-02-21 18:15:52.060,rejected,reject reason use candid number consultid cve ...,,,,,[],"[('reject', 'csubj', 'refer'), ('reason', 'nsu...",vulnerability
238605,CVE-2024-27283,cve@mitre.org,2024-02-22 05:15:10.087,2024-02-22 19:07:27.197,awaiting analysis,vulner discov verita ediscoveri platform appli...,,,,httpswwwveritascomsupporten_ussecurityvt,"[('verita ediscoveri', 'PERSON'), ('applic adm...","[('vulner', 'npadvmod', 'discov'), ('discov', ...",vulnerability
238606,CVE-2024-27318,6f8de1f0-f67e-45a6-b68f-98777fdb759c,2024-02-23 18:15:50.767,2024-02-23 19:31:25.817,awaiting analysis,version packag onnx includ vulner directori tr...,,cwe,,httpsgithubcomonnxonnxcommitbfbfdcfebbddea htt...,"[('userprovid directori', 'PERSON')]","[('version', 'compound', 'onnx'), ('packag', '...",patch


In [None]:
# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each description in the dataset
descriptions = df['Description'].fillna("").tolist()  # Fill NaNs with empty strings to avoid issues
description_embeddings = model.encode(descriptions, convert_to_tensor=True)

# Convert embeddings to numpy for FAISS
description_embeddings = np.array([embedding.numpy() for embedding in description_embeddings])

# Set up the FAISS index
embedding_dimension = description_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dimension)
index.add(description_embeddings)  # Add all embeddings to the index

# Function to retrieve similar CVE entries based on a given CVE ID
def retrieve_cve_info(cve_id):
    # Check for the row with the specific CVE ID
    row = df[df['CVE ID'] == cve_id]
    if row.empty:
        return f"No data found for CVE ID: {cve_id}"

    # Get the index of the row and corresponding embedding
    row_idx = row.index[0]
    embedding = description_embeddings[row_idx].reshape(1, -1)

    # Search for similar entries in the FAISS index
    _, similar_indices = index.search(embedding, 5)  # Returns top 5 similar entries

    # Retrieve CVE information for similar entries
    similar_cves = df.iloc[similar_indices[0]]
    return similar_cves

# Example usage
cve_id = 'CVE-1999-0005	'  # Replace with an actual CVE ID present in your dataset
cve_info = retrieve_cve_info(cve_id)
print(cve_info)

              CVE ID Source Identifier           Published Date  \
4      CVE-1999-0005     cve@mitre.org  1998-07-20 04:00:00.000   
12011  CVE-2005-1014     cve@mitre.org  2005-05-02 04:00:00.000   
15300  CVE-2005-4402     cve@mitre.org  2005-12-20 11:03:00.000   
7072   CVE-2003-0319     cve@mitre.org  2003-06-09 04:00:00.000   
2530   CVE-2000-0961     cve@mitre.org  2000-12-19 05:00:00.000   

            Last Modified Date Vulnerability Status  \
4      2008-09-09 12:33:31.117             analyzed   
12011  2017-07-11 01:32:29.627             modified   
15300  2016-10-18 03:38:08.317             modified   
7072   2016-10-18 02:32:22.863             modified   
2530   2017-10-10 01:29:24.780             modified   

                                             Description  CVSS Score  \
4      arbitrari command execut via imap buffer overf...        10.0   
12011  buffer overflow imap servic mailen enterpris e...         7.5   
15300  buffer overflow mailen profession earlier e

In [7]:
# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each description in the dataset
descriptions = df['Description'].fillna("").tolist()  # Fill NaNs with empty strings to avoid issues
description_embeddings = model.encode(descriptions, convert_to_tensor=True)

# Convert embeddings to numpy for FAISS
description_embeddings = np.array([embedding.numpy() for embedding in description_embeddings])

# Set up the FAISS index
embedding_dimension = description_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dimension)
index.add(description_embeddings)  # Add all embeddings to the index

def retrieve_cve_info(cve_id):
    # Check for the row with the specific CVE ID
    row = df[df['CVE ID'] == cve_id]
    if row.empty:
        return f"No data found for CVE ID: {cve_id}"

    # Get the index of the row and corresponding embedding
    row_idx = row.index[0]
    embedding = description_embeddings[row_idx].reshape(1, -1)

    # Search for similar entries in the FAISS index
    _, similar_indices = index.search(embedding, 5)  # Returns top 5 similar entries

    # Retrieve CVE information for similar entries
    similar_cves = df.iloc[similar_indices[0]]

    # Select specific columns for display
    columns_to_display = ['CVE ID', 'Published Date', 'Description', 'CVSS Score', 'Category']
    similar_cves = similar_cves[columns_to_display]

    return similar_cves

# Example usage
cve_id = 'CVE-1999-0005'  # Replace with an actual CVE ID present in your dataset
cve_info = retrieve_cve_info(cve_id)
print(cve_info)


              CVE ID           Published Date  \
4      CVE-1999-0005  1998-07-20 04:00:00.000   
12011  CVE-2005-1014  2005-05-02 04:00:00.000   
15300  CVE-2005-4402  2005-12-20 11:03:00.000   
7072   CVE-2003-0319  2003-06-09 04:00:00.000   
2530   CVE-2000-0961  2000-12-19 05:00:00.000   

                                             Description  CVSS Score  \
4      arbitrari command execut via imap buffer overf...        10.0   
12011  buffer overflow imap servic mailen enterpris e...         7.5   
15300  buffer overflow mailen profession earlier ente...         6.5   
7072   buffer overflow imap server imapmax smartmax m...         7.5   
2530   buffer overflow imap server netscap messag ser...        10.0   

            Category  
4      vulnerability  
12011  vulnerability  
15300  vulnerability  
7072   vulnerability  
2530           patch  


In [None]:
def retrieve_cve_info(cve_id):
    # Check for the row with the specific CVE ID
    row = df[df['CVE ID'] == cve_id]
    if row.empty:
        return f"No data found for CVE ID: {cve_id}"

    # Select specific columns for display
    columns_to_display = ['CVE ID', 'Published Date', 'Description', 'CVSS Score', 'Category',]
    return row[columns_to_display]

# Example usage
cve_id = 'CVE-1999-0005'  # Replace with an actual CVE ID present in your dataset
cve_info = retrieve_cve_info(cve_id)
print(cve_info)


          CVE ID           Published Date  \
4  CVE-1999-0005  1998-07-20 04:00:00.000   

                                         Description  CVSS Score  \
4  arbitrari command execut via imap buffer overf...        10.0   

        Category  
4  vulnerability  


In [11]:
def retrieve_cve_info(cve_id):
    # Check for the row with the specific CVE ID
    row = df[df['CVE ID'] == cve_id]
    if row.empty:
        return f"No data found for CVE ID: {cve_id}"

    # Select all specified columns for display
    columns_to_display = [
        'CVE ID', 'Source Identifier', 'Published Date', 'Last Modified Date',
        'Vulnerability Status', 'Description', 'CVSS Score', 'Weaknesses',
        'Configuration', 'References', 'Dependencies', 'Category'
    ]
    return row[columns_to_display]

# Example usage
cve_id = 'CVE-1999-0005'  # Replace with an actual CVE ID present in your dataset
cve_info = retrieve_cve_info(cve_id)
print(cve_info)


          CVE ID Source Identifier           Published Date  \
4  CVE-1999-0005     cve@mitre.org  1998-07-20 04:00:00.000   

        Last Modified Date Vulnerability Status  \
4  2008-09-09 12:33:31.117             analyzed   

                                         Description  CVSS Score   Weaknesses  \
4  arbitrari command execut via imap buffer overf...        10.0  nvdcweother   

                                       Configuration  \
4  cpeanetscapemessaging_serv cpeauniversity_of_w...   

                                          References  \
4  httpsunsolvesuncompubcgiretrievepldoctypecolld...   

                                        Dependencies       Category  
4  [('arbitrari', 'compound', 'command'), ('comma...  vulnerability  


In [13]:
# Interactive prompt for user input
cve_id = input("Enter the CVE ID you want information for: ")
cve_info = retrieve_cve_info(cve_id)
print("\nRetrieved CVE Information:")
print(cve_info)


Retrieved CVE Information:
           CVE ID Source Identifier           Published Date  \
13  CVE-1999-0014     cve@mitre.org  1998-01-21 05:00:00.000   

         Last Modified Date Vulnerability Status  \
13  2008-09-09 12:33:32.087             analyzed   

                                          Description  CVSS Score  \
13  unauthor privileg access denial servic via dta...         7.2   

     Weaknesses                                      Configuration  \
13  nvdcweother  cpeacdecd cpeacdecde_x cpeacdecd cpeacdecd cpe...   

                                           References  \
13  httpsunsolvesuncompubcgiretrievepldoctypecolld...   

                                         Dependencies       Category  
13  [('unauthor', 'compound', 'denial'), ('privile...  vulnerability  


In [14]:
def retrieve_cve_info(cve_id):
    # Check for the row with the specific CVE ID
    row = df[df['CVE ID'] == cve_id]
    if row.empty:
        return f"No data found for CVE ID: {cve_id}"

    # Select specified columns for display
    columns_to_display = [
        'CVE ID', 'Source Identifier', 'Published Date', 'Last Modified Date',
        'Vulnerability Status', 'Description', 'CVSS Score', 'Weaknesses',
        'Configuration', 'References', 'Entities', 'Dependencies', 'Category'
    ]
    
    # Filter the row with the desired columns and reset the index
    row = row[columns_to_display].reset_index(drop=True)
    
    # Display row as string without index
    return row.to_string(index=False)

# Example usage
cve_id = 'CVE-1999-0014'  # Replace with an actual CVE ID present in your dataset
cve_info = retrieve_cve_info(cve_id)
print("Retrieved CVE Information:")
print(cve_info)


Retrieved CVE Information:
       CVE ID Source Identifier          Published Date      Last Modified Date Vulnerability Status                                                      Description  CVSS Score  Weaknesses                                                                                                                                    Configuration                                                                                               References Entities                                                                                                                                                                                                                                                                                  Dependencies      Category
CVE-1999-0014     cve@mitre.org 1998-01-21 05:00:00.000 2008-09-09 12:33:32.087             analyzed unauthor privileg access denial servic via dtappgath program cde         7.2 nvdcweother cpeacdecd cpeacdecde_x cpeacdecd c

In [15]:
pip install tabulate


Collecting tabulate
  Using cached tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Using cached tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
from tabulate import tabulate

def retrieve_cve_info(cve_id):
    # Check for the row with the specific CVE ID
    row = df[df['CVE ID'] == cve_id]
    if row.empty:
        return f"No data found for CVE ID: {cve_id}"

    # Select specified columns for display
    columns_to_display = [
        'CVE ID', 'Source Identifier', 'Published Date', 'Last Modified Date',
        'Vulnerability Status', 'Description', 'CVSS Score', 'Weaknesses',
        'Configuration', 'References', 'Category'
    ]
    
    # Filter the row with the desired columns and reset the index
    row = row[columns_to_display].reset_index(drop=True)
    
    # Format the output as a table using `tabulate`
    return tabulate(row, headers='keys', tablefmt='grid')



In [3]:
cve_id = input("Enter the CVE ID you want information for: ")
cve_info = retrieve_cve_info(cve_id)
print("\nRetrieved CVE Information:")
print(cve_info)

NameError: name 'df' is not defined

In [None]:


# Initialize the SentenceTransformer model for embedding generation
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Generate embeddings for each description in the dataset
descriptions = df['Description'].tolist()
description_embeddings = model.encode(descriptions, convert_to_tensor=True)

# Convert embeddings to numpy arrays for FAISS
description_embeddings = np.array([embedding.numpy() for embedding in description_embeddings])

# Set up FAISS index for approximate nearest neighbors search
embedding_dim = description_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(description_embeddings)

def retrieve_cve_info(cve_id):
    # Adjusted column name to match the CSV
    row = df[df['CVE ID'] == cve_id]
    if row.empty:
        return f"No data found for CVE ID: {cve_id}"

    # Get the index of the row and corresponding embedding
    row_idx = row.index[0]
    embedding = description_embeddings[row_idx].reshape(1, -1)

    # Search for similar entries in the FAISS index
    _, similar_indices = index.search(embedding, 5)  # Returns top 5 similar entries

    # Retrieve CVE information for similar entries
    similar_cves = df.iloc[similar_indices[0]]
    return similar_cves


# Example usage
cve_id = "CVE-XXXX-YYYY"  # Replace with a specific CVE ID
cve_info = retrieve_cve_info(cve_id)
print(cve_info)
