In [2]:
import requests
import json
import pandas as pd
import os
import time

In [5]:
def save_data_to_csv(file_name, data):
    folder_path = os.path.join("..//data", "raw")
    csv_file = os.path.join(folder_path, file_name + ".csv")
    df = pd.DataFrame(data)
    df.to_csv(csv_file, index=False)
    
    print(f"Data saved to {csv_file}")

def save_data_to_json(file_name, data):
    folder_path = os.path.join("..//data", "raw")
    json_file = os.path.join(folder_path, file_name + ".json")
    with open(json_file, 'w') as f:
        json.dump(data, f, indent=4)

    print(f"Data saved to {json_file}")

In [29]:
def fetch_all_cves(base_url, params=None):
    all_cves = []
    page_number = 0
    
    while True:
        max_retries = 5
        retry_count = 0
        # Update the parameters with the current page number
        params = params or {}
        params.update({'startIndex': page_number * 2000})  # NVD API typically returns 2000 items per page
        # Make the request to the API
        # response = requests.get(base_url, params=params)
        while retry_count < max_retries:
            response = requests.get(base_url, params=params)
            if response.status_code == 200:
                break
            else:
                print(f"Failed to retrieve data: {response.status_code}, {retry_count}")
                retry_count += 1
                time.sleep(5)  # Wait before retrying (5 seconds in this case)
        else:
            print("Max retries reached. Exiting.")
            break

        data = response.json()
        vulnerabilities = data.get('vulnerabilities', [])

        # If no more CVEs are returned, break the loop
        if not vulnerabilities:
            print("No more CVEs to fetch.")
            break

        all_cves.extend(vulnerabilities)
        
        page_number += 1
        print(f"Fetched page {page_number} with {len(vulnerabilities)} CVEs")


    return all_cves

# Define the base API endpoint
base_url = "https://services.nvd.nist.gov/rest/json/cves/2.0"

# Optional: Define query parameters 
params = {'resultsPerPage': 2000}

# Fetch all CVEs
all_cves = fetch_all_cves(base_url, params)
 
save_data_to_json("all_cves", all_cves)
save_data_to_csv("all_cves", all_cves)

Fetched page 1 with 2000 CVEs
Fetched page 2 with 2000 CVEs
Fetched page 3 with 2000 CVEs
Fetched page 4 with 2000 CVEs
Fetched page 5 with 2000 CVEs
Fetched page 6 with 2000 CVEs
Fetched page 7 with 2000 CVEs
Fetched page 8 with 2000 CVEs
Fetched page 9 with 2000 CVEs
Fetched page 10 with 2000 CVEs
Fetched page 11 with 2000 CVEs
Failed to retrieve data: 503, 0
Failed to retrieve data: 503, 1
Failed to retrieve data: 503, 2
Fetched page 12 with 2000 CVEs
Fetched page 13 with 2000 CVEs
Fetched page 14 with 2000 CVEs
Fetched page 15 with 2000 CVEs
Fetched page 16 with 2000 CVEs
Fetched page 17 with 2000 CVEs
Fetched page 18 with 2000 CVEs
Fetched page 19 with 2000 CVEs
Fetched page 20 with 2000 CVEs
Fetched page 21 with 2000 CVEs
Fetched page 22 with 2000 CVEs
Fetched page 23 with 2000 CVEs
Fetched page 24 with 2000 CVEs
Failed to retrieve data: 503, 0
Fetched page 25 with 2000 CVEs
Fetched page 26 with 2000 CVEs
Fetched page 27 with 2000 CVEs
Fetched page 28 with 2000 CVEs
Fetched page 

In [3]:
df = pd.read_csv('..//data/raw/all_cves.csv')
df.shape

(269509, 1)

In [6]:
# Function to process each CVE JSON record
def restructure_cve_data(cve_data):
    # print(cve_data)
    cve_data = cve_data.get('cve', {})

    # Extracting descriptions
    desc_en = next((item['value'] for item in cve_data.get('descriptions', []) if item['lang'] == 'en'), '')

    # Extracting CVSS metrics
    cvss = cve_data.get('metrics', {}).get('cvssMetricV2', [{}])[0].get('cvssData', {})
    cvss_data = {
        "CVSS Version": cvss.get("version", ""),
        "CVSS Vector String": cvss.get("vectorString", ""),
        "Access Vector": cvss.get("accessVector", ""),
        "Access Complexity": cvss.get("accessComplexity", ""),
        "Authentication": cvss.get("authentication", ""),
        "Confidentiality Impact": cvss.get("confidentialityImpact", ""),
        "Integrity Impact": cvss.get("integrityImpact", ""),
        "Availability Impact": cvss.get("availabilityImpact", ""),
        "Base Score": cvss.get("baseScore", 0.0)
    }

    # Extracting weaknesses
    weakness_desc = next((item['description'][0]['value'] for item in cve_data.get('weaknesses', []) if item['description'][0]['lang'] == 'en'), '')

    # Extracting references
    references = ";".join(ref['url'] for ref in cve_data.get('references', []))
    
    patch_urls = []
    for reference in cve_data.get('references', []):
        if 'Patch' in reference.get('tags', []):
            patch_urls.append(reference['url'])
    patch = ';'.join(patch_urls)

    # Creating a dictionary of the CVE data
    return {
        "CVE ID": cve_data.get('id', ''),
        "Source Identifier": cve_data.get('sourceIdentifier', ''),
        "Published Date": cve_data.get('published', ''),
        "Last Modified Date": cve_data.get('lastModified', ''),
        "Vulnerability Status": cve_data.get('vulnStatus', ''),
        "Description": desc_en,
        **cvss_data,
        "Base Severity": cve_data.get('metrics', {}).get('cvssMetricV2', [{}])[0].get("baseSeverity", ""),
        "Exploitability Score": cve_data.get('metrics', {}).get('cvssMetricV2', [{}])[0].get("exploitabilityScore", 0.0),
        "Impact Score": cve_data.get('metrics', {}).get('cvssMetricV2', [{}])[0].get("impactScore", 0.0),
        "acInsufInfo": cve_data.get('metrics', {}).get('cvssMetricV2', [{}])[0].get("acInsufInfo", False),
        "Obtain All Privilege": cve_data.get('metrics', {}).get('cvssMetricV2', [{}])[0].get("obtainAllPrivilege", False),
        "Obtain User Privilege": cve_data.get('metrics', {}).get('cvssMetricV2', [{}])[0].get("obtainUserPrivilege", False),
        "Obtain Other Privilege": cve_data.get('metrics', {}).get('cvssMetricV2', [{}])[0].get("obtainOtherPrivilege", False),
        "User Interaction Required": cve_data.get('metrics', {}).get('cvssMetricV2', [{}])[0].get("userInteractionRequired", False),
        "CWE ID": weakness_desc,
        "Reference URLs": references,
        "Patch URL": patch
    }

with open('..//data/raw/all_cves.json', 'r') as file:
    cve_data = json.load(file)
    
restructured_df = pd.DataFrame([restructure_cve_data(cve) for cve in cve_data])

# Save the result to a new CSV file
save_data_to_csv("restructured_all_cves", restructured_df)
restructured_df.shape

Data saved to ..//data\raw\restructured_all_cves.csv


(269509, 26)