# Get CVSS Data

1. Download the NVD JSON files per year
2. Extract the fields we want from JSON to CSV
3. Save the file

In [1]:
#based on code contributed via https://gist.github.com/jgamblin/7a927997b1f1e35cc7f4f1788ee5eae1

In [1]:

%%capture
!mkdir -p jsondata
%cd jsondata
!rm *.json 
!rm *.zip 
!wget https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-{2000..2024}.json.zip 
#!wget https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2024.json.zip 
!unzip -o "*.zip" 
!date > date.txt 


In [3]:
import json
import logging
import glob
import csv
import pandas as pd
from typing import List, Dict, Any, Optional
from pathlib import Path
from datetime import datetime

# Configure logging
def setup_logging(log_dir: str = "logs") -> None:
    """Configure logging with both file and console handlers."""
    Path(log_dir).mkdir(exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = Path(log_dir) / f"nvd_parser_{timestamp}.log"
    
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_file),
            logging.StreamHandler()
        ]
    )

def safe_get_value(data: Dict[str, Any], keys: List[str], default: Any = 'Missing_Data') -> Any:
    """Safely navigate nested dictionary structure and return value."""
    try:
        result = data
        for key in keys:
            result = result[key]
        return result if result is not None else default
    except (KeyError, IndexError, TypeError):
        return default

def extract_cwe_values(entry: Dict[str, Any]) -> List[str]:
    """Extract CWE values from entry, handling multiple CWEs."""
    try:
        return [desc['value'] for desc in 
                entry['cve']['problemtype']['problemtype_data'][0]['description']]
    except (KeyError, IndexError, TypeError):
        return ['Missing_Data']

def process_nvd_entry(entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """Process a single NVD entry and return structured data."""
    try:
        description = safe_get_value(entry, ['cve', 'description', 'description_data', 0, 'value'], '')
        
        # Skip rejected or disputed entries
        if description.startswith(('** REJECT **', 'Rejected reason:')):
            return None
            
        cve = safe_get_value(entry, ['cve', 'CVE_data_meta', 'ID'])
        if cve == 'Missing_Data':
            logging.warning(f"Missing CVE ID in entry")
            return None

        metrics_v3 = entry.get('impact', {}).get('baseMetricV3', {}).get('cvssV3', {})
        metrics_v2 = entry.get('impact', {}).get('baseMetricV2', {}).get('cvssV2', {})
        
        return {
            'CVE': cve,
            'Published': safe_get_value(entry, ['publishedDate']),
            'Description': description,
            # CVSS3 metrics
            'AttackVector CVSS3': safe_get_value(metrics_v3, ['attackVector']),
            'AttackComplexity CVSS3': safe_get_value(metrics_v3, ['attackComplexity']),
            'PrivilegesRequired CVSS3': safe_get_value(metrics_v3, ['privilegesRequired']),
            'UserInteraction CVSS3': safe_get_value(metrics_v3, ['userInteraction']),
            'Scope CVSS3': safe_get_value(metrics_v3, ['scope']),
            'ConfidentialityImpact CVSS3': safe_get_value(metrics_v3, ['confidentialityImpact']),
            'IntegrityImpact CVSS3': safe_get_value(metrics_v3, ['integrityImpact']),
            'AvailabilityImpact CVSS3': safe_get_value(metrics_v3, ['availabilityImpact']),
            'BaseScore CVSS3': safe_get_value(metrics_v3, ['baseScore'], '0.0'),
            'BaseSeverity CVSS3': safe_get_value(metrics_v3, ['baseSeverity']),
            'ExploitabilityScore CVSS3': safe_get_value(entry, ['impact', 'baseMetricV3', 'exploitabilityScore'], '0.0'),
            'ImpactScore CVSS3': safe_get_value(entry, ['impact', 'baseMetricV3', 'impactScore'], '0.0'),
            # CVSS2 metrics
            'AccessVector CVSS2': safe_get_value(metrics_v2, ['accessVector']),
            'AccessComplexity CVSS2': safe_get_value(metrics_v2, ['accessComplexity']),
            'Authentication CVSS2': safe_get_value(metrics_v2, ['authentication']),
            'ConfidentialityImpact CVSS2': safe_get_value(metrics_v2, ['confidentialityImpact']),
            'IntegrityImpact CVSS2': safe_get_value(metrics_v2, ['integrityImpact']),
            'AvailabilityImpact CVSS2': safe_get_value(metrics_v2, ['availabilityImpact']),
            'BaseScore CVSS2': safe_get_value(metrics_v2, ['baseScore'], '0.0'),
            'BaseSeverity CVSS2': safe_get_value(metrics_v2, ['severity']),
            'ExploitabilityScore CVSS2': safe_get_value(entry, ['impact', 'baseMetricV2', 'exploitabilityScore'], '0.0'),
            'ImpactScore CVSS2': safe_get_value(entry, ['impact', 'baseMetricV2', 'impactScore'], '0.0'),
            'CWEs': extract_cwe_values(entry)
        }
    except Exception as e:
        logging.error(f"Error processing entry: {str(e)}")
        return None

def process_nvd_files(file_pattern: str = 'nvdcve-1.1-*.json', output_dir: str = 'data_out') -> pd.DataFrame:
    """Process all NVD JSON files matching the pattern and return a DataFrame."""
    row_accumulator = []
    files = glob.glob(file_pattern)
    
    if not files:
        logging.error(f"No files found matching pattern: {file_pattern}")
        return pd.DataFrame()

    for filename in files:
        logging.info(f"Processing file: {filename}")
        try:
            with open(filename, 'r', encoding='utf-8') as f:
                nvd_data = json.load(f)
                
            if 'CVE_Items' not in nvd_data:
                logging.error(f"Invalid file format: {filename} - Missing CVE_Items")
                continue
                
            for entry in nvd_data['CVE_Items']:
                processed_entry = process_nvd_entry(entry)
                if processed_entry:
                    row_accumulator.append(processed_entry)
                    
            logging.info(f"Successfully processed {len(nvd_data['CVE_Items'])} entries from {filename}")
                    
        except json.JSONDecodeError as e:
            logging.error(f"JSON parsing error in {filename}: {str(e)}")
            continue
        except Exception as e:
            logging.error(f"Unexpected error processing {filename}: {str(e)}")
            continue

    if not row_accumulator:
        logging.warning("No valid entries were processed")
        return pd.DataFrame()

    return pd.DataFrame(row_accumulator)

def clean_description(text: str) -> str:
    """Clean description text by removing newlines and non-ASCII characters."""
    try:
        # Replace newlines with spaces
        text = ' '.join(text.splitlines())
        # Encode as ASCII and decode back to string, replacing non-ASCII characters
        text = text.encode('ascii', errors='ignore').decode()
        # Remove multiple spaces
        text = ' '.join(text.split())
        return text
    except (AttributeError, UnicodeError) as e:
        logging.warning(f"Error cleaning description text: {str(e)}")
        return text

def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Process the DataFrame by converting dates, cleaning descriptions, and sorting."""
    try:
        logging.info("Processing DataFrame dates and sorting")
        # Convert Published column to datetime
        df['Published'] = pd.to_datetime(df['Published'], errors='coerce')
        
        # Log any rows where date conversion failed
        invalid_dates = df[df['Published'].isna()]
        if not invalid_dates.empty:
            logging.warning(f"Found {len(invalid_dates)} rows with invalid dates")
            
        # Sort by published date
        df = df.sort_values(by=['Published'])
        df = df.reset_index(drop=True)
        
        # Convert to date only
        df['Published'] = df['Published'].apply(lambda x: x.date() if pd.notnull(x) else None)
        
        # Clean Description column
        logging.info("Cleaning Description column")
        df['Description'] = df['Description'].apply(clean_description)
        
        logging.info("Successfully processed DataFrame")
        return df
    except Exception as e:
        logging.error(f"Error processing DataFrame: {str(e)}")
        raise

def export_dataframe(df: pd.DataFrame, output_dir: str = '../data_out/') -> None:
    """Export DataFrame to a compressed CSV file with proper escaping."""
    try:
        # Create output directory if it doesn't exist
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)
        
        output_file = output_path / 'CVSSData.csv.gz'
        logging.info(f"Exporting data to {output_file}")
        
        df.to_csv(
            output_file,
            index=False,
            quoting=csv.QUOTE_ALL,
            escapechar='\\',
            compression='gzip'
        )
        
        logging.info(f"Successfully exported {len(df)} records to {output_file}")
    except Exception as e:
        logging.error(f"Error exporting DataFrame: {str(e)}")
        raise




In [4]:
"""Main function to run the NVD data processing."""
setup_logging()
logging.info("Starting NVD data processing")

try:
    nvd_df = process_nvd_files()
    if not nvd_df.empty:
        nvd_df = process_dataframe(nvd_df)
        export_dataframe(nvd_df)
        #nvd_df.to_csv('./data_out/CVSSData.csv.gz', index=False, quoting=csv.QUOTE_ALL, escapechar='\\', compression='gzip')
    else:
        logging.error("No data was processed")
except Exception as e:
    logging.error(f"Fatal error in main process: {str(e)}")
    raise



2024-12-19 11:17:00,748 - INFO - Starting NVD data processing
2024-12-19 11:17:00,749 - INFO - Processing file: nvdcve-1.1-2004.json
2024-12-19 11:17:00,905 - INFO - Successfully processed 2707 entries from nvdcve-1.1-2004.json
2024-12-19 11:17:00,905 - INFO - Processing file: nvdcve-1.1-2018.json
2024-12-19 11:17:03,678 - INFO - Successfully processed 17455 entries from nvdcve-1.1-2018.json
2024-12-19 11:17:03,679 - INFO - Processing file: nvdcve-1.1-2006.json
2024-12-19 11:17:04,850 - INFO - Successfully processed 7142 entries from nvdcve-1.1-2006.json
2024-12-19 11:17:04,850 - INFO - Processing file: nvdcve-1.1-2019.json
2024-12-19 11:17:07,171 - INFO - Successfully processed 17017 entries from nvdcve-1.1-2019.json
2024-12-19 11:17:07,171 - INFO - Processing file: nvdcve-1.1-2022.json
2024-12-19 11:17:10,902 - INFO - Successfully processed 25280 entries from nvdcve-1.1-2022.json
2024-12-19 11:17:10,903 - INFO - Processing file: nvdcve-1.1-2012.json
2024-12-19 11:17:11,847 - INFO - S

In [5]:
nvd_df

Unnamed: 0,CVE,Published,Description,AttackVector CVSS3,AttackComplexity CVSS3,PrivilegesRequired CVSS3,UserInteraction CVSS3,Scope CVSS3,ConfidentialityImpact CVSS3,IntegrityImpact CVSS3,...,AccessComplexity CVSS2,Authentication CVSS2,ConfidentialityImpact CVSS2,IntegrityImpact CVSS2,AvailabilityImpact CVSS2,BaseScore CVSS2,BaseSeverity CVSS2,ExploitabilityScore CVSS2,ImpactScore CVSS2,CWEs
0,CVE-1999-0095,1988-10-01,"The debug command in Sendmail is enabled, allo...",Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,...,LOW,NONE,COMPLETE,COMPLETE,COMPLETE,10.0,Missing_Data,10.0,10.0,[NVD-CWE-Other]
1,CVE-1999-0082,1988-11-11,CWD ~root command in ftpd allows root access.,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,...,LOW,NONE,COMPLETE,COMPLETE,COMPLETE,10.0,Missing_Data,10.0,10.0,[NVD-CWE-Other]
2,CVE-1999-1471,1989-01-01,Buffer overflow in passwd in BSD based operati...,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,...,LOW,NONE,COMPLETE,COMPLETE,COMPLETE,7.2,Missing_Data,3.9,10.0,[NVD-CWE-Other]
3,CVE-1999-1122,1989-07-26,Vulnerability in restore in SunOS 4.0.3 and ea...,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,...,LOW,NONE,PARTIAL,PARTIAL,PARTIAL,4.6,Missing_Data,3.9,6.4,[NVD-CWE-Other]
4,CVE-1999-1467,1989-10-26,Vulnerability in rcp on SunOS 4.0.x allows rem...,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,...,LOW,NONE,COMPLETE,COMPLETE,COMPLETE,10.0,Missing_Data,10.0,10.0,[NVD-CWE-Other]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259837,CVE-2022-27600,2024-12-19,An uncontrolled resource consumption vulnerabi...,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,...,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,0.0,Missing_Data,0.0,0.0,"[CWE-400, CWE-798]"
259838,CVE-2024-11984,2024-12-19,A unrestricted upload of file with dangerous t...,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,...,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,0.0,Missing_Data,0.0,0.0,[]
259839,CVE-2024-11768,2024-12-19,The Download Manager plugin for WordPress is v...,NETWORK,LOW,NONE,NONE,UNCHANGED,LOW,NONE,...,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,0.0,Missing_Data,0.0,0.0,[CWE-285]
259840,CVE-2024-11740,2024-12-19,The The Download Manager plugin for WordPress ...,NETWORK,LOW,NONE,NONE,UNCHANGED,LOW,LOW,...,Missing_Data,Missing_Data,Missing_Data,Missing_Data,Missing_Data,0.0,Missing_Data,0.0,0.0,[CWE-94]
