In [1]:
import json
import os
from tqdm import tqdm
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple
import pandas as pd

# CVE Data Class

The class below has properties for each attribute that will exist in the dataset. There is at least one url above each property where you can find additional information for that attribute. You may need to do additional googling to get a verbose description.

In [2]:
@dataclass
class CVEData():
    """
    CVE Data
    """
    # https://cveproject.github.io/cve-schema/schema/docs/#oneOf_i0_cveMetadata_cveId
    cve_id: str

    # https://cveproject.github.io/cve-schema/schema/docs/#oneOf_i0_cveMetadata_dateUpdated
    date_updated: Optional[str]

    # https://cveproject.github.io/cve-schema/schema/docs/#oneOf_i0_cveMetadata_dateReserved
    date_reserved: Optional[str]

    # https://cveproject.github.io/cve-schema/schema/docs/#oneOf_i0_cveMetadata_datePublished
    date_published: Optional[str]

    # https://cveproject.github.io/cve-schema/schema/docs/#oneOf_i0_cveMetadata_state
    state: str

    # https://cveproject.github.io/cve-schema/schema/docs/#oneOf_i0_cveMetadata_assignerShortName
    assigner: Optional[str]

    # https://cveproject.github.io/cve-schema/schema/docs/#oneOf_i0_containers_cna_affected_items_vendor
    affected: Optional[str]

    # https://cveproject.github.io/cve-schema/schema/docs/#oneOf_i0_containers_cna_problemTypes
    problem_types: Optional[str]

    # https://cveproject.github.io/cve-schema/schema/docs/#oneOf_i0_containers_cna_metrics_items_cvssV4_0_baseScore
    # https://www.first.org/cvss/v4.0/specification-document
    v4_score: Optional[float]

    # https://cveproject.github.io/cve-schema/schema/docs/#oneOf_i0_containers_cna_metrics_items_cvssV4_0_baseSeverity
    v4_severity: Optional[str]

    # https://cveproject.github.io/cve-schema/schema/docs/#oneOf_i0_containers_cna_metrics_items_cvssV3_1_baseScore
    v31_score: Optional[float]

    # https://cveproject.github.io/cve-schema/schema/docs/#oneOf_i0_containers_cna_metrics_items_cvssV3_1_baseSeverity
    v31_severity: Optional[str]
    
    # https://cveproject.github.io/cve-schema/schema/docs/#oneOf_i0_containers_cna_metrics_items_cvssV3_0_baseScore
    v30_score: Optional[float]

    # https://cveproject.github.io/cve-schema/schema/docs/#oneOf_i0_containers_cna_metrics_items_cvssV3_0_baseSeverity
    v30_severity: Optional[str]

    # https://cveproject.github.io/cve-schema/schema/docs/#oneOf_i0_containers_cna_metrics_items_cvssV2_0_baseScore
    v20_score: Optional[float]

    # https://cveproject.github.io/cve-schema/schema/docs/#oneOf_i0_containers_cna_metrics_items_cvssV2_0_exploitability
    v20_severity: Optional[str]

    # https://cveproject.github.io/cve-schema/schema/docs/#oneOf_i0_containers_cna_metrics_items_cvssV4_0_attackVector
    attack_vector: Optional[str]

    # https://cveproject.github.io/cve-schema/schema/docs/#oneOf_i0_containers_cna_metrics_items_cvssV4_0_attackComplexity
    attack_complexity: Optional[str]

    # https://cveproject.github.io/cve-schema/schema/docs/#oneOf_i0_containers_cna_metrics_items_cvssV4_0_attackRequirements
    attack_requirements: Optional[str]

    # https://cveproject.github.io/cve-schema/schema/docs/#oneOf_i0_containers_cna_metrics_items_cvssV4_0_privilegesRequired
    privs_required: Optional[str]

    # https://cveproject.github.io/cve-schema/schema/docs/#oneOf_i0_containers_cna_metrics_items_cvssV4_0_userInteraction
    user_interaction: Optional[str]

    # https://cveproject.github.io/cve-schema/schema/docs/#oneOf_i0_containers_cna_metrics_items_cvssV4_0_exploitMaturity
    exploit_maturity: Optional[str]

In [3]:
def get_metric_scores(metric: Dict[Any, Any]) -> Any:
    """
    Gets metrics scores

    Args:
        metric (Dict[Any, Any]): Metrics dict

    Returns:
        Any: Metric Scores
    """
    score = metric["baseScore"]
    severity = metric.get("baseSeverity", None)
    attack_vector = metric.get("attackVector", None)
    attack_requirements = metric.get("attackRequirements", None)
    user_interaction = metric.get("userInteraction", None)
    privs_required = metric.get("privilegesRequired", None)

    if "attackComplexity" in metric:
        attack_complexity = metric["attackComplexity"]
    elif "accessComplexity" in metric:
        attack_complexity = metric["accessComplexity"]
    else:
        attack_complexity = None

    if "exploitMaturity" in metric:
        exploit_maturity = metric["exploitMaturity"]
    elif "exploitCodeMaturity" in metric:
        exploit_maturity = metric["exploitCodeMaturity"]
    elif "exploitability" in metric:
        exploit_maturity = metric["exploitability"]
    else:
        exploit_maturity = None

    return (score, severity, attack_vector,
            attack_complexity, attack_requirements,
            privs_required, user_interaction, exploit_maturity)

def get_metrics_from_container(container: Dict[Any, Any]) -> Any:
    """
    Gets metrics from container

    Args:
        container (Dict[Any, Any]): Container

    Returns:
        Any: Metrics
    """

    metrics = container["metrics"]

    v4_score = None
    v4_severity = None
    v4_attack_vector = None
    v4_attack_complexity = None
    v4_attack_requirements = None
    v4_privs_required = None
    v4_user_interaction = None
    v4_exploit_maturity = None

    v31_score = None
    v31_severity = None
    v31_attack_vector = None
    v31_attack_complexity = None
    v31_attack_requirements = None
    v31_privs_required = None
    v31_user_interaction = None
    v31_exploit_maturity = None

    v30_score = None
    v30_severity = None
    v30_attack_vector = None
    v30_attack_complexity = None
    v30_attack_requirements = None
    v30_privs_required = None
    v30_user_interaction = None
    v30_exploit_maturity = None

    v20_score = None
    v20_severity = None
    v20_attack_vector = None
    v20_attack_complexity = None
    v20_attack_requirements = None
    v20_privs_required = None
    v20_user_interaction = None
    v20_exploit_maturity = None

    for metric in metrics:
        if "cvssV4_0" in metric:
            (score, severity, attack_vector,
            attack_complexity, attack_requirements,
            privs_required, user_interaction, exploit_maturity) = get_metric_scores(metric["cvssV4_0"])
            v4_score = score
            v4_severity = severity
            v4_attack_vector = attack_vector
            v4_attack_complexity = attack_complexity
            v4_attack_requirements = attack_requirements
            v4_privs_required = privs_required
            v4_user_interaction = user_interaction
            v4_exploit_maturity = exploit_maturity

        if "cvssV3_1" in metric:
            (score, severity, attack_vector,
            attack_complexity, attack_requirements,
            privs_required, user_interaction, exploit_maturity) = get_metric_scores(metric["cvssV3_1"])
            v31_score = score
            v31_severity = severity
            v31_attack_vector = attack_vector
            v31_attack_complexity = attack_complexity
            v31_attack_requirements = attack_requirements
            v31_privs_required = privs_required
            v31_user_interaction = user_interaction
            v31_exploit_maturity = exploit_maturity

        if "cvssV3_0" in metric:
            (score, severity, attack_vector,
            attack_complexity, attack_requirements,
            privs_required, user_interaction, exploit_maturity) = get_metric_scores(metric["cvssV3_0"])
            v30_score = score
            v30_severity = severity
            v30_attack_vector = attack_vector
            v30_attack_complexity = attack_complexity
            v30_attack_requirements = attack_requirements
            v30_privs_required = privs_required
            v30_user_interaction = user_interaction
            v30_exploit_maturity = exploit_maturity

        if "cvssV2_0" in metric:
            (score, severity, attack_vector,
            attack_complexity, attack_requirements,
            privs_required, user_interaction, exploit_maturity) = get_metric_scores(metric["cvssV2_0"])
            v20_score = score
            v20_severity = severity
            v20_attack_vector = attack_vector
            v20_attack_complexity = attack_complexity
            v20_attack_requirements = attack_requirements
            v20_privs_required = privs_required
            v20_user_interaction = user_interaction
            v20_exploit_maturity = exploit_maturity
    
    attack_vector = next((item
                            for item in [v4_attack_vector, v31_attack_vector, v30_attack_vector, v20_attack_vector] 
                            if item is not None), None)

    attack_complexity = next((item
                                for item in [v4_attack_complexity, v31_attack_complexity, v30_attack_complexity, v20_attack_complexity] 
                                if item is not None), None)

    attack_requirements = next((item
                                for item in [v4_attack_requirements, v31_attack_requirements, v30_attack_requirements, v20_attack_requirements] 
                                if item is not None), None)

    privs_required = next((item 
                                for item in [v4_privs_required, v31_privs_required, v30_privs_required, v20_privs_required] 
                                if item is not None), None)

    user_interaction = next((item
                                for item in [v4_user_interaction,
                                            v31_user_interaction,
                                            v30_user_interaction,
                                            v20_user_interaction] 
                                if item is not None), None)

    exploit_maturity = next((item
                                for item in [v4_exploit_maturity, v31_exploit_maturity, v30_exploit_maturity, v20_exploit_maturity] 
                                if item is not None), None)
    
    return (
        v4_score,
        v4_severity,
        v31_score,
        v31_severity,
        v30_score,
        v30_severity,
        v20_score,
        v20_severity,
        attack_vector,
        attack_complexity,
        attack_requirements,
        privs_required,
        user_interaction,
        exploit_maturity
    )


def get_metrics(json_dict: Dict[Any, Any]) -> Any:
    """
    Get CVE metrics

    Args:
        json_dict (Dict[Any, Any]): CVE Json Object

    Returns:
        Any: Metrics
    """
    container = json_dict["containers"]["cna"]

    cna_v4_score = None
    cna_v4_severity = None
    cna_v31_score = None
    cna_v31_severity = None
    cna_v30_score = None
    cna_v30_severity = None
    cna_v20_score = None
    cna_v20_severity = None
    cna_attack_vector = None
    cna_attack_complexity = None
    cna_attack_requirements = None
    cna_privs_required = None
    cna_user_interaction = None
    cna_exploit_maturity = None

    adp_v4_score = None
    adp_v4_severity = None
    adp_v31_score = None
    adp_v31_severity = None
    adp_v30_score = None
    adp_v30_severity = None
    adp_v20_score = None
    adp_v20_severity = None
    adp_attack_vector = None
    adp_attack_complexity = None
    adp_attack_requirements = None
    adp_privs_required = None
    adp_user_interaction = None
    adp_exploit_maturity = None

    if "metrics" in container:
        (
            v4_score,
            v4_severity,
            v31_score,
            v31_severity,
            v30_score,
            v30_severity,
            v20_score,
            v20_severity,
            attack_vector,
            attack_complexity,
            attack_requirements,
            privs_required,
            user_interaction,
            exploit_maturity
        ) = get_metrics_from_container(container)
        cna_v4_score = v4_score
        cna_v4_severity = v4_severity
        cna_v31_score = v31_score
        cna_v31_severity = v31_severity
        cna_v30_score = v30_score
        cna_v30_severity = v30_severity
        cna_v20_score = v20_score
        cna_v20_severity = v20_severity
        cna_attack_vector = attack_vector
        cna_attack_complexity = attack_complexity
        cna_attack_requirements = attack_requirements
        cna_privs_required = privs_required
        cna_user_interaction = user_interaction
        cna_exploit_maturity = exploit_maturity


    if "adp" in json_dict["containers"]:
        for adp_container in json_dict["containers"]['adp']:

            if 'metrics' in adp_container:
                container = adp_container
                (
                    v4_score,
                    v4_severity,
                    v31_score,
                    v31_severity,
                    v30_score,
                    v30_severity,
                    v20_score,
                    v20_severity,
                    attack_vector,
                    attack_complexity,
                    attack_requirements,
                    privs_required,
                    user_interaction,
                    exploit_maturity
                ) = get_metrics_from_container(container)
                adp_v4_score = v4_score
                adp_v4_severity = v4_severity
                adp_v31_score = v31_score
                adp_v31_severity = v31_severity
                adp_v30_score = v30_score
                adp_v30_severity = v30_severity
                adp_v20_score = v20_score
                adp_v20_severity = v20_severity
                adp_attack_vector = attack_vector
                adp_attack_complexity = attack_complexity
                adp_attack_requirements = attack_requirements
                adp_privs_required = privs_required
                adp_user_interaction = user_interaction
                adp_exploit_maturity = exploit_maturity
    
    v4_score = next((item
                    for item in [cna_v4_score, adp_v4_score] 
                    if item is not None), None)
    v4_severity = next((item
                    for item in [cna_v4_severity, adp_v4_severity] 
                    if item is not None), None)

    v31_score = next((item
                    for item in [cna_v31_score, adp_v31_score] 
                    if item is not None), None)
    v31_severity = next((item
                    for item in [cna_v31_severity, adp_v31_severity] 
                    if item is not None), None)

    v30_score = next((item
                    for item in [cna_v30_score, adp_v30_score] 
                    if item is not None), None)
    v30_severity = next((item
                    for item in [cna_v30_severity, adp_v30_severity] 
                    if item is not None), None)

    v20_score = next((item
                    for item in [cna_v20_score, adp_v20_score] 
                    if item is not None), None)
    v20_severity = next((item
                    for item in [cna_v20_severity, adp_v20_severity] 
                    if item is not None), None)
    
    attack_vector = next((item
            for item in [cna_attack_vector, adp_attack_vector] 
            if item is not None), None)

    attack_complexity = next((item
            for item in [cna_attack_complexity, adp_attack_complexity] 
            if item is not None), None)

    attack_requirements = next((item
            for item in [cna_attack_requirements, adp_attack_requirements] 
            if item is not None), None)

    privs_required = next((item
            for item in [cna_privs_required, adp_privs_required] 
            if item is not None), None)

    user_interaction = next((item
            for item in [cna_user_interaction, adp_user_interaction] 
            if item is not None), None)

    exploit_maturity = next((item
            for item in [cna_exploit_maturity, adp_exploit_maturity] 
            if item is not None), None)
        
    return (
        v4_score,
        v4_severity,
        v31_score,
        v31_severity,
        v30_score,
        v30_severity,
        v20_score,
        v20_severity,
        attack_vector,
        attack_complexity,
        attack_requirements,
        privs_required,
        user_interaction,
        exploit_maturity
    )


def get_cve_data(file: str) -> Optional[List[CVEData]]:
    """
    Gets CVE Object given CVE formatted json file

    Args:
        file (str): Path to CVE formatted json file

    Returns:
        CVEData: CVE Data Object
    """
    with open(file, "r", encoding="utf-8") as json_file:
        json_dict = json.load(json_file)
    
    if "cveMetadata" in json_dict:
        cve_metadata = json_dict["cveMetadata"]
        cve_id = cve_metadata["cveId"]
        assigner = cve_metadata.get("assignerShortName", None)
        state = cve_metadata["state"]
        date_updated = cve_metadata.get("dateUpdated", None)
        date_reserved = cve_metadata.get("dateReserved", None)
        date_published = cve_metadata.get("datePublished", None)

        container = json_dict["containers"]["cna"]

        if "affected" in container and len(container["affected"]) > 0:
            affected = " ".join([f'{org["vendor"]}::{org["product"]}' 
                                for org in container["affected"]
                                if ("vendor" in org
                                     and "product" in org 
                                     and org["vendor"].lower() != "n/a" 
                                     and org["product"].lower() != "n/a")])
        else:
            affected = None
        
        if "problemTypes" in container and len(container["problemTypes"]) > 0:
            problem_types_list = []

            for problem_type in container["problemTypes"]:
                for desc in problem_type["descriptions"]:
                    problem_types_list.append(desc["description"])
            
            problem_types_list = list(set(problem_types_list))
            problem_types = " ".join(problem_types_list)
        else:
            problem_types = None

        (
            v4_score,
            v4_severity,
            v31_score,
            v31_severity,
            v30_score,
            v30_severity,
            v20_score,
            v20_severity,
            attack_vector,
            attack_complexity,
            attack_requirements,
            privs_required,
            user_interaction,
            exploit_maturity
        ) = get_metrics(json_dict)

        return CVEData(
            cve_id, date_updated, date_reserved, date_published,
            state, assigner, affected, problem_types, v4_score,
            v4_severity, v31_score, v31_severity, v30_score, v30_severity,
            v20_score, v20_severity, attack_vector, attack_complexity,
            attack_requirements, privs_required, user_interaction, exploit_maturity
        )

    return None

In [4]:
parent_folder = "data/cves/cves"
file_list = []
for dirpath, dirnames, filenames in os.walk(parent_folder):
    for filename in filenames:
        file_list.append(os.path.join(dirpath, filename))

cves: List[CVEData] = []
for file_path in tqdm(file_list, desc="Parsing Json Files..."):
    cve = get_cve_data(file_path)

    if cve is not None:
        cves.append(cve)

Parsing Json Files...: 100%|██████████| 263422/263422 [34:06<00:00, 128.69it/s]


In [5]:
cve_ids = []
dates_updated = []
dates_reserved = []
dates_published = []
states = []
assigners = []
affected = []
problem_types = []
v4_scores = []
v4_severities = []
v31_scores = []
v31_severities = []
v30_scores = []
v30_severities = []
v20_scores = []
v20_severities = []
attack_vectors = []
attack_complexities = []
attack_requirements = []
privs_required = []
user_interactions = []
exploit_maturities = []

for cve in tqdm(cves, desc="Generating Dataset..."):
    cve_ids.append(cve.cve_id)
    dates_updated.append(cve.date_updated)
    dates_reserved.append(cve.date_reserved)
    dates_published.append(cve.date_published)
    states.append(cve.state)
    assigners.append(cve.assigner)
    affected.append(cve.affected)
    problem_types.append(cve.problem_types)
    v4_scores.append(cve.v4_score)
    v4_severities.append(cve.v4_severity)
    v31_scores.append(cve.v31_score)
    v31_severities.append(cve.v31_severity)
    v30_scores.append(cve.v30_score)
    v30_severities.append(cve.v30_severity)
    v20_scores.append(cve.v20_score)
    v20_severities.append(cve.v20_severity)
    attack_vectors.append(cve.attack_vector)
    attack_complexities.append(cve.attack_complexity)
    attack_requirements.append(cve.attack_requirements)
    privs_required.append(cve.privs_required)
    user_interactions.append(cve.user_interaction)
    exploit_maturities.append(cve.exploit_maturity)

data = {
    "cve": cve_ids,
    "date_updated": dates_updated,
    "date_reserved": dates_reserved,
    "date_published": dates_published,
    "state": states,
    "assigner": assigners,
    "affected": affected,
    "problem_types": problem_types,
    "cvss_v4_score": v4_scores,
    "cvss_v4_severity": v4_severities,
    "cvss_v3_1_score": v31_scores,
    "cvss_v3_1_severity": v31_severities,
    "cvss_v3_score": v30_scores,
    "cvss_v3_severity": v30_severities,
    "cvss_v2_score": v20_scores,
    "cvss_v2_severity": v20_severities,
    "attack_vector": attack_vectors,
    "attack_complexity": attack_complexities,
    "attack_requirements": attack_requirements,
    "privileges_required": privs_required,
    "user_interaction": user_interactions,
    "exploit_maturity": exploit_maturities
}

mitre_df = pd.DataFrame.from_dict(data)

print(mitre_df.shape)
mitre_df.head()
    

Generating Dataset...: 100%|██████████| 263420/263420 [00:00<00:00, 400608.99it/s]


In [None]:
epss_list = pd.read_csv('data/epss_scores-2024-09-17.csv')

print(epss_list.shape)
epss_list.head()

In [None]:
dataset = pd.merge(mitre_df, epss_list, on='cve', how='left')
print(dataset.shape)
dataset.head()

In [6]:
null_counts = dataset.isna().sum()
null_counts

(263420, 22)


cve_id                      0
date_updated                0
date_reserved             506
date_published           3511
state                       0
assigner                    5
affected                14427
problem_types           20561
cvss_v4_score          261640
cvss_v4_severity       261640
cvss_v3_1_score        207945
cvss_v3_1_severity     207945
cvss_v3_score          246556
cvss_v3_severity       246556
cvss_v2_score          259286
cvss_v2_severity       263420
attack_vector          208643
attack_complexity      208642
attack_requirements    262790
privileges_required    208643
user_interaction       208643
exploit_maturity       259676
dtype: int64

In [None]:
dataset.to_csv("daan881_group4_dataset.csv", index=False)