In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime

df = pd.read_csv('daan881_group4_dataset.final.cleaned.csv')

print(df.shape)
df.head()

(93697, 13)


Unnamed: 0,cve,date_reserved,date_published,attack_vector,attack_complexity,attack_requirements,privileges_required,user_interaction,exploit_maturity,epss,percentile,cvss_score,cvss_severity
0,CVE-2024-8969,2024-09-18 04:19:44.810000+00:00,2024-09-18T06:53:53.016Z,1.0,0.0,0.0,0.0,0.0,3.0,-1.0,-1.0,6.5,3.0
1,CVE-2024-8957,2024-09-17 19:08:48.129000+00:00,2024-09-17T20:08:25.588Z,1.0,0.0,0.0,1.0,0.0,3.0,-1.0,-1.0,7.2,2.0
2,CVE-2024-8956,2024-09-17 19:08:47.005000+00:00,2024-09-17T19:59:27.205Z,1.0,0.0,0.0,2.0,0.0,3.0,-1.0,-1.0,9.1,1.0
3,CVE-2024-8951,2024-09-17 15:24:05.559000+00:00,2024-09-17T20:00:08.078Z,1.0,0.0,0.0,2.0,0.0,3.0,-1.0,-1.0,5.3,3.0
4,CVE-2024-47059,2024-09-17 13:41:00.585000+00:00,2024-09-18T21:19:26.951Z,1.0,1.0,0.0,2.0,0.0,3.0,-1.0,-1.0,0.0,-1.0


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93697 entries, 0 to 93696
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   cve                  93697 non-null  object 
 1   date_reserved        93697 non-null  object 
 2   date_published       93697 non-null  object 
 3   attack_vector        93697 non-null  float64
 4   attack_complexity    93697 non-null  float64
 5   attack_requirements  93697 non-null  float64
 6   privileges_required  93697 non-null  float64
 7   user_interaction     93697 non-null  float64
 8   exploit_maturity     93697 non-null  float64
 9   epss                 93697 non-null  float64
 10  percentile           93697 non-null  float64
 11  cvss_score           93697 non-null  float64
 12  cvss_severity        93697 non-null  float64
dtypes: float64(10), object(3)
memory usage: 9.3+ MB


## Converting datetimes

A better way to analyze the dates is to use the number of days since the timestamp rather than the timestamp itself. This provides a numeric value that holds more information. For example, any high level vulnerability that was published long ago, we would want to prioritize highly. Notice that these values would change based on when the analysis occurs. So we should preprocess our data to convert these values from the raw data prior to feeding it into the model.

In [22]:
def parse_timestamp(ts):
    """
      Check if the timestamp already has timezone info
    """
    try:
        parsed_ts = pd.to_datetime(ts)
        if parsed_ts.tzinfo is None:
            # If timezone-naive, localize to UTC
            parsed_ts = parsed_ts.tz_localize('UTC')
        return parsed_ts
    except Exception as e:
        print(f"Error parsing timestamp: {ts} - {e}")
        return pd.NaT 

def reformat_dates(data, column, new_column_name):
    """
    Reformats dates to be an integer value in days since
    rather than a timestamp
    """
    # Strip fractional seconds using string manipulation
    data[column] = data[column].str.replace(r'\.\d+', '', regex=True)

    # Convert the cleaned column to datetime
    df[column] = df[column].apply(parse_timestamp)

    # Use a timezone-aware current time
    current_time = pd.Timestamp.now(tz="UTC")
    
    # Create a new column of the days since the original timestamp
    data[new_column_name] = (current_time - data[column]).dt.days


reformat_dates(df, "date_reserved", "days_since_reserved")
df = df.drop(columns=["date_reserved"])
reformat_dates(df, "date_published", "days_since_published")
df = df.drop(columns=["date_published"])

df["date_reserved_published_delta"] = df["days_since_reserved"] - df["days_since_published"]

df.head()


Unnamed: 0,cve,attack_vector,attack_complexity,attack_requirements,privileges_required,user_interaction,exploit_maturity,epss,percentile,cvss_score,cvss_severity,days_since_reserved,days_since_published,date_reserved_published_delta
0,CVE-2024-8969,1.0,0.0,0.0,0.0,0.0,3.0,-1.0,-1.0,6.5,3.0,58,58,0
1,CVE-2024-8957,1.0,0.0,0.0,1.0,0.0,3.0,-1.0,-1.0,7.2,2.0,59,59,0
2,CVE-2024-8956,1.0,0.0,0.0,2.0,0.0,3.0,-1.0,-1.0,9.1,1.0,59,59,0
3,CVE-2024-8951,1.0,0.0,0.0,2.0,0.0,3.0,-1.0,-1.0,5.3,3.0,59,59,0
4,CVE-2024-47059,1.0,1.0,0.0,2.0,0.0,3.0,-1.0,-1.0,0.0,-1.0,59,58,1


## Adding Heuristic/Rules

Since we need supervised data. We need to come up with an algorithm that outputs a priority score which will be our target variable for our model. To do this we will normalize our numeric data between 0 and 1 and apply a weight to each of the features to generate a priority score. First, we must normalize the data.

In [25]:
data_copy = df.copy()
data_copy.head()

Unnamed: 0,cve,attack_vector,attack_complexity,attack_requirements,privileges_required,user_interaction,exploit_maturity,epss,percentile,cvss_score,cvss_severity,days_since_reserved,days_since_published,date_reserved_published_delta
0,CVE-2024-8969,1.0,0.0,0.0,0.0,0.0,3.0,-1.0,-1.0,6.5,3.0,58,58,0
1,CVE-2024-8957,1.0,0.0,0.0,1.0,0.0,3.0,-1.0,-1.0,7.2,2.0,59,59,0
2,CVE-2024-8956,1.0,0.0,0.0,2.0,0.0,3.0,-1.0,-1.0,9.1,1.0,59,59,0
3,CVE-2024-8951,1.0,0.0,0.0,2.0,0.0,3.0,-1.0,-1.0,5.3,3.0,59,59,0
4,CVE-2024-47059,1.0,1.0,0.0,2.0,0.0,3.0,-1.0,-1.0,0.0,-1.0,59,58,1


In [26]:
def generate_priority_score(row):
    score = 0

    # Base initial high weights to epss and cvss
    if row["cvss_score"] > 0:
        # We should prioritze high servity first, this is a range from LOW (4) - Critical (0)
        # So we will subtract 5 and take half as the weight 
        score += 5 - row["cvss_severity"]

        # CVSS score which has been normalized from 0-10, so we will multiple by 5
        # to distinguish between cves with the same severity
        score += row["cvss_score"]
    elif row["epss"] > 0:
        # The percentile is how likely it is to be exploited, 0-1
        # to make this equivalent to cvss_severity we will multiple by 10, divide by 2.
        score += (10 * row["percentile"])/2
    
        # The overall chance of explortation is a probability. To make it equivalent
        # to the cvss score we will multiple by 10
        score += 10 * row["epss"]
    
    # TODO: Add additional weights/logic for other columns
    return score

data_copy["priority_score"] = data_copy.apply(generate_priority_score, axis=1)

data_copy.head()

Unnamed: 0,cve,attack_vector,attack_complexity,attack_requirements,privileges_required,user_interaction,exploit_maturity,epss,percentile,cvss_score,cvss_severity,days_since_reserved,days_since_published,date_reserved_published_delta,priority_score
0,CVE-2024-8969,1.0,0.0,0.0,0.0,0.0,3.0,-1.0,-1.0,6.5,3.0,58,58,0,8.5
1,CVE-2024-8957,1.0,0.0,0.0,1.0,0.0,3.0,-1.0,-1.0,7.2,2.0,59,59,0,10.2
2,CVE-2024-8956,1.0,0.0,0.0,2.0,0.0,3.0,-1.0,-1.0,9.1,1.0,59,59,0,13.1
3,CVE-2024-8951,1.0,0.0,0.0,2.0,0.0,3.0,-1.0,-1.0,5.3,3.0,59,59,0,7.3
4,CVE-2024-47059,1.0,1.0,0.0,2.0,0.0,3.0,-1.0,-1.0,0.0,-1.0,59,58,1,0.0
