In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
import random

df = pd.read_csv('daan881_group4_dataset.final.cleaned.csv')

print(df.shape)
df.head()

(93697, 13)


Unnamed: 0,cve,date_reserved,date_published,attack_vector,attack_complexity,attack_requirements,privileges_required,user_interaction,exploit_maturity,epss,percentile,cvss_score,cvss_severity
0,CVE-2024-8969,2024-09-18 04:19:44.810000+00:00,2024-09-18T06:53:53.016Z,1.0,0.0,0.0,0.0,0.0,3.0,-1.0,-1.0,6.5,3.0
1,CVE-2024-8957,2024-09-17 19:08:48.129000+00:00,2024-09-17T20:08:25.588Z,1.0,0.0,0.0,1.0,0.0,3.0,-1.0,-1.0,7.2,2.0
2,CVE-2024-8956,2024-09-17 19:08:47.005000+00:00,2024-09-17T19:59:27.205Z,1.0,0.0,0.0,2.0,0.0,3.0,-1.0,-1.0,9.1,1.0
3,CVE-2024-8951,2024-09-17 15:24:05.559000+00:00,2024-09-17T20:00:08.078Z,1.0,0.0,0.0,2.0,0.0,3.0,-1.0,-1.0,5.3,3.0
4,CVE-2024-47059,2024-09-17 13:41:00.585000+00:00,2024-09-18T21:19:26.951Z,1.0,1.0,0.0,2.0,0.0,3.0,-1.0,-1.0,0.0,-1.0


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93697 entries, 0 to 93696
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   cve                  93697 non-null  object 
 1   date_reserved        93697 non-null  object 
 2   date_published       93697 non-null  object 
 3   attack_vector        93697 non-null  float64
 4   attack_complexity    93697 non-null  float64
 5   attack_requirements  93697 non-null  float64
 6   privileges_required  93697 non-null  float64
 7   user_interaction     93697 non-null  float64
 8   exploit_maturity     93697 non-null  float64
 9   epss                 93697 non-null  float64
 10  percentile           93697 non-null  float64
 11  cvss_score           93697 non-null  float64
 12  cvss_severity        93697 non-null  float64
dtypes: float64(10), object(3)
memory usage: 9.3+ MB


## Converting datetimes

A better way to analyze the dates is to use the number of days since the timestamp rather than the timestamp itself. This provides a numeric value that holds more information. For example, any high level vulnerability that was published long ago, we would want to prioritize highly. Notice that these values would change based on when the analysis occurs. So we should preprocess our data to convert these values from the raw data prior to feeding it into the model.

In [3]:
def parse_timestamp(ts):
    """
      Check if the timestamp already has timezone info
    """
    try:
        parsed_ts = pd.to_datetime(ts)
        if parsed_ts.tzinfo is None:
            # If timezone-naive, localize to UTC
            parsed_ts = parsed_ts.tz_localize('UTC')
        return parsed_ts
    except Exception as e:
        print(f"Error parsing timestamp: {ts} - {e}")
        return pd.NaT 

def reformat_dates(data, column, new_column_name):
    """
    Reformats dates to be an integer value in days since
    rather than a timestamp
    """
    # Strip fractional seconds using string manipulation
    data[column] = data[column].str.replace(r'\.\d+', '', regex=True)

    # Convert the cleaned column to datetime
    df[column] = df[column].apply(parse_timestamp)

    # Use a timezone-aware current time
    current_time = pd.Timestamp.now(tz="UTC")
    
    # Create a new column of the days since the original timestamp
    data[new_column_name] = (current_time - data[column]).dt.days


reformat_dates(df, "date_reserved", "days_since_reserved")
df = df.drop(columns=["date_reserved"])
reformat_dates(df, "date_published", "days_since_published")
df = df.drop(columns=["date_published"])

df["date_reserved_published_delta"] = df["days_since_reserved"] - df["days_since_published"]

df.head()


Unnamed: 0,cve,attack_vector,attack_complexity,attack_requirements,privileges_required,user_interaction,exploit_maturity,epss,percentile,cvss_score,cvss_severity,days_since_reserved,days_since_published,date_reserved_published_delta
0,CVE-2024-8969,1.0,0.0,0.0,0.0,0.0,3.0,-1.0,-1.0,6.5,3.0,58,58,0
1,CVE-2024-8957,1.0,0.0,0.0,1.0,0.0,3.0,-1.0,-1.0,7.2,2.0,59,59,0
2,CVE-2024-8956,1.0,0.0,0.0,2.0,0.0,3.0,-1.0,-1.0,9.1,1.0,59,59,0
3,CVE-2024-8951,1.0,0.0,0.0,2.0,0.0,3.0,-1.0,-1.0,5.3,3.0,59,59,0
4,CVE-2024-47059,1.0,1.0,0.0,2.0,0.0,3.0,-1.0,-1.0,0.0,-1.0,59,58,1


## Adding Heuristic/Rules

Since we need supervised data. We need to come up with an algorithm that outputs a priority score which will be our target variable for our model. To do this we will normalize our numeric data between 0 and 1 and apply a weight to each of the features to generate a priority score. First, we must normalize the data.

In [4]:
data_copy = df.copy()
data_copy.head()

Unnamed: 0,cve,attack_vector,attack_complexity,attack_requirements,privileges_required,user_interaction,exploit_maturity,epss,percentile,cvss_score,cvss_severity,days_since_reserved,days_since_published,date_reserved_published_delta
0,CVE-2024-8969,1.0,0.0,0.0,0.0,0.0,3.0,-1.0,-1.0,6.5,3.0,58,58,0
1,CVE-2024-8957,1.0,0.0,0.0,1.0,0.0,3.0,-1.0,-1.0,7.2,2.0,59,59,0
2,CVE-2024-8956,1.0,0.0,0.0,2.0,0.0,3.0,-1.0,-1.0,9.1,1.0,59,59,0
3,CVE-2024-8951,1.0,0.0,0.0,2.0,0.0,3.0,-1.0,-1.0,5.3,3.0,59,59,0
4,CVE-2024-47059,1.0,1.0,0.0,2.0,0.0,3.0,-1.0,-1.0,0.0,-1.0,59,58,1


In [5]:
def generate_priority_score(row, current_data):
    score = 0

    # Base initial high weights to epss and cvss
    if row["cvss_score"] > 0 and row["epss"] > 0:
        # If cvss and epss are both defined
        # take the average
        score += (row["cvss_score"] + (10 * row["epss"]))/2
        score += ((5 - row["cvss_severity"]) + ((10 * row["percentile"])/2))/2
    elif row["cvss_score"] > 0:
        # CVSS score which has been normalized from 0-10, so we will multiple by 5
        # to distinguish between cves with the same severity
        score += row["cvss_score"]

        # We should prioritze high servity first, this is a range from LOW (4) - Critical (0)
        # So we will subtract 5 and take half as the weight 
        score += 5 - row["cvss_severity"]
    elif row["epss"] > 0:
        # The overall chance of explortation is a probability. To make it equivalent
        # to the cvss score we will multiple by 10
        score += 10 * row["epss"]

        # The percentile is how likely it is to be exploited, 0-1
        # to make this equivalent to cvss_severity we will multiple by 10, divide by 2.
        score += (10 * row["percentile"])/2

    # Prioritize high severity that have been sitting for a while
    max_days_since_published = current_data["days_since_reserved"].max()

    if score > 10 and max_days_since_published > 0 and row["days_since_reserved"] > 0:
        published_days_ratio = row["days_since_reserved"]/max_days_since_published
        score += 5 * float(int(10 * published_days_ratio))

    # Prioritize items that quickly were published after being identified
    max_days_reserved_published_delta = current_data["date_reserved_published_delta"].max()

    if max_days_reserved_published_delta > 0 and row["date_reserved_published_delta"] > 0:
        delta_ratio = row["date_reserved_published_delta"]/max_days_reserved_published_delta
        score += 10 * delta_ratio

    # if there is a network vector then add more weight
    # than any other case
    if row["attack_vector"] == 1:
        score += 1
    else: 
        score += 0.5

    # if the attack complexity is LOW then 1,
    # if it is high then 0.5
    if row["attack_complexity"] == 0:
        score += 1
    else: 
        score += 0.5
    
    # if the attack requirements is None then 1,
    # if it is high then 0.5
    if row["attack_requirements"] == 0:
        score += 1
    else: 
        score += 0.5

    # if the No privledges are required then 1,
    # If low privs required then 0.5
    # if high privs required then 0.25
    if row["privileges_required"] == 2:
        score += 1
    elif row["privileges_required"] == 0:
        score += 0.5
    else: 
        score += 0.25

    # if the No user interaction is required then 1,
    # If passive then 0.5
    # if required or active then 0.25
    if row["privileges_required"] == 0:
        score += 1
    elif row["privileges_required"] == 3:
        score += 0.5
    else: 
        score += 0.25

    # if exploit maturity is high or not defined then 0,
    # if exploit maturity is functional then 0.5,
    # if proof_of_concept or unproven then 0.25
    if row["exploit_maturity"] == 0 or row["exploit_maturity"] == 3:
        score += 1
    elif row["exploit_maturity"] == 3:
        score += 0.5
    else: 
        score += 0.25

    return score

In [17]:

number_of_train_groups = 400
number_of_test_groups = 200
number_of_validation_groups = 100
max_number_of_cves = 500
min_number_of_cves = 100

train_dataset_list = []
test_dataset_list = []
val_dataset_list = []

for group_number in range(number_of_test_groups + number_of_train_groups + number_of_validation_groups):
    number_of_cves = random.randint(min_number_of_cves, max_number_of_cves)
    random_rows = data_copy.sample(n=number_of_cves)
    random_rows["priority_score"] = random_rows.apply(generate_priority_score, current_data=data_copy, axis=1)
    random_rows = random_rows.sort_values(by="priority_score", ascending=False)
    random_rows["group_id"] = group_number+1

    if (group_number+1) > (number_of_train_groups + number_of_test_groups):
        val_dataset_list.append(random_rows)
    elif (group_number+1) > number_of_train_groups:
        test_dataset_list.append(random_rows)
    else:
        train_dataset_list.append(random_rows)

    print(random_rows.head(n=5))

train_dataset = pd.concat(train_dataset_list)
test_dataset = pd.concat(test_dataset_list)
val_dataset = pd.concat(val_dataset_list)
    

                  cve  attack_vector  attack_complexity  attack_requirements  \
35003  CVE-2023-38049            1.0                0.0                  0.0   
76741  CVE-2022-21306            1.0                0.0                  0.0   
93101  CVE-2021-21596            3.0                0.0                  0.0   
92039  CVE-2021-23452            1.0                0.0                  0.0   
78586  CVE-2021-42309            1.0                0.0                  0.0   

       privileges_required  user_interaction  exploit_maturity     epss  \
35003                  0.0               0.0               3.0  0.00050   
76741                  2.0               0.0               3.0  0.00605   
93101                  2.0               0.0               3.0  0.00180   
92039                  2.0               0.0               1.0  0.00404   
78586                  2.0               0.0               3.0  0.03757   

       percentile  cvss_score  cvss_severity  days_since_reserved  \

In [18]:
train_dataset.to_csv("daan881_group4_dataset.train.csv", index=False)
test_dataset.to_csv("daan881_group4_dataset.test.csv", index=False)
val_dataset.to_csv("daan881_group4_dataset.val.csv", index=False)