# CVE scraping
Pipline for scraping all CVEs published after 1st Jen 2019 to 2nd June 2024

In [9]:
import os 
os.chdir("daniele_pipeline")
os.getcwd()

'/home/edvinn/KTH/DA231X-ex/Thesis-Edvin/daniele_pipeline'

In [2]:
from datetime import datetime, timedelta

def date_generator(start_date):

    current_date = start_date
    today = datetime.now()

    while current_date <= today:
        yield current_date.strftime("%Y-%m-%dT00:00:00.000")
        current_date += timedelta(days=120)

    yield  today.strftime("%Y-%m-%dT00:00:00.000")

dates = list(date_generator(datetime(2019, 1, 1)))
dates

['2019-01-01T00:00:00.000',
 '2019-05-01T00:00:00.000',
 '2019-08-29T00:00:00.000',
 '2019-12-27T00:00:00.000',
 '2020-04-25T00:00:00.000',
 '2020-08-23T00:00:00.000',
 '2020-12-21T00:00:00.000',
 '2021-04-20T00:00:00.000',
 '2021-08-18T00:00:00.000',
 '2021-12-16T00:00:00.000',
 '2022-04-15T00:00:00.000',
 '2022-08-13T00:00:00.000',
 '2022-12-11T00:00:00.000',
 '2023-04-10T00:00:00.000',
 '2023-08-08T00:00:00.000',
 '2023-12-06T00:00:00.000',
 '2024-04-04T00:00:00.000',
 '2024-08-02T00:00:00.000',
 '2024-11-30T00:00:00.000',
 '2025-03-04T00:00:00.000']

In [None]:
from time import sleep
import requests
import os

page_limit = 2000
template_url = "https://services.nvd.nist.gov/rest/json/cves/2.0/?resultsPerPage=2000&pubStartDate={StartDate}&pubEndDate={EndDate}"

API_KEY = os.getenv("NVD_API_KEY")
headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0',
        "apiKey": API_KEY
    }

vulnerabilities_list =[]
error_urls = []

for i in range(len(dates)-1):
    print(f"API from: {dates[i]}\t to: {dates[i+1]}")
    
    url = template_url.format(StartDate=dates[i],EndDate=dates[i+1])
    print(url)
    
    counter = 0
    try:
        reply = requests.get(url, headers=headers, timeout=100)
        j_reply = reply.json()
        
        vulnerabilities_list += j_reply['vulnerabilities']
        counter += len(j_reply['vulnerabilities'])
        n_results = j_reply['totalResults']
        
        while counter<=n_results-1:
            
            url_iter = url+f"&startIndex={counter}"
            print(f"\t{url_iter}")
            try:
                
                sleep(2)
                inner_reply = requests.get(url_iter, headers=headers, timeout=100)
                j_reply = inner_reply.json()
        
                vulnerabilities_list += j_reply['vulnerabilities']
                counter += len(j_reply['vulnerabilities'])
                
            except Exception as exp:
                if reply.status_code == 403 :
                    print("No published CVEs between", dates[i], "and", dates[i + 1])
                else :
                    print(f"Exception -> {exp}")
                    print(f"Request status -> {inner_reply.status_code}")
            
        print(f"Count -> {counter}")
        
    except Exception as exp:
        if reply.status_code == 403 :
            print("No published CVEs between", dates[i], "and", dates[i + 1])
        else :
            print(f"Exception -> {exp}")
            print(f"Request status -> {reply.status_code}")
        
        error_urls.append(url)
    
print(f"Vulnerabilities: {len(vulnerabilities_list)}")

API from: 2019-01-01T00:00:00.000	 to: 2019-05-01T00:00:00.000
https://services.nvd.nist.gov/rest/json/cves/2.0/?resultsPerPage=2000&pubStartDate=2019-01-01T00:00:00.000&pubEndDate=2019-05-01T00:00:00.000
	https://services.nvd.nist.gov/rest/json/cves/2.0/?resultsPerPage=2000&pubStartDate=2019-01-01T00:00:00.000&pubEndDate=2019-05-01T00:00:00.000&startIndex=2000
	https://services.nvd.nist.gov/rest/json/cves/2.0/?resultsPerPage=2000&pubStartDate=2019-01-01T00:00:00.000&pubEndDate=2019-05-01T00:00:00.000&startIndex=4000
Count -> 5730
API from: 2019-05-01T00:00:00.000	 to: 2019-08-29T00:00:00.000
https://services.nvd.nist.gov/rest/json/cves/2.0/?resultsPerPage=2000&pubStartDate=2019-05-01T00:00:00.000&pubEndDate=2019-08-29T00:00:00.000
	https://services.nvd.nist.gov/rest/json/cves/2.0/?resultsPerPage=2000&pubStartDate=2019-05-01T00:00:00.000&pubEndDate=2019-08-29T00:00:00.000&startIndex=2000
	https://services.nvd.nist.gov/rest/json/cves/2.0/?resultsPerPage=2000&pubStartDate=2019-05-01T00:0

In [None]:
import pickle
pickle_cve_raw_file = 'tmp/jcve_raw.pkl'
with open(pickle_cve_raw_file, 'wb') as file:
    pickle.dump(vulnerabilities_list, file)
print(f"Data successfully saved to '{pickle_cve_raw_file}' using pickle.")

Data successfully saved to 'resources/jcve_raw.pkl' using pickle.


In [1]:
import pickle
pickle_cve_raw_file = 'tmp/jcve_raw.pkl'
with open(pickle_cve_raw_file, 'rb') as file:
    vulnerabilities_list = pickle.load(file)


In [2]:
import pandas as pd
display(pd.DataFrame(vulnerabilities_list))

Unnamed: 0,cve
0,"{'id': 'CVE-2019-3494', 'sourceIdentifier': 'c..."
1,"{'id': 'CVE-2018-20650', 'sourceIdentifier': '..."
2,"{'id': 'CVE-2018-20651', 'sourceIdentifier': '..."
3,"{'id': 'CVE-2018-20652', 'sourceIdentifier': '..."
4,"{'id': 'CVE-2019-3500', 'sourceIdentifier': 'c..."
...,...
166512,"{'id': 'CVE-2025-1832', 'sourceIdentifier': 'c..."
166513,"{'id': 'CVE-2025-1833', 'sourceIdentifier': 'c..."
166514,"{'id': 'CVE-2025-1834', 'sourceIdentifier': 'c..."
166515,"{'id': 'CVE-2025-1835', 'sourceIdentifier': 'c..."


In [2]:
from datasets import Dataset
import pandas as pd 
df_nvd = pd.json_normalize(pd.DataFrame(vulnerabilities_list)['cve'], max_level=0)
display(df_nvd[:5])
ds = Dataset.from_pandas(df_nvd)
ds.push_to_hub('Eathus/nvd_raw_list')

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,id,sourceIdentifier,published,lastModified,vulnStatus,cveTags,descriptions,metrics,weaknesses,configurations,references,cisaExploitAdd,cisaActionDue,cisaRequiredAction,cisaVulnerabilityName,vendorComments,evaluatorComment
0,CVE-2019-3494,cve@mitre.org,2019-01-01T15:29:00.240,2024-11-21T04:42:08.157,Modified,[],"[{'lang': 'en', 'value': 'Simply-Blog through ...","{'cvssMetricV30': [{'source': 'nvd@nist.gov', ...","[{'source': 'nvd@nist.gov', 'type': 'Primary',...","[{'nodes': [{'operator': 'OR', 'negate': False...",[{'url': 'https://github.com/Paroxyste/Simply-...,,,,,,
1,CVE-2018-20650,cve@mitre.org,2019-01-01T16:29:00.233,2024-11-21T04:01:56.097,Modified,[],"[{'lang': 'en', 'value': 'A reachable Object::...","{'cvssMetricV31': [{'source': 'nvd@nist.gov', ...","[{'source': 'nvd@nist.gov', 'type': 'Primary',...","[{'nodes': [{'operator': 'OR', 'negate': False...",[{'url': 'http://www.securityfocus.com/bid/106...,,,,,,
2,CVE-2018-20651,cve@mitre.org,2019-01-01T16:29:00.343,2024-11-21T04:01:56.270,Modified,[],"[{'lang': 'en', 'value': 'A NULL pointer deref...","{'cvssMetricV30': [{'source': 'nvd@nist.gov', ...","[{'source': 'nvd@nist.gov', 'type': 'Primary',...","[{'nodes': [{'operator': 'OR', 'negate': False...",[{'url': 'http://lists.opensuse.org/opensuse-s...,,,,,,
3,CVE-2018-20652,cve@mitre.org,2019-01-01T16:29:00.403,2024-11-21T04:01:56.420,Modified,[],"[{'lang': 'en', 'value': 'An attempted excessi...","{'cvssMetricV30': [{'source': 'nvd@nist.gov', ...","[{'source': 'nvd@nist.gov', 'type': 'Primary',...","[{'nodes': [{'operator': 'OR', 'negate': False...",[{'url': 'https://github.com/syoyo/tinyexr/iss...,,,,,,
4,CVE-2019-3500,cve@mitre.org,2019-01-02T07:29:00.197,2024-11-21T04:42:08.880,Modified,[],"[{'lang': 'en', 'value': 'aria2c in aria2 1.33...","{'cvssMetricV31': [{'source': 'nvd@nist.gov', ...","[{'source': 'nvd@nist.gov', 'type': 'Primary',...","[{'nodes': [{'operator': 'OR', 'negate': False...",[{'url': 'https://github.com/aria2/aria2/issue...,,,,,,


Creating parquet from Arrow format: 100%|██████████| 167/167 [00:01<00:00, 133.01ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:11<00:00, 11.36s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/Eathus/nvd_raw_list/commit/8d1160ee784068b97613267a2a2fc01402ec3388', commit_message='Upload dataset', commit_description='', oid='8d1160ee784068b97613267a2a2fc01402ec3388', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Eathus/nvd_raw_list', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Eathus/nvd_raw_list'), pr_revision=None, pr_num=None)

In [4]:
display(df_nvd)

Unnamed: 0,id,sourceIdentifier,published,lastModified,vulnStatus,cveTags,descriptions,weaknesses,configurations,references,metrics.cvssMetricV30,metrics.cvssMetricV2,metrics.cvssMetricV31,cisaExploitAdd,cisaActionDue,cisaRequiredAction,cisaVulnerabilityName,vendorComments,evaluatorComment,metrics.cvssMetricV40
0,CVE-2019-3494,cve@mitre.org,2019-01-01T15:29:00.240,2024-11-21T04:42:08.157,Modified,[],"[{'lang': 'en', 'value': 'Simply-Blog through ...","[{'source': 'nvd@nist.gov', 'type': 'Primary',...","[{'nodes': [{'operator': 'OR', 'negate': False...",[{'url': 'https://github.com/Paroxyste/Simply-...,"[{'source': 'nvd@nist.gov', 'type': 'Primary',...","[{'source': 'nvd@nist.gov', 'type': 'Primary',...",,,,,,,,
1,CVE-2018-20650,cve@mitre.org,2019-01-01T16:29:00.233,2024-11-21T04:01:56.097,Modified,[],"[{'lang': 'en', 'value': 'A reachable Object::...","[{'source': 'nvd@nist.gov', 'type': 'Primary',...","[{'nodes': [{'operator': 'OR', 'negate': False...",[{'url': 'http://www.securityfocus.com/bid/106...,,"[{'source': 'nvd@nist.gov', 'type': 'Primary',...","[{'source': 'nvd@nist.gov', 'type': 'Primary',...",,,,,,,
2,CVE-2018-20651,cve@mitre.org,2019-01-01T16:29:00.343,2024-11-21T04:01:56.270,Modified,[],"[{'lang': 'en', 'value': 'A NULL pointer deref...","[{'source': 'nvd@nist.gov', 'type': 'Primary',...","[{'nodes': [{'operator': 'OR', 'negate': False...",[{'url': 'http://lists.opensuse.org/opensuse-s...,"[{'source': 'nvd@nist.gov', 'type': 'Primary',...","[{'source': 'nvd@nist.gov', 'type': 'Primary',...",,,,,,,,
3,CVE-2018-20652,cve@mitre.org,2019-01-01T16:29:00.403,2024-11-21T04:01:56.420,Modified,[],"[{'lang': 'en', 'value': 'An attempted excessi...","[{'source': 'nvd@nist.gov', 'type': 'Primary',...","[{'nodes': [{'operator': 'OR', 'negate': False...",[{'url': 'https://github.com/syoyo/tinyexr/iss...,"[{'source': 'nvd@nist.gov', 'type': 'Primary',...","[{'source': 'nvd@nist.gov', 'type': 'Primary',...",,,,,,,,
4,CVE-2019-3500,cve@mitre.org,2019-01-02T07:29:00.197,2024-11-21T04:42:08.880,Modified,[],"[{'lang': 'en', 'value': 'aria2c in aria2 1.33...","[{'source': 'nvd@nist.gov', 'type': 'Primary',...","[{'nodes': [{'operator': 'OR', 'negate': False...",[{'url': 'https://github.com/aria2/aria2/issue...,,"[{'source': 'nvd@nist.gov', 'type': 'Primary',...","[{'source': 'nvd@nist.gov', 'type': 'Primary',...",,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166512,CVE-2025-1832,cna@vuldb.com,2025-03-02T21:15:10.110,2025-03-02T21:15:10.110,Received,[],"[{'lang': 'en', 'value': 'A vulnerability clas...","[{'source': 'cna@vuldb.com', 'type': 'Primary'...",,[{'url': 'https://github.com/caigo8/CVE-md/blo...,,"[{'source': 'cna@vuldb.com', 'type': 'Secondar...","[{'source': 'cna@vuldb.com', 'type': 'Primary'...",,,,,,,"[{'source': 'cna@vuldb.com', 'type': 'Secondar..."
166513,CVE-2025-1833,cna@vuldb.com,2025-03-02T22:15:34.820,2025-03-02T22:15:34.820,Received,[],"[{'lang': 'en', 'value': 'A vulnerability, whi...","[{'source': 'cna@vuldb.com', 'type': 'Primary'...",,[{'url': 'https://github.com/caigo8/CVE-md/blo...,,"[{'source': 'cna@vuldb.com', 'type': 'Secondar...","[{'source': 'cna@vuldb.com', 'type': 'Primary'...",,,,,,,"[{'source': 'cna@vuldb.com', 'type': 'Secondar..."
166514,CVE-2025-1834,cna@vuldb.com,2025-03-02T22:15:34.997,2025-03-02T22:15:34.997,Received,[],"[{'lang': 'en', 'value': 'A vulnerability, whi...","[{'source': 'cna@vuldb.com', 'type': 'Primary'...",,[{'url': 'https://github.com/caigo8/CVE-md/blo...,,"[{'source': 'cna@vuldb.com', 'type': 'Secondar...","[{'source': 'cna@vuldb.com', 'type': 'Primary'...",,,,,,,"[{'source': 'cna@vuldb.com', 'type': 'Secondar..."
166515,CVE-2025-1835,cna@vuldb.com,2025-03-02T23:15:10.477,2025-03-02T23:15:10.477,Received,[],"[{'lang': 'en', 'value': 'A vulnerability has ...","[{'source': 'cna@vuldb.com', 'type': 'Primary'...",,[{'url': 'https://github.com/sheratan4/cve/iss...,,"[{'source': 'cna@vuldb.com', 'type': 'Secondar...","[{'source': 'cna@vuldb.com', 'type': 'Primary'...",,,,,,,"[{'source': 'cna@vuldb.com', 'type': 'Secondar..."


In [None]:
import pandas as pd 

df_cve = pd.json_normalize(pd.DataFrame(vulnerabilities_list)['cve'])
#df_weaknesses
df_cve_weaknesses = df_cve[['id', 'weaknesses']].copy()
#display(df_cve_weaknesses.drop_duplicates(subset="id"))
df_cve_weaknesses_expanded = df_cve_weaknesses.explode('weaknesses')
df_weaknesses_normalized = pd.json_normalize(df_cve_weaknesses_expanded['weaknesses'])

# Combine the normalized weaknesses with the original id column
df_cve_wd = pd.concat([df_cve_weaknesses_expanded['id'].reset_index(drop=True), df_weaknesses_normalized], axis=1)
df_cve_wd_exp = df_cve_wd.explode('description')
df_cve_wd_norm = pd.json_normalize(df_cve_wd_exp['description'])
df_cve_weak_final = pd.concat([df_cve_wd_exp[['id', 'source', 'type']].reset_index(drop=True), df_cve_wd_norm], axis=1)

df_cve_weak_final = df_cve_weak_final[df_cve_weak_final['value'].astype(str).str.contains('CWE')].reset_index(drop=True)
# Display the final DataFrame
display(df_cve_wd_exp)
display(df_cve_weak_final)
#display(pd.DataFrame(weaknesses_list_tmp))

In [104]:
vulnerabilities_list_tmp = map(lambda x: x['cve'], vulnerabilities_list)
#weaknesses_list_tmp = map(lambda x: x['weaknesses'], vulnerabilities_list)


In [105]:
import pandas as pd 
display(pd.DataFrame(vulnerabilities_list_tmp))

Unnamed: 0,id,sourceIdentifier,published,lastModified,vulnStatus,cveTags,descriptions,metrics,weaknesses,configurations,references,cisaExploitAdd,cisaActionDue,cisaRequiredAction,cisaVulnerabilityName,vendorComments,evaluatorComment
0,CVE-2019-3494,cve@mitre.org,2019-01-01T15:29:00.240,2024-11-21T04:42:08.157,Modified,[],"[{'lang': 'en', 'value': 'Simply-Blog through ...","{'cvssMetricV30': [{'source': 'nvd@nist.gov', ...","[{'source': 'nvd@nist.gov', 'type': 'Primary',...","[{'nodes': [{'operator': 'OR', 'negate': False...",[{'url': 'https://github.com/Paroxyste/Simply-...,,,,,,
1,CVE-2018-20650,cve@mitre.org,2019-01-01T16:29:00.233,2024-11-21T04:01:56.097,Modified,[],"[{'lang': 'en', 'value': 'A reachable Object::...","{'cvssMetricV31': [{'source': 'nvd@nist.gov', ...","[{'source': 'nvd@nist.gov', 'type': 'Primary',...","[{'nodes': [{'operator': 'OR', 'negate': False...",[{'url': 'http://www.securityfocus.com/bid/106...,,,,,,
2,CVE-2018-20651,cve@mitre.org,2019-01-01T16:29:00.343,2024-11-21T04:01:56.270,Modified,[],"[{'lang': 'en', 'value': 'A NULL pointer deref...","{'cvssMetricV30': [{'source': 'nvd@nist.gov', ...","[{'source': 'nvd@nist.gov', 'type': 'Primary',...","[{'nodes': [{'operator': 'OR', 'negate': False...",[{'url': 'http://lists.opensuse.org/opensuse-s...,,,,,,
3,CVE-2018-20652,cve@mitre.org,2019-01-01T16:29:00.403,2024-11-21T04:01:56.420,Modified,[],"[{'lang': 'en', 'value': 'An attempted excessi...","{'cvssMetricV30': [{'source': 'nvd@nist.gov', ...","[{'source': 'nvd@nist.gov', 'type': 'Primary',...","[{'nodes': [{'operator': 'OR', 'negate': False...",[{'url': 'https://github.com/syoyo/tinyexr/iss...,,,,,,
4,CVE-2019-3500,cve@mitre.org,2019-01-02T07:29:00.197,2024-11-21T04:42:08.880,Modified,[],"[{'lang': 'en', 'value': 'aria2c in aria2 1.33...","{'cvssMetricV31': [{'source': 'nvd@nist.gov', ...","[{'source': 'nvd@nist.gov', 'type': 'Primary',...","[{'nodes': [{'operator': 'OR', 'negate': False...",[{'url': 'https://github.com/aria2/aria2/issue...,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166512,CVE-2025-1832,cna@vuldb.com,2025-03-02T21:15:10.110,2025-03-02T21:15:10.110,Received,[],"[{'lang': 'en', 'value': 'A vulnerability clas...","{'cvssMetricV40': [{'source': 'cna@vuldb.com',...","[{'source': 'cna@vuldb.com', 'type': 'Primary'...",,[{'url': 'https://github.com/caigo8/CVE-md/blo...,,,,,,
166513,CVE-2025-1833,cna@vuldb.com,2025-03-02T22:15:34.820,2025-03-02T22:15:34.820,Received,[],"[{'lang': 'en', 'value': 'A vulnerability, whi...","{'cvssMetricV40': [{'source': 'cna@vuldb.com',...","[{'source': 'cna@vuldb.com', 'type': 'Primary'...",,[{'url': 'https://github.com/caigo8/CVE-md/blo...,,,,,,
166514,CVE-2025-1834,cna@vuldb.com,2025-03-02T22:15:34.997,2025-03-02T22:15:34.997,Received,[],"[{'lang': 'en', 'value': 'A vulnerability, whi...","{'cvssMetricV40': [{'source': 'cna@vuldb.com',...","[{'source': 'cna@vuldb.com', 'type': 'Primary'...",,[{'url': 'https://github.com/caigo8/CVE-md/blo...,,,,,,
166515,CVE-2025-1835,cna@vuldb.com,2025-03-02T23:15:10.477,2025-03-02T23:15:10.477,Received,[],"[{'lang': 'en', 'value': 'A vulnerability has ...","{'cvssMetricV40': [{'source': 'cna@vuldb.com',...","[{'source': 'cna@vuldb.com', 'type': 'Primary'...",,[{'url': 'https://github.com/sheratan4/cve/iss...,,,,,,
