In [5]:
import pandas as pd
import re

# --- CVE PATTERN (same as before) ---
CVE_PATTERN = re.compile(r"CVE-\d{4}-\d{4,7}", re.IGNORECASE)

def extract_cves_from_codes(codes):
    """
    Extract CVE IDs from the 'codes' column.
    Example input: 'CVE-2009-3699;OSVDB-58726'
    Output: ['CVE-2009-3699']
    """
    if pd.isna(codes):
        return []
    
    text = str(codes)
    
    # Split on common separators ; , | space
    parts = re.split(r"[;,\|\s]+", text)
    
    cves = []
    for p in parts:
        matches = CVE_PATTERN.findall(p)
        for m in matches:
            cves.append(m.upper())
    
    # Remove duplicates and sort
    return sorted(set(cves))


# === 1) LOAD EXPLOITDB DATA ===
exploitdb_path = "clean_files_exploitsdb.csv"   # <-- filename
exploitdb = pd.read_csv(exploitdb_path)

print("ExploitDB columns:", exploitdb.columns.tolist())
print("ExploitDB rows:", len(exploitdb))

# Sanity check: make sure 'codes' exists
if "codes" not in exploitdb.columns:
    raise ValueError("Column 'codes' not found in ExploitDB CSV. Please check the file.")


# === 2) EXTRACT CVEs FROM 'codes' ONLY ===
exploitdb["CVE_list"] = exploitdb["codes"].apply(extract_cves_from_codes)

exploitdb_with_cve = exploitdb[exploitdb["CVE_list"].str.len() > 0]
print("ExploitDB rows with at least 1 CVE in 'codes':", len(exploitdb_with_cve))


# === 3) EXPLODE SO EACH ROW = 1 CVE ===
exploitdb_exploded = exploitdb_with_cve.explode("CVE_list").rename(columns={"CVE_list": "CVE"})


# === 4) CLEAN & RENAME COLUMNS ===
# description in this dataset is basically the "title" of the exploit, so rename it
if "description" in exploitdb_exploded.columns:
    exploitdb_exploded = exploitdb_exploded.rename(columns={
        "description": "Exploit_Title"
    })

exploitdb_exploded = exploitdb_exploded.rename(columns={
    "id": "ExploitDB_ID",
    "date_published": "ExploitDB_Date_Published",
    "source_url": "ExploitDB_Source_URL"
})

# Keep only useful columns for now
cols_to_keep = [
    "CVE",
    "ExploitDB_ID",
    "Exploit_Title",  # change from 'description'
    "file",
    "ExploitDB_Date_Published",
    "type",
    "platform",
    "port",
    "verified",
    "tags",
    "codes",
    "ExploitDB_Source_URL",
]

# Only keep columns that actually exist
cols_to_keep = [c for c in cols_to_keep if c in exploitdb_exploded.columns]
exploitdb_exploded = exploitdb_exploded[cols_to_keep]

exploitdb_exploded["Source"] = "ExploitDB"

print("Final ExploitDB exploded shape:", exploitdb_exploded.shape)
exploitdb_exploded.head()


ExploitDB columns: ['id', 'file', 'description', 'date_published', 'author', 'type', 'platform', 'port', 'date_added', 'date_updated', 'verified', 'codes', 'tags', 'aliases', 'screenshot_url', 'application_url', 'source_url']
ExploitDB rows: 46922
ExploitDB rows with at least 1 CVE in 'codes': 27224
Final ExploitDB exploded shape: (30452, 13)


Unnamed: 0,CVE,ExploitDB_ID,Exploit_Title,file,ExploitDB_Date_Published,type,platform,port,verified,tags,codes,ExploitDB_Source_URL,Source
0,CVE-2009-3699,16929,AIX Calendar Manager Service Daemon (rpc.cmsd)...,exploits/aix/dos/16929.rb,2010-11-11,dos,aix,,1,Metasploit Framework (MSF),CVE-2009-3699;OSVDB-58726,http://aix.software.ibm.com/aix/efixes/securit...,ExploitDB
1,CVE-1999-1015,19046,AppleShare IP Mail Server 5.0.3 - Buffer Overflow,exploits/aix/dos/19046.txt,1999-10-15,dos,aix,,1,,CVE-1999-1015;OSVDB-5970,https://www.securityfocus.com/bid/61/info,ExploitDB
5,CVE-2003-0087,22249,IBM AIX 4.3.3/5.1/5.2 - 'libIM' Buffer Overflow,exploits/aix/dos/22249.txt,2003-02-12,dos,aix,,1,,CVE-2003-0087;OSVDB-7996,https://www.securityfocus.com/bid/6840/info,ExploitDB
9,CVE-2009-4265,16657,PointDev IDEAL Migration - Buffer Overflow (Me...,exploits/aix/dos/16657.rb,2010-09-25,dos,aix,,1,Metasploit Framework (MSF),CVE-2009-4265;OSVDB-60681,,ExploitDB
10,CVE-2014-9349,35342,RobotStats 1.0 - HTML Injection,exploits/aix/dos/35342.txt,2014-11-24,dos,aix,,0,,CVE-2014-9349;OSVDB-115021,,ExploitDB


In [8]:
exploitdb_exploded.to_csv("exploitdb_with_cve_exploded.csv", index=False)
print("Saved: exploitdb_with_cve_exploded.csv")

Saved: exploitdb_with_cve_exploded.csv
