In [None]:
import os
import json
import subprocess
from collections import Counter
import random
import requests

In [None]:
path = "primevul_filtered.jsonl"
records=[]
with open(path) as f:
    for line in f:
        line=line.strip()
        if line:
            records.append(json.loads(line))

for r in records:
    cve=r.get("cve")
    year=None
    if cve and cve.startswith("CVE-"):
        try:
            year=int(cve.split("-")[1])
        except:
            year=None
    r["year"]=year


In [None]:
years=[]
for r in records:
    cve=r.get("cve")
    year=None
    if cve and cve.startswith("CVE-"):
        try:
            year=int(cve.split("-")[1])
        except:
            year=None
    r["year"]=year
    if year:
        years.append(year)

min_year=min(years) if years else None
max_year=max(years) if years else None

min_year, max_year

In [None]:
primevul_cve=set()
primevul_cwe=set()
cve2cwe={}

for r in records:
    cve=r.get("cve")
    cwe_list=r.get("cwe",[])
    if cve:
        primevul_cve.add(cve)
        cve2cwe[cve]=cwe_list
    for cwe in cwe_list:
        primevul_cwe.add(cwe)

len(primevul_cve), len(primevul_cwe)

In [None]:
primevul_cwe=set()
for r in records:
    cwe_list=r.get("cwe",[])
    for cwe in cwe_list:
        primevul_cwe.add(cwe)

len(primevul_cwe), sorted(list(primevul_cwe))

In [None]:
if not os.path.exists("cvelistV5"):
    subprocess.run(["git","clone","https://github.com/CVEProject/cvelistV5.git"], check=True)

root="cvelistV5/cves"

years = []
for y in os.listdir(root):
    if y.isdigit() and int(y) > 2022:
        years.append(y)

years = sorted(years)

out=[]

def extract_problem_types(container):
    cwes = []
    if isinstance(container, list):
        for entry in container:
            cwes.extend(extract_problem_types(entry))
        return cwes
    if not isinstance(container, dict):
        return cwes
    pt = container.get("problemTypes")
    if not pt:
        return cwes
    for p in pt:
        desc_list = p.get("descriptions", [])
        for desc in desc_list:
            cwe = desc.get("cweId")
            if cwe:
                cwes.append(cwe)
    return cwes

def extract_github_commit_url(container):
    urls=[]
    if isinstance(container, list):
        for entry in container:
            urls.extend(extract_github_commit_url(entry))
        return urls
    if not isinstance(container, dict):
        return urls
    refs = container.get("references") or []
    for r in refs:
        u = r.get("url")
        if isinstance(u,str) and "github.com" in u and "/commit/" in u:
            urls.append(u)
    return urls

for y in years:
    year_dir = os.path.join(root, y)
    if not os.path.isdir(year_dir):
        continue

    for sub in os.listdir(year_dir):
        subdir = os.path.join(year_dir, sub)
        if not os.path.isdir(subdir):
            continue

        for fname in os.listdir(subdir):
            if not fname.endswith(".json"):
                continue

            path = os.path.join(subdir, fname)

            try:
                with open(path, "r", encoding="utf-8") as f:
                    rec = json.load(f)
            except:
                continue

            cve_id = rec.get("cveMetadata", {}).get("cveId")
            if not cve_id:
                continue

            containers = rec.get("containers", {})
            found_cwes = set()
            github_commits = []

            for key, cont in containers.items():
                cwes = extract_problem_types(cont)
                for cwe in cwes:
                    if cwe in primevul_cwe:
                        found_cwes.add(cwe)
                github_commits.extend(extract_github_commit_url(cont))

            if found_cwes:
                out.append({
                    "cve": cve_id,
                    "cwe": sorted(list(found_cwes)),
                    "path": path,
                    "github_commit_url": github_commits
                })

with open("filtered_cvelistV5_by_primevul_cwe.jsonl", "w") as w:
    for e in out:
        w.write(json.dumps(e) + "\n")

total_github_links=sum(len(e.get("github_commit_url",[])) for e in out)
print("Total GitHub commit links found:", total_github_links)

In [None]:
cwe_counter = Counter()

for e in out:
    if e.get("github_commit_url"):
        for cwe in e.get("cwe", []):
            cwe_counter[cwe] += 1

print("CWE distribution (only CVEs with GitHub commits):")
for cwe, count in cwe_counter.most_common():
    print(f"{cwe}: {count}")

In [None]:
random.seed(42)

suffixes_primary = (".c",".h",".cpp",".cc",".cxx",".hpp",".hxx",".h++")
suffixes_extended = (
    ".c",".h",".cpp",".cc",".cxx",".hpp",".hxx",".h++",
    ".m",".mm",".s",".asm",".pas",".inc",".dpr",".f",".f90",".f95"
)

def pick_example_for_cwe(cwe, out):
    candidates=[e for e in out if cwe in e.get("cwe",[]) and e.get("github_commit_url")]
    random.shuffle(candidates)

    for e in candidates:
        for u in e["github_commit_url"]:
            if "github.com" in u and "/commit/" in u:
                if any(s in u for s in suffixes_primary):
                    return e, u

    for e in candidates:
        for u in e["github_commit_url"]:
            if any(s in u for s in suffixes_extended):
                return e, u

    if candidates:
        e=random.choice(candidates)
        return e, None

    return None, None

examples={}

for cwe in cwe_counter.keys():
    ex, url = pick_example_for_cwe(cwe, out)
    if ex:
        examples[cwe] = {
            "cve": ex["cve"],
            "cwe": ex["cwe"],
            "commit_url": url,
            "path": ex["path"]
        }

In [None]:
examples

In [None]:
cwe_to_diff = {}

for cwe, info in examples.items():
    u = info.get("commit_url")
    if not u:
        cwe_to_diff[cwe] = {"cve": info["cve"], "diff": None}
        continue

    diff_url = u + ".diff"
    r = requests.get(diff_url)
    if r.status_code == 200:
        diff_text = r.text
    else:
        diff_text = None

    cwe_to_diff[cwe] = {
        "cve": info["cve"],
        "commit_url": u,
        "diff": diff_text
    }

with open("cwe_commit_diff_examples.json", "w") as f:
    json.dump(cwe_to_diff, f, indent=2)

len(cwe_to_diff)