In [72]:
import pandas as pd
import os
from collections import defaultdict

## General Dataset Stats

In [73]:
def parse_filename(filename):
    """
    Parse the filename into its components.
    Expected format: owner_repo_commit_folder_filepath
    """
    parts = filename.split("_", 3)  # Split into 4 parts: owner, repo, commit, rest
    if len(parts) < 4:
        return None  # Invalid format
    owner, repo, commit, rest = parts[0], parts[1], parts[2], parts[3]
    folder, *filepath = rest.split("_", 1)
    filepath = filepath[0] if filepath else ""
    return owner, repo, commit, folder, filepath

In [74]:
def analyze_folder(folder_path):
    """
    Analyze the folder and return stats: projects, commits, and files.
    """
    projects = set()
    commits = set()
    file_count = 0

    for root, _, files in os.walk(folder_path):
        for file in files:
            file_count += 1
            parsed = parse_filename(file)
            if parsed:
                owner, repo, commit, _, _ = parsed
                projects.add(f"{owner}/{repo}")
                commits.add(commit)
    return {
        "projects": len(projects),
        "commits": len(commits),
        "files": file_count,
    }

In [75]:
def compare_folders(folder1, folder2):
    """
    Compare two folders and analyze stats.
    """
    stats1 = analyze_folder(folder1)
    stats2 = analyze_folder(folder2)

    print(f"{folder1} Analysis:")
    print(f" - Projects: {stats1['projects']}")
    print(f" - Commits: {stats1['commits']}")
    print(f" - Files: {stats1['files']}\n")

    print(f"{folder2} Analysis:")
    print(f" - Projects: {stats2['projects']}")
    print(f" - Commits: {stats2['commits']}")
    print(f" - Files: {stats2['files']}")

Summarize datasets

In [76]:
original = "../data/run_all/original/"
patched = "../data/run_all/patched/"

compare_folders(original, patched)

../data/run_all/original/ Analysis:
 - Projects: 34
 - Commits: 10522
 - Files: 32873

../data/run_all/patched/ Analysis:
 - Projects: 34
 - Commits: 10590
 - Files: 35000


## Vulnerabilities

In [77]:
cq_original = pd.read_excel('../data/run_all/original_extended_py.xlsx', sheet_name='main')
cq_patched = pd.read_excel('../data/run_all/patched_extended_py.xlsx', sheet_name='main')
b_original = pd.read_csv("../data/run_all/bandit_original.csv")
b_patched = pd.read_csv("../data/run_all/bandit_patched.csv")

In [78]:
def summarize_data(df, name, file_col, commit_col):
    """Summarize the total vulnerabilities, unique commits, and unique files in a dataset."""
    num_vul = len(df)
    num_commits = df[commit_col].nunique()
    num_files = df[file_col].nunique()
    print(f"{name} Summary:")
    print(f"  Total Vulnerabilities: {num_vul}")
    print(f"  Unique Vulnerable Commits: {num_commits}")
    print(f"  Unique Vulnerable Files: {num_files}\n")

In [79]:
def clean_dataframe(df, file_col, commit_col, vuln_col):
    """Clean dataframe by stripping spaces, converting to lowercase, and normalizing file paths."""
    df[file_col] = df[file_col].str.strip().str.lower().str.replace(r"^(original/|patched/)", "", regex=True)
    df[commit_col] = df[commit_col].str.strip().str.lower()
    df[vuln_col] = df[vuln_col].str.strip().str.lower()
    return df

In [None]:
def classify_vulnerabilities(original_df, patched_df, name, file_col, commit_col, vuln_col, line_col=None):
    """Classify vulnerabilities as fixed, introduced, or persisting between two datasets."""
    # Determine columns to include
    columns = [file_col, commit_col, vuln_col]
    if line_col:
        columns.append(line_col)

    # Extract relevant columns
    original_vulns = original_df[columns]
    patched_vulns = patched_df[columns]

    original_set = set(map(tuple, original_vulns.values))
    patched_set = set(map(tuple, patched_vulns.values))

    # Classify vulnerabilities
    fixed = original_set - patched_set
    introduced = patched_set - original_set
    persisting = original_set & patched_set

    fixed_commits = {commit for _, commit, *_ in fixed}
    introduced_commits = {commit for _, commit, *_ in introduced}
    persisting_commits = {commit for _, commit, *_ in persisting}

    fixed_files = {file for file, _, *_ in fixed}
    introduced_files = {file for file, _, *_ in introduced}
    persisting_files = {file for file, _, *_ in persisting}

    print(f"{name} Vulnerability Classification:")
    print(f"  Fixed Vulnerabilities: {len(fixed)}")
    print(f"    Attributed to {len(fixed_commits)} unique commits")
    print(f"    In {len(fixed_files)} unique files")
    print(f"  Introduced Vulnerabilities: {len(introduced)}")
    print(f"    Attributed to {len(introduced_commits)} unique commits")
    print(f"    In {len(introduced_files)} unique files")
    print(f"  Persisting Vulnerabilities: {len(persisting)}")
    print(f"    Attributed to {len(persisting_commits)} unique commits")
    print(f"    In {len(persisting_files)} unique files\n")

Summarize vulnerabilities

In [103]:
summarize_data(cq_original, "CodeQL Original", file_col="file", commit_col="commit_link")
summarize_data(cq_patched, "CodeQL Patched", file_col="file", commit_col="commit_link")
summarize_data(b_original, "Bandit Original", file_col="file", commit_col="commit_link")
summarize_data(b_patched, "Bandit Patched", file_col="file", commit_col="commit_link")

CodeQL Original Summary:
  Total Vulnerabilities: 944
  Unique Vulnerable Commits: 464
  Unique Vulnerable Files: 157

CodeQL Patched Summary:
  Total Vulnerabilities: 989
  Unique Vulnerable Commits: 479
  Unique Vulnerable Files: 169

Bandit Original Summary:
  Total Vulnerabilities: 60276
  Unique Vulnerable Commits: 5980
  Unique Vulnerable Files: 2673

Bandit Patched Summary:
  Total Vulnerabilities: 61749
  Unique Vulnerable Commits: 6055
  Unique Vulnerable Files: 2882



Classify vulnerabilities

In [109]:
classify_vulnerabilities(cq_original, cq_patched, "CodeQL", file_col="file", commit_col="commit_link", vuln_col="rule_id", line_col=None)
classify_vulnerabilities(b_original, b_patched, "Bandit", file_col="file", commit_col="commit_link", vuln_col="test_id", line_col="line_number")


CodeQL Vulnerability Classification:
  Fixed Vulnerabilities: 19
    Attributed to 15 unique commits
    In 18 unique files
  Introduced Vulnerabilities: 49
    Attributed to 34 unique commits
    In 43 unique files
  Persisting Vulnerabilities: 623
    Attributed to 456 unique commits
    In 150 unique files

Bandit Vulnerability Classification:
  Fixed Vulnerabilities: 27580
    Attributed to 3989 unique commits
    In 1962 unique files
  Introduced Vulnerabilities: 29050
    Attributed to 4095 unique commits
    In 2210 unique files
  Persisting Vulnerabilities: 32685
    Attributed to 4585 unique commits
    In 2034 unique files

