In [2]:
import pandas as pd
import os
from collections import defaultdict

## General Dataset Stats

In [3]:
def parse_filename(filename):
    """
    Parse the filename into its components.
    Expected format: owner_repo_commit_folder_filepath
    """
    parts = filename.split("_", 3)  # Split into 4 parts: owner, repo, commit, rest
    if len(parts) < 4:
        return None  # Invalid format
    owner, repo, commit, rest = parts[0], parts[1], parts[2], parts[3]
    folder, *filepath = rest.split("_", 1)
    filepath = filepath[0] if filepath else ""
    return owner, repo, commit, folder, filepath

In [4]:
def analyze_folder(folder_path):
    """
    Analyze the folder and return stats: projects, commits, and files.
    """
    projects = set()
    commits = set()
    file_count = 0

    for root, _, files in os.walk(folder_path):
        for file in files:
            file_count += 1
            parsed = parse_filename(file)
            if parsed:
                owner, repo, commit, _, _ = parsed
                projects.add(f"{owner}/{repo}")
                commits.add(commit)
    return {
        "projects": len(projects),
        "commits": len(commits),
        "files": file_count,
    }

In [5]:
def compare_folders(folder1, folder2):
    """
    Compare two folders and analyze stats.
    """
    stats1 = analyze_folder(folder1)
    stats2 = analyze_folder(folder2)

    print(f"{folder1} Analysis:")
    print(f" - Projects: {stats1['projects']}")
    print(f" - Commits: {stats1['commits']}")
    print(f" - Files: {stats1['files']}\n")

    print(f"{folder2} Analysis:")
    print(f" - Projects: {stats2['projects']}")
    print(f" - Commits: {stats2['commits']}")
    print(f" - Files: {stats2['files']}")

Summarize datasets

In [6]:
original = "../data/run_all/original/"
patched = "../data/run_all/patched/"

compare_folders(original, patched)

../data/run_all/original/ Analysis:
 - Projects: 34
 - Commits: 10522
 - Files: 32873

../data/run_all/patched/ Analysis:
 - Projects: 34
 - Commits: 10590
 - Files: 35000


## Vulnerabilities

In [7]:
cq_original = pd.read_excel('../data/run_all/original_extended_py.xlsx', sheet_name='main')
cq_patched = pd.read_excel('../data/run_all/patched_extended_py.xlsx', sheet_name='main')
b_original = pd.read_csv("../data/run_all/bandit_original.csv")
b_patched = pd.read_csv("../data/run_all/bandit_patched.csv")

In [8]:
def summarize_data(df, name, file_col, commit_col, vuln_col):
    """Summarize the total unique vulnerabilities, unique commits, and unique files in a dataset."""
    unique_vulns = df[[file_col, commit_col, vuln_col]].drop_duplicates()
    num_vul = len(unique_vulns)
    num_commits = unique_vulns[commit_col].nunique()
    num_files = unique_vulns[file_col].nunique()
    print(f"{name} Summary:")
    print(f"  Total Unique Vulnerabilities: {num_vul}")
    print(f"  Unique Vulnerable Commits: {num_commits}")
    print(f"  Unique Vulnerable Files: {num_files}\n")

In [9]:
def clean_file_path(df, file_col):
    """Clean the file column by removing 'original/' or 'patched/' prefixes."""
    df[file_col] = df[file_col].str.replace(r'^(original/|patched/)', '', regex=True)
    return df

def classify_vulnerabilities(original_df, patched_df, file_col, commit_col, vuln_col):
    """Classify vulnerabilities as fixed, introduced, or persisting between two datasets."""
    # Clean file paths
    original_df = clean_file_path(original_df, file_col)
    patched_df = clean_file_path(patched_df, file_col)

    # Add a unique identifier column for matching
    original_df['unique_id'] = original_df[commit_col] + "|" + original_df[file_col] + "|" + original_df[vuln_col]
    patched_df['unique_id'] = patched_df[commit_col] + "|" + patched_df[file_col] + "|" + patched_df[vuln_col]

    # Identify statuses
    original_ids = set(original_df['unique_id'])
    patched_ids = set(patched_df['unique_id'])

    patched_df['status'] = patched_df['unique_id'].apply(
        lambda x: 'persisting' if x in original_ids else 'new'
    )
    original_df['status'] = original_df['unique_id'].apply(
        lambda x: 'persisting' if x in patched_ids else 'fixed'
    )

    # Combine results with statuses
    patched_with_status = patched_df.drop(columns=['unique_id'])
    original_with_status = original_df[original_df['status'] == 'fixed'].drop(columns=['unique_id'])

    combined_results = pd.concat([patched_with_status, original_with_status], ignore_index=True)
    combined_results = combined_results.drop_duplicates(subset=[commit_col, file_col, vuln_col, 'status'])
    
    return combined_results

In [10]:
def find_intersection(codeql_df, bandit_df, file_col, commit_col, cwe_col, tags_col):
    """Find intersection of vulnerabilities between CodeQL and Bandit based on CWE and file name."""
    # Extract the last part of the file name
    codeql_df['file_base'] = codeql_df[file_col].apply(lambda x: x.split('/')[-1])
    bandit_df['file_base'] = bandit_df[file_col].apply(lambda x: x.split('/')[-1])

    # Merge on commit_id and file_base to find matching files
    merged_df = pd.merge(codeql_df, bandit_df, 
                         left_on=[commit_col, 'file_base'], 
                         right_on=[commit_col, 'file_base'], 
                         suffixes=('_codeql', '_bandit'))

    # Filter rows where the Bandit CWE exists in CodeQL tags
    def cwe_in_tags(cwe, tags):
        return str(cwe) in tags

    merged_df = merged_df[merged_df.apply(
        lambda row: cwe_in_tags(row[cwe_col], row[tags_col]), axis=1)]

    return merged_df.drop(columns=['file_base'])

Summarize vulnerabilities

In [11]:
summarize_data(cq_original, "CodeQL Original", file_col="file", commit_col="commit_link", vuln_col="rule_id")
summarize_data(cq_patched, "CodeQL Patched", file_col="file", commit_col="commit_link", vuln_col="rule_id")
summarize_data(b_original, "Bandit Original", file_col="file", commit_col="commit_link", vuln_col="test_id")
summarize_data(b_patched, "Bandit Patched", file_col="file", commit_col="commit_link", vuln_col="test_id")

CodeQL Original Summary:
  Total Unique Vulnerabilities: 642
  Unique Vulnerable Commits: 464
  Unique Vulnerable Files: 157

CodeQL Patched Summary:
  Total Unique Vulnerabilities: 672
  Unique Vulnerable Commits: 479
  Unique Vulnerable Files: 169

Bandit Original Summary:
  Total Unique Vulnerabilities: 19158
  Unique Vulnerable Commits: 5980
  Unique Vulnerable Files: 2673

Bandit Patched Summary:
  Total Unique Vulnerabilities: 19696
  Unique Vulnerable Commits: 6055
  Unique Vulnerable Files: 2882



Classify vulnerabilities

In [12]:
# Classify CodeQL vulnerabilities
codeql_results = classify_vulnerabilities(
    cq_original, cq_patched, file_col="file", commit_col="commit_id", vuln_col="rule_id"
)

# Classify Bandit vulnerabilities
bandit_results = classify_vulnerabilities(
    b_original, b_patched, file_col="file", commit_col="commit_id", vuln_col="test_id"
)

In [13]:
summarize_data(codeql_results, "CodeQL Results", file_col="file", commit_col="commit_id", vuln_col="rule_id")
summarize_data(bandit_results, "Bandit Results", file_col="file", commit_col="commit_id", vuln_col="test_id")

CodeQL Results Summary:
  Total Unique Vulnerabilities: 691
  Unique Vulnerable Commits: 483
  Unique Vulnerable Files: 176

Bandit Results Summary:
  Total Unique Vulnerabilities: 20025
  Unique Vulnerable Commits: 6090
  Unique Vulnerable Files: 2955



In [14]:
codeql_results.head()

Unnamed: 0,project,commit_id,commit_link,file,rule_id,message,precision,name,kind,shortDescription,fullDescription,level,problemSeverity,securitySeverity,subSeverity,tags,status
0,pypa/pip,43ac83fb1b83e8f5ad773418491eae376bda210d,https://github.com/pypa/pip/commit/43ac83fb1b8...,pip//vendor/requests/packages/urllib3/connecti...,py/insecure-default-protocol,Call to deprecated method ssl.wrap_socket does...,high,Default version of SSL/TLS may be insecure,problem,Default version of SSL/TLS may be insecure,Leaving the SSL/TLS version unspecified may re...,warning,warning,7.5,,"['security', 'external/cwe/cwe-327']",persisting
1,pypa/pip,43ac83fb1b83e8f5ad773418491eae376bda210d,https://github.com/pypa/pip/commit/43ac83fb1b8...,pip//vendor/requests/packages/urllib3/util/ssl...,py/insecure-default-protocol,Call to deprecated method ssl.wrap_socket does...,high,Default version of SSL/TLS may be insecure,problem,Default version of SSL/TLS may be insecure,Leaving the SSL/TLS version unspecified may re...,warning,warning,7.5,,"['security', 'external/cwe/cwe-327']",persisting
3,pypa/pip,796320abac38410316067bbb9455007cc51079db,https://github.com/pypa/pip/commit/796320abac3...,pip//vendor/requests/packages/urllib3/connecti...,py/insecure-default-protocol,Call to deprecated method ssl.wrap_socket does...,high,Default version of SSL/TLS may be insecure,problem,Default version of SSL/TLS may be insecure,Leaving the SSL/TLS version unspecified may re...,warning,warning,7.5,,"['security', 'external/cwe/cwe-327']",persisting
4,pypa/pip,7b4be2bc3e669ec4ab83b1ef120dfa4559f48799,https://github.com/pypa/pip/commit/7b4be2bc3e6...,pip//vendor/requests/packages/urllib3/connecti...,py/insecure-default-protocol,Call to deprecated method ssl.wrap_socket does...,high,Default version of SSL/TLS may be insecure,problem,Default version of SSL/TLS may be insecure,Leaving the SSL/TLS version unspecified may re...,warning,warning,7.5,,"['security', 'external/cwe/cwe-327']",persisting
5,pypa/pip,7b4be2bc3e669ec4ab83b1ef120dfa4559f48799,https://github.com/pypa/pip/commit/7b4be2bc3e6...,pip//vendor/requests/packages/urllib3/util/ssl...,py/insecure-default-protocol,Call to deprecated method ssl.wrap_socket does...,high,Default version of SSL/TLS may be insecure,problem,Default version of SSL/TLS may be insecure,Leaving the SSL/TLS version unspecified may re...,warning,warning,7.5,,"['security', 'external/cwe/cwe-327']",new


In [15]:
bandit_results.head()

Unnamed: 0,file,line_number,line_range,col_offset,end_col_offset,issue_severity,issue_confidence,issue_cwe,issue_cwe_link,issue_text,test_name,test_id,code,more_info,project,commit_id,commit_link,status
0,openstack/datadog/checks/openstack/openstack.py,198,"[198, 199, 200]",12,111,LOW,HIGH,703,https://cwe.mitre.org/data/definitions/703.html,Use of assert detected. The enclosed code will...,assert_used,B101,197 \n198 assert t_id and t_id not...,https://bandit.readthedocs.io/en/1.7.10/plugin...,DataDog/integrations-core,09dae880e153bf831e8ece51376bfe6701151a3d,https://github.com/DataDog/integrations-core/c...,persisting
1,openstack/datadog/checks/openstack/openstack.py,625,"[625, 626, 627]",12,20,LOW,HIGH,703,https://cwe.mitre.org/data/definitions/703.html,"Try, Except, Pass detected.",try_except_pass,B110,624 hyp_state = self.HYPER...,https://bandit.readthedocs.io/en/1.7.10/plugin...,DataDog/integrations-core,09dae880e153bf831e8ece51376bfe6701151a3d,https://github.com/DataDog/integrations-core/c...,persisting
5,consul/datadog/checks/consul/consul.py,95,"[95, 96]",27,77,MEDIUM,LOW,400,https://cwe.mitre.org/data/definitions/400.html,Call to requests without timeout,request_without_timeout,B113,94 if privatekeyfile:\n95 ...,https://bandit.readthedocs.io/en/1.7.10/plugin...,DataDog/integrations-core,6f7fc99c87524f518c36e6feef13f9fae00ce8e8,https://github.com/DataDog/integrations-core/c...,persisting
8,mongo/datadog/checks/mongo/mongo.py,773,"[773, 774]",24,36,LOW,HIGH,703,https://cwe.mitre.org/data/definitions/703.html,"Try, Except, Continue detected.",try_except_continue,B112,772 value = va...,https://bandit.readthedocs.io/en/1.7.10/plugin...,DataDog/integrations-core,74e71bad320547a9518dcdb3f6f4bea1ecfebc41,https://github.com/DataDog/integrations-core/c...,persisting
9,mysql/datadog/checks/mysql/mysql.py,456,[456],46,55,MEDIUM,MEDIUM,605,https://cwe.mitre.org/data/definitions/605.html,Possible binding to all interfaces.,hardcoded_bind_all_interfaces,B104,"455 # The server needs to run locally,...",https://bandit.readthedocs.io/en/1.7.10/plugin...,DataDog/integrations-core,7f23c18c75592e51f6cdce4837a221de8ba3cd51,https://github.com/DataDog/integrations-core/c...,persisting


In [16]:
codeql_results.value_counts("status")

status
persisting    623
new            49
fixed          19
Name: count, dtype: int64

In [17]:
bandit_results.value_counts("status")

status
persisting    18829
new             867
fixed           329
Name: count, dtype: int64

In [18]:
intersection_df = find_intersection(
    codeql_results, bandit_results, 
    file_col="file", commit_col="commit_id", cwe_col="issue_cwe", tags_col="tags"
)

In [19]:
intersection_df.columns

Index(['project_codeql', 'commit_id', 'commit_link_codeql', 'file_codeql',
       'rule_id', 'message', 'precision', 'name', 'kind', 'shortDescription',
       'fullDescription', 'level', 'problemSeverity', 'securitySeverity',
       'subSeverity', 'tags', 'status_codeql', 'file_bandit', 'line_number',
       'line_range', 'col_offset', 'end_col_offset', 'issue_severity',
       'issue_confidence', 'issue_cwe', 'issue_cwe_link', 'issue_text',
       'test_name', 'test_id', 'code', 'more_info', 'project_bandit',
       'commit_link_bandit', 'status_bandit'],
      dtype='object')

In [20]:
intersection_df.value_counts('status_codeql')

status_codeql
persisting    183
new            16
fixed           6
Name: count, dtype: int64

In [21]:
summarize_data(intersection_df, "Intersection Results", file_col="file_bandit", commit_col="commit_id", vuln_col="test_id")

Intersection Results Summary:
  Total Unique Vulnerabilities: 176
  Unique Vulnerable Commits: 131
  Unique Vulnerable Files: 54



In [None]:
temp = intersection_df[['commit_id', 'project_bandit', 'file_bandit', 'rule_id', 'name', 'shortDescription', 'tags', 'issue_cwe', 'issue_text', 'status_codeql']]
temp[temp['status_codeql']=='new']

Unnamed: 0,commit_id,project_bandit,file_bandit,rule_id,name,shortDescription,tags,issue_cwe,issue_text,status_codeql
7,7b4be2bc3e669ec4ab83b1ef120dfa4559f48799,pypa/pip,pip//vendor/requests/packages/urllib3/util/ssl...,py/insecure-default-protocol,Default version of SSL/TLS may be insecure,Default version of SSL/TLS may be insecure,"['security', 'external/cwe/cwe-327']",327,ssl.wrap_socket call with no SSL/TLS protocol ...,new
19,a9d56c7734fd465d01437d61f632749a293e7805,pypa/pip,src/pip//vendor/requests/packages/urllib3/util...,py/insecure-default-protocol,Default version of SSL/TLS may be insecure,Default version of SSL/TLS may be insecure,"['security', 'external/cwe/cwe-327']",327,ssl.wrap_socket call with no SSL/TLS protocol ...,new
68,a9d56c7734fd465d01437d61f632749a293e7805,pypa/pip,src/pip//vendor/requests/auth.py,py/weak-sensitive-data-hashing,Use of a broken or weak cryptographic hashing ...,Use of a broken or weak cryptographic hashing ...,"['security', 'external/cwe/cwe-327', 'external...",327,Use of weak MD5 hash for security. Consider us...,new
103,7b4be2bc3e669ec4ab83b1ef120dfa4559f48799,pypa/pip,pip//vendor/requests/packages/urllib3/util/ssl...,py/insecure-protocol,Use of insecure SSL/TLS version,Use of insecure SSL/TLS version,"['security', 'external/cwe/cwe-327']",327,ssl.wrap_socket call with no SSL/TLS protocol ...,new
118,a9d56c7734fd465d01437d61f632749a293e7805,pypa/pip,src/pip//vendor/requests/packages/urllib3/util...,py/insecure-protocol,Use of insecure SSL/TLS version,Use of insecure SSL/TLS version,"['security', 'external/cwe/cwe-327']",327,ssl.wrap_socket call with no SSL/TLS protocol ...,new
703,a9d56c7734fd465d01437d61f632749a293e7805,pypa/pip,src/pip//vendor/html5lib/filters/sanitizer.py,py/overly-large-range,Overly permissive regular expression range,Overly permissive regular expression range,"['correctness', 'security', 'external/cwe/cwe-...",20,Using escape to parse untrusted XML data is kn...,new
708,c8e8a99b7a6f9404536bc9d895a1a42a060f7f91,pypa/pip,pip//vendor/html5lib/filters/sanitizer.py,py/overly-large-range,Overly permissive regular expression range,Overly permissive regular expression range,"['correctness', 'security', 'external/cwe/cwe-...",20,Using escape to parse untrusted XML data is kn...,new
711,c8e8a99b7a6f9404536bc9d895a1a42a060f7f91,pypa/pip,pip//vendor/html5lib/sanitizer.py,py/overly-large-range,Overly permissive regular expression range,Overly permissive regular expression range,"['correctness', 'security', 'external/cwe/cwe-...",20,Using escape to parse untrusted XML data is kn...,new
770,fb2881e5bbdc9a5d2e45923a275781ef3d1c4f08,conan-io/conan,conans/util/runners.py,py/insecure-temporary-file,Insecure temporary file,Insecure temporary file,"['external/cwe/cwe-377', 'security']",377,Use of insecure and deprecated function (mktemp).,new
778,895d23dd3c5154b149bdc5f57b1c1e33b3afdd71,conda/conda,conda/install.py,py/insecure-temporary-file,Insecure temporary file,Insecure temporary file,"['external/cwe/cwe-377', 'security']",377,Use of insecure and deprecated function (mktemp).,new


In [46]:
temp[temp['status_codeql']=='fixed'].value_counts('name')

name
Arbitrary file write during tarfile extraction    2
Overly permissive regular expression range        2
Insecure temporary file                           1
Overly permissive file permissions                1
Name: count, dtype: int64

In [45]:
temp[temp['status_codeql']=='new'].value_counts('name')

name
Arbitrary file write during tarfile extraction                               4
Overly permissive regular expression range                                   3
Default version of SSL/TLS may be insecure                                   2
Insecure temporary file                                                      2
Overly permissive file permissions                                           2
Use of insecure SSL/TLS version                                              2
Use of a broken or weak cryptographic hashing algorithm on sensitive data    1
Name: count, dtype: int64

In [44]:
intersection_df.to_csv('../data/run_all/codeql_bandit_overlap.csv', index=False)