In [6]:
import os
import re
import csv
import requests
from tqdm import tqdm

# === CONFIGURATION ===
# Directory containing the tool output files.
reports_dir = "/Users/hassnain/Desktop/oopsla/cargo-sherlock/evaluation/rq3/random1000/"

# Output CSV file to create.
output_csv = "filtered_crates.csv"

# Base URL for checking presence on lib.rs
lib_rs_base = "https://lib.rs/crate/"

# Open the CSV file for writing and write the header.
with open(output_csv, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Crate Name", "Severity Label", "On lib.rs"])

    # Process each report file and write the result immediately.
    for filename in tqdm(os.listdir(reports_dir), desc="Processing reports"):
        file_path = os.path.join(reports_dir, filename)
        # Skip if not a file.
        if not os.path.isfile(file_path):
            print("Invalid file:", filename)
            continue

        with open(file_path, 'r') as f:
            content = f.read()

        # Extract the full crate name from a header line like:
        # "Analysis Report for bellande_rust_import-0.0.1"
        match = re.search(r'Analysis Report for (.+)', content)
        if not match:
            continue
        full_crate = match.group(1).strip()

        # Remove version suffix if present (assuming a pattern like -0.0.1)
        match_version = re.match(r'^(.*)-(\d+\.\d+\.\d+)$', full_crate)
        crate_base = match_version.group(1) if match_version else full_crate

        # Extract the severity label (e.g., "Severity Label: CRITICAL")
        match_label = re.search(r'Severity Label:\s*(\S+)', content)
        severity = match_label.group(1).strip() if match_label else "UNKNOWN"

        # Check if the crate appears on lib.rs.
        lib_rs_url = f"{lib_rs_base}{crate_base}"
        try:
            response = requests.get(lib_rs_url)
            on_lib_rs = "Yes" if response.status_code == 200 else "No"
        except Exception as e:
            on_lib_rs = "No"

        # Write the result row to the CSV.
        writer.writerow([full_crate, severity, on_lib_rs])

print(f"Output CSV created: {output_csv}")


Processing reports: 100%|██████████| 960/960 [14:13<00:00,  1.12it/s]

Output CSV created: filtered_crates.csv





In [7]:
import os
import re
from tqdm import tqdm

# === CONFIGURATION ===
reports_dir = "/Users/hassnain/Desktop/oopsla/cargo-sherlock/evaluation/rq3/random1000/"
# File to log invalid file names (optional)
log_file = "invalid_reports.txt"

# Expected header pattern
pattern = re.compile(r'Analysis Report for (.+)')

# List to hold names of files that don't match the pattern
invalid_files = []

# Iterate over all files in the reports directory.
for filename in tqdm(os.listdir(reports_dir), desc="Scanning reports"):
    file_path = os.path.join(reports_dir, filename)
    # Skip non-files
    if not os.path.isfile(file_path):
        continue
    try:
        with open(file_path, 'r') as f:
            content = f.read()
    except Exception as e:
        print(f"Error reading file {filename}: {e}")
        invalid_files.append(filename)
        continue

    if not pattern.search(content):
        invalid_files.append(filename)

# Print out the names of files that did not match the expected header.
if invalid_files:
    print("The following files do not contain the expected header:")
    for f in invalid_files:
        print(f)
else:
    print("All files contain the expected header.")

# Optionally, write the list to a log file.
with open(log_file, 'w') as f:
    for name in invalid_files:
        f.write(f"{name}\n")
print(f"List of invalid files written to {log_file}.")


Scanning reports: 100%|██████████| 960/960 [00:00<00:00, 23094.14it/s]

The following files do not contain the expected header:
mdbook-trace-0.1.1
pkg-utils-0.1.0
bellhop-auth-header-0.2.1
mdbook-tera-backend-0.0.1
photon-indexer-0.50.0
neo3-0.1.9
iroh-docs-0.34.0
emu_core-0.1.1
mdbook-theme-0.1.6
mdbook-tocjs-0.1.4
bellhop-demo-0.2.1
arecibo-0.1.1
bellhop-hook-jenkins-0.2.1
air-interpreter-data-0.17.2
mdbook-toc-0.14.2
bellhop-auth-dummy-0.2.1
deathframe-0.5.1
rarity-0.3.0
iroh-blobs-0.34.0
solana-install-1.18.26
thebook-0.3.0
bevy_light_field-0.8.0
drt-tools-0.2.28
iroh-doctor-0.33.0
napi-package-template-0.1.0
iroh-cli-0.28.1
veryfi-1.0.0
daleth_lsp-0.1.1
bellhop-0.2.1
emu_driver-0.1.0
iroh-bitswap-0.2.0
mdbook-tailwindcss-0.1.1
dameng-helper-0.2.4
mdbook-templates-0.1.0
gwmp-mux-0.11.0
gwdiff-1.0.0
deb-rust-0.1.2
mdbook-tag-0.0.2
solana-include-idl-cli-0.1.0
solana-test-validator-2.2.3
neo4j-0.2.0
nu_plugin_xpath-0.44.0
nuance-0.3.2
mdbook-tailor-0.8.2
iroh-embed-0.2.0
mdbook-tera-0.5.1
iroh-0.34.0
mdbook-tagger-0.2.0
mdbook-tabs-0.2.1
solana-tokens-2.


