In [None]:
import os
import re
import csv
import requests

# === CONFIGURATION ===
# Path to the CSV file containing the 500 crate names.
crates_csv = "random1000_crates.csv"

# Directory containing the tool output files.
reports_dir = "/Users/hassnain/Desktop/oopsla/cargo-sherlock/evaluation/rq3/"

# Output CSV file to create.
output_csv = "filtered_crates.csv"

# Base URL for checking presence on lib.rs
lib_rs_base = "https://lib.rs/crate/"

# === STEP 1: Load the crates from the CSV, skipping the header row ===
crates_list = []
with open(crates_csv, newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip header row
    for row in reader:
        if row and row[0].strip():
            crates_list.append(row[0].strip())

print("Loaded crates from CSV:", crates_list)

# === STEP 2: Process each report file in the given directory ===
results = []
for filename in os.listdir(reports_dir):
    file_path = os.path.join(reports_dir, filename)
    # Skip if not a file
    if not os.path.isfile(file_path):
        continue

    with open(file_path, 'r') as f:
        content = f.read()

    # Extract the full crate name from a header line like:
    # "Analysis Report for bellande_rust_import-0.0.1"
    match = re.search(r'Analysis Report for (.+)', content)
    if not match:
        continue
    full_crate = match.group(1).strip()

    # Remove version suffix if present (assuming a pattern like -0.0.1)
    match_version = re.match(r'^(.*)-(\d+\.\d+\.\d+)$', full_crate)
    if match_version:
        crate_base = match_version.group(1)
    else:
        crate_base = full_crate

    # Do not filter any crates—process all reports regardless of CSV membership.
    # Extract the severity label (e.g., "Severity Label: CRITICAL")
    match_label = re.search(r'Severity Label:\s*(\S+)', content)
    severity = match_label.group(1).strip() if match_label else "UNKNOWN"

    # === STEP 3: Check if the crate appears on lib.rs ===
    lib_rs_url = f"{lib_rs_base}{crate_base}"
    try:
        response = requests.get(lib_rs_url)
        on_lib_rs = "Yes" if response.status_code == 200 else "No"
    except Exception as e:
        on_lib_rs = "No"

    results.append([full_crate, severity, on_lib_rs])

# === STEP 4: Write the results to a new CSV file ===
with open(output_csv, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Crate Name", "Severity Label", "On lib.rs"])
    writer.writerows(results)

print(f"Output CSV created: {output_csv}")


Loaded crates from CSV: ['nanohtml2text', 'nanohttp', 'nanoid', 'nanoid-dictionary', 'nanoid-wasm', 'nanoid_cli', 'nanoimage', 'nanoir', 'nanokit', 'nanoleaf', 'nanoly', 'nanom', 'nanom-derive', 'nanomsg', 'nanomsg-sys', 'nanopass', 'nanopow-rs', 'nanopre', 'nanopub', 'pic2lcd', 'pic2txt', 'pic32-config-sector', 'pic32-hal', 'pic32mx2xx', 'pic32mx470', 'pic32mx567', 'pic8259', 'pic8259_simple', 'pic8259_x86', 'pica', 'picahq', 'picam', 'pican', 'picard', 'picard-core', 'picasso-core', 'picat', 'piccolo', 'piccolo-util', 'crate-pro', 'crate-publish-test', 'crate-publish-test-foo', 'crate-published', 'crate-race', 'crate-redeem-in-kind', 'crate-root', 'crate-sample-pdaj', 'crate-settings', 'crate-starter', 'crate-template', 'crate-test-20220118', 'crate-test-bolt', 'crate-test-xxx', 'crate-token', 'crate-version', 'crate-version-experiment', 'crate-web', 'crate2bib', 'lmdb-rs', 'lmdb-rs-m', 'lmdb-sys', 'lmdb-sys2', 'lmdb-zero', 'lmfu', 'lmgpt', 'lminc', 'lmk', 'lml', 'lmml', 'lmml-cli', 