# Notebook v4 – Registry Enrichment, Delta Features & Transition Model

**Input:**  
- `../data/meta/version_delta_features_live.csv` (output from v3)

Each row in `version_delta_features_live.csv` represents a version transition:

> (ecosystem, package_name, prev_version → version)

with an existing label:

- `y_malicious` – label for the **current** version.

This notebook:

1. Loads `version_delta_features_live.csv` as the base **delta table**.
2. Ensures labels (`y_malicious`, `prev_label_malicious`) are available.
3. Fetches **PyPI** / **npm** registry metadata for all versions.
4. Builds per-version **static size & density** features.
5. Converts those static features into **delta/ratio features** per transition.
6. Adds **registry-inspired derived features** (unified size, density proxies,
   log-magnitude, sign, “large jump” flags).
7. Performs **feature selection** with ANOVA F-test (`SelectKBest`).
8. Trains:
   - A **Random Forest** on all transitions.
   - A **transition model** only on rows where `prev_label_malicious == 0`
     (known-good → next version).
9. Saves selected feature names to:
   - `../data/meta/selected_delta_features_v4.csv`.

The original v3 table (`version_delta_features_live.csv`) is the **only input**.


In [1]:
# Cell 2 – Imports & configuration

from pathlib import Path
from typing import Dict, Any, Optional

import numpy as np
import pandas as pd
import requests
from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
)

pd.set_option("future.no_silent_downcasting", True)

META_DIR = Path("../data/meta")
DELTA_CSV = META_DIR / "version_delta_features_live.csv"
LABELS_VERSION_CSV = META_DIR / "labels_version.csv"


In [2]:
# Cell 3 – Load base delta table (v3 output) and normalize labels

delta_df = pd.read_csv(DELTA_CSV)
print(f"Loaded delta features: {delta_df.shape}")
print("Columns:", delta_df.columns.tolist())

# --- Normalize current-version label: y_malicious ----------------------
if "y_malicious" not in delta_df.columns:
    if "label_malicious" in delta_df.columns:
        print("[INFO] Using label_malicious as y_malicious")
        delta_df["y_malicious"] = delta_df["label_malicious"].astype(int)
    else:
        raise ValueError(
            "Expected column 'y_malicious' or 'label_malicious' not found in delta_df"
        )

print("\nLabel distribution (current version, y_malicious):")
print(delta_df["y_malicious"].value_counts(dropna=False))

# --- Ensure prev_label_malicious exists (for transition model) ---------
if "prev_label_malicious" not in delta_df.columns:
    if LABELS_VERSION_CSV.exists():
        print("\n[INFO] Reconstructing prev_label_malicious from labels_version.csv")
        labels_version = pd.read_csv(LABELS_VERSION_CSV)

        # Try to detect malicious label column in labels_version
        label_col_candidates = [c for c in labels_version.columns if "malicious" in c]
        if not label_col_candidates:
            raise ValueError(
                "labels_version.csv does not contain a malicious label column; "
                "expected something like 'label_malicious' or 'y_malicious'."
            )
        lv_label_col = label_col_candidates[0]

        expected = {"ecosystem", "package_name", "version"}
        missing_lv = expected - set(labels_version.columns)
        if missing_lv:
            raise ValueError(
                f"labels_version.csv is missing columns: {missing_lv}. "
                "Adjust the join logic accordingly."
            )

        labels_prev = labels_version.rename(
            columns={
                "version": "prev_version",
                lv_label_col: "prev_label_malicious",
            }
        )[["ecosystem", "package_name", "prev_version", "prev_label_malicious"]]

        before_rows = len(delta_df)
        delta_df = delta_df.merge(
            labels_prev,
            on=["ecosystem", "package_name", "prev_version"],
            how="left",
        )
        after_rows = len(delta_df)
        if before_rows != after_rows:
            print(
                f"[WARN] Row count changed when merging prev_label_malicious: "
                f"{before_rows} -> {after_rows}. Check for key duplication."
            )
    else:
        print(
            "\n[WARN] labels_version.csv not found; "
            "prev_label_malicious will remain unavailable."
        )

if "prev_label_malicious" in delta_df.columns:
    print("\nLabel distribution (previous version, prev_label_malicious):")
    print(delta_df["prev_label_malicious"].value_counts(dropna=False))
else:
    print("\n[WARN] prev_label_malicious is not present in delta_df.")

delta_df.head()


Loaded delta features: (717, 18)
Columns: ['ecosystem', 'package_name', 'prev_version', 'version', 'y_malicious', 'delta_num_requires_dist', 'delta_summary_len', 'delta_description_len', 'delta_num_classifiers', 'delta_has_author', 'delta_has_license', 'delta_num_dependencies', 'delta_num_dev_dependencies', 'delta_num_scripts', 'delta_num_keywords', 'delta_version_len', 'delta_version_num_dots', 'delta_version_has_prerelease']

Label distribution (current version, y_malicious):
y_malicious
0    453
1    264
Name: count, dtype: int64

[INFO] Reconstructing prev_label_malicious from labels_version.csv

Label distribution (previous version, prev_label_malicious):
prev_label_malicious
NaN     710
True      7
Name: count, dtype: int64


Unnamed: 0,ecosystem,package_name,prev_version,version,y_malicious,delta_num_requires_dist,delta_summary_len,delta_description_len,delta_num_classifiers,delta_has_author,delta_has_license,delta_num_dependencies,delta_num_dev_dependencies,delta_num_scripts,delta_num_keywords,delta_version_len,delta_version_num_dots,delta_version_has_prerelease,prev_label_malicious
0,npm,@accordproject/concerto-analysis,3.22.1-20250825143753,3.23.0,0,,,0.0,,,,0.0,0.0,0.0,0.0,-15.0,0.0,-1.0,
1,npm,@accordproject/concerto-analysis,3.23.0,3.23.1-20250827125749,0,,,0.0,,,,0.0,0.0,0.0,0.0,15.0,0.0,1.0,
2,npm,@accordproject/concerto-analysis,3.23.1-20250827125749,3.24.0,0,,,0.0,,,,0.0,0.0,0.0,0.0,-15.0,0.0,-1.0,
3,npm,@accordproject/concerto-analysis,3.24.0,3.24.1-20250827130031,1,,,0.0,,,,0.0,0.0,0.0,0.0,15.0,0.0,1.0,
4,npm,@accordproject/concerto-linter,3.22.1-20250825143753,3.23.0,0,,,0.0,,,,0.0,0.0,0.0,0.0,-15.0,0.0,-1.0,


In [3]:
# Cell 4 – HTTP session, caches, and key column names

SESSION = requests.Session()
HTTP_TIMEOUT = 10

pypi_cache: Dict[tuple, Optional[Dict[str, Any]]] = {}
npm_cache: Dict[tuple, Optional[Dict[str, Any]]] = {}

ECO_COL = "ecosystem"
PKG_COL = "package_name"
VER_COL = "version"

missing_keys = [c for c in [ECO_COL, PKG_COL, VER_COL, "prev_version"] if c not in delta_df.columns]
if missing_keys:
    raise ValueError(
        f"Expected columns {missing_keys} in delta_df but they are missing."
    )

print("\nEcosystems present:")
print(delta_df[ECO_COL].value_counts(dropna=False))



Ecosystems present:
ecosystem
npm     568
pypi    149
Name: count, dtype: int64


In [22]:
# Cell 5 – PyPI metadata fetch

def fetch_pypi_release(name: str, version: str) -> Optional[Dict[str, Any]]:
    """
    Fetch PyPI metadata for a given project/version from /pypi/<project>/<version>/json.
    Returns a dict with derived fields or None on failure.
    """
    key = (name, version)
    if key in pypi_cache:
        return pypi_cache[key]

    url = f"https://pypi.org/pypi/{name}/{version}/json"
    try:
        resp = SESSION.get(url, timeout=HTTP_TIMEOUT)
        if resp.status_code != 200:
            pypi_cache[key] = None
            return None
        data = resp.json()
    except Exception as e:
        print(f"[WARN] PyPI request failed for {name} {version}: {e}")
        pypi_cache[key] = None
        return None

    urls = data.get("urls") or []
    if not urls:
        pypi_cache[key] = None
        return None

    # Prefer wheel, fallback to sdist, else first file
    preferred = None
    for f in urls:
        if f.get("packagetype") == "bdist_wheel":
            preferred = f
            break
    if preferred is None:
        for f in urls:
            if f.get("packagetype") == "sdist":
                preferred = f
                break
    if preferred is None:
        preferred = urls[0]

    result = {
        "pypi_size_bytes": preferred.get("size"),
        "pypi_packagetype": preferred.get("packagetype"),
        "pypi_filename": preferred.get("filename"),
        "pypi_url": preferred.get("url"),
    }
    pypi_cache[key] = result
    return result


# Quick test on a few PyPI rows (if any)
sample_pypi = delta_df[delta_df[ECO_COL] == "pypi"]
print(f"[INFO] Sample PyPI rows found: {len(sample_pypi)}")
# for _, row in sample_pypi.iterrows():
#     info = fetch_pypi_release(row[PKG_COL], row[VER_COL])
#     print(row[PKG_COL], row[VER_COL], "->", info)


[INFO] Sample PyPI rows found: 149


In [5]:
# Cell 6 – npm metadata fetch

def fetch_npm_version(name: str, version: str) -> Optional[Dict[str, Any]]:
    """
    Fetch npm registry metadata for a given package version from
    https://registry.npmjs.org/<name>/<version>.
    Returns a dict with derived fields or None on failure.
    """
    key = (name, version)
    if key in npm_cache:
        return npm_cache[key]

    url = f"https://registry.npmjs.org/{name}/{version}"
    try:
        resp = SESSION.get(url, timeout=HTTP_TIMEOUT)
        if resp.status_code != 200:
            npm_cache[key] = None
            return None
        data = resp.json()
    except Exception as e:
        print(f"[WARN] npm request failed for {name}@{version}: {e}")
        npm_cache[key] = None
        return None

    dist = data.get("dist") or {}
    result = {
        "npm_tarball_url": dist.get("tarball"),
        "npm_shasum": dist.get("shasum"),
        "npm_integrity": dist.get("integrity"),
        "npm_unpacked_size_bytes": dist.get("unpackedSize"),
        "npm_file_count": dist.get("fileCount"),
    }
    npm_cache[key] = result
    return result


# Quick test on a few npm rows (if any)
sample_npm = delta_df[delta_df[ECO_COL] == "npm"]
print(f"[INFO] Sample npm rows found: {len(sample_npm)}")
# for _, row in sample_npm.iterrows(): 
#     info = fetch_npm_version(row[PKG_COL], row[VER_COL])
#     print(row[PKG_COL], row[VER_COL], "->", info)


[INFO] Sample npm rows found: 568


In [6]:
# Cell 7 – Build per-version registry metadata table

# Unique (ecosystem, package_name, version) for current versions
current_versions = delta_df[[ECO_COL, PKG_COL, VER_COL]].drop_duplicates()

# Unique combos for prev_version (rename to match VER_COL)
prev_versions = (
    delta_df[[ECO_COL, PKG_COL, "prev_version"]]
    .rename(columns={"prev_version": VER_COL})
    .dropna(subset=[VER_COL])
    .drop_duplicates()
)

all_versions = (
    pd.concat([current_versions, prev_versions], ignore_index=True)
    .drop_duplicates()
)

print(f"[INFO] Unique (ecosystem, package, version) to fetch: {len(all_versions)}")

records = []

for _, row in tqdm(all_versions.iterrows(), total=len(all_versions)):
    ecos = row[ECO_COL]
    name = row[PKG_COL]
    ver = row[VER_COL]

    rec: Dict[str, Any] = {
        ECO_COL: ecos,
        PKG_COL: name,
        VER_COL: ver,
    }

    meta: Optional[Dict[str, Any]] = None
    if ecos == "pypi":
        meta = fetch_pypi_release(name, ver)
    elif ecos == "npm":
        meta = fetch_npm_version(name, ver)
    else:
        meta = None

    if meta is not None:
        rec.update(meta)

    records.append(rec)

registry_versions = pd.DataFrame(records)
print("registry_versions shape:", registry_versions.shape)
print("registry_versions columns:", registry_versions.columns.tolist())
registry_versions.head()


[INFO] Unique (ecosystem, package, version) to fetch: 627


  0%|          | 0/627 [00:00<?, ?it/s]

registry_versions shape: (627, 12)
registry_versions columns: ['ecosystem', 'package_name', 'version', 'npm_tarball_url', 'npm_shasum', 'npm_integrity', 'npm_unpacked_size_bytes', 'npm_file_count', 'pypi_size_bytes', 'pypi_packagetype', 'pypi_filename', 'pypi_url']


Unnamed: 0,ecosystem,package_name,version,npm_tarball_url,npm_shasum,npm_integrity,npm_unpacked_size_bytes,npm_file_count,pypi_size_bytes,pypi_packagetype,pypi_filename,pypi_url
0,npm,@accordproject/concerto-analysis,3.23.0,https://registry.npmjs.org/@accordproject/conc...,31a9a9f3a76ed8c36f254e71a722375f331d909b,sha512-43JFp937RXuoOvXO5ynk+r8tofVCVivkUpv8Oon...,147757.0,61.0,,,,
1,npm,@accordproject/concerto-analysis,3.23.1-20250827125749,https://registry.npmjs.org/@accordproject/conc...,745caacfefb84ac50cefc4d9337188f5f71f025d,sha512-TuGa38FxgFJNbVT/kx/BtzeNzOD5/8lFGmw5nkp...,147802.0,61.0,,,,
2,npm,@accordproject/concerto-analysis,3.24.0,https://registry.npmjs.org/@accordproject/conc...,2e352eae9f6a0ac7c7f4abf1a708c2177c8889bf,sha512-9zYvF0vX4iRS9L7/QC4N4s91YzXAiRiUzX4UR4k...,147757.0,61.0,,,,
3,npm,@accordproject/concerto-analysis,3.24.1-20250827130031,https://registry.npmjs.org/@accordproject/conc...,4226c5234d18b36b61a40a78112454d63516770a,sha512-Tbkv+SiXyO2NXBRjpbPCGfFecr3Rm5gAOuM2qfe...,147802.0,61.0,,,,
4,npm,@accordproject/concerto-linter,3.23.0,https://registry.npmjs.org/@accordproject/conc...,014bc2c5a1a42045f3fd7f3635ddd5ea576a135b,sha512-tQ23JwR69JRH5X8XAfGRtpFhPLlHucGa3YsgDRh...,71080.0,39.0,,,,


In [None]:
# Cell 8 – Per-version static size & density features (with column aliases)

reg_cols = set(registry_versions.columns)

# --- Alias raw registry columns to the names v4 expects ----------------
# PyPI: some older code may have left this as just "size"
if "pypi_size_bytes" not in registry_versions.columns:
    if "size" in registry_versions.columns:
        print("[INFO] Using registry_versions['size'] as pypi_size_bytes")
        registry_versions["pypi_size_bytes"] = registry_versions["size"]
    else:
        print("[WARN] No pypi_size_bytes or size column found; PyPI size deltas will be NaN")

# npm: in case of raw names "unpackedSize" / "fileCount"
if "npm_unpacked_size_bytes" not in registry_versions.columns and "unpackedSize" in registry_versions.columns:
    print("[INFO] Using registry_versions['unpackedSize'] as npm_unpacked_size_bytes")
    registry_versions["npm_unpacked_size_bytes"] = registry_versions["unpackedSize"]

if "npm_file_count" not in registry_versions.columns and "fileCount" in registry_versions.columns:
    print("[INFO] Using registry_versions['fileCount'] as npm_file_count")
    registry_versions["npm_file_count"] = registry_versions["fileCount"]

# Refresh set of columns after aliasing
reg_cols = set(registry_versions.columns)

# --- Compute unified static sizes -------------------------------------
def choose_static_uncompressed(row: pd.Series) -> float:
    npm_size = row.get("npm_unpacked_size_bytes", np.nan)
    pypi_size = row.get("pypi_size_bytes", np.nan)
    if pd.notna(npm_size):
        return float(npm_size)
    if pd.notna(pypi_size):
        return float(pypi_size)
    return np.nan

registry_versions["static_size_uncompressed_bytes"] = registry_versions.apply(
    choose_static_uncompressed, axis=1
)

# "Compressed" size – only really for PyPI, if present
registry_versions["static_size_compressed_bytes"] = registry_versions.get(
    "pypi_size_bytes", np.nan
)

# Unified "package size" alias
registry_versions["static_pkg_size_bytes"] = registry_versions[
    "static_size_uncompressed_bytes"
]

# --- Entropy / density-like proxy: bytes per file ---------------------
def compute_entropy_ratio(row: pd.Series) -> float:
    size = row.get("static_pkg_size_bytes", np.nan)
    count = row.get("npm_file_count", np.nan)
    if pd.isna(size) or pd.isna(count) or count <= 0:
        return np.nan
    return float(size) / float(count)

registry_versions["entropy_ratio_size"] = registry_versions.apply(
    compute_entropy_ratio, axis=1
)

DENSITY_THRESHOLD = 5000.0
registry_versions["entropy_indicator"] = np.where(
    registry_versions["entropy_ratio_size"] >= DENSITY_THRESHOLD, 1.0, 0.0
)

# ---- Safe debug view: only show columns that actually exist -----------
candidate_cols = [
    ECO_COL,
    PKG_COL,
    VER_COL,
    "pypi_size_bytes",
    "npm_unpacked_size_bytes",
    "npm_file_count",
    "static_size_uncompressed_bytes",
    "static_size_compressed_bytes",
    "entropy_ratio_size",
    "entropy_indicator",
]

cols_to_show = [c for c in candidate_cols if c in registry_versions.columns]

print("[INFO] Sample of registry_versions with static features:")
registry_versions[cols_to_show].head()


[INFO] Sample of registry_versions with static features:


Unnamed: 0,ecosystem,package_name,version,pypi_size_bytes,npm_unpacked_size_bytes,npm_file_count,static_size_uncompressed_bytes,static_size_compressed_bytes,entropy_ratio_size,entropy_indicator
0,npm,@accordproject/concerto-analysis,3.23.0,,147757.0,61.0,147757.0,,2422.245902,0.0
1,npm,@accordproject/concerto-analysis,3.23.1-20250827125749,,147802.0,61.0,147802.0,,2422.983607,0.0
2,npm,@accordproject/concerto-analysis,3.24.0,,147757.0,61.0,147757.0,,2422.245902,0.0
3,npm,@accordproject/concerto-analysis,3.24.1-20250827130031,,147802.0,61.0,147802.0,,2422.983607,0.0
4,npm,@accordproject/concerto-linter,3.23.0,,71080.0,39.0,71080.0,,1822.564103,0.0


In [None]:
# Cell 9 – Build per-transition static deltas & ratios (robust to missing columns)

STATIC_BASE_COLS = [
    "static_size_uncompressed_bytes",
    "static_size_compressed_bytes",
    "pypi_size_bytes",             # may be missing if no PyPI data
    "npm_unpacked_size_bytes",     # may be missing if aliasing didn't happen
    "npm_file_count",
    "entropy_ratio_size",
    "entropy_indicator",
]

key_cols = [ECO_COL, PKG_COL, VER_COL]

# Only select the static columns that actually exist in registry_versions
available_base_cols = [c for c in STATIC_BASE_COLS if c in registry_versions.columns]
missing_base_cols = [c for c in STATIC_BASE_COLS if c not in registry_versions.columns]

if missing_base_cols:
    print(f"[INFO] Skipping missing static base cols in registry_versions: {missing_base_cols}")

static_map: Dict[tuple, Dict[str, Any]] = {}

for _, r in registry_versions[key_cols + available_base_cols].iterrows():
    key = (r[ECO_COL], r[PKG_COL], r[VER_COL])

    # Build a dict for *all* expected STATIC_BASE_COLS, filling missing ones with NaN
    stats = {}
    for col in STATIC_BASE_COLS:
        stats[col] = r.get(col, np.nan)
    static_map[key] = stats


def safe_ratio(num: float, den: float) -> float:
    if den is None or (isinstance(den, float) and (np.isnan(den) or den == 0.0)):
        return np.nan
    return num / den


delta_static_records = []

for _, r in tqdm(delta_df.iterrows(), total=len(delta_df)):
    ecos = r[ECO_COL]
    name = r[PKG_COL]
    ver_curr = r[VER_COL]
    ver_prev = r["prev_version"]

    curr_stats = static_map.get((ecos, name, ver_curr), {})
    prev_stats = static_map.get((ecos, name, ver_prev), {})

    rec: Dict[str, Any] = {}

    # ---- Unified uncompressed static size ---------------------------------
    curr_uncomp = float(curr_stats.get("static_size_uncompressed_bytes", np.nan))
    prev_uncomp = float(prev_stats.get("static_size_uncompressed_bytes", np.nan))

    rec["static_size_prev_uncompressed_bytes"] = prev_uncomp
    rec["static_size_curr_uncompressed_bytes"] = curr_uncomp

    if np.isnan(curr_uncomp) or np.isnan(prev_uncomp):
        rec["static_size_delta_vs_prev"] = np.nan
        rec["static_size_ratio_vs_prev"] = np.nan
        rec["ratio_static_size_uncompressed_bytes"] = np.nan
    else:
        rec["static_size_delta_vs_prev"] = curr_uncomp - prev_uncomp
        ratio = safe_ratio(curr_uncomp, prev_uncomp)
        rec["static_size_ratio_vs_prev"] = ratio
        rec["ratio_static_size_uncompressed_bytes"] = ratio

    # ---- PyPI size delta/ratio (will just be NaN if you have no PyPI) -----
    curr_pypi = float(curr_stats.get("pypi_size_bytes", np.nan))
    prev_pypi = float(prev_stats.get("pypi_size_bytes", np.nan))

    if np.isnan(curr_pypi) or np.isnan(prev_pypi):
        rec["delta_pypi_size_bytes"] = np.nan
        rec["ratio_pypi_size_bytes"] = np.nan
    else:
        rec["delta_pypi_size_bytes"] = curr_pypi - prev_pypi
        rec["ratio_pypi_size_bytes"] = safe_ratio(curr_pypi, prev_pypi)

    # ---- npm unpacked size delta/ratio ------------------------------------
    curr_npm_size = float(curr_stats.get("npm_unpacked_size_bytes", np.nan))
    prev_npm_size = float(prev_stats.get("npm_unpacked_size_bytes", np.nan))

    if np.isnan(curr_npm_size) or np.isnan(prev_npm_size):
        rec["delta_npm_unpacked_size_bytes"] = np.nan
        rec["ratio_npm_unpacked_size_bytes"] = np.nan
    else:
        rec["delta_npm_unpacked_size_bytes"] = curr_npm_size - prev_npm_size
        rec["ratio_npm_unpacked_size_bytes"] = safe_ratio(
            curr_npm_size, prev_npm_size
        )

    # ---- npm file count delta/ratio ---------------------------------------
    curr_npm_files = float(curr_stats.get("npm_file_count", np.nan))
    prev_npm_files = float(prev_stats.get("npm_file_count", np.nan))

    if np.isnan(curr_npm_files) or np.isnan(prev_npm_files):
        rec["delta_npm_file_count"] = np.nan
        rec["ratio_npm_file_count"] = np.nan
    else:
        rec["delta_npm_file_count"] = curr_npm_files - prev_npm_files
        rec["ratio_npm_file_count"] = safe_ratio(curr_npm_files, prev_npm_files)

    # ---- Entropy ratio delta/ratio ----------------------------------------
    curr_entropy = float(curr_stats.get("entropy_ratio_size", np.nan))
    prev_entropy = float(prev_stats.get("entropy_ratio_size", np.nan))

    if np.isnan(curr_entropy) or np.isnan(prev_entropy):
        rec["delta_entropy_ratio_size"] = np.nan
        rec["ratio_entropy_ratio_size"] = np.nan
    else:
        rec["delta_entropy_ratio_size"] = curr_entropy - prev_entropy
        rec["ratio_entropy_ratio_size"] = safe_ratio(curr_entropy, prev_entropy)

    # ---- Entropy indicator delta/ratio ------------------------------------
    curr_indicator = float(curr_stats.get("entropy_indicator", np.nan))
    prev_indicator = float(prev_stats.get("entropy_indicator", np.nan))

    if np.isnan(curr_indicator) or np.isnan(prev_indicator):
        rec["delta_entropy_indicator"] = np.nan
        rec["ratio_entropy_indicator"] = np.nan
    else:
        rec["delta_entropy_indicator"] = curr_indicator - prev_indicator
        rec["ratio_entropy_indicator"] = safe_ratio(curr_indicator, prev_indicator)

    delta_static_records.append(rec)

delta_static_df = pd.DataFrame(delta_static_records, index=delta_df.index)
print("delta_static_df shape:", delta_static_df.shape)
delta_static_df.head()


  0%|          | 0/717 [00:00<?, ?it/s]

delta_static_df shape: (717, 15)


Unnamed: 0,static_size_prev_uncompressed_bytes,static_size_curr_uncompressed_bytes,static_size_delta_vs_prev,static_size_ratio_vs_prev,ratio_static_size_uncompressed_bytes,delta_pypi_size_bytes,ratio_pypi_size_bytes,delta_npm_unpacked_size_bytes,ratio_npm_unpacked_size_bytes,delta_npm_file_count,ratio_npm_file_count,delta_entropy_ratio_size,ratio_entropy_ratio_size,delta_entropy_indicator,ratio_entropy_indicator
0,147802.0,147757.0,-45.0,0.999696,0.999696,,,-45.0,0.999696,0.0,1.0,-0.737705,0.999696,0.0,
1,147757.0,147802.0,45.0,1.000305,1.000305,,,45.0,1.000305,0.0,1.0,0.737705,1.000305,0.0,
2,147802.0,147757.0,-45.0,0.999696,0.999696,,,-45.0,0.999696,0.0,1.0,-0.737705,0.999696,0.0,
3,147757.0,147802.0,45.0,1.000305,1.000305,,,45.0,1.000305,0.0,1.0,0.737705,1.000305,0.0,
4,122945.0,71080.0,-51865.0,0.578145,0.578145,,,-51865.0,0.578145,-25.0,0.609375,-98.451522,0.94875,0.0,


In [9]:
# Cell 10 – Merge new static-delta features into delta_df

delta_df = pd.concat(
    [delta_df.reset_index(drop=True), delta_static_df.reset_index(drop=True)],
    axis=1,
)

print("Enriched delta_df shape:", delta_df.shape)
print("delta_df columns (first 40):", delta_df.columns.tolist()[:40])

# (Optional) save a v4-enriched version; keep v3 output intact
out_delta_v4_csv = META_DIR / "version_delta_features_v4.csv"
delta_df.to_csv(out_delta_v4_csv, index=False)
print(f"[INFO] Saved v4-enriched delta table to {out_delta_v4_csv}")


Enriched delta_df shape: (717, 34)
delta_df columns (first 40): ['ecosystem', 'package_name', 'prev_version', 'version', 'y_malicious', 'delta_num_requires_dist', 'delta_summary_len', 'delta_description_len', 'delta_num_classifiers', 'delta_has_author', 'delta_has_license', 'delta_num_dependencies', 'delta_num_dev_dependencies', 'delta_num_scripts', 'delta_num_keywords', 'delta_version_len', 'delta_version_num_dots', 'delta_version_has_prerelease', 'prev_label_malicious', 'static_size_prev_uncompressed_bytes', 'static_size_curr_uncompressed_bytes', 'static_size_delta_vs_prev', 'static_size_ratio_vs_prev', 'ratio_static_size_uncompressed_bytes', 'delta_pypi_size_bytes', 'ratio_pypi_size_bytes', 'delta_npm_unpacked_size_bytes', 'ratio_npm_unpacked_size_bytes', 'delta_npm_file_count', 'ratio_npm_file_count', 'delta_entropy_ratio_size', 'ratio_entropy_ratio_size', 'delta_entropy_indicator', 'ratio_entropy_indicator']
[INFO] Saved v4-enriched delta table to ..\data\meta\version_delta_featur

## Cell 11 – Registry-inspired derived features

In this step we derive **registry-style anomaly features** from existing delta / ratio columns:

- **Unified size delta / ratio**  
  - `delta_unified_size_bytes` picks the first available size delta per row  
    (e.g., offline static size delta, PyPI size delta, npm unpacked size delta).  
  - `ratio_unified_size` does the same for multiplicative size change.  
  - This gives the model a single “best” view of **how much the package size changed** between versions, independent of ecosystem or data source.

- **Density / “bytes per file” proxies**  
  - `ratio_npm_bytes_per_file_proxy`, `delta_npm_bytes_per_file_proxy`, and `delta_unified_density_proxy` approximate changes in **bytes per file**.  
  - These act as weak proxies for “packedness” or density: large size increases without similar file-count increases may indicate new blobs, embedded payloads, or obfuscated content.

All features are made **NaN-safe** and given neutral defaults so they can be used directly in feature selection and modeling.


In [10]:
# Cell 11 – Registry-inspired derived features (all delta / ratio based)

cols = set(delta_df.columns)

delta_cols = [c for c in delta_df.columns if c.startswith("delta_")]
ratio_cols = [c for c in delta_df.columns if c.startswith("ratio_")]

print(f"Found {len(delta_cols)} delta_* features and {len(ratio_cols)} ratio_* features")

# ---- Unified size delta / ratio --------------------------------------
unified_delta_sources = [
    "static_size_delta_vs_prev",        # unified static size delta (if present)
    "delta_pypi_size_bytes",           # PyPI size delta
    "delta_npm_unpacked_size_bytes",   # npm unpacked size delta
]

unified_ratio_sources = [
    "static_size_ratio_vs_prev",               # unified static size ratio (if present)
    "ratio_pypi_size_bytes",
    "ratio_static_size_uncompressed_bytes",
    "ratio_npm_unpacked_size_bytes",
]

if any(src in cols for src in unified_delta_sources):
    delta_df["delta_unified_size_bytes"] = np.nan
    for src in unified_delta_sources:
        if src in cols:
            delta_df["delta_unified_size_bytes"] = delta_df[
                "delta_unified_size_bytes"
            ].fillna(delta_df[src])
else:
    delta_df["delta_unified_size_bytes"] = 0.0

if any(src in cols for src in unified_ratio_sources):
    delta_df["ratio_unified_size"] = np.nan
    for src in unified_ratio_sources:
        if src in cols:
            delta_df["ratio_unified_size"] = delta_df["ratio_unified_size"].fillna(
                delta_df[src]
            )
else:
    delta_df["ratio_unified_size"] = 1.0  # neutral "no change" ratio

# ---- Density / entropy-like proxies: bytes per file-ish --------------

# npm side: ratio-based "bytes per file" proxy
if {"ratio_npm_unpacked_size_bytes", "ratio_npm_file_count"} <= cols:
    denom = delta_df["ratio_npm_file_count"].replace(0, np.nan)
    delta_df["ratio_npm_bytes_per_file_proxy"] = (
        delta_df["ratio_npm_unpacked_size_bytes"] / denom
    )
    delta_df["ratio_npm_bytes_per_file_proxy"] = (
        delta_df["ratio_npm_bytes_per_file_proxy"]
        .replace([np.inf, -np.inf], np.nan)
        .fillna(1.0)
    )

# npm side: delta-based "bytes per file" proxy
if {"delta_npm_unpacked_size_bytes", "delta_npm_file_count"} <= cols:
    denom = delta_df["delta_npm_file_count"].replace(0, np.nan)
    delta_df["delta_npm_bytes_per_file_proxy"] = (
        delta_df["delta_npm_unpacked_size_bytes"] / denom
    )
    delta_df["delta_npm_bytes_per_file_proxy"] = (
        delta_df["delta_npm_bytes_per_file_proxy"]
        .replace([np.inf, -np.inf], np.nan)
        .fillna(0.0)
    )

# generic unified density proxy from static sizes + file count if available
if {"static_size_delta_vs_prev", "delta_npm_file_count"} <= cols:
    denom = delta_df["delta_npm_file_count"].replace(0, np.nan)
    delta_df["delta_unified_density_proxy"] = (
        delta_df["static_size_delta_vs_prev"] / denom
    )
    delta_df["delta_unified_density_proxy"] = (
        delta_df["delta_unified_density_proxy"]
        .replace([np.inf, -np.inf], np.nan)
        .fillna(0.0)
    )

density_cols = [
    c
    for c in [
        "ratio_npm_bytes_per_file_proxy",
        "delta_npm_bytes_per_file_proxy",
        "delta_unified_density_proxy",
    ]
    if c in delta_df.columns
]

print("Density / entropy-like columns now present:", density_cols)
delta_df[density_cols].head()


Found 18 delta_* features and 6 ratio_* features
Density / entropy-like columns now present: ['ratio_npm_bytes_per_file_proxy', 'delta_npm_bytes_per_file_proxy', 'delta_unified_density_proxy']


Unnamed: 0,ratio_npm_bytes_per_file_proxy,delta_npm_bytes_per_file_proxy,delta_unified_density_proxy
0,0.999696,0.0,0.0
1,1.000305,0.0,0.0
2,0.999696,0.0,0.0
3,1.000305,0.0,0.0
4,0.94875,2074.6,2074.6


## Cell 12 – Magnitude, log-magnitude, and sign of size-like deltas

Here we enrich all **size-like delta features** (columns containing `size`, `unpacked`, or `bytes`) with three derived views:

- **Absolute magnitude**: `{col}_abs`  
  - Captures how big the change is in bytes, ignoring direction.

- **Log-magnitude**: `{col}_log1p_abs`  
  - Uses `log(1 + |delta|)` to reduce skew from very large jumps and make values more model-friendly.

- **Direction flag**: `{col}_sign`  
  - Encodes whether the size **decreased (-1)**, **stayed similar (0)**, or **increased (+1)**.  
  - This separates the idea of “how big was the change?” from “did it grow or shrink?”

Together, these features let the model reason about **both direction and scale** of size changes across different size-related signals.


In [11]:
# Cell 12 – Magnitude, log-magnitude, and sign for size-like deltas (NaN-safe)

size_like_delta_cols = []

for c in delta_cols + ["static_size_delta_vs_prev", "delta_unified_size_bytes"]:
    if c in cols and ("size" in c or "unpacked" in c or "bytes" in c):
        size_like_delta_cols.append(c)

size_like_delta_cols = sorted(set(size_like_delta_cols))
print("Size-like delta columns:", size_like_delta_cols)

for c in size_like_delta_cols:
    # Absolute change
    abs_col = f"{c}_abs"
    delta_df[abs_col] = delta_df[c].abs()

    # Log(1+|delta|) magnitude (helps strong skew)
    log_col = f"{c}_log1p_abs"
    delta_df[log_col] = np.log1p(delta_df[abs_col])

    # Direction flag: -1, 0, +1
    sign_col = f"{c}_sign"
    # NaN-safe: treat missing values as 0 before taking sign
    sign_vals = np.sign(delta_df[c].fillna(0.0))
    delta_df[sign_col] = sign_vals.astype("int8")

example_cols = []
for c in size_like_delta_cols[:2]:
    example_cols.extend([c, f"{c}_abs", f"{c}_log1p_abs", f"{c}_sign"])

print("Example size/derived columns:", example_cols)
delta_df[example_cols].head()


Size-like delta columns: ['delta_entropy_ratio_size', 'delta_npm_unpacked_size_bytes', 'delta_pypi_size_bytes', 'static_size_delta_vs_prev']
Example size/derived columns: ['delta_entropy_ratio_size', 'delta_entropy_ratio_size_abs', 'delta_entropy_ratio_size_log1p_abs', 'delta_entropy_ratio_size_sign', 'delta_npm_unpacked_size_bytes', 'delta_npm_unpacked_size_bytes_abs', 'delta_npm_unpacked_size_bytes_log1p_abs', 'delta_npm_unpacked_size_bytes_sign']


Unnamed: 0,delta_entropy_ratio_size,delta_entropy_ratio_size_abs,delta_entropy_ratio_size_log1p_abs,delta_entropy_ratio_size_sign,delta_npm_unpacked_size_bytes,delta_npm_unpacked_size_bytes_abs,delta_npm_unpacked_size_bytes_log1p_abs,delta_npm_unpacked_size_bytes_sign
0,-0.737705,0.737705,0.552565,-1,-45.0,45.0,3.828641,-1
1,0.737705,0.737705,0.552565,1,45.0,45.0,3.828641,1
2,-0.737705,0.737705,0.552565,-1,-45.0,45.0,3.828641,-1
3,0.737705,0.737705,0.552565,1,45.0,45.0,3.828641,1
4,-98.451522,98.451522,4.59967,-1,-51865.0,51865.0,10.856419,-1


## Cell 13 – Large-jump flags for size deltas

Finally, we add **binary “large jump” flags** for each size-like delta:

- For each size delta column, we compute the 95th percentile of `|delta|`.
- We then create `{col}_large_jump` which is:
  - `1` if `|delta|` is in the top 5% (unusually large change),
  - `0` otherwise.

These flags provide a simple, registry-inspired anomaly signal:  
> “This version changed size much more than most other updates.”

They are scale-aware (based on the observed distribution) and help the model highlight **rare, extreme size changes** that may correlate with malicious transitions.


In [12]:
# Cell 13 – "Large jump" flags for size deltas (top 5% by |delta|)

for c in size_like_delta_cols:
    q = delta_df[c].abs().quantile(0.95)
    flag_col = f"{c}_large_jump"
    delta_df[flag_col] = (delta_df[c].abs() >= q).astype("int8")

large_jump_cols = [f"{c}_large_jump" for c in size_like_delta_cols]
print("Large-jump flag columns:", large_jump_cols[:10])
delta_df[large_jump_cols].head()


Large-jump flag columns: ['delta_entropy_ratio_size_large_jump', 'delta_npm_unpacked_size_bytes_large_jump', 'delta_pypi_size_bytes_large_jump', 'static_size_delta_vs_prev_large_jump']


Unnamed: 0,delta_entropy_ratio_size_large_jump,delta_npm_unpacked_size_bytes_large_jump,delta_pypi_size_bytes_large_jump,static_size_delta_vs_prev_large_jump
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0


In [13]:
# Cell 14 – Feature matrix construction (with constant-feature removal)

if "y_malicious" not in delta_df.columns:
    raise ValueError("Expected 'y_malicious' column in delta_df")

y = delta_df["y_malicious"].astype(int)

exclude_cols = ["y_malicious"]
exclude_cols.extend([c for c in delta_df.columns if c.startswith("label_")])
exclude_cols.extend([c for c in delta_df.columns if c.startswith("prev_label_")])
exclude_cols.extend(
    [c for c in ["ecosystem", "package_name", "version", "prev_version"] if c in delta_df.columns]
)

# First pick numeric candidates
numeric_candidates = [
    c
    for c in delta_df.columns
    if c not in exclude_cols and delta_df[c].dtype.kind in "biufc"
]

# Drop constant columns (zero variance)
nunique = delta_df[numeric_candidates].nunique(dropna=False)
numeric_cols = nunique[nunique > 1].index.tolist()

print(f"Numeric feature candidates (non-constant): {len(numeric_cols)}")
print("Example feature names:", numeric_cols[:20])

X = delta_df[numeric_cols].fillna(0)
print("X shape:", X.shape, "| y shape:", y.shape)


Numeric feature candidates (non-constant): 49
Example feature names: ['delta_num_requires_dist', 'delta_summary_len', 'delta_description_len', 'delta_num_classifiers', 'delta_has_author', 'delta_has_license', 'delta_num_dependencies', 'delta_num_dev_dependencies', 'delta_num_scripts', 'delta_num_keywords', 'delta_version_len', 'delta_version_num_dots', 'delta_version_has_prerelease', 'static_size_prev_uncompressed_bytes', 'static_size_curr_uncompressed_bytes', 'static_size_delta_vs_prev', 'static_size_ratio_vs_prev', 'ratio_static_size_uncompressed_bytes', 'delta_pypi_size_bytes', 'ratio_pypi_size_bytes']
X shape: (717, 49) | y shape: (717,)


In [14]:
# Cell 15 – Feature selection via ANOVA F-test (uses X, y from Cell 14)

from sklearn.feature_selection import SelectKBest, f_classif

if "X" not in globals() or "y" not in globals():
    raise RuntimeError("Run Cell 14 first to define X and y before feature selection.")

k = min(30, X.shape[1])  # allow up to 30 features
selector = SelectKBest(f_classif, k=k)
selector.fit(X, y)

selected_features = [numeric_cols[i] for i in selector.get_support(indices=True)]

print(f"Selected {len(selected_features)} features:")
for f in selected_features:
    print(f"  - {f}")

X_selected = delta_df[selected_features].fillna(0)

# Save selected feature names for downstream notebooks
from pathlib import Path
import pandas as pd

META_DIR = Path("../data/meta")
out_path = META_DIR / "selected_delta_features_v4.csv"
pd.DataFrame({"selected_feature": selected_features}).to_csv(out_path, index=False)
print(f"\nSaved selected features to {out_path.name}")


Selected 30 features:
  - delta_description_len
  - delta_num_classifiers
  - delta_num_scripts
  - delta_num_keywords
  - delta_version_len
  - delta_version_num_dots
  - delta_version_has_prerelease
  - static_size_prev_uncompressed_bytes
  - static_size_curr_uncompressed_bytes
  - static_size_ratio_vs_prev
  - ratio_static_size_uncompressed_bytes
  - ratio_pypi_size_bytes
  - ratio_npm_unpacked_size_bytes
  - ratio_npm_file_count
  - ratio_entropy_ratio_size
  - ratio_entropy_indicator
  - ratio_unified_size
  - ratio_npm_bytes_per_file_proxy
  - delta_npm_bytes_per_file_proxy
  - delta_unified_density_proxy
  - delta_entropy_ratio_size_log1p_abs
  - delta_entropy_ratio_size_sign
  - delta_npm_unpacked_size_bytes_log1p_abs
  - delta_npm_unpacked_size_bytes_sign
  - delta_pypi_size_bytes_abs
  - delta_pypi_size_bytes_log1p_abs
  - delta_pypi_size_bytes_sign
  - static_size_delta_vs_prev_log1p_abs
  - static_size_delta_vs_prev_sign
  - delta_pypi_size_bytes_large_jump

Saved selected 

  f = msb / msw


In [15]:
# Cell 16 – Train/test split and Random Forest (all transitions)

X_train, X_test, y_train, y_test = train_test_split(
    X_selected,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

clf_all = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    class_weight="balanced",
)
clf_all.fit(X_train, y_train)

y_pred = clf_all.predict(X_test)
y_proba = clf_all.predict_proba(X_test)[:, 1]

print("\n=== Model Performance (all transitions) ===")
print(classification_report(y_test, y_pred, target_names=["Benign", "Malicious"]))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

roc = roc_auc_score(y_test, y_proba)
print(f"\nROC-AUC (all transitions): {roc:.3f}")



=== Model Performance (all transitions) ===
              precision    recall  f1-score   support

      Benign       0.86      0.84      0.85        91
   Malicious       0.73      0.77      0.75        53

    accuracy                           0.81       144
   macro avg       0.80      0.80      0.80       144
weighted avg       0.82      0.81      0.81       144


Confusion Matrix:
[[76 15]
 [12 41]]

ROC-AUC (all transitions): 0.884


In [16]:
# Cell 17 – Feature importances (all transitions)

feat_imp_all = (
    pd.DataFrame(
        {"feature": selected_features, "importance": clf_all.feature_importances_}
    )
    .sort_values("importance", ascending=False)
    .reset_index(drop=True)
)

print("\nTop Feature Importances (all transitions):")
feat_imp_all.head(20)



Top Feature Importances (all transitions):


Unnamed: 0,feature,importance
0,static_size_delta_vs_prev_log1p_abs,0.136944
1,static_size_curr_uncompressed_bytes,0.124999
2,static_size_prev_uncompressed_bytes,0.121882
3,ratio_unified_size,0.065803
4,static_size_ratio_vs_prev,0.058537
5,delta_npm_unpacked_size_bytes_log1p_abs,0.056962
6,delta_entropy_ratio_size_log1p_abs,0.04969
7,ratio_static_size_uncompressed_bytes,0.048484
8,ratio_entropy_ratio_size,0.048042
9,ratio_npm_bytes_per_file_proxy,0.045669


In [17]:
# Cell 18 – Subset to transitions where previous version is known benign

if "prev_label_malicious" not in delta_df.columns:
    print(
        "[WARN] prev_label_malicious not available; "
        "transition model will reuse clf_all."
    )
    X_known = None
    y_known = None
else:
    mask_prev_good = (
        (delta_df["prev_label_malicious"] == 0)
        & delta_df["prev_label_malicious"].notna()
    )

    print(
        f"Transitions with prev_label_malicious == 0: "
        f"{mask_prev_good.sum()} / {len(delta_df)}"
    )

    X_known = delta_df.loc[mask_prev_good, selected_features].fillna(0)
    y_known = delta_df.loc[mask_prev_good, "y_malicious"].astype(int)

    print("X_known shape:", X_known.shape, "| y_known shape:", y_known.shape)
    print("Label distribution in known-good transitions:")
    print(y_known.value_counts(dropna=False))


Transitions with prev_label_malicious == 0: 0 / 717
X_known shape: (0, 30) | y_known shape: (0,)
Label distribution in known-good transitions:
Series([], Name: count, dtype: int64)


In [18]:
# Cell 19 – Train transition model (prev benign -> next version)

clf_transition = None

if X_known is None or len(X_known) < 10 or y_known.nunique() < 2:
    print(
        "[WARN] Not enough transitions from known-good previous versions "
        "to train a separate transition model. Using clf_all as fallback."
    )
    clf_transition = clf_all
else:
    Xk_train, Xk_test, yk_train, yk_test = train_test_split(
        X_known,
        y_known,
        test_size=0.2,
        random_state=42,
        stratify=y_known,
    )

    clf_transition = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=42,
        class_weight="balanced",
    )
    clf_transition.fit(Xk_train, yk_train)

    yk_pred = clf_transition.predict(Xk_test)
    yk_proba = clf_transition.predict_proba(Xk_test)[:, 1]

    print("\n=== Transition Model Performance (prev benign -> next) ===")
    print(
        classification_report(
            yk_test, yk_pred, target_names=["Benign", "Malicious"]
        )
    )

    print("\nConfusion Matrix (transition model):")
    print(confusion_matrix(yk_test, yk_pred))

    roc_k = roc_auc_score(yk_test, yk_proba)
    print(f"\nROC-AUC (transition model): {roc_k:.3f}")


[WARN] Not enough transitions from known-good previous versions to train a separate transition model. Using clf_all as fallback.


In [19]:
# Cell 20 – Helper to score a specific transition from a known-good version

def score_transition_from_known_good(
    model: RandomForestClassifier,
    df_delta: pd.DataFrame,
    ecosystem: str,
    package_name: str,
    base_version: str,
    next_version: str,
    feature_cols: list[str],
) -> float:
    """
    Compute P(malicious) for the transition:
        ecosystem:package_name base_version -> next_version

    Uses the given model and the selected delta features.

    Assumes df_delta contains:
      - 'ecosystem', 'package_name', 'version', 'prev_version'
      - feature_cols
    """
    mask = (
        (df_delta[ECO_COL] == ecosystem)
        & (df_delta[PKG_COL] == package_name)
        & (df_delta[VER_COL] == next_version)
        & (df_delta["prev_version"] == base_version)
    )

    row = df_delta.loc[mask, feature_cols]

    if row.empty:
        raise ValueError(
            f"No matching row for transition {ecosystem}:{package_name} "
            f"{base_version} -> {next_version}"
        )

    X_row = row.fillna(0)
    proba = model.predict_proba(X_row)[:, 1][0]
    return float(proba)


In [20]:
# Cell 20b – Create synthetic dummy transition with dummy feature values

# Dummy identifiers for the example transition
demo_ecos = "npm"
demo_pkg = "dummy-pkg-v4-demo"
demo_prev = "0.9.0"   # known-good version
demo_curr = "1.0.0"   # new version to evaluate

# Check if this transition already exists
mask_demo = (
    (delta_df[ECO_COL] == demo_ecos)
    & (delta_df[PKG_COL] == demo_pkg)
    & (delta_df[VER_COL] == demo_curr)
    & (delta_df["prev_version"] == demo_prev)
)

if mask_demo.any():
    print("[INFO] Synthetic demo transition already present in delta_df.")
else:
    # Start with NaN for every column
    fake_row = {col: np.nan for col in delta_df.columns}

    # IDs
    fake_row[ECO_COL] = demo_ecos
    fake_row[PKG_COL] = demo_pkg
    fake_row[VER_COL] = demo_curr
    fake_row["prev_version"] = demo_prev

    # Labels: previous version benign, current version malicious (for illustration)
    if "prev_label_malicious" in delta_df.columns:
        fake_row["prev_label_malicious"] = 0  # known-good base version
    if "y_malicious" in delta_df.columns:
        fake_row["y_malicious"] = 1  # label for current version (dummy malicious)

    # Give some dummy but "suspicious-looking" values for key features
    dummy_values = {
        # Big size jump
        "static_size_delta_vs_prev": 50_000.0,
        "static_size_ratio_vs_prev": 2.5,
        "ratio_static_size_uncompressed_bytes": 2.5,
        "delta_unified_size_bytes": 50_000.0,
        "ratio_unified_size": 2.5,

        # More files and larger unpacked size
        "delta_npm_unpacked_size_bytes": 48_000.0,
        "ratio_npm_unpacked_size_bytes": 2.4,
        "delta_npm_file_count": 100.0,
        "ratio_npm_file_count": 2.0,

        # Entropy / density changes
        "delta_entropy_ratio_size": 3_000.0,
        "ratio_entropy_ratio_size": 1.6,
        "ratio_entropy_indicator": 1.0,  # flipped from 0 -> 1

        # Bytes-per-file proxies
        "ratio_npm_bytes_per_file_proxy": 1.8,
        "delta_npm_bytes_per_file_proxy": 1_000.0,
        "delta_unified_density_proxy": 800.0,

        # Magnitude / sign / large-jump flags (if those were created)
        "delta_entropy_ratio_size_abs": 3_000.0,
        "delta_entropy_ratio_size_log1p_abs": np.log1p(3_000.0),
        "delta_entropy_ratio_size_sign": 1,
        "delta_npm_unpacked_size_bytes_abs": 48_000.0,
        "delta_npm_unpacked_size_bytes_log1p_abs": np.log1p(48_000.0),
        "delta_npm_unpacked_size_bytes_sign": 1,
        "delta_entropy_ratio_size_large_jump": 1,
        "delta_npm_unpacked_size_bytes_large_jump": 1,
        "static_size_delta_vs_prev_large_jump": 1,
    }

    # Fill selected features with dummy values where possible, fallback to 0.0
    for f in selected_features:
        if f in fake_row:  # column exists in delta_df
            if f in dummy_values:
                fake_row[f] = dummy_values[f]
            else:
                fake_row[f] = 0.0  # neutral dummy value

    # Append to delta_df
    delta_df = pd.concat([delta_df, pd.DataFrame([fake_row])], ignore_index=True)
    print("[INFO] Added synthetic demo transition row:")
    print(
        delta_df.tail(1)[
            [ECO_COL, PKG_COL, "prev_version", VER_COL, "prev_label_malicious", "y_malicious"]
            if "prev_label_malicious" in delta_df.columns
            else [ECO_COL, PKG_COL, "prev_version", VER_COL, "y_malicious"]
        ]
    )


[INFO] Added synthetic demo transition row:
    ecosystem       package_name prev_version version prev_label_malicious  \
717       npm  dummy-pkg-v4-demo        0.9.0   1.0.0                    0   

     y_malicious  
717            1  


In [21]:
# Cell 21 – Example: scoring the synthetic dummy transition

demo_ecos = "npm"
demo_pkg = "dummy-pkg-v4-demo"
demo_prev = "0.9.0"
demo_curr = "1.0.0"

p_mal = score_transition_from_known_good(
    model=clf_transition,
    df_delta=delta_df,
    ecosystem=demo_ecos,
    package_name=demo_pkg,
    base_version=demo_prev,
    next_version=demo_curr,
    feature_cols=selected_features,
)

print(
    f"P(malicious) for transition {demo_ecos}:{demo_pkg} "
    f"{demo_prev} -> {demo_curr}: {p_mal:.3f}"
)


P(malicious) for transition npm:dummy-pkg-v4-demo 0.9.0 -> 1.0.0: 0.295
