In [13]:
import datetime
from collections import Counter, defaultdict
import json
import numpy as np

from common import *
from vulnerability_database import VulnerabilityDatabase

#### Resultset generated with
```
python analyze_bundle_dataset.py --total 100000 --worker $(nproc) --batch-per-file -s filter_source_map_sources $DATASETS/results-*
```

In [14]:
with open(os.path.join(DATASETS, "update-behavior-pnpm.json"), "r") as f:
    data = json.load(f)

vulndb = VulnerabilityDatabase(os.path.join(DATASETS, "vulndb.json"))

In [4]:
results = defaultdict(lambda: defaultdict(list))

stats_to_plot = {}

In [5]:
for n, day in enumerate(data):
    for d in day:
        domain = next(iter(d))
        urls = d[domain]
        libraries = set()
        for extracted in map(parse_full_pnpm_names, urls):
            libraries.update([tuple(e.rsplit("@", 1)) for e in extracted])
        histories = defaultdict(list)
        for lib, vers in libraries:
            histories[lib].append(vers)
        for lib, verss in histories.items():
            results[domain][lib].append(sorted(verss))
        for lib in results[domain]:
            if len(results[domain][lib]) < n+1:
                results[domain][lib].append([])  # Preserve one measurement per day


In [6]:
not_updated_libs = 0
not_updated_domains = 0
not_monotonous = 0

base_date = datetime.datetime.fromisoformat("2024-10-31T00:00:00Z")

for domain, domain_data in results.items():
    no = len(domain_data) > 0
    for library, history in domain_data.items():
        if all([n < 2 for n in set(map(len, history))]):  # we ignore multiple installed versions in parallel
            if len(set([v[0] for v in history if len(v) > 0])) > 1:
                no = False

                parsed_history = list(map(coerce_version, [v[0] for v in history if len(v) > 0]))
                non_empty_history = [(n, v[0]) for n, v in enumerate(history) if len(v) > 0]
                
                if all(v1 <= v2 for v1, v2 in zip(parsed_history, parsed_history[1:])):
                    updates = [(base_date + datetime.timedelta(days=n), v2) for (n, v1), (_, v2) in zip(non_empty_history, non_empty_history[1:]) if v1[0] != v2[0]]
                    if library in vulndb.releases:
                        for update_time, version in updates:
                            if version in vulndb.releases[library]:
                                update_time_diff = update_time - datetime.datetime.fromisoformat(vulndb.releases[library][version])
                                print(f"{domain=} {library=} {version=} {update_time_diff.days=}")
                            else:
                                print(f"WARNING: Missing version {version} for library {library}")
                    else:
                        # This is expected, as we have private packages in here
                        # print(f"WARNING: No release info for {library}")
                        pass
                else:
                    not_monotonous += 1
            else:
                not_updated_libs += 1
    if no:
        not_updated_domains += 1

print(f"{not_updated_domains=} {not_updated_libs=} {not_monotonous=}")

domain='dubclub.win' library='@sentry/browser' version='8.36.0' update_time_diff.days=5
domain='app.destinyitemmanager.com' library='react-router' version='7.0.1' update_time_diff.days=16
domain='community.spiceworks.com' library='nanoid' version='5.0.8' update_time_diff.days=4
not_updated_domains=377 not_updated_libs=4888 not_monotonous=8


## When a new library version gets released, what percentage of domains uses the updates within 1/4/16 weeks?

In [7]:
intervals = [datetime.timedelta(days=i) for i in [7, 4*7, 16*7]]
libraries = set(library for domain_data in results.values() for library in domain_data.keys())
libraries_not_indexed = set(library for library in libraries if library not in vulndb.releases)
libraries_with_recent_updates = set(library for library in libraries.difference(libraries_not_indexed) if len(vulndb.releases[library]) > 0 and base_date - datetime.datetime.fromisoformat(vulndb.releases[library]["modified"]) < max(intervals))

In [8]:
stats = [[], [], []]
prevalences = [[], [], []]

log = []

for library in libraries_with_recent_updates:
    release_dates = {v: datetime.datetime.fromisoformat(vulndb.releases[library][v]) for v in vulndb.releases[library]}
    release_order = list([k for k in vulndb.releases[library].keys() if k not in ["created", "modified"]])  # npm already has release order
    
    for i, interval in enumerate(intervals):
        count = [0, 0]
        prevalence = 0
        for domain, domain_data in results.items():
            domain_hit = False
            if library in domain_data:
                for version in release_order:
                    if base_date - release_dates[version] < interval:  # must have a chance of overlap
                        history = domain_data[library]
                        found_this_or_later_in_interval = False
                        for day, versions in enumerate(history):
                            date = base_date + datetime.timedelta(days=day)
                            if date < release_dates[version]:
                                continue
                            if date > release_dates[version] + interval:
                                break
                                
                            for v in versions:
                                if v in release_dates and release_order.index(v) >= release_order.index(version):
                                    found_this_or_later_in_interval = True
                                    break
                        count[0 if found_this_or_later_in_interval else 1] += 1
                        # if library == "@uppy/core": log.append(f"{library=} {interval.days=} {domain=} {found_this_or_later_in_interval=} {version=} {v=}")
                        domain_hit = True
                if domain_hit:
                    prevalence += 1
        if sum(count) > 0:
            stats[i].append(count[0] / sum(count))
            if count[0] > 0: log.append(f"{count=} {prevalence=} {library=} {interval.days=}")
            prevalences[i].append(prevalence)
                                    

print(f"1 week: {compute_statistics(stats[0])}")
print(f"4 week: {compute_statistics(stats[1])}")
print(f"16 week: {compute_statistics(stats[2])}")

stats_normalized = [np.multiply(np.multiply(stat, prevalence), len(prevalence) / np.array(prevalence).sum()) for stat, prevalence in zip(stats, prevalences)]

print(f"1 week (normalized): {list(map(float, compute_statistics(stats_normalized[0])))}")
print(f"4 week (normalized): {list(map(float, compute_statistics(stats_normalized[1])))}")
print(f"16 week (normalized): {list(map(float, compute_statistics(stats_normalized[2])))}")

stats_to_plot["library_stats"] = stats
stats_to_plot["library_instances_stats"] = [list(s) for s in stats_normalized]

1 week: [0.0, 0.0, 0.007014852176067933, 0.75]
4 week: [0.0, 0.0, 0.02599053742848358, 1.0]
16 week: [0.0, 0.0, 0.10613958960513507, 1.0]
1 week (normalized): [0.0, 0.0, 0.006650501583260177, 0.43499999999999994]
4 week (normalized): [0.0, 0.0, 0.020830420088195, 0.8050271739130435]
16 week (normalized): [0.0, 0.0, 0.09697951765206159, 2.989038555925179]


## How many domains update their dependencies?

In [9]:
stats = [[], [], []]

for domain, domain_data in results.items():
    
    for i, interval in enumerate(intervals):
        count = [0, 0]
        prevalence = 0
        for library in libraries_with_recent_updates:
            if library in domain_data:
                release_dates = {v: datetime.datetime.fromisoformat(vulndb.releases[library][v]) for v in vulndb.releases[library]}
                release_order = list([k for k in vulndb.releases[library].keys() if k not in ["created", "modified"]])  # npm already has release order
                for version in release_order:
                    if base_date - release_dates[version] < interval:  # must have a chance of overlap
                        history = domain_data[library]
                        found_this_or_later_in_interval = False
                        for day, versions in enumerate(history):
                            date = base_date + datetime.timedelta(days=day)
                            if date < release_dates[version]:
                                continue
                            if date > release_dates[version] + interval:
                                break
                                
                            for v in versions:
                                if v in release_dates and release_order.index(v) >= release_order.index(version):
                                    found_this_or_later_in_interval = True
                                    break
                        count[0 if found_this_or_later_in_interval else 1] += 1
        if sum(count) > 0:
            stats[i].append(count[0] / sum(count))
            if count[0] > 0: log.append(f"{count=} {prevalence=} {library=} {interval.days=}")
                                    

print(f"1 week: {compute_statistics(stats[0])}")
print(f"4 week: {compute_statistics(stats[1])}")
print(f"16 week: {compute_statistics(stats[2])}")

stats_to_plot["domain_stats"] = stats

1 week: [0.0, 0.0, 0.00212430874976802, 0.15]
4 week: [0.0, 0.0, 0.00574210958739292, 0.5130434782608696]
16 week: [0.0, 0.0, 0.077297906638259, 1.0]


## How many vulnerable libraries are included per domain?

In [15]:
what_libraries = Counter()
n_days = len(data)

stats = []

for day in range(n_days):
    stats.append([0, 0])
    for domain, domain_data in results.items():
        domainstats = [0, 0]
        for library, history in domain_data.items():
            if day >= len(history): continue
            
            versions = history[day]
            if len(versions) > 0:
                version = versions[0]  # todo ignore multiple versions
                if vulndb.is_vulnerable(library, str(coerce_version(version))):
                    domainstats[0] += 1
                    what_libraries.update({library: 1})
                else:
                    domainstats[1] += 1
        if sum(domainstats) > 0:
            stats[-1][0] += domainstats[0] / sum(domainstats)
            stats[-1][1] += domainstats[1] / sum(domainstats)

print("Most common vulnerable libraries:")
display(what_libraries.most_common(10))

print("Stats (vulnerable vs safe) for each day:")
print(f"mean: {np.average([s[0] / sum(s) for s in stats if sum(s) > 0])}")
display([s[0] / sum(s) for s in stats if sum(s) > 0])

Most common vulnerable libraries:


[('webpack', 1342),
 ('axios', 135),
 ('svelte', 131),
 ('nuxt', 130),
 ('cookie', 122),
 ('bootstrap', 106),
 ('vue', 86),
 ('url-parse', 76),
 ('dompurify', 37),
 ('next', 31)]

Stats (vulnerable vs safe) for each day:
mean: 0.04865334063537752


[0.03372007292800274,
 0.04495276322179056,
 0.03667578074134805,
 0.038449005571181166,
 0.038599260643951055,
 0.041748634453523795,
 0.037414443019384194,
 0.043237796897277934,
 0.038390670031622595,
 0.04099244599654449,
 0.04276463656497769,
 0.041207545854613004,
 0.04283819100588487,
 0.04296195387008352,
 0.04462435617429964,
 0.04650304537997382,
 0.04284842999156158,
 0.04516684681508887,
 0.040661025012956244,
 0.04868642502715165,
 0.0529732915225688,
 0.045275536423978996,
 0.04901033771471635,
 0.044895765117445106,
 0.0535136890893124,
 0.04543646751830908,
 0.0558324285644705,
 0.043985402144033146,
 0.05356995248716605,
 0.052946948257281495,
 0.050645478359376,
 0.054364765450295206,
 0.05138533749046605,
 0.05596423295106347,
 0.05963890252361591,
 0.06541718582928148,
 0.06382762223781394,
 0.06848110086596501,
 0.055195152521441686,
 0.06084889926864974,
 0.06626930576517696,
 0.05200497707286288,
 0.05816753894472569]

In [11]:
sum(what_libraries.values())

2685