In [23]:
import datetime
import urllib.parse
from collections import Counter, defaultdict
import json
import numpy as np

import semantic_version as sv

from common import *
from vulnerability_database import VulnerabilityDatabase

#### Resultset generated with
```
python analyze_bundle_dataset.py --total 100000 --worker $(nproc) --batch-per-file -s get_urls $DATASETS/bundles-daily/results-*
```

In [24]:
with open(os.path.join(DATASETS, "update-behavior-cdn.json"), "r") as f:
    data = json.load(f)

vulndb = VulnerabilityDatabase(os.path.join(DATASETS, "vulndb.json"))

In [3]:
results = defaultdict(lambda: defaultdict(list))
base_date = datetime.datetime.fromisoformat("2024-10-31T18:00:00Z")

stats_to_plot = {}

In [4]:
class CdnVersion:
    def __init__(self, version):
        self.__orig = version
        self.version = self.spec = None

        version = urllib.parse.unquote(version).strip()
        if version in ["latest", "git", "next"]:
            version = "*"
        try:
            self.version = sv.Version(version)
        except ValueError:
            try:
                self.spec = sv.NpmSpec(version)
            except ValueError:
                self.version = coerce_version(version)

    def match(self, release_list: dict):
        # reverse order to match latest the first
        for version in reversed(list(release_list.keys())):
            if version == "modified" or version == "created": continue

            try:
                release_version = coerce_version(version)
            except ValueError:
                continue

            if self.version is not None:
                if self.version >= release_version:
                    return version
            if self.spec is not None:
                if self.spec.match(release_version):
                    return version


        return None

    def match_date_interval(self, release_list: dict, date: datetime.datetime, interval: datetime.timedelta) -> bool:
        for version, release_date in reversed(list(release_list.items())):
            if version == "modified" or version == "created": continue

            if date - release_date < interval:
                # We have a release in the interval
                # Now check that it matches this version (spec)

                try:
                    release_version = coerce_version(version)
                except ValueError:
                    continue

                if self.version is not None and self.version >= release_version:
                    return True
                if self.spec is not None and self.spec.match(release_version):
                    return True

        return False

    def __eq__(self, other):
        return self.version == other.version and self.spec == other.spec

    def __lt__(self, other):
        if self.version is not None and other.version is not None:
            return self.version < other.version
        return False  # Cannot compare spec with version or spec with spec

    def __le__(self, other):
        return self.__eq__(other) or self.__lt__(other)

    def __hash__(self):
        return hash(self.__orig)

    def __str__(self):
        return self.__orig

    def __repr__(self):
        return f"<CdnVersion {self.__orig!r} {self.version!r} {self.spec!r}>"

In [5]:
for n, day in enumerate(data):
    for d in day:
        domain = d.get("domain")
        urls = d.get("urls")
        all_libraries = set(lib for lib in map(get_library_version_from_cdn_url, urls) if lib is not None)
        libraries = set((lib, vers) for lib, vers in all_libraries if vers == '*' or (lib, '*') not in all_libraries)
        histories = defaultdict(list)
        for lib, vers in libraries:
            histories[lib].append(vers)
        for lib, verss in histories.items():
            results[domain][lib].append(list(map(CdnVersion, sorted(verss))))
        for lib in results[domain]:
            if len(results[domain][lib]) < n+1:
                results[domain][lib].append([])  # Preserve one measurement per day


In [6]:
def check_all_sites_with_updates(only_fixed_versions=False):
    not_updated_libs = 0
    not_updated_domains = 0
    not_monotonous = 0

    log = []
    
    for domain, domain_data in results.items():
        no = len(domain_data) > 0
        
        for library, history in domain_data.items():
            if only_fixed_versions:
                if library == "@appmate/wishlist" or library.startswith("@sentry"):
                    continue

            if all([n < 2 for n in set(map(len, history))]):  # we ignore multiple installed versions in parallel
                if len(set([v[0] for v in history if len(v) > 0])) > 1:
                    no = False
    
                        
                    parsed_history = [v[0] for v in history if len(v) > 0]
                    non_empty_history = [(n, v[0]) for n, v in enumerate(history) if len(v) > 0]
                    
                    if all(v1 <= v2 for v1, v2 in zip(parsed_history, parsed_history[1:])):
                        updates = [(base_date + datetime.timedelta(days=n), v2) for (_, v1), (n, v2) in zip(non_empty_history, non_empty_history[1:]) if v1 != v2]
                        if library in vulndb.releases:
                            for update_time, version in updates:
                                if not only_fixed_versions or version.version is not None:
                                    matched_version = version.match(vulndb.releases[library])
                                    if matched_version is not None:
                                        update_time_diff = update_time - datetime.datetime.fromisoformat(vulndb.releases[library][matched_version])
                                        log.append((domain, library, version, matched_version, update_time_diff.days))
                                        print(f"{domain=} {library=} {version=} {matched_version=} {update_time_diff.days=}")
                                    else:
                                        print(f"WARNING: Missing version {version} for library {library}")
                        else:
                            print(f"WARNING: No release info for {library}")
                    else:
                        not_monotonous += 1
                else:
                    not_updated_libs += 1
        if no:
            not_updated_domains += 1
    
    print(f"{not_updated_domains=} {not_updated_libs=} {not_monotonous=}")
    return log

In [None]:
check_all_sites_with_updates()

## When a new library version gets released, what percentage of domains uses the updates within 1/4/16 weeks?

In [8]:
def library_update_stats(only_fixed_versions=False, skip_domains_with_specs=False):
    intervals = [datetime.timedelta(days=i) for i in [7, 4*7, 16*7]]
    libraries = set(library for domain_data in results.values() for library in domain_data.keys())
    libraries_not_indexed = set(library for library in libraries if library not in vulndb.releases)
    libraries_with_recent_updates = set(library for library in libraries.difference(libraries_not_indexed) if len(vulndb.releases[library]) > 0 and base_date - datetime.datetime.fromisoformat(vulndb.releases[library]["modified"]) < max(intervals))

    stats = [[], [], []]
    prevalences = [[], [], []]

    if skip_domains_with_specs:
        domain_has_specs = {domain: any(any(any(v.spec is not None for v in versions) for versions in history) for history in domain_data.values()) for domain, domain_data in results.items()}
    
    for library in libraries_with_recent_updates:
        release_dates = {v: datetime.datetime.fromisoformat(vulndb.releases[library][v]) for v in vulndb.releases[library]}
        release_order = list([k for k in vulndb.releases[library].keys() if k not in ["created", "modified"]])  # npm already has release order

        if only_fixed_versions:
            if library == "@appmate/wishlist" or library.startswith("@sentry"):
                continue
        
        for i, interval in enumerate(intervals):
            count = [0, 0]
            prevalence = 0
            for domain, domain_data in results.items():
                if skip_domains_with_specs and domain_has_specs[domain]:
                    continue
                
                if library in domain_data:
                    history = domain_data[library]
                    found_this_or_later_in_interval = False
                    for day, versions in enumerate(history):
                        for v in versions:
                            if (not only_fixed_versions or v.version is not None) and v.match_date_interval(release_dates, base_date + datetime.timedelta(days=day), interval):
                                found_this_or_later_in_interval = True
                                break
                        if found_this_or_later_in_interval:
                            break
                    count[0 if found_this_or_later_in_interval else 1] += 1
                    if found_this_or_later_in_interval:
                        # don't print, too many occurences due to spec matching
                        # print(f"hit {domain=} {library=} {interval.days=}")
                        pass
                    prevalence += 1
            if sum(count) > 0:
                stats[i].append(count[0] / sum(count))
                prevalences[i].append(prevalence)
    return stats, prevalences

In [9]:
stats, prevalences = library_update_stats()

print(f"1 week: {compute_statistics(stats[0])}")
print(f"4 week: {compute_statistics(stats[1])}")
print(f"16 week: {compute_statistics(stats[2])}")

stats_normalized = [np.multiply(np.multiply(stat, prevalence), len(prevalence) / np.array(prevalence).sum()) for stat, prevalence in zip(stats, prevalences)]

print(f"1 week (normalized): {list(map(float, compute_statistics(stats_normalized[0])))}")
print(f"4 week (normalized): {list(map(float, compute_statistics(stats_normalized[1])))}")
print(f"16 week (normalized): {list(map(float, compute_statistics(stats_normalized[2])))}")

stats_to_plot["library_stats"] = stats
stats_to_plot["library_instances_stats"] = [list(s) for s in stats_normalized]

1 week: [0.0, 0.0, 0.15430718156741474, 1.0]
4 week: [0.0, 0.0, 0.17362227616245174, 1.0]
16 week: [0.0, 0.0, 0.2554643573589052, 1.0]
1 week (normalized): [0.0, 0.0, 0.06040997777228946, 5.7569770313657695]
4 week (normalized): [0.0, 0.0, 0.06831316374413436, 5.830081501605335]
16 week (normalized): [0.0, 0.0, 0.07503087182020253, 5.921462089404791]


## How many domains update their dependencies?

In [10]:
def domain_update_stats(only_fixed_versions=False):
    intervals = [datetime.timedelta(days=i) for i in [7, 4*7, 16*7]]
    libraries = set(library for domain_data in results.values() for library in domain_data.keys())
    libraries_not_indexed = set(library for library in libraries if library not in vulndb.releases)
    libraries_with_recent_updates = set(library for library in libraries.difference(libraries_not_indexed) if len(vulndb.releases[library]) > 0 and base_date - datetime.datetime.fromisoformat(vulndb.releases[library]["modified"]) < max(intervals))

    stats = [[], [], []]
    
    for domain, domain_data in results.items():
        
        for i, interval in enumerate(intervals):
            count = [0, 0]
            for library in libraries_with_recent_updates:
                if library in domain_data:
                    release_dates = {v: datetime.datetime.fromisoformat(vulndb.releases[library][v]) for v in vulndb.releases[library]}
                    release_order = list([k for k in vulndb.releases[library].keys() if k not in ["created", "modified"]])  # npm already has release order
                    history = domain_data[library]
                    found_this_or_later_in_interval = False
                    for day, versions in enumerate(history):
                        for v in versions:
                            if (not only_fixed_versions or v.version is not None) and v.match_date_interval(release_dates, base_date + datetime.timedelta(days=day), interval):
                                found_this_or_later_in_interval = True
                                break
                        if found_this_or_later_in_interval:
                            break
                    count[0 if found_this_or_later_in_interval else 1] += 1
                    if found_this_or_later_in_interval:
                        # don't print, too many occurences due to spec matching
                        # print(f"hit {domain=} {library=} {interval.days=}")
                        pass
            if sum(count) > 0:
                stats[i].append(count[0] / sum(count))
    return stats

In [11]:
stats = domain_update_stats()

print(f"1 week: {compute_statistics(stats[0])}")
print(f"4 week: {compute_statistics(stats[1])}")
print(f"16 week: {compute_statistics(stats[2])}")

stats_to_plot["domain_stats"] = stats

1 week: [0.0, 0.0, 0.04952893231794966, 1.0]
4 week: [0.0, 0.0, 0.056788619751047494, 1.0]
16 week: [0.0, 0.0, 0.062313132703306115, 1.0]


## How many vulnerable libraries are included per domain?

In [25]:
def vulnerable_libs_per_domain():
    what_libraries = Counter()
    n_days = len(data)
    
    stats = []
    
    for day in range(n_days):
        stats.append([0, 0])
        for domain, domain_data in results.items():
            domainstats = [0, 0]
            for library, history in domain_data.items():
                if day >= len(history): continue
                
                versions = history[day]
                if len(versions) == 1:
                    version = versions[0]
                    if version.version is not None:
                        if vulndb.is_vulnerable(library, str(version.version)):
                            domainstats[0] += 1
                            what_libraries.update({library: 1})
                        else:
                            domainstats[1] += 1
            if sum(domainstats) > 0:
                stats[-1][0] += domainstats[0] / sum(domainstats)
                stats[-1][1] += domainstats[1] / sum(domainstats)
    
    print("Most common vulnerable libraries:")
    display(what_libraries.most_common(10))
    
    print("Stats (vulnerable vs safe) for each day:")
    print(f"mean: {np.average([s[0] / sum(s) for s in stats if sum(s) > 0])}")
    display([s[0] / sum(s) for s in stats if sum(s) > 0])

In [26]:
vulnerable_libs_per_domain()

Most common vulnerable libraries:


[('jquery', 199004),
 ('jquery-ui', 13671),
 ('bootstrap', 13055),
 ('swiper', 10153),
 ('vue', 7329),
 ('crypto-js', 6628),
 ('mathjax', 4840),
 ('gsap', 3747),
 ('lazysizes', 3065),
 ('select2', 2642)]

Stats (vulnerable vs safe) for each day:
mean: 0.3476487053016498


[0.3382109649843065,
 0.35646654096351704,
 0.3456226339178705,
 0.3513288444580905,
 0.34344805347306007,
 0.3504939167841727,
 0.3461361800113669,
 0.34761446417568476,
 0.34014748496435376,
 0.35025127824557384,
 0.34750132046802484,
 0.349422743884382,
 0.3461420655997501,
 0.3475015606362334,
 0.34711739813595816,
 0.346178442957963,
 0.3459121282900216,
 0.35013748405888045,
 0.3441371060714205,
 0.3483042520019862,
 0.3440559969518828,
 0.3447023142298081,
 0.34681206904853157,
 0.3465540195717475,
 0.3475586791303817,
 0.3502612263991048,
 0.3465077937026676,
 0.34698410457671447,
 0.3467703226089138,
 0.3498653101539083,
 0.34163019861116517,
 0.3481438884313484,
 0.3509577653931857,
 0.34611781397955216,
 0.3498428445706178,
 0.3471268447131481,
 0.34952832603905204,
 0.3506034014248299,
 0.3508682585019543,
 0.34847690240085105,
 0.35236563011400107,
 0.35113946609112007,
 0.34994628724384064]

## How does everything look if we only consider exact versions?

In [None]:
log_fixed_versions = check_all_sites_with_updates(only_fixed_versions=True)

In [15]:
fixed_libs = Counter([l[1] if not l[1].startswith("@sentry") else "@sentry" for l in log_fixed_versions])
print(fixed_libs.most_common())
print(len(log_fixed_versions) - 55 - 45 - 25 - 14)

[('jquery', 55), ('@letscooee/web-sdk', 45), ('@lottiefiles/dotlottie-web', 7), ('htmx.org', 5), ('search-insights', 3), ('jquery-ui', 3), ('foundation', 2), ('bootstrap', 2), ('popper.js', 2), ('@rails/ujs', 2), ('@goodgamestudios/cxf-ia', 1), ('lit', 1), ('ionicons', 1), ('font-awesome', 1), ('twitter-bootstrap', 1), ('air-datepicker', 1), ('@uscreentv/video-player', 1), ('hls.js', 1), ('swiper', 1), ('instantsearch.js', 1), ('algoliasearch', 1), ('@gobistories/gobi-web-integration', 1), ('quill', 1), ('@yes-chef/yes-chef-sliders', 1), ('bootstrap-italia', 1), ('gsap', 1), ('lodash', 1), ('moment', 1), ('lottie-web', 1), ('jquery-migrate', 1), ('video.js', 1), ('summernote', 1)]
9


In [16]:
stats, prevalences = library_update_stats(only_fixed_versions=True)

print(f"1 week: {compute_statistics(stats[0])}")
print(f"4 week: {compute_statistics(stats[1])}")
print(f"16 week: {compute_statistics(stats[2])}")

stats_normalized = [np.multiply(np.multiply(stat, prevalence), len(prevalence) / np.array(prevalence).sum()) for stat, prevalence in zip(stats, prevalences)]

print(f"1 week (normalized): {list(map(float, compute_statistics(stats_normalized[0])))}")
print(f"4 week (normalized): {list(map(float, compute_statistics(stats_normalized[1])))}")
print(f"16 week (normalized): {list(map(float, compute_statistics(stats_normalized[2])))}")

stats_to_plot["library_stats_fixed"] = stats
stats_to_plot["library_instances_stats_fixed"] = [list(s) for s in stats_normalized]

1 week: [0.0, 0.0, 0.04873893768151233, 1.0]
4 week: [0.0, 0.0, 0.0698260615953492, 1.0]
16 week: [0.0, 0.0, 0.13283360002348227, 1.0]
1 week (normalized): [0.0, 0.0, 0.008570720832301214, 0.8608372553876642]
4 week (normalized): [0.0, 0.0, 0.012534059945504088, 0.8608372553876642]
16 week (normalized): [0.0, 0.0, 0.017785484270497896, 0.8608372553876642]


In [17]:
stats = domain_update_stats(only_fixed_versions=True)

print(f"1 week: {compute_statistics(stats[0])}")
print(f"4 week: {compute_statistics(stats[1])}")
print(f"16 week: {compute_statistics(stats[2])}")

stats_to_plot["domain_stats_fixed"] = stats

1 week: [0.0, 0.0, 0.00567766155916445, 1.0]
4 week: [0.0, 0.0, 0.008726259289843104, 1.0]
16 week: [0.0, 0.0, 0.012570189925681255, 1.0]


## What if we try to do not consider CDN induced updates with concrete versions?

In [21]:
stats, prevalences = library_update_stats(True, True)

print(f"1 week: {compute_statistics(stats[0])}")
print(f"4 week: {compute_statistics(stats[1])}")
print(f"16 week: {compute_statistics(stats[2])}")

stats_normalized = [np.multiply(np.multiply(stat, prevalence), len(prevalence) / np.array(prevalence).sum()) for stat, prevalence in zip(stats, prevalences)]

print(f"1 week (normalized): {list(map(float, compute_statistics(stats_normalized[0])))}")
print(f"4 week (normalized): {list(map(float, compute_statistics(stats_normalized[1])))}")
print(f"16 week (normalized): {list(map(float, compute_statistics(stats_normalized[2])))}")

stats_to_plot["library_stats_fixed"] = stats
stats_to_plot["library_instances_stats_fixed"] = [list(s) for s in stats_normalized]

1 week: [0.0, 0.0, 0.04626704022275734, 1.0]
4 week: [0.0, 0.0, 0.07203298003578928, 1.0]
16 week: [0.0, 0.0, 0.11202479175871154, 1.0]
1 week (normalized): [0.0, 0.0, 0.008565446354926717, 0.6804771270858448]
4 week (normalized): [0.0, 0.0, 0.011610938392233996, 0.6804771270858448]
16 week (normalized): [0.0, 0.0, 0.016052280946640442, 0.6804771270858448]
