In [2]:
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import os
from scipy import stats

# Zorg dat grafieken direct in het notebook verschijnen
%matplotlib inline

In [3]:

FOLDER_PATH = os.path.join('Databestanden', 'all.json')
ECOSYSTEM_NAME = "Python (PyPI)"

print(f"--- START ANALYSE VOOR: {ECOSYSTEM_NAME} ---")
print(f"Map geselecteerd: {FOLDER_PATH}")

--- START ANALYSE VOOR: Python (PyPI) ---
Map geselecteerd: Databestanden/all.json


In [4]:
def score_to_group(score):
    if score >= 9.0: return 'Critical'
    elif score >= 7.0: return 'High'
    elif score >= 4.0: return 'Medium'
    else: return 'Low'

def get_severity_group(entry, entry_id=""):
    """
    Universele severity checker. Werkt voor zowel PyPI als npm datastructuren.
    """
    # 1. Malware is altijd Critical
    if entry_id.startswith('MAL-'):
        return 'Critical'

    severity_str = None
    raw_sev = entry.get('severity')

    # 2. Check 'database_specific'
    if 'database_specific' in entry:
        severity_str = entry['database_specific'].get('severity')

    # 3. Check root 'severity' als string
    if not severity_str and isinstance(raw_sev, str):
        severity_str = raw_sev

    # 4. Categoriseren op tekst
    if severity_str:
        s = severity_str.upper()
        if 'CRITICAL' in s: return 'Critical'
        if 'HIGH' in s: return 'High'
        if 'MODERATE' in s or 'MEDIUM' in s: return 'Medium'
        if 'LOW' in s: return 'Low'
    
    # 5. Fallback: Check CVSS objecten of numerieke scores
    if isinstance(raw_sev, list):
        for item in raw_sev:
            if isinstance(item, dict) and isinstance(item.get('score'), (int, float)):
                return score_to_group(float(item['score']))
            if isinstance(item, dict) and item.get('type') in ['CVSS_V3', 'CVSS_V31']:
                 try: return score_to_group(float(item.get('score')))
                 except: pass

    # 6. Fallback: Direct een getal
    if isinstance(raw_sev, (int, float)):
        return score_to_group(float(raw_sev))

    return None

def parse_date(date_str):
    """Probeert robuust een datum string te parsen."""
    if not date_str: return None
    clean_str = date_str.split('.')[0].replace('Z', '')
    for fmt in ["%Y-%m-%dT%H:%M:%S", "%Y-%m-%d"]:
        try: return datetime.strptime(clean_str, fmt)
        except ValueError: continue
    return None

In [5]:
if not os.path.exists(FOLDER_PATH):
    print(f"FOUT: Map niet gevonden: {FOLDER_PATH}")
else:
    all_files = [f for f in os.listdir(FOLDER_PATH) if f.endswith('.json')]
    total_files = len(all_files)
    print(f"Totaal aantal bestanden gevonden: {total_files}")

    records = []
    source_counter = {} 
    counter = 0
    skipped = 0

    for filename in all_files:
        counter += 1
        if counter % 5000 == 0: print(f"Verwerkt: {counter}/{total_files}...")
        
        try:
            with open(os.path.join(FOLDER_PATH, filename), 'r', encoding='utf-8') as f:
                entry = json.load(f)
                e_id = entry.get('id', filename)
                
                # Bron tellen
                source_type = e_id.split('-')[0]
                source_counter[source_type] = source_counter.get(source_type, 0) + 1

                # Severity bepalen
                group = get_severity_group(entry, e_id)
                if not group: 
                    skipped += 1
                    continue

                # Datums parsen
                pub_str = entry.get('published') or entry.get('publishedAt')
                mod_str = entry.get('modified') or entry.get('updated') or entry.get('last_updated')
                
                pub_date = parse_date(pub_str)
                fix_date = parse_date(mod_str)
                
                if pub_date and fix_date:
                    days = (fix_date - pub_date).days
                    if days < 0: days = 0
                    
                    records.append({
                        'id': e_id,
                        'severity_group': group,
                        'days_to_fix': days
                    })
                else:
                    skipped += 1

        except Exception: 
            skipped += 1
            continue

    print("-" * 30)
    print(f"KLAAR! Bruikbare records: {len(records)}")
    print(f"Overgeslagen records: {skipped}")

Totaal aantal bestanden gevonden: 17715
Verwerkt: 5000/17715...
Verwerkt: 10000/17715...
Verwerkt: 15000/17715...
------------------------------
KLAAR! Bruikbare records: 14418
Overgeslagen records: 3297


In [6]:
if not records:
    print("Geen data gevonden.")
else:
    df = pd.DataFrame(records)
    order = ['Low', 'Medium', 'High', 'Critical']

    # 1. Gemiddelden
    print(f"--- Gemiddelde dagen tot fix ({ECOSYSTEM_NAME}) ---")
    print(df.groupby('severity_group')['days_to_fix'].mean().reindex(order).round(0))


--- Gemiddelde dagen tot fix (Python (PyPI)) ---
severity_group
Low         710.0
Medium      636.0
High        703.0
Critical    209.0
Name: days_to_fix, dtype: float64


In [7]:
summary = (
    df.groupby("severity_group")["days_to_fix"]
      .agg(
          n="count",
          mean="mean",
          median="median",
          q1=lambda s: s.quantile(0.25),
          q3=lambda s: s.quantile(0.75),
      )
)
summary["iqr"] = summary["q3"] - summary["q1"]
print(summary.round(2))


                    n    mean  median      q1      q3      iqr
severity_group                                                
Critical        10841  208.81   179.0  126.00   181.0    55.00
High             1563  703.29   741.0  145.00  1007.0   862.00
Low               332  710.08   743.0   88.75  1259.0  1170.25
Medium           1682  635.51   604.0  128.25   933.0   804.75


In [8]:
if not records:
    print("Geen data gevonden.")
else:
    df = pd.DataFrame(records)
    order = ['Low', 'Medium', 'High', 'Critical']

    # 2. Statistische Toets
    print("\n--- Statistische Toets (Kruskal-Wallis) ---")
    groups_data = [df[df['severity_group'] == g]['days_to_fix'] for g in order if len(df[df['severity_group'] == g]) > 0]

    if len(groups_data) > 1:
        stat, p_value = stats.kruskal(*groups_data)
        print(f"H-statistic: {stat:.4f}")
        print(f"P-value: {p_value:.4e}")
        if p_value < 0.05:
            print(">> Conclusie: Er is een significant verschil tussen de severity groepen.")
        else:
            print(">> Conclusie: Geen significant verschil gevonden.")
    else:
        print("Niet genoeg groepen voor statistiek.")


--- Statistische Toets (Kruskal-Wallis) ---
H-statistic: 1699.3185
P-value: 0.0000e+00
>> Conclusie: Er is een significant verschil tussen de severity groepen.


In [12]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# --- checks ---
required = {"severity_group", "days_to_fix"}
missing = required - set(df.columns)
if missing:
    raise ValueError(f"df mist kolommen: {missing}")

order = ["Low", "Medium", "High", "Critical"]

# --- output folder + bestandsnaam ---
out_dir = "figures"
os.makedirs(out_dir, exist_ok=True)

ecosystem = df["ecosystem"].iloc[0] if ("ecosystem" in df.columns and len(df) > 0) else "unknown"
title_name = globals().get("ECOSYSTEM_NAME", ecosystem)

# (optioneel) nette stijl
sns.set_theme(style="whitegrid")

# -------------------------
# 1) LINEAR boxplot (zonder outliers)
# -------------------------

box_color ="#CFE2F3"    # Lichte pastelblauw (benadering van de afbeelding)
median_color = "#FF8C00" # Donkeroranje
edge_color = "black"     # Zwarte lijnen voor randen en whiskers

fig, ax = plt.subplots(figsize=(7, 4))

sns.boxplot(
    data=df,
    x="severity_group",
    y="days_to_fix",
    order=order,
    showfliers=False,  # Je had dit op False staan (de afbeelding toont ze wel, zet op True als je stippen wilt)
    ax=ax,
    # --- Kleurinstellingen ---
    color=box_color,                        # De vulkleur van de box
    boxprops=dict(edgecolor=edge_color),    # Kleur van de rand van de box
    whiskerprops=dict(color=edge_color),    # Kleur van de verticale strepen
    capprops=dict(color=edge_color),        # Kleur van de horizontale dakjes
    medianprops=dict(color=median_color, linewidth=1.5), # De oranje streep
)

ax.set_ylim(0, 3000)
ax.set_xlabel("Severity group")
ax.set_ylabel("Exposure window (days)")
ax.set_title(f"{title_name}: exposure window by severity")

fig.tight_layout()
pdf_path = os.path.join(out_dir, f"sq1_boxplot_{ecosystem}_linear_colored.pdf")
fig.savefig(pdf_path, format="pdf", bbox_inches="tight")
plt.close(fig)
print("Saved:", pdf_path)

Saved: figures/sq1_boxplot_PyPI_linear_colored.pdf


In [10]:
df["ecosystem"] = "PyPI"
df.to_csv("sq1_pypi_exposure.csv", index=False)
print("Saved:", "sq1_pypi_exposure.csv", df.shape, df.columns.tolist())


Saved: sq1_pypi_exposure.csv (14418, 4) ['id', 'severity_group', 'days_to_fix', 'ecosystem']
