#Imports

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import chi2, SelectKBest, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, SGDClassifier
from sklearn.dummy import DummyClassifier
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    silhouette_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
import matplotlib.pyplot as plt
import logging
import re
from urllib.parse import urlparse

# Configure logging
logging.basicConfig(level=logging.ERROR)

try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("Google Drive is gekoppeld!")
except ModuleNotFoundError:
    print("Niet in Google Colab, slaan Google Drive-mount over.")



Niet in Google Colab, slaan Google Drive-mount over.


#Controle

In [None]:
import os
import numpy as np

def count_html_files(pad):
    return sum(f.lower().endswith(('.html', '.htm')) for f in os.listdir(pad))

dataset_paden = {
    "Benign 2018": "C:/Users/cihat/Downloads/Phishing_dataset1/benign/2018/2018",
    "Benign 2019": "C:/Users/cihat/Downloads/Phishing_dataset1/benign/2019/2019",
    "Benign 2020": "C:/Users/cihat/Downloads/Phishing_dataset1/benign/2020/2020",
    "Malicious 2018": "C:/Users/cihat/Downloads/Phishing_dataset1/malicious/2018/2018",
    "Malicious 2019": "C:/Users/cihat/Downloads/Phishing_dataset1/malicious/2019/2019",
    "Malicious 2020": "C:/Users/cihat/Downloads/Phishing_dataset1/malicious/2020/2020",
}

print("Aantal HTML-bestanden per jaar:")
for naam, pad in dataset_paden.items():
    if os.path.exists(pad):
        print(f"{naam}: {count_html_files(pad)} bestanden")
    else:
        print(f"{naam}: Pad niet gevonden")

data_path = "C:/Users/cihat/Downloads/Phishing_dataset1/"
npy_files = []

# Enkel zoeken naar .npy-bestanden
for root, _, files in os.walk(data_path):
    for file in files:
        if file.endswith(".npy"):
            npy_files.append(os.path.join(root, file))

print("\nGevonden .npy bestanden:")
print("\n".join(npy_files) if npy_files else "Geen .npy bestanden gevonden.")

if npy_files:
    print("\nLaden van .npy bestanden...")
    for file in npy_files:
        data = np.load(file)
        print(f"Bestand: {file}, Vorm: {data.shape}")


Aantal HTML-bestanden per jaar:
Benign 2018: 17462 bestanden
Benign 2019: 18521 bestanden
Benign 2020: 92068 bestanden
Malicious 2018: 16063 bestanden
Malicious 2019: 65075 bestanden
Malicious 2020: 158370 bestanden

Gevonden .npy bestanden:
C:/Users/cihat/Downloads/Phishing_dataset1/output\features_benign_2018_balanced.npy
C:/Users/cihat/Downloads/Phishing_dataset1/output\features_benign_2019_balanced.npy
C:/Users/cihat/Downloads/Phishing_dataset1/output\features_benign_2020_balanced.npy
C:/Users/cihat/Downloads/Phishing_dataset1/output\features_malicious_2018_balanced.npy
C:/Users/cihat/Downloads/Phishing_dataset1/output\features_malicious_2019_balanced.npy
C:/Users/cihat/Downloads/Phishing_dataset1/output\features_malicious_2020_balanced.npy
C:/Users/cihat/Downloads/Phishing_dataset1/output\labels_benign_2018_balanced.npy
C:/Users/cihat/Downloads/Phishing_dataset1/output\labels_benign_2019_balanced.npy
C:/Users/cihat/Downloads/Phishing_dataset1/output\labels_benign_2020_balanced.npy

#Feature Extractie & Data balancering

In [None]:
from gensim.models import KeyedVectors
import gensim.downloader as api

print("Model downloaden en inladen...")
fasttext_model = api.load('fasttext-wiki-news-subwords-300')  # Engels model
print("Model geladen!")

# Test het model
word_vector = fasttext_model['example']  # Vervang "example" door een woord
print("Vector voor 'example':", word_vector[:10])  # Eerste 10 dimensies

Model downloaden...
Model geladen!
Vector voor 'example': [ 0.0073217 -0.018045   0.038147  -0.0031285 -0.058175  -0.0020996
 -0.0076067 -0.12056   -0.081749  -0.039819 ]


##2018

In [None]:
import os
import numpy as np
from bs4 import BeautifulSoup
from gensim.models import KeyedVectors
import gensim.downloader as api
import chardet
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm  # Voor voortgangsindicator

# Voorgetraind FastText-model laden
print("Voorgetraind FastText-model downloaden en laden...")
fasttext_model = api.load('fasttext-wiki-news-subwords-300')
print("FastText-model succesvol geladen!")

# Basisdatasetpaden
benign_path = Path(r"\\?\C:\Users\cihat\Downloads\Phishing_dataset1\benign\2018\2018")
malicious_path = Path(r"\\?\C:\Users\cihat\Downloads\Phishing_dataset1\malicious\2018\2018")
output_path = Path(r"\\?\C:\Users\cihat\Downloads\Phishing_dataset1\output")
output_path.mkdir(parents=True, exist_ok=True)

# Logbestand voor errors
log_file = output_path / "error_log_2018.txt"

def log_error(message):
    with open(log_file, 'a') as log:
        log.write(f"{message}\n")

# Functie om tekst uit HTML-bestanden te extraheren
def extract_text_from_html(file_path):
    try:
        with open(file_path, 'rb') as f:
            raw_data = f.read()
            detected_encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
        with open(file_path, 'r', encoding=detected_encoding, errors='ignore') as file:
            soup = BeautifulSoup(file, 'lxml')
            return soup.get_text(separator=" ", strip=True)
    except Exception as e:
        log_error(f"Fout bij bestand {file_path}: {e}")
        return None

# Functie om FastText-vectoren te genereren
def generate_fasttext_vectors(text, model):
    words = text.split()
    vectors = [model[word] for word in words if word in model.key_to_index]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Functie om bestanden veilig te verwerken
def process_html_file(file_path, label):
    try:
        text = extract_text_from_html(file_path)
        if text:
            vector = generate_fasttext_vectors(text, fasttext_model)
            return vector, label
    except Exception as e:
        log_error(f"Fout bij verwerking van bestand {file_path}: {e}")
    return None, None

# Parallelle verwerking
def extract_features_parallel(file_paths, label):
    features = []
    labels = []

    print(f"{len(file_paths)} bestanden gevonden.")
    with ThreadPoolExecutor(max_workers=4) as executor:
        results = list(tqdm(executor.map(lambda f: process_html_file(f, label), file_paths),
                            total=len(file_paths), desc="Bezig met verwerken"))

    for vector, lbl in results:
        if vector is not None:
            features.append(vector)
            labels.append(lbl)

    return np.array(features), np.array(labels)

# Selecteer bestanden
benign_files = list(benign_path.glob("*.htm*"))
malicious_files = list(malicious_path.glob("*.htm*"))

# Beperk malicious bestanden tot het aantal benign bestanden
min_size = min(len(benign_files), len(malicious_files))
benign_files = benign_files[:min_size]
malicious_files = malicious_files[:min_size]

print(f"Aantal benign bestanden: {len(benign_files)}")
print(f"Aantal malicious bestanden: {len(malicious_files)}")

# Verwerk benign bestanden
print("Feature-extractie voor benign bestanden starten...")
features_benign, labels_benign = extract_features_parallel(benign_files, label=0)

# Verwerk malicious bestanden
print("Feature-extractie voor malicious bestanden starten...")
features_malicious, labels_malicious = extract_features_parallel(malicious_files, label=1)

# Sla de gebalanceerde features en labels apart op
np.save(output_path / "features_benign_2018_balanced.npy", features_benign)
np.save(output_path / "labels_benign_2018_balanced.npy", labels_benign)
np.save(output_path / "features_malicious_2018_balanced.npy", features_malicious)
np.save(output_path / "labels_malicious_2018_balanced.npy", labels_malicious)

print("Balancering voltooid!")
print(f"Benign samples: {len(features_benign)}")
print(f"Malicious samples: {len(features_malicious)}")
print("Vorm van benign features:", features_benign.shape)
print("Vorm van malicious features:", features_malicious.shape)
print("Problemen gelogd in:", log_file)


Voorgetraind FastText-model downloaden en laden...
FastText-model succesvol geladen!
Aantal benign bestanden: 16063
Aantal malicious bestanden: 16063
Feature-extractie voor benign bestanden starten...
16063 bestanden gevonden.


Bezig met verwerken: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16063/16063 [1:29:45<00:00,  2.98it/s]


Feature-extractie voor malicious bestanden starten...
16063 bestanden gevonden.


Bezig met verwerken: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16063/16063 [22:35<00:00, 11.85it/s]

Balancering voltooid!
Benign samples: 15853
Malicious samples: 15781
Vorm van benign features: (15853, 300)
Vorm van malicious features: (15781, 300)
Problemen gelogd in: \\?\C:\Users\cihat\Downloads\Phishing_dataset1\output\error_log_2018.txt





##2019

In [None]:
import os
import numpy as np
from bs4 import BeautifulSoup
from gensim.models import KeyedVectors
import gensim.downloader as api
import chardet
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm  # Voor voortgangsindicator

# Voorgetraind FastText-model laden
print("Voorgetraind FastText-model downloaden en laden...")
fasttext_model = api.load('fasttext-wiki-news-subwords-300')
print("FastText-model succesvol geladen!")

# Basisdatasetpaden
benign_path = Path(r"\\?\C:\Users\cihat\Downloads\Phishing_dataset1\benign\2019\2019")
malicious_path = Path(r"\\?\C:\Users\cihat\Downloads\Phishing_dataset1\malicious\2019\2019")
output_path = Path(r"\\?\C:\Users\cihat\Downloads\Phishing_dataset1\output")
output_path.mkdir(parents=True, exist_ok=True)

# Logbestand voor errors
log_file = output_path / "error_log_2019.txt"

def log_error(message):
    with open(log_file, 'a') as log:
        log.write(f"{message}\n")

# Functie om tekst uit HTML-bestanden te extraheren
def extract_text_from_html(file_path):
    try:
        with open(file_path, 'rb') as f:
            raw_data = f.read()
            detected_encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
        with open(file_path, 'r', encoding=detected_encoding, errors='ignore') as file:
            soup = BeautifulSoup(file, 'lxml')
            return soup.get_text(separator=" ", strip=True)
    except Exception as e:
        log_error(f"Fout bij bestand {file_path}: {e}")
        return None

# Functie om FastText-vectoren te genereren
def generate_fasttext_vectors(text, model):
    words = text.split()
    vectors = [model[word] for word in words if word in model.key_to_index]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Functie om bestanden veilig te verwerken
def process_html_file(file_path, label):
    try:
        text = extract_text_from_html(file_path)
        if text:
            vector = generate_fasttext_vectors(text, fasttext_model)
            return vector, label
    except Exception as e:
        log_error(f"Fout bij verwerking van bestand {file_path}: {e}")
    return None, None

# Parallelle verwerking
def extract_features_parallel(file_paths, label):
    features = []
    labels = []

    print(f"{len(file_paths)} bestanden gevonden.")
    with ThreadPoolExecutor(max_workers=4) as executor:
        results = list(tqdm(executor.map(lambda f: process_html_file(f, label), file_paths),
                            total=len(file_paths), desc="Bezig met verwerken"))

    for vector, lbl in results:
        if vector is not None:
            features.append(vector)
            labels.append(lbl)

    return np.array(features), np.array(labels)

# Selecteer bestanden
benign_files = list(benign_path.glob("*.htm*"))
malicious_files = list(malicious_path.glob("*.htm*"))

# Beperk malicious bestanden tot het aantal benign bestanden
min_size = min(len(benign_files), len(malicious_files))
benign_files = benign_files[:min_size]
malicious_files = malicious_files[:min_size]

print(f"Aantal benign bestanden: {len(benign_files)}")
print(f"Aantal malicious bestanden: {len(malicious_files)}")

# Verwerk benign bestanden
print("Feature-extractie voor benign bestanden starten...")
#features_benign, labels_benign = extract_features_parallel(benign_files, label=0)

# Verwerk malicious bestanden
print("Feature-extractie voor malicious bestanden starten...")
features_malicious, labels_malicious = extract_features_parallel(malicious_files, label=1)

# Sla de gebalanceerde features en labels apart op
np.save(output_path / "features_benign_2019_balanced.npy", features_benign)
np.save(output_path / "labels_benign_2019_balanced.npy", labels_benign)
np.save(output_path / "features_malicious_2019_balanced.npy", features_malicious)
np.save(output_path / "labels_malicious_2019_balanced.npy", labels_malicious)

print("Balancering voltooid!")
print(f"Benign samples: {len(features_benign)}")
print(f"Malicious samples: {len(features_malicious)}")
print("Vorm van benign features:", features_benign.shape)
print("Vorm van malicious features:", features_malicious.shape)
print("Problemen gelogd in:", log_file)


Voorgetraind FastText-model downloaden en laden...
FastText-model succesvol geladen!
Aantal benign bestanden: 18521
Aantal malicious bestanden: 18521
Feature-extractie voor benign bestanden starten...
Feature-extractie voor malicious bestanden starten...
18521 bestanden gevonden.


Bezig met verwerken: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18521/18521 [51:55<00:00,  5.95it/s]


Balancering voltooid!
Benign samples: 18039
Malicious samples: 16067
Vorm van benign features: (18039, 300)
Vorm van malicious features: (16067, 300)
Problemen gelogd in: \\?\C:\Users\cihat\Downloads\Phishing_dataset1\output\error_log_2019.txt


##2020

In [None]:
import os
import numpy as np
from bs4 import BeautifulSoup
from gensim.models import KeyedVectors
import gensim.downloader as api
import chardet
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm  # Voor voortgangsindicator

# Voorgetraind FastText-model laden
print("Voorgetraind FastText-model downloaden en laden...")
fasttext_model = api.load('fasttext-wiki-news-subwords-300')
print("FastText-model succesvol geladen!")

# Basisdatasetpaden
benign_path = Path(r"\\?\C:\Users\cihat\Downloads\Phishing_dataset1\benign\2020\2020")
malicious_path = Path(r"\\?\C:\Users\cihat\Downloads\Phishing_dataset1\malicious\2020\2020")
output_path = Path(r"\\?\C:\Users\cihat\Downloads\Phishing_dataset1\output")
output_path.mkdir(parents=True, exist_ok=True)

# Logbestand voor errors
log_file = output_path / "error_log_2020.txt"

def log_error(message):
    with open(log_file, 'a') as log:
        log.write(f"{message}\n")

# Functie om tekst uit HTML-bestanden te extraheren
def extract_text_from_html(file_path):
    try:
        with open(file_path, 'rb') as f:
            raw_data = f.read()
            detected_encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
        with open(file_path, 'r', encoding=detected_encoding, errors='ignore') as file:
            soup = BeautifulSoup(file, 'lxml')
            return soup.get_text(separator=" ", strip=True)
    except Exception as e:
        log_error(f"Fout bij bestand {file_path}: {e}")
        return None

# Functie om FastText-vectoren te genereren
def generate_fasttext_vectors(text, model):
    words = text.split()
    vectors = [model[word] for word in words if word in model.key_to_index]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Functie om bestanden veilig te verwerken
def process_html_file(file_path, label):
    try:
        text = extract_text_from_html(file_path)
        if text:
            vector = generate_fasttext_vectors(text, fasttext_model)
            return vector, label
    except Exception as e:
        log_error(f"Fout bij verwerking van bestand {file_path}: {e}")
    return None, None

# Parallelle verwerking
def extract_features_parallel(file_paths, label):
    features = []
    labels = []

    print(f"{len(file_paths)} bestanden gevonden.")
    with ThreadPoolExecutor(max_workers=4) as executor:
        results = list(tqdm(executor.map(lambda f: process_html_file(f, label), file_paths),
                            total=len(file_paths), desc="Bezig met verwerken"))

    for vector, lbl in results:
        if vector is not None:
            features.append(vector)
            labels.append(lbl)

    return np.array(features), np.array(labels)

# Selecteer bestanden
benign_files = list(benign_path.glob("*.htm*"))
malicious_files = list(malicious_path.glob("*.htm*"))

# Beperk malicious bestanden tot het aantal benign bestanden
min_size = min(len(benign_files), len(malicious_files))
benign_files = benign_files[:min_size]
malicious_files = malicious_files[:min_size]

print(f"Aantal benign bestanden: {len(benign_files)}")
print(f"Aantal malicious bestanden: {len(malicious_files)}")

# Verwerk benign bestanden
print("Feature-extractie voor benign bestanden starten...")
features_benign, labels_benign = extract_features_parallel(benign_files, label=0)

# Verwerk malicious bestanden
print("Feature-extractie voor malicious bestanden starten...")
features_malicious, labels_malicious = extract_features_parallel(malicious_files, label=1)

# Sla de gebalanceerde features en labels apart op
np.save(output_path / "features_benign_2020_balanced.npy", features_benign)
np.save(output_path / "labels_benign_2020_balanced.npy", labels_benign)
np.save(output_path / "features_malicious_2020_balanced.npy", features_malicious)
np.save(output_path / "labels_malicious_2020_balanced.npy", labels_malicious)

print("Balancering voltooid!")
print(f"Benign samples: {len(features_benign)}")
print(f"Malicious samples: {len(features_malicious)}")
print("Vorm van benign features:", features_benign.shape)
print("Vorm van malicious features:", features_malicious.shape)
print("Problemen gelogd in:", log_file)


Voorgetraind FastText-model downloaden en laden...
FastText-model succesvol geladen!
Aantal benign bestanden: 93115
Aantal malicious bestanden: 93115
Feature-extractie voor benign bestanden starten...
93115 bestanden gevonden.


Bezig met verwerken: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 93115/93115 [5:04:48<00:00,  5.09it/s]


Feature-extractie voor malicious bestanden starten...
93115 bestanden gevonden.


Bezig met verwerken: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 93115/93115 [2:59:09<00:00,  8.66it/s]


Balancering voltooid!
Benign samples: 87762
Malicious samples: 72785
Vorm van benign features: (87762, 300)
Vorm van malicious features: (72785, 300)
Problemen gelogd in: \\?\C:\Users\cihat\Downloads\Phishing_dataset1\output\error_log_2020.txt


#Normaliseren & Data splitsing

In [None]:
from tensorflow.keras import models, layers
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np
import os

# Controleer en laad de datasets
def check_and_load_data(path, file_name):
    file_path = os.path.join(path, file_name)
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Bestand niet gevonden: {file_path}")
    return np.load(file_path)

output_path = r"/content/drive/MyDrive/Afstuderen/Deadline/output"
# Features laden
X_benign_2018 = check_and_load_data(output_path, "features_benign_2018_balanced.npy")
X_benign_2019 = check_and_load_data(output_path, "features_benign_2019_balanced.npy")
X_benign_2020 = check_and_load_data(output_path, "features_benign_2020_balanced.npy")
X_phish_2018 = check_and_load_data(output_path, "features_malicious_2018_balanced.npy")
X_phish_2019 = check_and_load_data(output_path, "features_malicious_2019_balanced.npy")
X_phish_2020 = check_and_load_data(output_path, "features_malicious_2020_balanced.npy")

# Labels laden
y_benign_2018 = check_and_load_data(output_path, "labels_benign_2018_balanced.npy")
y_benign_2019 = check_and_load_data(output_path, "labels_benign_2019_balanced.npy")
y_benign_2020 = check_and_load_data(output_path, "labels_benign_2020_balanced.npy")
y_phish_2018 = check_and_load_data(output_path, "labels_malicious_2018_balanced.npy")
y_phish_2019 = check_and_load_data(output_path, "labels_malicious_2019_balanced.npy")
y_phish_2020 = check_and_load_data(output_path, "labels_malicious_2020_balanced.npy")

# Combineer benign en malicious data en labels
X_2018 = np.vstack([X_benign_2018, X_phish_2018])
X_2019 = np.vstack([X_benign_2019, X_phish_2019])
X_2020 = np.vstack([X_benign_2020, X_phish_2020])

y_2018 = np.hstack([y_benign_2018, y_phish_2018])
y_2019 = np.hstack([y_benign_2019, y_phish_2019])
y_2020 = np.hstack([y_benign_2020, y_phish_2020])

# Combineer data van alle jaren om de schaal te bepalen
X_all = np.vstack([X_2018, X_2019, X_2020])  # Combineer 2018, 2019, en 2020

# Initialiseer de scaler en bereken de schaal over alle jaren
scaler = MinMaxScaler()
scaler.fit(X_all)  # Bereken X_min en X_max over de volledige dataset

# Pas de schaal consistent toe op elk jaar
X_2018_scaled = scaler.transform(X_2018)
X_2019_scaled = scaler.transform(X_2019)
X_2020_scaled = scaler.transform(X_2020)

# -------------------------
# Splits data in Train, Validation en Test sets met stratificatie
# -------------------------
def split_data(X, y, test_size=0.4, val_size=0.5, random_state=42):
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=val_size, stratify=y_temp, random_state=random_state
    )
    return X_train, X_val, X_test, y_train, y_val, y_test

X_train_2018, X_val_2018, X_test_2018, y_train_2018, y_val_2018, y_test_2018 = split_data(X_2018_scaled, y_2018)
X_train_2019, X_val_2019, X_test_2019, y_train_2019, y_val_2019, y_test_2019 = split_data(X_2019_scaled, y_2019)
X_train_2020, X_val_2020, X_test_2020, y_train_2020, y_val_2020, y_test_2020 = split_data(X_2020_scaled, y_2020)

# Controleer de verdeling per set en jaar
def print_class_distribution(y, year, dataset_name):
    unique, counts = np.unique(y, return_counts=True)
    distribution = dict(zip(unique, counts))
    print(f"{year} {dataset_name} verdeling: {distribution}")

for year, datasets in zip(
    ["2018", "2019", "2020"],
    [
        (y_train_2018, y_val_2018, y_test_2018),
        (y_train_2019, y_val_2019, y_test_2019),
        (y_train_2020, y_val_2020, y_test_2020),
    ],
):
    print_class_distribution(datasets[0], year, "Train")
    print_class_distribution(datasets[1], year, "Validation")
    print_class_distribution(datasets[2], year, "Test")



2018 Train verdeling: {0: 9512, 1: 9468}
2018 Validation verdeling: {0: 3170, 1: 3157}
2018 Test verdeling: {0: 3171, 1: 3156}
2019 Train verdeling: {0: 10823, 1: 9640}
2019 Validation verdeling: {0: 3608, 1: 3213}
2019 Test verdeling: {0: 3608, 1: 3214}
2020 Train verdeling: {0: 52657, 1: 43671}
2020 Validation verdeling: {0: 17552, 1: 14557}
2020 Test verdeling: {0: 17553, 1: 14557}
