In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

INPUT_FILE = "Uitgebreide_VKM_dataset.csv"
OUTPUT_FILE = "Uitgebreide_VKM_dataset_cleaned.csv"

# Kolommen die we als tekst willen schoonmaken
TEXT_COLS = [
    "shortdescription",
    "description",
    "content",
    "learningoutcomes"
]

# Stopwoorden en lemmatizer
# Kies 'english' of 'dutch' afhankelijk van je dataset
STOP_LANG = "dutch"   # of "english"
stop_words = set(stopwords.words(STOP_LANG))
lemmatizer = WordNetLemmatizer()

def normalize_length(tokens, max_len=200):
    """Beperk tokens tot een vaste maximale lengte om scheefheid te verminderen."""
    return tokens[:max_len]


def normalize_text(text: str) -> str:
    """Maak tekst schoon, lemmatiseer en normaliseer lengte."""
    if pd.isna(text):
        return ""

    text = str(text).lower()

    # Verwijder alles behalve letters, cijfers en spaties
    text = re.sub(r"[^a-zA-Z0-9\sáéíóúàèìòùäëïöüâêîôûçñ]", " ", text)

    tokens = word_tokenize(text)

    # Filter op stopwoorden en korte tokens
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]

    # Lemmatiseer
    tokens = [lemmatizer.lemmatize(t) for t in tokens]

    tokens = normalize_length(tokens, max_len=200)

    return " ".join(tokens)



def clean_vkm_dataset(input_file: str, output_file: str) -> pd.DataFrame:
    # Data inladen
    df = pd.read_csv(input_file)

    print("=" * 60)
    print("DATA CLEANING PROCESS")
    print("=" * 60)
    print(f"Origineel: {df.shape[0]} rijen, {df.shape[1]} kolommen")

    df_cleaned = df.copy()

    # 1. Verwijder de kleur-kolommen (Rood, Groen, Blauw, Geel) als ze bestaan
    kleur_kolommen = ["Rood", "Groen", "Blauw", "Geel"]
    bestaande_kleuren = [c for c in kleur_kolommen if c in df_cleaned.columns]
    if bestaande_kleuren:
        df_cleaned = df_cleaned.drop(columns=bestaande_kleuren)
        print(f"\n1. Kleur-kolommen verwijderd: {bestaande_kleuren}")
    else:
        print("\n1. Geen kleur-kolommen gevonden om te verwijderen")

    print(f"   -> {df_cleaned.shape[1]} kolommen resterend")

    # 2. Vul lege waarden in shortdescription met eerste 200 chars van description
    if "shortdescription" in df_cleaned.columns and "description" in df_cleaned.columns:
        before_nulls = df_cleaned["shortdescription"].isna().sum()
        df_cleaned["shortdescription"] = df_cleaned["shortdescription"].fillna(
            df_cleaned["description"].astype(str).str[:200]
        )
        after_nulls = df_cleaned["shortdescription"].isna().sum()
        print(f"\n2. shortdescription aangevuld met description (eerste 200 chars)")
        print(f"   Voor: {before_nulls} NULL, Na: {after_nulls} NULL")
    else:
        print("\n2. Kolommen shortdescription of description ontbreken, stap overgeslagen")

    # 3. Vul lege waarden in learningoutcomes
    if "learningoutcomes" in df_cleaned.columns:
        before_nulls = df_cleaned["learningoutcomes"].isna().sum()
        df_cleaned["learningoutcomes"] = df_cleaned["learningoutcomes"].fillna("Nog niet bepaald")
        after_nulls = df_cleaned["learningoutcomes"].isna().sum()
        print(f"\n3. learningoutcomes aangevuld met 'Nog niet bepaald'")
        print(f"   Voor: {before_nulls} NULL, Na: {after_nulls} NULL")
    else:
        print("\n3. Kolom learningoutcomes ontbreekt, stap overgeslagen")

    # 4. start_date naar geldige datetime
    if "start_date" in df_cleaned.columns:
        df_cleaned["start_date"] = pd.to_datetime(df_cleaned["start_date"], errors="coerce")
        invalid_dates = df_cleaned["start_date"].isna().sum()
        print(f"\n4. start_date geconverteerd naar datetime")
        print(f"   Ongeldige datums naar NaT: {invalid_dates}")
    else:
        print("\n4. Kolom start_date ontbreekt, stap overgeslagen")

    # 5. Duplicaten op id droppen
    if "id" in df_cleaned.columns:
        before = df_cleaned.shape[0]
        duplicates = df_cleaned.duplicated(subset=["id"]).sum()
        df_cleaned = df_cleaned.drop_duplicates(subset=["id"])
        after = df_cleaned.shape[0]
        print(f"\n5. Duplicaten op 'id'")
        print(f"   Gevonden: {duplicates}, Rijen voor: {before}, na: {after}")
    else:
        print("\n5. Kolom 'id' ontbreekt, duplicaten-check overgeslagen")

    # 6. Tekstvelden schoonmaken + lemmatizeren
    print("\n6. Tekstvelden normaliseren en lemmatiseren")
    for col in TEXT_COLS:
        if col in df_cleaned.columns:
            clean_col = f"{col}_clean"
            print(f"   - Verwerken: {col} -> {clean_col}")
            df_cleaned[clean_col] = df_cleaned[col].apply(normalize_text)
        else:
            print(f"   - Kolom {col} niet gevonden, overgeslagen")

    # 7. Globale missing value check
    total_nulls = df_cleaned.isnull().sum().sum()
    print("\n" + "=" * 60)
    print("FINALE DATASET STATUS")
    print("=" * 60)
    print(f"Rijen: {df_cleaned.shape[0]}")
    print(f"Kolommen: {df_cleaned.shape[1]}")
    print(f"Totaal NULL waarden: {total_nulls}")

    # Optioneel: per kolom
    print("\nNULL waarden per kolom (alleen > 0):")
    nulls_per_col = df_cleaned.isnull().sum()
    print(nulls_per_col[nulls_per_col > 0])

    # Sample tonen
    sample_cols = [c for c in ["id", "name", "shortdescription", "shortdescription_clean", "learningoutcomes", "learningoutcomes_clean"] if c in df_cleaned.columns]
    print("\nSample (eerste 3 rijen):")
    print(df_cleaned[sample_cols].head(3))

    # Opslaan
    df_cleaned.to_csv(output_file, index=False)
    print(f"\nOPGESLAGEN: {output_file}")

    return df_cleaned


if __name__ == "__main__":
    df_cleaned = clean_vkm_dataset(INPUT_FILE, OUTPUT_FILE)


DATA CLEANING PROCESS
Origineel: 211 rijen, 20 kolommen

1. Kleur-kolommen verwijderd: ['Rood', 'Groen', 'Blauw', 'Geel']
   -> 16 kolommen resterend

2. shortdescription aangevuld met description (eerste 200 chars)
   Voor: 20 NULL, Na: 0 NULL

3. learningoutcomes aangevuld met 'Nog niet bepaald'
   Voor: 5 NULL, Na: 0 NULL

4. start_date geconverteerd naar datetime
   Ongeldige datums naar NaT: 0

5. Duplicaten op 'id'
   Gevonden: 0, Rijen voor: 211, na: 211

6. Tekstvelden normaliseren en lemmatiseren
   - Verwerken: shortdescription -> shortdescription_clean
   - Verwerken: description -> description_clean
   - Verwerken: content -> content_clean
   - Verwerken: learningoutcomes -> learningoutcomes_clean

FINALE DATASET STATUS
Rijen: 211
Kolommen: 20
Totaal NULL waarden: 0

NULL waarden per kolom (alleen > 0):
Series([], dtype: int64)

Sample (eerste 3 rijen):
    id                          name  \
0  159  Kennismaking met Psychologie   
1  160   Learning and working abroad   
2 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kloos\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kloos\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kloos\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\kloos\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
df_cleaned["fulltext_clean"] = (
    df_cleaned.get("name_clean", "") + " " +
    df_cleaned.get("shortdescription_clean", "") + " " +
    df_cleaned.get("description_clean", "") + " " +
    df_cleaned.get("content_clean", "") + " " +
    df_cleaned.get("learningoutcomes_clean", "")
).str.strip()
