In [2]:
import pandas as pd
import numpy as np

# Data inladen
df = pd.read_csv('Uitgebreide_VKM_dataset.csv')

print("=" * 60)
print("DATA CLEANING PROCESS")
print("=" * 60)
print(f"\nORIGINEEL: {df.shape[0]} rijen, {df.shape[1]} kolommen")

# 1. Verwijder de kleur-kolommen (Rood, Groen, Blauw, Geel)
df_cleaned = df.drop(columns=['Rood', 'Groen', 'Blauw', 'Geel'])
print(f"\n1. Kleur-kolommen verwijderd -> {df_cleaned.shape[1]} kolommen")

# 2. Vul lege waarden in shortdescription met description
df_cleaned['shortdescription'] = df_cleaned['shortdescription'].fillna(
    df_cleaned['description'].str[:200]  # Eerste 200 karakters van description
)
print(f"2. shortdescription: {df_cleaned['shortdescription'].isna().sum()} NULL waarden blijvend")

# 3. Vul lege waarden in learningoutcomes met "Nog niet bepaald"
df_cleaned['learningoutcomes'] = df_cleaned['learningoutcomes'].fillna('Nog niet bepaald')
print(f"3. learningoutcomes: {df_cleaned['learningoutcomes'].isna().sum()} NULL waarden blijvend")

# 4. Zorg ervoor dat start_date geldig is
df_cleaned['start_date'] = pd.to_datetime(df_cleaned['start_date'], errors='coerce')
invalid_dates = df_cleaned['start_date'].isna().sum()
print(f"4. start_date: {invalid_dates} ongeldige datums geconverteerd naar NaT")

# 5. Controleer op duplicaten
duplicates = df_cleaned.duplicated(subset=['id']).sum()
print(f"5. Duplicaten: {duplicates} gevonden")

print("\n" + "=" * 60)
print("FINALE DATASET STATUS")
print("=" * 60)
print(f"Rijen: {df_cleaned.shape[0]}")
print(f"Kolommen: {df_cleaned.shape[1]}")
print(f"\nKolommen in cleaned dataset:")
print(df_cleaned.columns.tolist())
print(f"\nMissing values in cleaned dataset:")
print(df_cleaned.isnull().sum().sum())

# Sla op
output_file = 'Uitgebreide_VKM_dataset_cleaned.csv'
df_cleaned.to_csv(output_file, index=False)
print(f"\nOPGESLAGEN: {output_file}")


ModuleNotFoundError: No module named 'pandas'

In [None]:

# Verificatie: Inladen en controleren van cleaned dataset
df_cleaned = pd.read_csv('Uitgebreide_VKM_dataset_cleaned.csv')

print("=" * 60)
print("VERIFICATIE VAN CLEANED DATASET")
print("=" * 60)
print(f"\nShape: {df_cleaned.shape}")
print(f"\nData types:")
print(df_cleaned.dtypes)
print(f"\n\nAantal NULL waarden per kolom:")
print(df_cleaned.isnull().sum())
print(f"\n\nTotaal NULL waarden: {df_cleaned.isnull().sum().sum()}")
print(f"\nDuplicaten: {df_cleaned.duplicated(subset=['id']).sum()}")

# Toon sample van gegevens
print("\n" + "=" * 60)
print("SAMPLE VAN CLEANED DATA (eerste 3 rijen)")
print("=" * 60)
print(df_cleaned[['id', 'name', 'shortdescription', 'learningoutcomes']].head(3))
