# 1️⃣ Imports and configuration

In [None]:
from pathlib import Path
from typing import Tuple, Dict, Any

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

pd.set_option("display.max_colwidth", 200)


# 2️⃣ Utility functions

In [None]:

def _normalize_token(x: Any) -> str:
    try:
        s = str(x).strip().lower()
    except Exception:
        s = ""
    return s.rstrip(".,")


# 3️⃣ Load dataset

In [None]:
file_path = Path("../Data/Raw/Uitgebreide_VKM_dataset.csv")
suffix = file_path.suffix.lower()

if suffix in {".csv", ".tsv", ".txt"}:
    sep = "\t" if suffix == ".tsv" else ","
    df = pd.read_csv(file_path, sep=sep, low_memory=False)
elif suffix in {".xlsx", ".xls"}:
    df = pd.read_excel(file_path)
elif suffix == ".json":
    try:
        df = pd.read_json(file_path, orient="records", lines=False)
    except Exception:
        df = pd.read_json(file_path, lines=True)
elif suffix == ".parquet":
    df = pd.read_parquet(file_path)
else:
    raise ValueError(f"Unsupported file type: {suffix}")

df.head()


# 4️⃣ Inspect dataset

In [None]:
# Quick dataset inspection (to run in a new notebook cell)
print("Shape:", df.shape)
print("\nColumns and dtypes:")
print(df.dtypes)

print("\nFirst 5 rows:")
print(df.head())

# Missingness summary (use existing 'miss' if present)
if 'miss' in globals():
    print("\nMissing % (existing variable 'miss'):")
    print(miss.sort_values(ascending=False))
else:
    miss_local = df.isnull().mean() * 100
    print("\nMissing %:")
    print(miss_local.sort_values(ascending=False))

# Numeric summary (use existing 'num' if present)
if 'num' in globals() and not num.empty:
    print("\nNumeric summary (existing variable 'num'):")
    print(num.describe().T)
else:
    num_local = df.select_dtypes(include=[np.number])
    print("\nNumeric summary:")
    print(num_local.describe().T)

# Categorical columns and top values (avoid overwriting existing 'cat_cols' if present)
if 'cat_cols' in globals():
    cats = [col for col in cat_cols if col in df.columns]
else:
    cats = df.select_dtypes(include=['object', 'category']).columns.tolist()

print("\nCategorical columns to inspect:", cats)
for col in cats:
    print(f"\n--- {col} ---")
    try:
        vc = df[col].value_counts(dropna=False).head(10)
        print("Top values:\n", vc)
        print("Unique (non-null):", df[col].nunique(dropna=True))
    except Exception as e:
        print("Could not compute value_counts:", e)

# Parse and summarize start_date (safely)
if 'start_date' in df.columns:
    dates = pd.to_datetime(df['start_date'], errors='coerce')
    print("\nstart_date parsing:")
    print("Nulls after parsing:", dates.isna().sum())
    print("Top parsed dates:")
    print(dates.value_counts().head())

text_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Memory usage
print("\nMemory usage (MB):", df.memory_usage(deep=True).sum() / 1024**2)

# 5️⃣ Numeric histograms


In [None]:
if not num.empty:
    max_cols = 100
    plot_cols = num.columns[:max_cols]
    nrows = math.ceil(len(plot_cols) / 4)
    figsize = (min(16, 4*len(plot_cols)), 3*nrows)
    num[plot_cols].hist(bins=20, figsize=figsize)
    plt.suptitle(f"Histograms: {file_path.name}")
    plt.tight_layout(rect=[0,0,1,0.95])
    plt.show()
