In [5]:
import os
from pathlib import Path
import time
import json

import pandas as pd
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import (
    StringType,
    IntegerType,
    LongType,
    FloatType,
    DoubleType,
    ShortType,
    DecimalType,
)

os.chdir("..")
print("CWD:", os.getcwd())

BASE_DIR = Path(".").resolve()
DATA_DIR = BASE_DIR / "data"
INDEX_PATH = DATA_DIR / "dataset_index.csv"

CAT_PROFILE_DIR = BASE_DIR / "profiles" / "categorical"
CAT_PROFILE_DIR.mkdir(parents=True, exist_ok=True)

spark = (
    SparkSession.builder
    .appName("NYC_Categorical_Profiling")
    .getOrCreate()
)

index_df = pd.read_csv(INDEX_PATH)
index_records = index_df.to_dict(orient="records")
len(index_records)


CWD: /home/jovyan


40

In [6]:
numeric_types = (
    IntegerType,
    LongType,
    FloatType,
    DoubleType,
    ShortType,
    DecimalType,
)

def get_categorical_columns(sdf, max_distinct_numeric=20):
    string_cols = []
    numeric_low_card = []

    for field in sdf.schema.fields:
        if isinstance(field.dataType, StringType):
            string_cols.append(field.name)

    for field in sdf.schema.fields:
        if isinstance(field.dataType, numeric_types):
            col = field.name
            try:
                distinct_cnt = sdf.select(col).distinct().count()
            except Exception:
                distinct_cnt = None
            if distinct_cnt is not None and distinct_cnt <= max_distinct_numeric:
                numeric_low_card.append(col)

    cols = sorted(set(string_cols + numeric_low_card))
    return cols


In [None]:
# Cell 3: main categorical profiling loop + JSON output + timing

profiling_stats = []

for row in index_records:
    # Skip datasets that didnâ€™t download successfully
    if row.get("download_status") != "ok":
        continue

    dataset_id = row["dataset_id"]
    local_path = row["local_path"]
    full_path = str(BASE_DIR / local_path)

    print(f"Profiling categorical columns for {dataset_id} ...")

    start = time.time()
    status = "ok"
    error = None

    # 1) Read CSV with Spark
    try:
        sdf = (
            spark.read
            .option("header", True)
            .option("inferSchema", True)
            .csv(full_path)
        )
    except Exception as e:
        status = "read_error"
        error = str(e)
        profiling_stats.append(
            {
                "dataset_id": dataset_id,
                "status": status,
                "error": error,
                "seconds": time.time() - start,
                "num_categorical_cols": 0,
            }
        )
        print(f"  -> READ ERROR: {error}")
        continue

    # 2) Detect categorical columns
    categorical_cols = get_categorical_columns(sdf)
    categorical_profile = {}

    # 3) Build distributions per categorical column
    for col in categorical_cols:
        # Normalize as lowercase + trimmed string, handling dots/spaces via backticks
        norm_expr = F.lower(
            F.trim(F.col(f"`{col}`").cast("string"))
        ).alias("value_norm")

        dist = (
            sdf.groupBy(norm_expr)
            .agg(F.count("*").alias("count"))
            .orderBy(F.desc("count"))
        )

        # Collect top categories (up to 100)
        dist_rows = dist.limit(100).collect()
        if not dist_rows:
            continue

        try:
            total_row = dist.agg(F.sum("count").alias("total")).first()
            total = total_row["total"]
        except Exception:
            total = None

        top_values = []
        for r in dist_rows:
            value = r["value_norm"]
            count = int(r["count"])
            if total:
                percent = float(count) / float(total)
            else:
                percent = None
            top_values.append(
                {
                    "value": value,
                    "count": count,
                    "percent": percent,
                }
            )

        try:
            unique_values = dist.count()
        except Exception:
            unique_values = len(dist_rows)

        categorical_profile[col] = {
            "unique_values": int(unique_values),
            "top_values": top_values,
        }

    # 4) Simple feature grouping by prefix
    feature_groups = {}
    for col_name in sdf.columns:
        if "_" in col_name:
            prefix = col_name.split("_")[0]
        else:
            prefix = col_name
        feature_groups.setdefault(prefix, []).append(col_name)

    # 5) Save JSON for this dataset
    out_obj = {
        "dataset_id": dataset_id,
        "categorical_profile": categorical_profile,
        "feature_groups": feature_groups,
    }

    out_path = CAT_PROFILE_DIR / f"{dataset_id}_categorical.json"
    with out_path.open("w") as f:
        json.dump(out_obj, f, indent=2)

    elapsed = time.time() - start

    profiling_stats.append(
        {
            "dataset_id": dataset_id,
            "status": status,
            "error": error,
            "seconds": elapsed,
            "num_categorical_cols": len(categorical_cols),
        }
    )

print("Done. Profiling stats count:", len(profiling_stats))


Profiling categorical columns for f9bf-2cp4 ...


                                                                                

Profiling categorical columns for x3bb-kg5j ...


                                                                                

Profiling categorical columns for zt9s-n5aj ...
Profiling categorical columns for s3k6-pzi2 ...


In [None]:
stats_df = pd.DataFrame(profiling_stats)
stats_df.to_csv(BASE_DIR / "profiles" / "categorical_profiling_times.csv", index=False)
stats_df.head()
