In [3]:
import polars as pl

DATA_FILE = "2024_fb_ads_president_scored_anon.csv"

# Load dataset and remove duplicate columns
df = pl.read_csv(DATA_FILE)
df = df.select([pl.col(c) for i, c in enumerate(df.columns) if c not in df.columns[:i]])

# Classify columns
numeric_cols = [col for col in df.columns if df[col].dtype in (pl.Float64, pl.Int64)]
non_numeric_cols = [col for col in df.columns if df[col].dtype == pl.Utf8]

# === Summary Printer ===
def summarize(df_part: pl.DataFrame, label: str):
    print(f"\n{'=' * 60}\n{label}\n{'=' * 60}")

    if df_part.is_empty():
        print("No data available.\n")
        return

    # Numeric
    if numeric_cols:
        print("\n📊 Numeric Summary:")
        numeric_data = df_part.select([pl.col(col).cast(pl.Float64) for col in numeric_cols])
        print(numeric_data.describe())

    # Non-numeric
    print("\n📝 Categorical Summary:")
    for col in non_numeric_cols:
        unique_vals = df_part.select(pl.col(col).n_unique()).item()
        vc = df_part.select(pl.col(col).value_counts()).unnest(col)
        if vc.height > 0:
            most_common = vc[0, col]
            most_count = vc[0, "count"]
            print(f"- {col}:")
            print(f"    Unique values : {unique_vals}")
            print(f"    Most frequent : '{most_common}' ({most_count} times)'")
    print()

# === Overall Summary ===
summarize(df, "🌐 Overall Facebook Ads Summary")

# === Grouped by page_id (first 5)
if "page_id" in df.columns:
    unique_ids = df.select("page_id").unique().head(5)["page_id"]
    for page_id in unique_ids:
        group_df = df.filter(pl.col("page_id") == page_id)
        summarize(group_df, f"📁 Grouped by page_id = {page_id}")

# === Grouped by page_id + ad_id (top 10 combos)
if {"page_id", "ad_id"}.issubset(df.columns):
    top_10 = (
        df.group_by(["page_id", "ad_id"])
        .agg(pl.len().alias("count"))
        .sort("count", descending=True)
        .head(10)
    )

    for row in top_10.iter_rows():
        page_id, ad_id, _ = row
        group_df = df.filter((pl.col("page_id") == page_id) & (pl.col("ad_id") == ad_id))
        summarize(group_df, f"🔗 Grouped by page_id = {page_id}, ad_id = {ad_id}")



🌐 Overall Facebook Ads Summary

📊 Numeric Summary:
shape: (9, 32)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ statistic ┆ estimated ┆ estimated ┆ estimated ┆ … ┆ womens_is ┆ incivilit ┆ freefair_ ┆ fraud_il │
│ ---       ┆ _audience ┆ _impressi ┆ _spend    ┆   ┆ sue_topic ┆ y_illumin ┆ illuminat ┆ luminati │
│ str       ┆ _size     ┆ ons       ┆ ---       ┆   ┆ _illumina ┆ ating     ┆ ing       ┆ ng       │
│           ┆ ---       ┆ ---       ┆ f64       ┆   ┆ tin…      ┆ ---       ┆ ---       ┆ ---      │
│           ┆ f64       ┆ f64       ┆           ┆   ┆ ---       ┆ f64       ┆ f64       ┆ f64      │
│           ┆           ┆           ┆           ┆   ┆ f64       ┆           ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ count     ┆ 246745.0  ┆ 246745.0  ┆ 246745.0  ┆ … ┆ 246745.0  ┆ 246745.0  ┆ 246745.0  ┆ 246745.0 │
│ null_coun ┆ 0.0       