In [2]:
import csv
import math
from collections import defaultdict, Counter

FILE_PATH = "2024_fb_ads_president_scored_anon.csv"

def is_number(s):
    try:
        float(s)
        return True
    except:
        return False

def mean(lst):
    return sum(lst) / len(lst) if lst else 0

def std_dev(lst):
    if len(lst) < 2:
        return 0
    avg = mean(lst)
    variance = sum((x - avg) ** 2 for x in lst) / len(lst)
    return math.sqrt(variance)

def read_csv(file_path):
    with open(file_path, encoding="utf-8") as f:
        reader = csv.DictReader(f)
        return list(reader)

def summarize(data, label="Overall Summary"):
    print(f"\n{'=' * 60}")
    print(f"{label}")
    print(f"{'=' * 60}")

    if not data:
        print("No records available.\n")
        return

    columns = data[0].keys()
    for col in columns:
        values = [row[col].strip() for row in data if row[col].strip()]
        numeric_vals = [float(v) for v in values if is_number(v)]

        if numeric_vals:
            print(f"\n📊 {col} (Numeric)")
            print(f"    Count   : {len(numeric_vals)}")
            print(f"    Mean    : {mean(numeric_vals):.2f}")
            print(f"    Min     : {min(numeric_vals)}")
            print(f"    Max     : {max(numeric_vals)}")
            print(f"    Std Dev : {std_dev(numeric_vals):.2f}")
        elif values:
            counter = Counter(values)
            most_common = counter.most_common(1)[0]
            print(f"\n📝 {col} (Categorical)")
            print(f"    Unique Values : {len(counter)}")
            print(f"    Most Frequent : '{most_common[0]}' ({most_common[1]} times)")
    print()

def group_by(data, keys):
    groups = defaultdict(list)
    for row in data:
        key = tuple(row[k] for k in keys)
        groups[key].append(row)
    return groups

if __name__ == "__main__":
    data = read_csv(FILE_PATH)

    # Overall Summary
    summarize(data, label="🌐 Overall Dataset Summary")

    # Group by page_id
    page_groups = group_by(data, ["page_id"])
    for page_id, group in list(page_groups.items())[:5]:  # Limit to first 5 for display
        summarize(group, label=f"📁 Group Summary: page_id = {page_id[0]}")

    # Group by page_id and ad_id
    page_ad_groups = group_by(data, ["page_id", "ad_id"])
    for key in list(page_ad_groups.keys())[:5]:  # Limit to top 5 combos
        summarize(page_ad_groups[key], label=f"🔗 Group Summary: page_id = {key[0]}, ad_id = {key[1]}")



🌐 Overall Dataset Summary

📝 page_id (Categorical)
    Unique Values : 4475
    Most Frequent : '4d66f5853f0365dba032a87704a634f023d15babde973bb7a284ed8cd2707b2d' (55503 times)

📝 ad_id (Categorical)
    Unique Values : 246745
    Most Frequent : '0ddb025b8544e2d58e6977ad417e742a52522b3e1fc1c9d9b61c57148f8d72fc' (1 times)

📝 ad_creation_time (Categorical)
    Unique Values : 547
    Most Frequent : '2024-10-27' (8619 times)

📝 bylines (Categorical)
    Unique Values : 3786
    Most Frequent : 'HARRIS FOR PRESIDENT' (49788 times)

📝 currency (Categorical)
    Unique Values : 18
    Most Frequent : 'USD' (246599 times)

📝 delivery_by_region (Categorical)
    Unique Values : 141122
    Most Frequent : '{}' (30989 times)

📝 demographic_distribution (Categorical)
    Unique Values : 215622
    Most Frequent : '{}' (30989 times)

📊 estimated_audience_size (Numeric)
    Count   : 246745
    Mean    : 556462.86
    Min     : 0.0
    Max     : 1000001.0
    Std Dev : 409863.93

📊 estimated_imp