In [1]:
import pandas as pd

DATA_FILE = "2024_fb_ads_president_scored_anon.csv"

# Load dataset
df = pd.read_csv(DATA_FILE)

# Detect numeric vs non-numeric columns
numeric_cols = df.select_dtypes(include='number').columns.tolist()
non_numeric_cols = df.select_dtypes(exclude='number').columns.tolist()

# === Function to print structured summary ===
def summarize(df_part, label):
    print(f"\n{'='*60}")
    print(f"{label}")
    print(f"{'='*60}")

    if df_part.empty:
        print("No records to display.")
        return

    # Numeric columns
    if numeric_cols:
        print("\n📊 Numeric Columns:")
        desc = df_part[numeric_cols].describe().transpose()
        print(desc.round(2))

    # Non-numeric columns
    print("\n📝 Non-Numeric Columns:")
    for col in non_numeric_cols:
        vc = df_part[col].value_counts(dropna=True)
        if vc.empty:
            continue
        most_common = vc.index[0]
        most_count = vc.iloc[0]
        unique_count = df_part[col].nunique(dropna=True)
        print(f"- {col}:")
        print(f"    Unique values  : {unique_count}")
        print(f"    Most frequent  : '{most_common}' ({most_count} times)")

# === Overall summary ===
summarize(df, "🌐 Overall Dataset Summary")

# === Grouped by page_id (first 5) ===
for pid, group in list(df.groupby("page_id"))[:5]:
    summarize(group, f"📁 Grouped by page_id = {pid}")

# === Grouped by page_id + ad_id (Top 10 most common) ===
top_combos = df.groupby(["page_id", "ad_id"]).size().sort_values(ascending=False).head(10).index
for pid, aid in top_combos:
    sub_df = df[(df["page_id"] == pid) & (df["ad_id"] == aid)]
    summarize(sub_df, f"🔗 Grouped by page_id = {pid}, ad_id = {aid}")


🌐 Overall Dataset Summary

📊 Numeric Columns:
                                              count       mean        std  \
estimated_audience_size                    246745.0  556462.86  409864.76   
estimated_impressions                      246745.0   45601.53  136790.77   
estimated_spend                            246745.0    1061.29    4992.56   
scam_illuminating                          246745.0       0.07       0.26   
election_integrity_Truth_illuminating      246745.0       0.05       0.22   
advocacy_msg_type_illuminating             246745.0       0.55       0.50   
issue_msg_type_illuminating                246745.0       0.38       0.49   
attack_msg_type_illuminating               246745.0       0.27       0.44   
image_msg_type_illuminating                246745.0       0.22       0.42   
cta_msg_type_illuminating                  246745.0       0.57       0.49   
engagement_cta_subtype_illuminating        246745.0       0.12       0.33   
fundraising_cta_subtype_illum