In [1]:
import polars as pl

DATA_FILE = "2024_fb_posts_president_scored_anon.csv"

# Load and deduplicate columns
df = pl.read_csv(DATA_FILE)
df = df.select([pl.col(c) for i, c in enumerate(df.columns) if c not in df.columns[:i]])

# Determine column types
numeric_cols = [col for col in df.columns if df[col].dtype in (pl.Float64, pl.Int64)]
non_numeric_cols = [col for col in df.columns if df[col].dtype == pl.Utf8]

# === Summary Printer ===
def summarize(df_part: pl.DataFrame, label: str):
    print(f"\n{'=' * 60}\n{label}\n{'=' * 60}")

    if df_part.is_empty():
        print("No data available.\n")
        return

    if numeric_cols:
        print("\nüìä Numeric Summary:")
        numeric_data = df_part.select([pl.col(col).cast(pl.Float64) for col in numeric_cols])
        print(numeric_data.describe())

    print("\nüìù Categorical Summary:")
    for col in non_numeric_cols:
        unique_vals = df_part.select(pl.col(col).n_unique()).item()
        vc = df_part.select(pl.col(col).value_counts()).unnest(col)
        if vc.height > 0:
            most_common = vc[0, col]
            most_count = vc[0, "count"]
            print(f"- {col}:")
            print(f"    Unique values : {unique_vals}")
            print(f"    Most frequent : '{most_common}' ({most_count} times)'")
    print()

# === Overall Summary ===
summarize(df, "üåê Overall Facebook Posts Summary")

# === Grouped by Facebook_Id (first 5)
if "Facebook_Id" in df.columns:
    unique_ids = df.select("Facebook_Id").unique().head(5)["Facebook_Id"]
    for fb_id in unique_ids:
        group_df = df.filter(pl.col("Facebook_Id") == fb_id)
        summarize(group_df, f"üìÅ Grouped by Facebook_Id = {fb_id}")

# === Grouped by Facebook_Id + post_id (top 10 combos)
if all(col in df.columns for col in ["Facebook_Id", "post_id"]):
    top_10 = (
        df.group_by(["Facebook_Id", "post_id"])
        .agg(pl.len().alias("count"))
        .sort("count", descending=True)
        .head(10)
    )

    for row in top_10.iter_rows():
        fb_id, post_id, _ = row
        group_df = df.filter((pl.col("Facebook_Id") == fb_id) & (pl.col("post_id") == post_id))
        summarize(group_df, f"üîó Grouped by Facebook_Id = {fb_id}, post_id = {post_id}")


üåê Overall Facebook Posts Summary

üìä Numeric Summary:
shape: (9, 41)
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ statistic ‚îÜ Likes     ‚îÜ Comments  ‚îÜ Shares    ‚îÜ ‚Ä¶ ‚îÜ incivilit ‚îÜ scam_illu ‚îÜ freefair_ ‚îÜ fraud_il ‚îÇ
‚îÇ ---       ‚îÜ ---       ‚îÜ ---       ‚îÜ ---       ‚îÜ   ‚îÜ y_illumin ‚îÜ minating  ‚îÜ illuminat ‚îÜ luminati ‚îÇ
‚îÇ str       ‚îÜ f64       ‚îÜ f64       ‚îÜ f64       ‚îÜ   ‚îÜ ating     ‚îÜ ---       ‚îÜ ing       ‚îÜ ng       ‚îÇ
‚îÇ           ‚îÜ           ‚îÜ           ‚îÜ           ‚îÜ   ‚îÜ ---       ‚îÜ f64       ‚îÜ ---       ‚îÜ ---      ‚îÇ
‚îÇ           ‚îÜ           ‚îÜ           ‚îÜ           ‚îÜ   ‚îÜ f64       ‚îÜ           ‚îÜ f64       ‚îÜ f64      ‚îÇ
‚ïû‚ïê‚ïê‚ïê‚ïê‚ï