In [1]:
import polars as pl

DATA_FILE = "2024_tw_posts_president_scored_anon.csv"

# Load and deduplicate columns
df = pl.read_csv(DATA_FILE)
df = df.select([pl.col(c) for i, c in enumerate(df.columns) if c not in df.columns[:i]])

# Detect column types
numeric_cols = [col for col in df.columns if df[col].dtype in (pl.Float64, pl.Int64)]
non_numeric_cols = [col for col in df.columns if df[col].dtype == pl.Utf8]

# === Summary Printer ===
def summarize(df_part: pl.DataFrame, label: str):
    print(f"\n{'=' * 60}\n{label}\n{'=' * 60}")

    if df_part.is_empty():
        print("No data available.\n")
        return

    if numeric_cols:
        print("\n📊 Numeric Summary:")
        numeric_data = df_part.select([pl.col(col).cast(pl.Float64) for col in numeric_cols])
        print(numeric_data.describe())

    print("\n📝 Categorical Summary:")
    for col in non_numeric_cols:
        unique_vals = df_part.select(pl.col(col).n_unique()).item()
        vc = df_part.select(pl.col(col).value_counts()).unnest(col)
        if vc.height > 0:
            most_common = vc[0, col]
            most_count = vc[0, "count"]
            print(f"- {col}:")
            print(f"    Unique values : {unique_vals}")
            print(f"    Most frequent : '{most_common}' ({most_count} times)'")
    print()

# === Overall Summary ===
summarize(df, "🌐 Overall Twitter Posts Summary")

# === Grouped by id (first 5 unique tweets)
if "id" in df.columns:
    top_ids = df.select("id").unique().head(5)["id"]
    for tweet_id in top_ids:
        group_df = df.filter(pl.col("id") == tweet_id)
        summarize(group_df, f"📁 Grouped by id = {tweet_id}")

# === Grouped by id + source (top 10 tweet-source combos)
if {"id", "source"}.issubset(df.columns):
    top_10 = (
        df.group_by(["id", "source"])
        .agg(pl.len().alias("entry_count"))
        .sort("entry_count", descending=True)
        .head(10)
    )

    for row in top_10.iter_rows():
        tweet_id, source, _ = row
        group_df = df.filter((pl.col("id") == tweet_id) & (pl.col("source") == source))
        summarize(group_df, f"🔗 Grouped by id = {tweet_id}, source = {source}")


🌐 Overall Twitter Posts Summary

📊 Numeric Summary:
shape: (9, 37)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ statistic ┆ retweetCo ┆ replyCoun ┆ likeCount ┆ … ┆ incivilit ┆ scam_illu ┆ freefair_ ┆ fraud_il │
│ ---       ┆ unt       ┆ t         ┆ ---       ┆   ┆ y_illumin ┆ minating  ┆ illuminat ┆ luminati │
│ str       ┆ ---       ┆ ---       ┆ f64       ┆   ┆ ating     ┆ ---       ┆ ing       ┆ ng       │
│           ┆ f64       ┆ f64       ┆           ┆   ┆ ---       ┆ f64       ┆ ---       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆ f64       ┆           ┆ f64       ┆ f64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ count     ┆ 27304.0   ┆ 27304.0   ┆ 27304.0   ┆ … ┆ 26034.0   ┆ 26034.0   ┆ 27304.0   ┆ 27304.0  │
│ null_coun ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ … ┆ 1270.0    ┆ 1270.0    ┆ 0.0       ┆ 0.0      │
│ t         ┆          