In [1]:
import polars as pl

DATA_FILE = "2024_tw_posts_president_scored_anon.csv"

# Load and deduplicate columns
df = pl.read_csv(DATA_FILE)
df = df.select([pl.col(c) for i, c in enumerate(df.columns) if c not in df.columns[:i]])

# Detect column types
numeric_cols = [col for col in df.columns if df[col].dtype in (pl.Float64, pl.Int64)]
non_numeric_cols = [col for col in df.columns if df[col].dtype == pl.Utf8]

# === Summary Printer ===
def summarize(df_part: pl.DataFrame, label: str):
    print(f"\n{'=' * 60}\n{label}\n{'=' * 60}")

    if df_part.is_empty():
        print("No data available.\n")
        return

    if numeric_cols:
        print("\nüìä Numeric Summary:")
        numeric_data = df_part.select([pl.col(col).cast(pl.Float64) for col in numeric_cols])
        print(numeric_data.describe())

    print("\nüìù Categorical Summary:")
    for col in non_numeric_cols:
        unique_vals = df_part.select(pl.col(col).n_unique()).item()
        vc = df_part.select(pl.col(col).value_counts()).unnest(col)
        if vc.height > 0:
            most_common = vc[0, col]
            most_count = vc[0, "count"]
            print(f"- {col}:")
            print(f"    Unique values : {unique_vals}")
            print(f"    Most frequent : '{most_common}' ({most_count} times)'")
    print()

# === Overall Summary ===
summarize(df, "üåê Overall Twitter Posts Summary")

# === Grouped by id (first 5 unique tweets)
if "id" in df.columns:
    top_ids = df.select("id").unique().head(5)["id"]
    for tweet_id in top_ids:
        group_df = df.filter(pl.col("id") == tweet_id)
        summarize(group_df, f"üìÅ Grouped by id = {tweet_id}")

# === Grouped by id + source (top 10 tweet-source combos)
if {"id", "source"}.issubset(df.columns):
    top_10 = (
        df.group_by(["id", "source"])
        .agg(pl.len().alias("entry_count"))
        .sort("entry_count", descending=True)
        .head(10)
    )

    for row in top_10.iter_rows():
        tweet_id, source, _ = row
        group_df = df.filter((pl.col("id") == tweet_id) & (pl.col("source") == source))
        summarize(group_df, f"üîó Grouped by id = {tweet_id}, source = {source}")


üåê Overall Twitter Posts Summary

üìä Numeric Summary:
shape: (9, 37)
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ statistic ‚îÜ retweetCo ‚îÜ replyCoun ‚îÜ likeCount ‚îÜ ‚Ä¶ ‚îÜ incivilit ‚îÜ scam_illu ‚îÜ freefair_ ‚îÜ fraud_il ‚îÇ
‚îÇ ---       ‚îÜ unt       ‚îÜ t         ‚îÜ ---       ‚îÜ   ‚îÜ y_illumin ‚îÜ minating  ‚îÜ illuminat ‚îÜ luminati ‚îÇ
‚îÇ str       ‚îÜ ---       ‚îÜ ---       ‚îÜ f64       ‚îÜ   ‚îÜ ating     ‚îÜ ---       ‚îÜ ing       ‚îÜ ng       ‚îÇ
‚îÇ           ‚îÜ f64       ‚îÜ f64       ‚îÜ           ‚îÜ   ‚îÜ ---       ‚îÜ f64       ‚îÜ ---       ‚îÜ ---      ‚îÇ
‚îÇ           ‚îÜ           ‚îÜ           ‚îÜ           ‚îÜ   ‚îÜ f64       ‚îÜ           ‚îÜ f64       ‚îÜ f64      ‚îÇ
‚ïû‚ïê‚ïê‚ïê‚ïê‚ïê