In [1]:
import pandas as pd

DATA_FILE = "2024_tw_posts_president_scored_anon.csv"

# Load data
df = pd.read_csv(DATA_FILE)

# Detect column types
numeric_cols = df.select_dtypes(include="number").columns.tolist()
non_numeric_cols = df.select_dtypes(exclude="number").columns.tolist()

# === Structured Summary Printer ===
def summarize(df_part, label="Summary"):
    print(f"\n{'=' * 60}")
    print(label)
    print(f"{'=' * 60}")

    if df_part.empty:
        print("No data available.\n")
        return

    if numeric_cols:
        print("\n📊 Numeric Columns:")
        desc = df_part[numeric_cols].describe().transpose()
        print(desc.round(2))

    print("\n📝 Non-Numeric Columns:")
    for col in non_numeric_cols:
        vc = df_part[col].value_counts(dropna=True)
        if not vc.empty:
            most_common = vc.index[0]
            most_count = vc.iloc[0]
            print(f"- {col}:")
            print(f"    Unique values  : {df_part[col].nunique(dropna=True)}")
            print(f"    Most frequent  : '{most_common}' ({most_count} times)")

# === Overall summary ===
summarize(df, "🌐 Overall Twitter Posts Summary")

# === Grouped by 'id' (first 5)
for tweet_id, group in list(df.groupby("id"))[:5]:
    summarize(group, f"📁 Group: id = {tweet_id}")

# === Grouped by 'id + quoteId' (Top 10 frequent combos)
top_combos = df.groupby(["id", "quoteId"]).size().sort_values(ascending=False).head(10).index
for tid, qid in top_combos:
    group = df[(df["id"] == tid) & (df["quoteId"] == qid)]
    summarize(group, f"🔗 Group: id = {tid}, quoteId = {qid}")



🌐 Overall Twitter Posts Summary

📊 Numeric Columns:
                                             count          mean  \
retweetCount                               27304.0  1.322060e+03   
replyCount                                 27304.0  1.063790e+03   
likeCount                                  27304.0  6.913690e+03   
quoteCount                                 27304.0  1.280800e+02   
viewCount                                  27304.0  5.070847e+05   
bookmarkCount                              27304.0  1.362100e+02   
quoteId                                     3287.0  1.764298e+18   
inReplyToId                                 3345.0  1.758286e+18   
election_integrity_Truth_illuminating      26034.0  4.000000e-02   
advocacy_msg_type_illuminating             26034.0  5.600000e-01   
issue_msg_type_illuminating                26034.0  5.100000e-01   
attack_msg_type_illuminating               26034.0  3.100000e-01   
image_msg_type_illuminating                26034.0  2.300000e-0