In [1]:
import csv
import math
from collections import defaultdict, Counter

FILE_PATH = "2024_tw_posts_president_scored_anon.csv"

def is_number(s):
    try:
        float(s.replace(",", ""))
        return True
    except:
        return False

def to_number(s):
    try:
        return float(s.replace(",", ""))
    except:
        return None

def mean(lst):
    return sum(lst) / len(lst) if lst else 0

def std_dev(lst):
    if len(lst) < 2:
        return 0
    avg = mean(lst)
    return math.sqrt(sum((x - avg) ** 2 for x in lst) / len(lst))

def read_csv(file_path):
    with open(file_path, mode="r", encoding="utf-8") as f:
        return list(csv.DictReader(f))

def summarize(data, label="Overall Summary"):
    print(f"\n{'=' * 60}")
    print(f"{label}")
    print(f"{'=' * 60}")

    if not data:
        print("No data found.\n")
        return

    columns = data[0].keys()
    for col in columns:
        values = [row[col].strip() for row in data if row[col].strip()]
        numeric_vals = [to_number(v) for v in values if is_number(v)]

        if numeric_vals:
            print(f"\n📊 {col} (Numeric)")
            print(f"    Count   : {len(numeric_vals)}")
            print(f"    Mean    : {mean(numeric_vals):.2f}")
            print(f"    Min     : {min(numeric_vals)}")
            print(f"    Max     : {max(numeric_vals)}")
            print(f"    Std Dev : {std_dev(numeric_vals):.2f}")
        elif values:
            counter = Counter(values)
            most_common = counter.most_common(1)[0] if counter else ("N/A", 0)
            print(f"\n📝 {col} (Categorical)")
            print(f"    Unique Values : {len(counter)}")
            print(f"    Most Frequent : '{most_common[0]}' ({most_common[1]} times)")
    print()

def group_by(data, keys):
    grouped = defaultdict(list)
    for row in data:
        key = tuple(row[k] for k in keys)
        grouped[key].append(row)
    return grouped

if __name__ == "__main__":
    data = read_csv(FILE_PATH)

    # Overall
    summarize(data, label="🌐 Overall Twitter Dataset Summary")

    # Grouped by id (top 5)
    id_groups = group_by(data, ["id"])
    for key in list(id_groups.keys())[:5]:
        summarize(id_groups[key], label=f"📁 Group: id = {key[0]}")

    # Grouped by id and quoteId (top 5)
    id_quote_groups = group_by(data, ["id", "quoteId"])
    for key in list(id_quote_groups.keys())[:5]:
        summarize(id_quote_groups[key], label=f"🔗 Group: id = {key[0]}, quoteId = {key[1]}")



🌐 Overall Twitter Dataset Summary

📝 id (Categorical)
    Unique Values : 27304
    Most Frequent : 'cc46051622b8a9c1b883a3bbf12c640b12ac1cbdc7f48a773b6cc2a65f03aa2d' (1 times)

📝 url (Categorical)
    Unique Values : 27304
    Most Frequent : 'f70a206472e9deaf6e313297c1efb891729ced346a0aeb34e16935d78f74b937' (1 times)

📝 source (Categorical)
    Unique Values : 14
    Most Frequent : 'Twitter Web App' (14930 times)

📊 retweetCount (Numeric)
    Count   : 27304
    Mean    : 1322.06
    Min     : 0.0
    Max     : 144615.0
    Std Dev : 3404.94

📊 replyCount (Numeric)
    Count   : 27304
    Mean    : 1063.79
    Min     : 0.0
    Max     : 121270.0
    Std Dev : 3174.92

📊 likeCount (Numeric)
    Count   : 27304
    Mean    : 6913.69
    Min     : 0.0
    Max     : 915221.0
    Std Dev : 21589.91

📊 quoteCount (Numeric)
    Count   : 27304
    Mean    : 128.08
    Min     : 0.0
    Max     : 123320.0
    Std Dev : 1131.51

📊 viewCount (Numeric)
    Count   : 27304
    Mean    : 50708