In [2]:
import csv
import time
import math
from collections import defaultdict, Counter

def read_csv(file_path):
    with open(file_path, encoding='utf-8') as file:
        return list(csv.DictReader(file))

def is_numeric(value):
    try:
        float(value)
        return True
    except:
        return False

def identify_numeric_columns(data):
    return [
        col for col in data[0]
        if all(is_numeric(row[col]) or row[col] == '' for row in data[:100])
    ]

def get_numeric_stats(data, col):
    numbers = [float(row[col]) for row in data if is_numeric(row[col])]
    if not numbers:
        return {"count": 0, "mean": None, "min": None, "max": None, "std_dev": None}
    avg = sum(numbers) / len(numbers)
    std = math.sqrt(sum((x - avg) ** 2 for x in numbers) / len(numbers)) if len(numbers) > 1 else 0
    return {"count": len(numbers), "mean": avg, "min": min(numbers), "max": max(numbers), "std_dev": std}

def get_categorical_stats(data, col):
    values = [row[col] for row in data if row[col]]
    freq = Counter(values)
    return {"count": len(values), "unique": len(freq), "most_common": freq.most_common(1)[0] if freq else None}

def group_data(data, keys):
    result = defaultdict(list)
    for row in data:
        if all(k in row for k in keys):
            result[tuple(row[k] for k in keys)].append(row)
    return result

def analyze_file(file_path, label):
    print(f"\n🔍 Starting analysis for: {label}")
    data = read_csv(file_path)
    numeric_cols = identify_numeric_columns(data)

    print("\n📈 Overall Summary")
    for col in data[0]:
        print(f"\n{col}:")
        if col in numeric_cols:
            print(get_numeric_stats(data, col))
        else:
            print(get_categorical_stats(data, col))

    print("\n📂 Grouped by page_id:")
    for group_key, rows in list(group_data(data, ["page_id"]).items())[:3]:
        print(f"\nGroup: {group_key}")
        for col in numeric_cols:
            print(f"  {col}:", get_numeric_stats(rows, col))

    if "ad_id" in data[0]:
        print("\n📂 Grouped by page_id and ad_id:")
        for group_key, rows in list(group_data(data, ["page_id", "ad_id"]).items())[:3]:
            print(f"\nGroup: {group_key}")
            for col in numeric_cols:
                print(f"  {col}:", get_numeric_stats(rows, col))

# Time tracker
start = time.time()

# Process each dataset
datasets = [
    ("2024_fb_ads_president_scored_anon.csv", "Facebook Ads"),
    ("2024_fb_posts_president_scored_anon.csv", "Facebook Posts"),
    ("2024_tw_posts_president_scored_anon.csv", "Twitter Posts"),
]

for path, name in datasets:
    analyze_file(path, name)

# Optional: Extra dataset (Trump Truths)
extra_path = "Downloads/period_03/trump_truths_dataset.csv"
extra_data = read_csv(extra_path)
num_cols = identify_numeric_columns(extra_data)
summary = defaultdict(lambda: {"count": 0, "sum": 0, "min": float('inf'), "max": float('-inf'), "values": []})

for row in extra_data:
    for col, val in row.items():
        if val == '':
            continue
        summary[col]["count"] += 1
        if col in num_cols:
            val = float(val)
            summary[col]["sum"] += val
            summary[col]["min"] = min(summary[col]["min"], val)
            summary[col]["max"] = max(summary[col]["max"], val)
            summary[col]["values"].append(val)
        else:
            summary[col]["values"].append(val)

print("\n📘 Summary: Trump Truths Dataset")
for col, s in summary.items():
    print(f"\n{col}")
    print(f" - Count: {s['count']}")
    if col in num_cols:
        mean = s['sum'] / s['count']
        std = math.sqrt(sum((x - mean) ** 2 for x in s['values']) / s['count'])
        print(f" - Mean: {mean:.2f}, Min: {s['min']}, Max: {s['max']}, Std Dev: {std:.2f}")
    else:
        freq = Counter(s['values']).most_common(3)
        print(f" - Unique: {len(set(s['values']))}, Top 3: {freq}")

print(f"\n⏱️ Total Time: {time.time() - start:.2f} seconds")



🔍 Starting analysis for: Facebook Ads

📈 Overall Summary

page_id:
{'count': 246745, 'unique': 4475, 'most_common': ('4d66f5853f0365dba032a87704a634f023d15babde973bb7a284ed8cd2707b2d', 55503)}

ad_id:
{'count': 246745, 'unique': 246745, 'most_common': ('0ddb025b8544e2d58e6977ad417e742a52522b3e1fc1c9d9b61c57148f8d72fc', 1)}

ad_creation_time:
{'count': 246745, 'unique': 547, 'most_common': ('2024-10-27', 8619)}

bylines:
{'count': 245736, 'unique': 3790, 'most_common': ('HARRIS FOR PRESIDENT', 49788)}

currency:
{'count': 246745, 'unique': 18, 'most_common': ('USD', 246599)}

delivery_by_region:
{'count': 246745, 'unique': 141122, 'most_common': ('{}', 30989)}

demographic_distribution:
{'count': 246745, 'unique': 215622, 'most_common': ('{}', 30989)}

estimated_audience_size:
{'count': 246745, 'mean': 556462.8559687126, 'min': 0.0, 'max': 1000001.0, 'std_dev': 409863.9282788759}

estimated_impressions:
{'count': 246745, 'mean': 45601.52595189366, 'min': 499.0, 'max': 1000000.0, 'std_d

FileNotFoundError: [Errno 2] No such file or directory: 'Downloads/period_03/trump_truths_dataset.csv'