In [2]:
import pandas as pd
import time

# ------------------------------
# Function: Summarize Dataset
# ------------------------------
def summarize_dataset(df, label):
    print(f"\n📊 Summary Report for: {label}")

    # General numeric and non-numeric stats
    print("\n📈 Descriptive Statistics:")
    print(df.describe(include='all'))

    # Count of unique values per column
    print("\n🔍 Unique Values per Column:")
    print(df.nunique())

    # Most frequent values for object/text columns
    print("\n🏷️ Top 3 Most Frequent Values (Text Columns):")
    for column in df.select_dtypes(include='object').columns:
        print(f"\n{column}:\n{df[column].value_counts().head(3)}")


# ------------------------------
# Function: Analyze Selected Columns
# ------------------------------
def analyze_shared_columns(df, dataset_name, shared_fields):
    print(f"\n📘 Analyzing Shared Columns in: {dataset_name}")
    subset = df[shared_fields]

    # Numeric summary stats
    print("\n📈 Numeric Summary (mean, median, min, max, std):")
    print(subset.describe().T[['mean', '50%', 'min', 'max', 'std']])

    # Count of unique values
    print("\n🔍 Unique Value Counts:")
    print(subset.nunique())

    # Most frequent values
    print("\n🏷️ Top 3 Frequent Entries:")
    for field in shared_fields:
        print(f"• {field}:\n{subset[field].value_counts().head(3)}\n")


# ------------------------------
# Main Dataset Paths
# ------------------------------
data_sources = {
    "Facebook Ads": "2024_fb_ads_president_scored_anon.csv",
    "Facebook Posts": "2024_fb_posts_president_scored_anon.csv",
    "Twitter Posts": "2024_tw_posts_president_scored_anon.csv"
}

# Shared indicator columns across all datasets (Illuminating Project)
shared_indicators = [
    'advocacy_msg_type_illuminating', 'attack_msg_type_illuminating',
    'covid_topic_illuminating', 'cta_msg_type_illuminating',
    'economy_topic_illuminating', 'education_topic_illuminating',
    'engagement_cta_subtype_illuminating', 'environment_topic_illuminating',
    'foreign_policy_topic_illuminating', 'fraud_illuminating',
    'freefair_illuminating', 'fundraising_cta_subtype_illuminating',
    'governance_topic_illuminating', 'health_topic_illuminating',
    'image_msg_type_illuminating', 'immigration_topic_illuminating',
    'incivility_illuminating', 'issue_msg_type_illuminating',
    'lgbtq_issues_topic_illuminating', 'military_topic_illuminating',
    'race_and_ethnicity_topic_illuminating', 'safety_topic_illuminating',
    'scam_illuminating', 'social_and_cultural_topic_illuminating',
    'technology_and_privacy_topic_illuminating', 'voting_cta_subtype_illuminating',
    'womens_issue_topic_illuminating'
]

# ------------------------------
# Process All Main Datasets
# ------------------------------
for name, file_path in data_sources.items():
    df = pd.read_csv(file_path)
    summarize_dataset(df, name)
    analyze_shared_columns(df, name, shared_indicators)


# ------------------------------
# Bonus Dataset: Trump Truths
# ------------------------------
print("\n🧾 Bonus Analysis: Trump Truths Dataset")
start = time.time()

trump_df = pd.read_csv("Downloads/period_03/trump_truths_dataset.csv")
summarize_dataset(trump_df, "Trump Truths")

end = time.time()
print(f"\n⏱️ Total Time Taken: {end - start:.2f} seconds")



📊 Summary Report for: Facebook Ads

📈 Descriptive Statistics:
                                                  page_id  \
count                                              246745   
unique                                               4475   
top     4d66f5853f0365dba032a87704a634f023d15babde973b...   
freq                                                55503   
mean                                                  NaN   
std                                                   NaN   
min                                                   NaN   
25%                                                   NaN   
50%                                                   NaN   
75%                                                   NaN   
max                                                   NaN   

                                                    ad_id ad_creation_time  \
count                                              246745           246745   
unique                                          

FileNotFoundError: [Errno 2] No such file or directory: 'Downloads/period_03/trump_truths_dataset.csv'