In [5]:
import polars as pl
import time

# ------------------------------
# Function: Perform Basic Summary
# ------------------------------
def summarize_polars_dataframe(df: pl.DataFrame, title: str):
    print(f"\n📊 Dataset Summary: {title}")

    # General statistics for all numeric and boolean columns
    print("\n📈 Descriptive Overview:")
    print(df.describe())

    # Number of unique values in each column
    print("\n🔢 Count of Unique Values per Column:")
    print(df.n_unique())

    # Top 3 frequent values for string-type (categorical) columns
    print("\n📋 Most Frequent Values (Top 3) in Text Columns:")
    string_columns = [c for c, dtype in zip(df.columns, df.dtypes) if dtype == pl.Utf8]
    for col in string_columns:
        print(f"\n🔸 {col}")
        freq_df = df.group_by(col).len().sort("len", descending=True).head(3)
        print(freq_df)


# ------------------------------
# Function: Analyze Shared Columns
# ------------------------------
def analyze_shared_fields(df: pl.DataFrame, dataset_name: str, common_fields: list[str]):
    print(f"\n📘 Shared Columns Analysis for: {dataset_name}")

    # Summary statistics only on shared columns
    print("\n📈 Summary of Shared Columns:")
    print(df.select(common_fields).describe())

    print("\n🔢 Unique Counts (Shared Columns):")
    for field in common_fields:
        print(f" - {field}: {df[field].n_unique()}")

    print("\n📋 Top 3 Most Common Values per Shared Column:")
    for field in common_fields:
        if df[field].dtype in [pl.Utf8, pl.Int64, pl.Float64]:
            try:
                most_common = df.group_by(field).len().sort("len", descending=True).head(3)
                print(f"\n🔸 {field}:\n{most_common}")
            except Exception:
                print(f"⚠️ Skipped {field} due to error.")


# ------------------------------
# Dataset File Paths
# ------------------------------
files = {
    "Facebook Ads": "2024_fb_ads_president_scored_anon.csv",
    "Facebook Posts": "2024_fb_posts_president_scored_anon.csv",
    "Twitter Posts": "2024_tw_posts_president_scored_anon.csv"
}

# ------------------------------
# Step 1: Initial Summary for Each Dataset
# ------------------------------
for name, path in files.items():
    df = pl.read_csv(path)
    summarize_polars_dataframe(df, name)

# ------------------------------
# Step 2: Analyze Shared Columns
# ------------------------------
# Load all three datasets
df_ads = pl.read_csv(files["Facebook Ads"])
df_fb = pl.read_csv(files["Facebook Posts"])
df_tw = pl.read_csv(files["Twitter Posts"])

# Find common columns across all datasets
shared_cols = sorted(set(df_ads.columns) & set(df_fb.columns) & set(df_tw.columns))
print("\n📌 Common Fields across All Datasets:")
for col in shared_cols:
    print(" -", col)

# Run shared-column analysis
analyze_shared_fields(df_ads, "Facebook Ads", shared_cols)
analyze_shared_fields(df_fb, "Facebook Posts", shared_cols)
analyze_shared_fields(df_tw, "Twitter Posts", shared_cols)

# ------------------------------
# Bonus Dataset: Trump Truths
# ------------------------------
print("\n🧾 Extra Dataset Analysis: Trump Truths")
start = time.time()

df_trump = pl.read_csv("Downloads/period_03/trump_truths_dataset.csv")

# Optional: Normalize column names
df_trump = df_trump.rename({col: col.strip().lower().replace(" ", "_") for col in df_trump.columns})

# Summary output
summarize_polars_dataframe(df_trump, "Trump Truths Dataset")

end = time.time()
print(f"\n⏱️ Processing Completed in {end - start:.2f} seconds")



📊 Dataset Summary: Facebook Ads

📈 Descriptive Overview:
shape: (9, 42)
┌────────────┬──────────────┬──────────────┬──────────────┬───┬──────────────┬─────────────┬─────────────┬─────────────┐
│ statistic  ┆ page_id      ┆ ad_id        ┆ ad_creation_ ┆ … ┆ womens_issue ┆ incivility_ ┆ freefair_il ┆ fraud_illum │
│ ---        ┆ ---          ┆ ---          ┆ time         ┆   ┆ _topic_illum ┆ illuminatin ┆ luminating  ┆ inating     │
│ str        ┆ str          ┆ str          ┆ ---          ┆   ┆ inatin…      ┆ g           ┆ ---         ┆ ---         │
│            ┆              ┆              ┆ str          ┆   ┆ ---          ┆ ---         ┆ f64         ┆ f64         │
│            ┆              ┆              ┆              ┆   ┆ f64          ┆ f64         ┆             ┆             │
╞════════════╪══════════════╪══════════════╪══════════════╪═══╪══════════════╪═════════════╪═════════════╪═════════════╡
│ count      ┆ 246745       ┆ 246745       ┆ 246745       ┆ … ┆ 246745.0     ┆ 2

FileNotFoundError: The system cannot find the path specified. (os error 3): Downloads/period_03/trump_truths_dataset.csv

In [4]:
!pip install polars
!pip install time

Collecting polars
  Downloading polars-1.31.0-cp39-abi3-win_amd64.whl.metadata (15 kB)
Downloading polars-1.31.0-cp39-abi3-win_amd64.whl (35.2 MB)
   ---------------------------------------- 0.0/35.2 MB ? eta -:--:--
   - -------------------------------------- 1.0/35.2 MB 10.0 MB/s eta 0:00:04
   --- ------------------------------------ 3.1/35.2 MB 10.2 MB/s eta 0:00:04
   ------ --------------------------------- 5.5/35.2 MB 9.9 MB/s eta 0:00:04
   ------- -------------------------------- 6.3/35.2 MB 8.4 MB/s eta 0:00:04
   --------- ------------------------------ 8.1/35.2 MB 8.8 MB/s eta 0:00:04
   ----------- ---------------------------- 10.5/35.2 MB 8.8 MB/s eta 0:00:03
   -------------- ------------------------- 12.3/35.2 MB 8.9 MB/s eta 0:00:03
   ---------------- ----------------------- 14.4/35.2 MB 9.1 MB/s eta 0:00:03
   ------------------ --------------------- 16.3/35.2 MB 9.1 MB/s eta 0:00:03
   --------------------- ------------------ 18.6/35.2 MB 9.2 MB/s eta 0:00:02
   ---


[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Could not find a version that satisfies the requirement time (from versions: none)

[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for time
