# Install dependencies

In [None]:
%pip install -q pandas pyarrow numpy seaborn matplotlib scikit-learn

# 1) Setup: paths and helpers

In [None]:
# Imports and paths
import os, re, json, math
from pathlib import Path
from datetime import datetime
from urllib.parse import urlparse

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer

# Plot style
sns.set(style="whitegrid")
plt.rcParams["figure.dpi"] = 120

# Paths (works whether you run from notebooks/ or project root)
PROJ = Path.cwd().resolve().parents[0] if Path.cwd().name == 'notebooks' else Path.cwd()
DATA = PROJ / "data"
RAW = DATA / "raw"
PROC = DATA / "processed"
REPORTS = PROJ / "reports"
FIGS = REPORTS / "figures"
for p in [RAW, PROC, REPORTS, FIGS]:
    p.mkdir(parents=True, exist_ok=True)

def timestamp():
    return datetime.utcnow().strftime("%Y%m%d_%H%M%S")

def latest_file(folder: Path, pattern="*.parquet"):
    files = sorted(folder.glob(pattern), key=lambda p: p.stat().st_mtime)
    return files[-1] if files else None

RUN_ID = timestamp()
RUN_ID

# 2) Load latest processed

In [None]:
proc_path = latest_file(PROC, "*.parquet")
if not proc_path:
    raise FileNotFoundError("No processed parquet found in data/processed. Run 02_data_cleaning first.")

print(f"Loading processed snapshot: {proc_path}")
df = pd.read_parquet(proc_path)

# Ensure expected columns exist
if "text_clean" not in df.columns:
    raise ValueError("Expected 'text_clean' column not found. Re-run 02_data_cleaning.")

# Types and convenience columns
if "created_at" in df.columns:
    df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")

# Reconstruct 'type' if missing
if "type" not in df.columns:
    if "is_retweet" in df.columns and "is_quote" in df.columns:
        df["type"] = np.where(df["is_retweet"], "retweet",
                              np.where(df["is_quote"], "quote", "original"))
    else:
        df["type"] = "unknown"

# Ensure char_len/word_count exist
if "char_len" not in df.columns:
    df["char_len"] = df["text_clean"].astype(str).str.len()
if "word_count" not in df.columns:
    df["word_count"] = df["text_clean"].astype(str).str.split().apply(len)

rows = len(df)
cols = len(df.columns)
rows, cols

# 3) Dataset overview

In [None]:
from IPython.display import display

print("Head:")
display(df.head(3))

print("\nShape:", df.shape)
print("\nColumns:", df.columns.tolist())

print("\nDtypes:")
display(df.dtypes)

print("\nLanguage distribution (top 10):")
if "lang" in df.columns:
    display(df["lang"].value_counts().head(10))

print("\nType distribution:")
display(df["type"].value_counts())

if "username" in df.columns:
    print("\nTop usernames:")
    display(df["username"].value_counts().head(10))

if {"like_count", "retweet_count", "comment_count"}.issubset(df.columns):
    print("\nEngagement summary:")
    display(df[["like_count","retweet_count","comment_count"]].describe())

# 4) Time trends

In [None]:
has_time = "created_at" in df.columns and df["created_at"].notna().any()
if has_time:
    # Daily volume
    daily = (df.set_index("created_at")
               .assign(count=1)
               .resample("D")["count"].sum()
               .reset_index())

    plt.figure(figsize=(8,3))
    sns.lineplot(data=daily, x="created_at", y="count", marker="o", linewidth=1)
    plt.title("Posts per day")
    plt.xlabel("date")
    plt.ylabel("count")
    plt.tight_layout()
    fig_path = FIGS / f"eda_posts_per_day_{RUN_ID}.png"
    plt.savefig(fig_path)
    plt.show()
    print(f"Saved: {fig_path}")

    # Hour-of-day and weekday patterns
    tmp = df[df["created_at"].notna()].copy()
    tmp["hour"] = tmp["created_at"].dt.hour
    tmp["weekday"] = tmp["created_at"].dt.dayofweek  # 0=Mon
    tmp["weekday_name"] = tmp["created_at"].dt.day_name()

    plt.figure(figsize=(7,3))
    sns.countplot(data=tmp, x="hour", color="#4C78A8")
    plt.title("Posts by hour of day")
    plt.tight_layout()
    fig_path = FIGS / f"eda_posts_by_hour_{RUN_ID}.png"
    plt.savefig(fig_path)
    plt.show()
    print(f"Saved: {fig_path}")

    plt.figure(figsize=(7,3))
    order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
    sns.countplot(data=tmp, x="weekday_name", order=order, color="#72B7B2")
    plt.title("Posts by weekday")
    plt.xticks(rotation=30, ha="right")
    plt.tight_layout()
    fig_path = FIGS / f"eda_posts_by_weekday_{RUN_ID}.png"
    plt.savefig(fig_path)
    plt.show()
    print(f"Saved: {fig_path}")

    # Heatmap (weekday x hour)
    pivot = (tmp.groupby(["weekday_name","hour"]).size()
                .reset_index(name="count")
                .pivot(index="weekday_name", columns="hour", values="count")
                .reindex(index=order))
    plt.figure(figsize=(8,4))
    sns.heatmap(pivot.fillna(0), cmap="Blues")
    plt.title("Post volume: weekday x hour")
    plt.tight_layout()
    fig_path = FIGS / f"eda_heatmap_weekday_hour_{RUN_ID}.png"
    plt.savefig(fig_path)
    plt.show()
    print(f"Saved: {fig_path}")

    # Engagement over time (if present)
    if {"like_count","retweet_count"}.issubset(df.columns):
        daily_eng = (df[df["created_at"].notna()]
                       .set_index("created_at")
                       .resample("D")[["like_count","retweet_count","comment_count"]]
                       .mean())
        plt.figure(figsize=(8,3))
        sns.lineplot(data=daily_eng)
        plt.title("Average engagement per day")
        plt.xlabel("date")
        plt.tight_layout()
        fig_path = FIGS / f"eda_engagement_per_day_{RUN_ID}.png"
        plt.savefig(fig_path)
        plt.show()
        print(f"Saved: {fig_path}")
else:
    print("created_at not available; skipping time series.")

# 5) Length distributions

In [None]:
plt.figure(figsize=(7,3))
sns.histplot(df["char_len"], bins=50, color="#F58518")
plt.title("Cleaned text length (characters)")
plt.tight_layout()
fig_path = FIGS / f"eda_char_len_{RUN_ID}.png"
plt.savefig(fig_path)
plt.show()
print(f"Saved: {fig_path}")

In [None]:
plt.figure(figsize=(7,3))
sns.histplot(df["word_count"], bins=50, color="#54A24B")
plt.title("Cleaned text length (words)")
plt.tight_layout()
fig_path = FIGS / f"eda_word_count_{RUN_ID}.png"
plt.savefig(fig_path)
plt.show()
print(f"Saved: {fig_path}")

# 6) Engagement distributions and top posts

In [None]:
if {"like_count","retweet_count","comment_count"}.issubset(df.columns):
    # Distributions (clip to reduce long tails in plots)
    for col, color in [("like_count","#4C78A8"), ("retweet_count","#72B7B2"), ("comment_count","#E45756")]:
        plt.figure(figsize=(7,3))
        sns.histplot(df[col].clip(upper=np.quantile(df[col], 0.99)), bins=50, color=color)
        plt.title(f"{col} distribution (clipped 99th pct)")
        plt.tight_layout()
        fig_path = FIGS / f"eda_{col}_dist_{RUN_ID}.png"
        plt.savefig(fig_path)
        plt.show()
        print(f"Saved: {fig_path}")

    # Top posts by likes and retweets
    top_by_likes = df.sort_values("like_count", ascending=False).head(20)
    top_by_retweets = df.sort_values("retweet_count", ascending=False).head(20)

    # Save CSVs (serialize URLs if list)
    def _serialize_urls(s):
        if isinstance(s, (list, tuple)): return json.dumps(s, ensure_ascii=False)
        return s
    out1 = REPORTS / f"top_posts_by_likes_{RUN_ID}.csv"
    out2 = REPORTS / f"top_posts_by_retweets_{RUN_ID}.csv"
    tmp1, tmp2 = top_by_likes.copy(), top_by_retweets.copy()
    if "urls" in tmp1.columns: tmp1["urls"] = tmp1["urls"].map(_serialize_urls)
    if "urls" in tmp2.columns: tmp2["urls"] = tmp2["urls"].map(_serialize_urls)
    tmp1.to_csv(out1, index=False, encoding="utf-8")
    tmp2.to_csv(out2, index=False, encoding="utf-8")
    print(f"Saved:\n- {out1}\n- {out2}")
else:
    print("Engagement columns not available; skipping engagement plots and top posts.")

# 7) Top tokens and n-grams

In [None]:
def top_ngrams(texts, topk=30, ngram_range=(1,2), min_df=2, max_features=40000, stop_words="english"):
    texts = pd.Series(texts).fillna("").astype(str).tolist()
    if not any(len(t.strip()) for t in texts):
        return pd.DataFrame(columns=["term","freq"])
    vec = CountVectorizer(stop_words=stop_words, ngram_range=ngram_range,
                          min_df=min_df, max_features=max_features)
    X = vec.fit_transform(texts)
    vocab = np.array(vec.get_feature_names_out())
    freqs = np.asarray(X.sum(axis=0)).ravel()
    order = np.argsort(freqs)[::-1][:topk]
    return pd.DataFrame({"term": vocab[order], "freq": freqs[order]})

# Overall top terms
top_terms = top_ngrams(df["text_clean"], topk=30, ngram_range=(1,2), min_df=2)
display(top_terms)

plt.figure(figsize=(8,6))
sns.barplot(data=top_terms, y="term", x="freq", color="#4C78A8")
plt.title("Top terms (unigrams + bigrams)")
plt.xlabel("frequency")
plt.ylabel("")
plt.tight_layout()
fig_path = FIGS / f"eda_top_terms_{RUN_ID}.png"
plt.savefig(fig_path)
plt.show()
print(f"Saved: {fig_path}")

# Top terms by type (if multiple types present)
if df["type"].nunique() > 1:
    for t in df["type"].value_counts().index.tolist():
        sub = df[df["type"] == t]
        if len(sub) < 10:
            continue
        tt = top_ngrams(sub["text_clean"], topk=20, ngram_range=(1,2), min_df=2)
        plt.figure(figsize=(7,5))
        sns.barplot(data=tt, y="term", x="freq", color="#72B7B2")
        plt.title(f"Top terms: {t}")
        plt.xlabel("frequency")
        plt.ylabel("")
        plt.tight_layout()
        fig_path = FIGS / f"eda_top_terms_{t}_{RUN_ID}.png"
        plt.savefig(fig_path)
        plt.show()
        print(f"Saved: {fig_path}")

# 8) Hashtags, mentions, and URLs/domains

In [None]:
# Extract from original text (not cleaned), if available
HAS_TEXT = "text" in df.columns

# Hashtags
if HAS_TEXT:
    hashtag_re = re.compile(r"#(\w+)")
    all_tags = df["text"].dropna().astype(str).map(lambda s: hashtag_re.findall(s))
    tags = [t.lower() for lst in all_tags for t in lst]
    tags_s = pd.Series(tags)
    if not tags_s.empty:
        top_hashtags = tags_s.value_counts().head(30).reset_index()
        top_hashtags.columns = ["hashtag","freq"]
        display(top_hashtags)

        plt.figure(figsize=(8,6))
        sns.barplot(data=top_hashtags, y="hashtag", x="freq", color="#E45756")
        plt.title("Top hashtags")
        plt.tight_layout()
        fig_path = FIGS / f"eda_top_hashtags_{RUN_ID}.png"
        plt.savefig(fig_path)
        plt.show()
        print(f"Saved: {fig_path}")
    else:
        print("No hashtags found.")
else:
    print("Original 'text' column not available; skipping hashtag analysis.")

# Mentions
if HAS_TEXT:
    mention_re = re.compile(r"@([A-Za-z0-9_]{1,15})")
    all_mentions = df["text"].dropna().astype(str).map(lambda s: mention_re.findall(s))
    mentions = [m.lower() for lst in all_mentions for m in lst]
    mentions_s = pd.Series(mentions)
    if not mentions_s.empty:
        top_mentions = mentions_s.value_counts().head(30).reset_index()
        top_mentions.columns = ["mention","freq"]
        display(top_mentions)

        plt.figure(figsize=(8,6))
        sns.barplot(data=top_mentions, y="mention", x="freq", color="#F58518")
        plt.title("Top mentions")
        plt.tight_layout()
        fig_path = FIGS / f"eda_top_mentions_{RUN_ID}.png"
        plt.savefig(fig_path)
        plt.show()
        print(f"Saved: {fig_path}")
    else:
        print("No mentions found.")

# URLs and domains (from 'urls' list)
if "urls" in df.columns:
    def extract_domains(urls):
        if not isinstance(urls, (list, tuple)): return []
        ds = []
        for u in urls:
            try:
                d = urlparse(u).netloc.lower()
                if d.startswith("www."): d = d[4:]
                ds.append(d)
            except:
                continue
        return ds

    domains = []
    for lst in df["urls"].fillna([]):
        domains.extend(extract_domains(lst))

    dom_s = pd.Series(domains)
    if not dom_s.empty:
        top_domains = dom_s.value_counts().head(30).reset_index()
        top_domains.columns = ["domain","freq"]
        display(top_domains)

        plt.figure(figsize=(8,6))
        sns.barplot(data=top_domains, y="domain", x="freq", color="#54A24B")
        plt.title("Top linked domains")
        plt.tight_layout()
        fig_path = FIGS / f"eda_top_domains_{RUN_ID}.png"
        plt.savefig(fig_path)
        plt.show()
        print(f"Saved: {fig_path}")

        # Save CSV
        out_domains = REPORTS / f"top_domains_{RUN_ID}.csv"
        top_domains.to_csv(out_domains, index=False, encoding="utf-8")
        print(f"Saved: {out_domains}")
    else:
        print("No external URLs found.")
else:
    print("'urls' column not available; skipping URL/domain analysis.")

# 9) Save EDA summary artifact

In [None]:
summary = {
    "run_id": RUN_ID,
    "input": str(proc_path.relative_to(PROJ)),
    "rows": int(len(df)),
    "columns": df.columns.tolist(),
    "time_range": {
        "min": (df["created_at"].min().strftime("%Y-%m-%d %H:%M:%S")
                if "created_at" in df.columns and df["created_at"].notna().any() else None),
        "max": (df["created_at"].max().strftime("%Y-%m-%d %H:%M:%S")
                if "created_at" in df.columns and df["created_at"].notna().any() else None),
    },
    "lang_counts_top10": (df["lang"].value_counts().head(10).to_dict() if "lang" in df.columns else {}),
    "type_counts": df["type"].value_counts().to_dict(),
    "username_top10": (df["username"].value_counts().head(10).to_dict() if "username" in df.columns else {}),
    "length_stats": df[["char_len","word_count"]].describe().to_dict(),
    "engagement_stats": (df[["like_count","retweet_count","comment_count"]].describe().to_dict()
                         if {"like_count","retweet_count","comment_count"}.issubset(df.columns) else {}),
}

summary_path = REPORTS / f"eda_summary_{RUN_ID}.json"
with open(summary_path, "w", encoding="utf-8") as f:
    json.dump(summary, f, indent=2)

print(f"Saved EDA summary: {summary_path}")