In [31]:
import gzip, json
from pathlib import Path

def load_preview_map(path_jsonl_gz: str | Path) -> dict:
    m = {}
    with gzip.open(path_jsonl_gz, "rt", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            m[obj["doc_id"]] = obj.get("preview", "")
    return m

prev = load_preview_map("manifest_all_with_preview.jsonl.gz")
len(prev)


26054

In [32]:
import pandas as pd
from glob import glob

paths = glob("outputs/semantic/*/docs_with_topic_clusters.csv.gz")
dfs = []
for p in paths:
    df = pd.read_csv(p, compression="gzip", low_memory=False)
    dfs.append(df)

all_docs = pd.concat(dfs, ignore_index=True)
all_docs["raw_preview"] = all_docs["doc_id"].map(prev).fillna("")
all_docs.shape


(2229, 11)

In [33]:
topic_cols = ["human_doc_type","topic_cluster","topic_size","topic_prob","doc_id"]
all_docs[topic_cols].head()

# quick: biggest clusters
(
    all_docs[all_docs["topic_cluster"] != -1]
    .groupby(["human_doc_type","topic_cluster"])
    .agg(n=("doc_id","count"), avg_prob=("topic_prob","mean"))
    .sort_values(["n"], ascending=False)
    .head(30)
)


Unnamed: 0_level_0,Unnamed: 1_level_0,n,avg_prob
human_doc_type,topic_cluster,Unnamed: 2_level_1,Unnamed: 3_level_1
email_blackmail_social,6,153,0.896137
email_blackmail_social,5,145,0.863354
email_strategy_politics_investment,2,142,0.861849
email_strategy_politics_investment,4,109,0.946855
email_blackmail_social,3,87,0.838345
email_strategy_politics_investment,5,71,0.899506
email_blackmail_social,0,63,0.952875
email_strategy_politics_investment,3,63,0.99452
email_blackmail_social,1,62,0.948891
email_blackmail_social,2,57,0.955447


In [34]:
import re
from collections import Counter, defaultdict

def norm_line(s: str) -> str:
    s = s.strip()
    s = re.sub(r"\s+", " ", s)
    # normalize obvious dates/times to avoid treating each as unique
    s = re.sub(r"\b\d{1,2}/\d{1,2}/(?:\d{2}|\d{4})\b", "__DATE__", s)
    s = re.sub(r"\b\d{1,2}:\d{2}(?::\d{2})?\b", "__TIME__", s)
    s = re.sub(r"\b(19\d{2}|20\d{2})\b", "__YEAR__", s)
    return s

def extract_lines(text: str):
    return [norm_line(ln) for ln in text.splitlines() if ln.strip()]

def cluster_boilerplate(all_docs: pd.DataFrame, doc_type: str, cluster_id: int, topk=40, min_in_cluster=8):
    in_mask = (all_docs.human_doc_type == doc_type) & (all_docs.topic_cluster == cluster_id)
    out_mask = (all_docs.human_doc_type == doc_type) & (all_docs.topic_cluster != cluster_id) & (all_docs.topic_cluster != -1)

    in_texts  = all_docs.loc[in_mask, "raw_preview"].tolist()
    out_texts = all_docs.loc[out_mask, "raw_preview"].tolist()

    cin = Counter()
    cout = Counter()

    for t in in_texts:
        cin.update(set(extract_lines(t)))  # set() => count "docs containing line"
    for t in out_texts:
        cout.update(set(extract_lines(t)))

    rows = []
    for line, a in cin.items():
        if a < min_in_cluster:
            continue
        b = cout.get(line, 0)
        ratio = (a + 1) / (b + 1)
        rows.append((ratio, a, b, line))

    rows.sort(reverse=True)
    return rows[:topk]

# Example usage:
cluster_boilerplate(all_docs, "email_power_business_short", 4, topk=30, min_in_cluster=10)[:10]


[(0.09923664122137404, 12, 130, 'Importance: High \\r')]

In [35]:
def shingles(text: str, k=8):
    toks = re.sub(r"\s+", " ", text).strip().split()
    toks = [t.lower() for t in toks]
    out = set()
    for i in range(0, max(0, len(toks)-k+1)):
        out.add(" ".join(toks[i:i+k]))
    return out

def cluster_shingles(all_docs: pd.DataFrame, doc_type: str, cluster_id: int, k=8, topk=30, min_in_cluster=10):
    in_mask = (all_docs.human_doc_type == doc_type) & (all_docs.topic_cluster == cluster_id)
    out_mask = (all_docs.human_doc_type == doc_type) & (all_docs.topic_cluster != cluster_id) & (all_docs.topic_cluster != -1)

    in_texts  = all_docs.loc[in_mask, "raw_preview"].tolist()
    out_texts = all_docs.loc[out_mask, "raw_preview"].tolist()

    cin = Counter()
    cout = Counter()

    for t in in_texts:
        cin.update(shingles(t, k=k))
    for t in out_texts:
        cout.update(shingles(t, k=k))

    rows = []
    for sh, a in cin.items():
        if a < min_in_cluster:
            continue
        b = cout.get(sh, 0)
        ratio = (a + 1) / (b + 1)
        rows.append((ratio, a, b, sh))

    rows.sort(reverse=True)
    return rows[:topk]

cluster_shingles(all_docs, "email_power_business_short", 4, k=10, topk=20, min_in_cluster=12)[:10]


[]

In [37]:
import numpy as np

def sample_cluster(all_docs, doc_type, cluster_id, n=10, seed=0):
    g = all_docs[(all_docs.human_doc_type==doc_type) & (all_docs.topic_cluster==cluster_id)]
    if len(g) == 0:
        return g
    rng = np.random.default_rng(seed)
    idx = rng.choice(g.index.to_numpy(), size=min(n, len(g)), replace=False)
    s = g.loc[idx, ["doc_id","topic_prob"]].copy()
    s["preview"] = s["doc_id"].map(prev).fillna("")
    return s.sort_values("topic_prob", ascending=False)

sample_cluster(all_docs, "email_strategy_politics_investment", 4, n=8)


Unnamed: 0,doc_id,topic_prob,preview
622,b558ff19e65ce8368a5885d4,1.0,"﻿The Game: All Things Trump\nAndres Serrano\n""..."
35,008d3899a8e637c530fd58a6,1.0,From: Alain Forget \r\nSent: 11/18/2016 11:30:...
511,95734a28fcb25fdb6e51aa99,1.0,From: Richard Kahn ___________________________...
402,7065b296786cdd167ca39b17,0.94106,﻿ExP•sso vossis 27 DE SEPnExteslE DE 2013 \r\n...
243,42a452505627ad03bd5e2996,0.94106,From: Jeffrey Epstein [jeeyacation@gmail.com] ...
47,04bc1aeaf258ad0407c5140e,0.94106,﻿The Comical Conservative \r\nPresident Obama ...
194,31f7604886638f2605384738,0.887333,From: LHS________________________________ \r\n...
74,0e624bd11740e837cd8bdfcf,0.872743,﻿The case for naming a U.S. secretary of Cultu...


In [38]:
import re
import numpy as np
import pandas as pd

RE_EMAIL_HDR_ANYWHERE = re.compile(r"(?im)^\s*(from|to|cc|bcc|sent|date|subject|importance)\s*:", re.I)
RE_MANY_UNDERSCORES = re.compile(r"_{8,}")

def doc_family(clean_text: str, preview: str = "") -> str:
    t = (preview or "") + "\n" + (clean_text or "")
    t = t[:4000]  # keep it cheap

    if RE_EMAIL_HDR_ANYWHERE.search(t):
        return "email_like"

    # OCR-ish: lots of non-ascii or very low alpha fraction
    if t:
        chars = np.frombuffer(t.encode("utf-8", errors="ignore"), dtype=np.uint8)
        # crude alpha estimate on decoded string
        alpha = sum(ch.isalpha() for ch in t)
        frac_alpha = alpha / max(1, len(t))
        non_ascii = sum(ord(ch) > 127 for ch in t) / max(1, len(t))
        if frac_alpha < 0.55 and non_ascii > 0.08:
            return "ocr_noise"

    # article-ish: longer text without email headers + has sentence structure
    if (clean_text or "").count(" ") > 120 and not RE_MANY_UNDERSCORES.search(t):
        return "article_like"

    return "other"

df_docs["family"] = [doc_family(ct, pv) for ct, pv in zip(df_docs["clean_text"], df_docs["preview"])]
df_docs["family"].value_counts()


NameError: name 'df_docs' is not defined