In [None]:
#!pip install bertopic sentence-transformers umap-learn hdbscan gensim


Collecting bertopic
  Downloading bertopic-0.17.4-py3-none-any.whl.metadata (24 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.2.0-py3-none-any.whl.metadata (16 kB)
Collecting umap-learn
  Downloading umap_learn-0.5.9.post2-py3-none-any.whl.metadata (25 kB)
Collecting hdbscan
  Downloading hdbscan-0.8.41-cp311-cp311-macosx_10_9_universal2.whl.metadata (15 kB)
Collecting gensim
  Downloading gensim-4.4.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (8.4 kB)
Collecting llvmlite>0.36.0 (from bertopic)
  Downloading llvmlite-0.46.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (5.0 kB)
Collecting numba>=0.51.2 (from umap-learn)
  Downloading numba-0.63.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.9 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Collecting smart_open>=1.8.1 (from gensim)
  Downloading smart_open-7.5.0-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart_open>=1.8.1->ge

In [22]:
import os, re, glob, json
import numpy as np
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel



In [11]:

DATA_DIRS = ["2014", "2015"]   # or one folder if you prefer
OUT_DIR = "outputs"
os.makedirs(OUT_DIR, exist_ok=True)

### Define EU list

In [9]:
EU_COUNTRIES = {
    "Austria","Belgium","Bulgaria","Croatia","Cyprus","Czech Republic","Denmark",
    "Estonia","Finland","France","Germany","Greece","Hungary","Ireland","Italy",
    "Latvia","Lithuania","Luxembourg","Malta","Netherlands","Poland","Portugal",
    "Romania","Spain","Sweden"
}


### Collect file paths

In [12]:
paths = []
for d in DATA_DIRS:
    paths.extend(glob.glob(os.path.join(d, "*.txt")))
paths = sorted(paths)

len(paths), paths[:3]


(390, ['2014/Afghanistan.txt', '2014/Albania.txt', '2014/Algeria.txt'])

### Cleaning + section split + chunking

In [13]:
SCRIPT_STYLE_RE = re.compile(r"<(script|style)[^>]*>.*?</\1>", re.IGNORECASE | re.DOTALL)
TAG_RE = re.compile(r"<[^>]+>")
WS_RE = re.compile(r"\s+")

def strip_html(html: str) -> str:
    html = SCRIPT_STYLE_RE.sub(" ", html)
    text = TAG_RE.sub(" ", html)
    text = (text.replace("&amp;", "&")
                .replace("&nbsp;", " ")
                .replace("&quot;", '"')
                .replace("&lt;", "<")
                .replace("&gt;", ">"))
    return WS_RE.sub(" ", text).strip()

def detect_sections(text: str):
    headings = [
        "EXECUTIVE SUMMARY", "EXECUTIVE SUMMARY:",
        "Section 1.", "SECTION 1.", "Section 1:", "SECTION 1:",
        "Section 2.", "SECTION 2.", "Section 2:", "SECTION 2:",
        "Section 3.", "SECTION 3.", "Section 3:", "SECTION 3:",
        "Section 4.", "SECTION 4.", "Section 4:", "SECTION 4:",
        "Section 5.", "SECTION 5.", "Section 5:", "SECTION 5:",
        "Section 6.", "SECTION 6.", "Section 6:", "SECTION 6:",
        "Section 7.", "SECTION 7.", "Section 7:", "SECTION 7:",
    ]
    pattern = "(" + "|".join(re.escape(h) for h in headings) + ")"
    parts = re.split(pattern, text)

    if len(parts) <= 1:
        return [("FULL_TEXT", text)]

    sections = []
    pre = parts[0].strip()
    if pre:
        sections.append(("PREAMBLE", pre))

    i = 1
    while i < len(parts) - 1:
        title = parts[i].strip()
        body = parts[i+1].strip()
        if body:
            sections.append((title, body))
        i += 2
    return sections

def split_into_word_chunks(text: str, min_words=120, max_words=250):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        j = min(i + max_words, len(words))
        chunk = words[i:j]
        if len(chunk) >= min_words:
            chunks.append(" ".join(chunk))
        i = j
    return chunks


### Build the full dataset table (ALL countries)

In [14]:
rows = []
for p in paths:
    with open(p, "r", encoding="utf-8", errors="replace") as f:
        year = f.readline().strip()
        country = f.readline().strip()
        html = f.read()

    plain = strip_html(html)
    for sec_title, sec_text in detect_sections(plain):
        for k, ch in enumerate(split_into_word_chunks(sec_text, 120, 250)):
            rows.append({
                "year": year,
                "country": country,
                "is_eu": country in EU_COUNTRIES,
                "section": sec_title,
                "source_file": os.path.basename(p),
                "chunk_id": f"{os.path.basename(p)}::{sec_title}::{k}",
                "text": ch
            })

df = pd.DataFrame(rows)
df.shape, df.head()


((18052, 7),
    year      country  is_eu            section      source_file  \
 0  2014  Afghanistan  False  EXECUTIVE SUMMARY  Afghanistan.txt   
 1  2014  Afghanistan  False  EXECUTIVE SUMMARY  Afghanistan.txt   
 2  2014  Afghanistan  False         Section 1.  Afghanistan.txt   
 3  2014  Afghanistan  False         Section 1.  Afghanistan.txt   
 4  2014  Afghanistan  False         Section 1.  Afghanistan.txt   
 
                                 chunk_id  \
 0  Afghanistan.txt::EXECUTIVE SUMMARY::0   
 1  Afghanistan.txt::EXECUTIVE SUMMARY::1   
 2         Afghanistan.txt::Section 1.::0   
 3         Afghanistan.txt::Section 1.::1   
 4         Afghanistan.txt::Section 1.::2   
 
                                                 text  
 0  Share Afghanistan is an Islamic republic with ...  
 1  detention; judicial corruption and ineffective...  
 2  Respect for the Integrity of the Person, Inclu...  
 3  compared with the same period in 2013. The tot...  
 4  in Paktika Province k

### Verify counts (world vs EU)

In [15]:
df.groupby(["year"])["country"].nunique(), df["is_eu"].mean()


(year
 2014    195
 2015    195
 Name: country, dtype: int64,
 np.float64(0.1028140926213162))

In [16]:
# How many EU chunks?
df[df["is_eu"]].groupby("year")["country"].nunique()


year
2014    25
2015    25
Name: country, dtype: int64

### Fit BERTopic on the full corpus

In [20]:


docs_all = df["text"].tolist()

embedder = SentenceTransformer("all-MiniLM-L6-v2")

topic_model = BERTopic(
    embedding_model=embedder,
    calculate_probabilities=False,
    verbose=True
)

topics, _ = topic_model.fit_transform(docs_all)
df["topic"] = topics

topic_info = topic_model.get_topic_info()
topic_info.head(10)


2026-01-01 15:04:13,948 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 565/565 [03:30<00:00,  2.69it/s]
2026-01-01 15:07:45,206 - BERTopic - Embedding - Completed ✓
2026-01-01 15:07:45,207 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-01 15:08:03,273 - BERTopic - Dimensionality - Completed ✓
2026-01-01 15:08:03,274 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,5557,-1_the_of_to_and,"[the, of, to, and, in, for, were, on, or, that]",[90 days while the PIC continues its investiga...
1,0,2940,0_labor_workers_work_union,"[labor, workers, work, union, unions, employer...",[Worker Rights Share a. Freedom of Association...
2,1,1041,1_elections_political_election_parties,"[elections, political, election, parties, part...",[Respect for Political Rights: The Right of Ci...
3,2,816,2_corruption_officials_public_information,"[corruption, officials, public, information, d...",[Corruption and Lack of Transparency in Govern...
4,3,554,3_human_rights_international_ngos,"[human, rights, international, ngos, bodies, g...",[Governmental Attitude Regarding International...
5,4,384,4_disabilities_with_persons_mental,"[disabilities, with, persons, mental, ethnic, ...",[While the government effectively enforced the...
6,5,365,5_rape_violence_domestic_spousal,"[rape, violence, domestic, spousal, women, dis...","[Discrimination, Societal Abuses, and Traffick..."
7,6,282,6_police_security_forces_arrest,"[police, security, forces, arrest, responsible...",[observers. While the national Red Cross and c...
8,7,258,7_prisoners_prison_prisons_monitoring,"[prisoners, prison, prisons, monitoring, compl...",[Prison administrators did not maintain record...
9,8,207,8_internet_websites_freedom_content,"[internet, websites, freedom, content, blocked...",[National Forum for the Democratization of Com...


### Helper to view topic words

In [21]:
def get_topic_words(model, topic_id, topn=10):
    pairs = model.get_topic(topic_id) or []
    return [w for w, _ in pairs[:topn]]

valid_topic_ids = [t for t in topic_model.get_topics().keys() if t != -1]
for tid in valid_topic_ids[:8]:
    print(tid, get_topic_words(topic_model, tid, 10))



0 ['labor', 'workers', 'work', 'union', 'unions', 'employers', 'employment', 'sector', 'minimum', 'child']
1 ['elections', 'political', 'election', 'parties', 'participation', 'party', 'seats', 'fair', 'elected', 'free']
2 ['corruption', 'officials', 'public', 'information', 'disclosure', 'financial', 'transparency', 'government', 'assets', 'anticorruption']
3 ['human', 'rights', 'international', 'ngos', 'bodies', 'government', 'attitude', 'governmental', 'ombudsman', 'organizations']
4 ['disabilities', 'with', 'persons', 'mental', 'ethnic', 'buildings', 'education', 'schools', 'accessible', 'disability']
5 ['rape', 'violence', 'domestic', 'spousal', 'women', 'discrimination', 'societal', 'gender', 'sexual', 'race']
6 ['police', 'security', 'forces', 'arrest', 'responsible', 'apparatus', 'internal', 'role', 'force', 'ministry']
7 ['prisoners', 'prison', 'prisons', 'monitoring', 'complaints', 'detention', 'visits', 'permitted', 'conditions', 'independent']


## Evaluation (do it on ALL, then interpret EU)
### Coherence + diversity

In [23]:


def topic_diversity(topic_words, topk=10):
    all_words = []
    for wlist in topic_words:
        all_words.extend(wlist[:topk])
    return len(set(all_words)) / max(1, len(all_words))

def coherence_cv(tokenized_docs, topic_words):
    dictionary = Dictionary(tokenized_docs)
    corpus = [dictionary.doc2bow(toks) for toks in tokenized_docs]
    cm = CoherenceModel(
        topics=topic_words,
        texts=tokenized_docs,
        corpus=corpus,
        dictionary=dictionary,
        coherence="c_v"
    )
    return float(cm.get_coherence())

topic_words = [get_topic_words(topic_model, tid, 20) for tid in valid_topic_ids]
tokenized = [d.split() for d in docs_all]

metrics = {
    "coherence_c_v": coherence_cv(tokenized, topic_words),
    "topic_diversity_top10": topic_diversity(topic_words, topk=10),
    "n_topics_excluding_outliers": len(valid_topic_ids)
}
metrics


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'coherence_c_v': 0.7004116820920762,
 'topic_diversity_top10': 0.5082758620689655,
 'n_topics_excluding_outliers': 145}

### Stability (quick but defendable)

In [25]:
def jaccard(a, b):
    a, b = set(a), set(b)
    return len(a & b) / max(1, len(a | b))

def run_model_with_seed(seed: int):
    np.random.seed(seed)
    idx = np.random.permutation(len(docs_all))
    docs_shuffled = [docs_all[i] for i in idx]
    tm = BERTopic(embedding_model=embedder, verbose=False)
    tm.fit_transform(docs_shuffled)
    valid = [tid for tid in tm.get_topics().keys() if tid != -1]
    return {tid: get_topic_words(tm, tid, 15) for tid in valid}

w1 = run_model_with_seed(1)
w2 = run_model_with_seed(2)

scores = []
for t1, words1 in list(w1.items())[:15]:
    best = 0.0
    for t2, words2 in w2.items():
        best = max(best, jaccard(words1, words2))
    scores.append(best)

{"stability_mean_best_jaccard": float(np.mean(scores)),
 "stability_median_best_jaccard": float(np.median(scores))}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'stability_mean_best_jaccard': 0.7828282828282828,
 'stability_median_best_jaccard': 0.8181818181818182}

## EU-focused interpretation (the “easy to interpret” part)
### Topic share inside EU vs non-EU

In [26]:
def topic_share(subdf: pd.DataFrame, label: str):
    t = (subdf[subdf["topic"] != -1]
         .groupby("topic").size().reset_index(name="n"))
    t["share"] = t["n"] / t["n"].sum()
    t["group"] = label
    return t

eu = topic_share(df[df["is_eu"]], "EU")
non_eu = topic_share(df[~df["is_eu"]], "Non-EU")

compare = pd.concat([eu, non_eu], ignore_index=True)
compare = compare.pivot_table(index="topic", columns="group", values="share", fill_value=0.0).reset_index()
compare["delta_EU_minus_NonEU"] = compare.get("EU", 0.0) - compare.get("Non-EU", 0.0)

compare["top_words"] = compare["topic"].apply(lambda t: ", ".join(get_topic_words(topic_model, int(t), 8)))
compare.sort_values("delta_EU_minus_NonEU", ascending=False).head(15)


group,topic,EU,Non-EU,delta_EU_minus_NonEU,top_words
14,14,0.059265,0.004032,0.055233,"roma, romani, housing, school, education, scho..."
21,21,0.045761,0.003673,0.042088,"anti, semitic, jewish, semitism, holocaust, je..."
46,46,0.030758,0.000717,0.030041,"asylum, safe, seekers, eu, transit, countries,..."
26,26,0.024756,0.004032,0.020725,"speech, press, freedom, expression, liberties,..."
43,43,0.016504,0.002598,0.013906,"echr, european, decisions, remedies, appeal, h..."
92,92,0.012753,0.000538,0.012216,"racism, racist, discrimination, racial, hate, ..."
34,34,0.015004,0.004032,0.010972,"stateless, citizenship, persons, unhcr, statel..."
76,76,0.010503,0.001254,0.009248,"trial, fair, right, defendants, judiciary, ind..."
7,7,0.027757,0.019799,0.007958,"prisoners, prison, prisons, monitoring, compla..."
4,4,0.037509,0.029923,0.007586,"disabilities, with, persons, mental, ethnic, b..."


### Change over time inside EU (2014 → 2015)

In [27]:
eu_df = df[df["is_eu"] & (df["topic"] != -1)].copy()

year_topic = (eu_df.groupby(["year", "topic"]).size().reset_index(name="n"))
year_topic["share_within_year"] = year_topic["n"] / year_topic.groupby("year")["n"].transform("sum")

pivot = year_topic.pivot_table(index="topic", columns="year", values="share_within_year", fill_value=0.0)
if len(pivot.columns) >= 2:
    years = sorted(pivot.columns.tolist())
    pivot["delta"] = pivot[years[-1]] - pivot[years[0]]
    out = pivot.sort_values("delta", ascending=False).head(15).copy()
    out["top_words"] = out.index.map(lambda t: ", ".join(get_topic_words(topic_model, int(t), 8)))
    out.reset_index()
else:
    print("Need both years present to compute change.")


### Zero-shot labels on EU chunks (sample)

In [None]:
from transformers import pipeline

zshot = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

LABELS = [
    "corruption",
    "judicial independence",
    "police abuse",
    "prison conditions",
    "freedom of expression",
    "freedom of assembly",
    "religious freedom",
    "refugees and asylum",
    "human trafficking",
    "anti-Semitism",
    "LGBTQ+ rights",
    "women's rights",
    "labor rights"
]

zs = eu_df.sample(min(400, len(eu_df)), random_state=7).copy()
res = zshot(zs["text"].tolist(), candidate_labels=LABELS, multi_label=True)

top3 = []
for r in res:
    pairs = sorted(zip(r["labels"], r["scores"]), key=lambda x: x[1], reverse=True)[:3]
    top3.append([p[0] for p in pairs])

zs["top3_labels"] = top3
labels_by_topic = (zs[["topic","top3_labels"]].explode("top3_labels")
                   .groupby(["topic","top3_labels"]).size()
                   .reset_index(name="count")
                   .sort_values(["topic","count"], ascending=[True, False]))
labels_by_topic.head(20)


Device set to use mps:0
