This script uses OpenAI to identify paragraphs in a long text that aren't related to the topic, genre or style you are focussing on.
It was developed for collections of OCR newspaper articles where some of the articles have reporting on a wide range of diverse topics (crime, weather, harvests, baking competitions, wars, floods, etc) but we are only wanting to collect reporting on a specific one of these and want to identify, assess and remove paragraphs on irrelevant matters. 
Use a sample of 15 to 30 paragraphs on the topic, or in the genre or style you are working with (less than 15 probably not enough to get a good sample. 50 or more probably adds very little benefit). The first part of this creates an 'embedding' signature. Then we loop through all the paragraphs we want to check in a large text and compare their embeddings to the 'signature'. The script outputs any paragraph (truncated if very long) that doesn't match the signature. You can adjust the threshold for how similar/different the paragraph is before it is flagged. Then you use the list of lines to find the paragraphs and decide whether to remove them from your text.

Note:
Once the embedding signature has been created it is saved in reference_embeddings.json. If you want to force it to recreate the signature, eg because you changed the example paragraphs, delete this file, or make a back up, or change its name.

The 'unrelated' paragraphs should not simply be deleted without checking, as they are likely to contain things like subheadings or be about something which in isolation might seem unrelated, but are actually part of the whole narrative.

Requires:
!pip -q install openai numpy pandas

Scoring:
Each paragraph is assigned a score for how close a match it is to the signature score. Here's a general idea of how scores indicate closeness:

1.00 → pointing the same way (near-identical meaning)
0.00 → orthogonal (no meaningful semantic overlap)
< 0.00 → opposed meanings (rare in practice for prose)

0.70 – 0.85 : Extremely close
Near paraphrase
Same event, same action, different wording
Reprints, summaries, or very tight thematic overlap

0.60 – 0.70 : Clearly same genre
Different incidents
Same type of activity (pursuit, violence, reprisal)
Different actors / places / dates

0.50 – 0.60 : Borderline / contextual
Indirect references
Aftermath, official correspondence, commentary
Mentions violence obliquely or as background

0.40 – 0.50 : Weakly related
Shares vocabulary or tone
But not really about the same thing
Might mention people or places without the genre action

< 0.40 : Unrelated
Different topic entirely
Shipping, weather, stock prices, sport, politics

In [None]:
import os, re, json, math
from dataclasses import dataclass
from typing import List, Tuple, Dict, Any, Optional

import numpy as np
import pandas as pd
from openai import OpenAI

configuration

In [None]:
EMBED_MODEL = "text-embedding-3-small"  # or "text-embedding-3-large"
BATCH_SIZE = 200  # 100–500 usually fine; depends on paragraph sizes

# Similarity thresholds (tune these after one test run)
T_LOW  = 0.40   # below this => UNRELATED
T_HIGH = 0.55   # between low/high => REVIEW, above high => IN_GENRE

# Optional: cache reference embeddings so you don't re-embed exemplars every run
REFERENCE_CACHE_JSON = "reference_embeddings.json"

helpers (splitting + embedding + cosine)

In [None]:
def split_paragraphs(text: str) -> List[str]:
    """Split on 1+ blank lines. Keeps paragraphs intact."""
    parts = re.split(r"\n\s*\n+", text.strip())
    return [p.strip() for p in parts if p.strip()]

def truncate(text: str, max_chars: int = 500) -> str:
    text = re.sub(r"\s+", " ", text).strip()
    if len(text) <= max_chars:
        return text
    return text[:max_chars].rsplit(" ", 1)[0] + "…"
    
def first_nonempty_line(text: str) -> str:
    for line in text.splitlines():
        s = line.strip()
        if s:
            return s
    return ""

def split_articles_by_hashline(text: str) -> List[str]:
    """
    Split file into articles separated by a line that is only hashes (e.g. #######).
    Any line containing only # characters (and whitespace) counts as a separator.
    """
    # Normalize newlines
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    # Split on separator lines
    parts = re.split(r"(?m)^\s*#+\s*$", text)
    return [p.strip() for p in parts if p.strip()]

def strip_citation_line(article_text: str) -> Tuple[str, str]:
    """
    Return (citation_line, body_without_citation).
    Assumes first non-empty line is citation.
    """
    lines = article_text.splitlines()
    citation = ""
    body_lines = []
    found = False
    for i, line in enumerate(lines):
        if not found and line.strip():
            citation = line.strip()
            body_lines = lines[i+1:]
            found = True
            break
    body = "\n".join(body_lines).strip()
    return citation, body

def embed_texts(client: OpenAI, texts: List[str], model: str, batch_size: int = 200) -> np.ndarray:
    """
    Embed a list of strings. Returns array shape (n, d) float32.
    Skips empty strings (but you should avoid passing empties).
    """
    vecs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        # API supports array of strings in one request :contentReference[oaicite:1]{index=1}
        resp = client.embeddings.create(model=model, input=batch)
        vecs.extend([item.embedding for item in resp.data])
    return np.array(vecs, dtype=np.float32)

def cosine_sim_matrix(A: np.ndarray, B: np.ndarray) -> np.ndarray:
    """
    Compute cosine similarity between each row in A and each row in B.
    If embeddings are normalized (OpenAI embeddings are), cosine = dot product. :contentReference[oaicite:2]{index=2}
    Returns (A_rows, B_rows).
    """
    # For safety, normalize anyway (tiny cost, avoids surprises if you change models/providers)
    A_norm = A / np.linalg.norm(A, axis=1, keepdims=True)
    B_norm = B / np.linalg.norm(B, axis=1, keepdims=True)
    return A_norm @ B_norm.T

build exemplar “signature” or load it if it already exists

In [None]:
def build_or_load_reference(exemplar_path: str) -> Dict[str, Any]:
    if os.path.exists(REFERENCE_CACHE_JSON):
        with open(REFERENCE_CACHE_JSON, "r", encoding="utf-8") as f:
            ref = json.load(f)
        if ref.get("model") == EMBED_MODEL:
            print(f"Loaded cached reference embeddings from {REFERENCE_CACHE_JSON} (model={EMBED_MODEL})")
            return ref
        else:
            print("Cache exists but model differs; rebuilding reference.")

    with open(exemplar_path, "r", encoding="utf-8") as f:
        raw = f.read()

    exemplars = split_paragraphs(raw)
    if not exemplars:
        raise ValueError("No exemplar paragraphs found. Ensure blank lines separate paragraphs.")

    client = OpenAI()
    ex_vecs = embed_texts(client, exemplars, EMBED_MODEL, batch_size=BATCH_SIZE)

    ref = {
        "model": EMBED_MODEL,
        "exemplars": [{"id": i, "text": t, "embedding": ex_vecs[i].tolist()} for i, t in enumerate(exemplars)],
    }
    with open(REFERENCE_CACHE_JSON, "w", encoding="utf-8") as f:
        json.dump(ref, f)
    print(f"Built and cached {len(exemplars)} exemplars → {REFERENCE_CACHE_JSON} (model={EMBED_MODEL})")
    return ref

Run to create the 'signature' for this topic/genre/style 

In [None]:
# Set this to your exemplar-paragraph file
EXEMPLARS_TXT = "SampleParagraphs.txt"

ref = build_or_load_reference(EXEMPLARS_TXT)
exemplar_vecs = np.array([e["embedding"] for e in ref["exemplars"]], dtype=np.float32)

print("Exemplars:", len(ref["exemplars"]), " | Embedding dim:", exemplar_vecs.shape[1])

scan the multi-article file and classify paragraphs

In [None]:
def scan_articles_file(path: str) -> pd.DataFrame:
    with open(path, "r", encoding="utf-8") as f:
        raw = f.read()

    articles = split_articles_by_hashline(raw)
    print("Articles found:", len(articles))

    rows = []
    all_paras = []
    para_meta = []  # (article_idx, citation, para_idx_in_article, preview, full_para)

    for a_i, art in enumerate(articles):
        citation, body = strip_citation_line(art)
        if not body:
            continue
        paras = split_paragraphs(body)
        for p_i, p in enumerate(paras):
            preview = truncate(p, 500)
            all_paras.append(p)
            para_meta.append((a_i, citation, p_i, preview, p))

    if not all_paras:
        return pd.DataFrame(columns=["article_id","citation","para_id","score_max","label","preview"])

    client = OpenAI()
    para_vecs = embed_texts(client, all_paras, EMBED_MODEL, batch_size=BATCH_SIZE)

    sims = cosine_sim_matrix(para_vecs, exemplar_vecs)  # shape (num_paras, num_exemplars)
    score_max = sims.max(axis=1)

    for (a_i, citation, p_i, preview, p), s in zip(para_meta, score_max):
        if s < T_LOW:
            label = "UNRELATED"
        elif s < T_HIGH:
            label = "REVIEW"
        else:
            label = "IN_GENRE"

        rows.append({
            "article_id": a_i,
            "citation": citation,
            "para_id": p_i,
            "score_max": float(s),
            "label": label,
            "preview": preview,
        })

    df = pd.DataFrame(rows).sort_values(["article_id","para_id"]).reset_index(drop=True)
    return df

Run comparison check

In [None]:
ARTICLES_TXT = "CheckGomeroiTexts.txt"  # your file with many articles separated by ####### lines

df = scan_articles_file(ARTICLES_TXT)
df.head(20)

output only “unrelated” paragraphs (truncated at 500 char)

In [None]:
unrelated = df[df["label"] == "UNRELATED"][["article_id","para_id","score_max","preview","citation"]]
review = df[df["label"] == "REVIEW"][["article_id","para_id","score_max","preview","citation"]]
unrelated = unrelated.sort_values("score_max", ascending=True)
review = review.sort_values("score_max", ascending=True)

print("UNRELATED:", len(unrelated))
print("REVIEW:", len(review))

unrelated.to_csv("unrelated_paragraphs.csv", index=False)
review.to_csv("review_paragraphs.csv", index=False)

print("UNRELATED - see CSV for full list.")
unrelated.head(30)



In [None]:
print("REVIEW - see CSV for full list.")
review.head(30)