In [None]:
import re
import os
from dataclasses import dataclass
from typing import List, Optional, Tuple

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
# ========= CONFIG =========

FILES = [
    "./Texts/NewsArticles.txt",
    "./Texts/MoreNewsArticles.txt"
    #"../Texts/...,
]

SIMILARITY_THRESHOLD = 0.80   # try 0.85 if you want fuzzier matches, 90 for stricter matches
MIN_CHARS = 400               # ignore very short bodies
TOP_K = 200                   # max number of matches to display (None = all)

# =========================


In [None]:
HASH_SPLIT_RE = re.compile(r"(?m)^\s*#{4,}\s*$")
TROVE_URL_RE = re.compile(
    r"https?://nla\.gov\.au/nla\.news-article\d+(?:[/?]\S*)?",
    re.IGNORECASE
)

@dataclass
class Article:
    source_file: str
    index_in_file: int
    trove_url: Optional[str]
    citation_line: Optional[str]
    body_norm: str


In [None]:
def split_articles(text: str) -> List[str]:
    parts = HASH_SPLIT_RE.split(text)
    return [p.strip() for p in parts if p.strip()]


def extract_citation_and_body(block: str):
    lines = block.splitlines()
    citation = None
    body_start = 0

    for i, line in enumerate(lines):
        if line.strip():
            citation = line.strip()
            body_start = i + 1
            break

    body = "\n".join(lines[body_start:]).strip()
    return citation, body


def normalise_body(body: str) -> str:
    if not body:
        return ""

    body = body.replace("\r\n", "\n").replace("\r", "\n")

    # join hyphenated line wraps
    body = re.sub(r"(\w)-\n(\w)", r"\1\2", body)

    # collapse newlines into spaces
    body = re.sub(r"\n+", " ", body)

    # lowercase
    body = body.lower()

    # remove most punctuation
    body = re.sub(r"[“”\".,;:!?()\[\]{}<>]", " ", body)

    # collapse whitespace
    body = re.sub(r"\s+", " ", body).strip()

    return body


In [None]:
def load_articles(files: List[str]) -> List[Article]:
    articles = []

    for path in files:
        with open(path, "r", encoding="utf-8", errors="replace") as f:
            text = f.read()

        blocks = split_articles(text)

        for i, block in enumerate(blocks, start=1):
            citation, body = extract_citation_and_body(block)
            url_match = TROVE_URL_RE.search(block)
            url = url_match.group(0) if url_match else None

            body_norm = normalise_body(body)

            articles.append(
                Article(
                    source_file=os.path.basename(path),
                    index_in_file=i,
                    trove_url=url,
                    citation_line=citation,
                    body_norm=body_norm
                )
            )

    return articles



articles = load_articles(FILES)

print(f"Loaded {len(articles)} article blocks.")
print(f"Using {sum(len(a.body_norm) >= MIN_CHARS for a in articles)} bodies ≥ {MIN_CHARS} chars.")

from collections import Counter
counts = Counter(a.source_file for a in articles)
print("Articles per file:")
for fname, n in counts.items():
    print(f"  {fname}: {n}")
print(f"Total articles: {len(articles)}")


In [None]:
def find_similar_articles(
    articles: List[Article],
    threshold: float,
    min_chars: int
):
    idx_map = [i for i, a in enumerate(articles) if len(a.body_norm) >= min_chars]
    texts = [articles[i].body_norm for i in idx_map]

    vectorizer = TfidfVectorizer(
        analyzer="char",
        ngram_range=(5, 5),
        min_df=1
    )

    X = vectorizer.fit_transform(texts)
    S = cosine_similarity(X)

    matches = []
    n = len(idx_map)

    for i in range(n):
        for j in range(i + 1, n):
            score = float(S[i, j])
            if score >= threshold:
                matches.append((score, idx_map[i], idx_map[j]))

    matches.sort(reverse=True, key=lambda x: x[0])
    return matches


matches = find_similar_articles(
    articles,
    SIMILARITY_THRESHOLD,
    MIN_CHARS
)

print(f"Found {len(matches)} matches ≥ {SIMILARITY_THRESHOLD}.")
if matches:
    print(f"Found {len(matches)} matches ≥ {SIMILARITY_THRESHOLD}.")
else:
    print(f"No matches found ≥ {SIMILARITY_THRESHOLD}.")

In [None]:
def show_match(score, a: Article, b: Article, preview=300):
    print("=" * 100)
    print(f"SIMILARITY: {score:.3f}")
    print(f"A: {a.source_file}  [#{a.index_in_file}]")
    print(f"   URL: {a.trove_url}")
    print(f"   Citation: {a.citation_line}")
    print(f"B: {b.source_file}  [#{b.index_in_file}]")
    print(f"   URL: {b.trove_url}")
    print(f"   Citation: {b.citation_line}")
    print("-" * 100)
    print("A preview:", a.body_norm[:preview], "…")
    print("B preview:", b.body_norm[:preview], "…")
    print()


for score, i, j in matches[:TOP_K or None]:
    show_match(score, articles[i], articles[j])
