# Sort texts by citation date

This notebook sorts texts by date. It assumes they are seperated by a line of hashtags and hashtags only, and that the citation appears at the top. The assumed format of the citation is APA (Trove style).

It extracts the citation at the top of each text, parses the first date found in parentheses in the citation
(e.g., `(1843, April 1)`), sorts the texts by that date (earliest first), and writes them back out in the same
format.


In [None]:
# Configuration
folder = "./Texts/"
filename = "NewsArticles.txt"   # <-- change this to your file
input_path = folder + filename
output_path = folder + "Sorted_" + filename
separator_out = "########"  # separator used when writing the output


In [None]:
import re
from datetime import datetime
from typing import List, Dict, Optional

def split_chunks(raw: str) -> List[str]:
    # Split on any line that is mostly hashes (at least 3 hashes)
    # Handles variable-length separators like #######, ####################, etc.
    parts = re.split(r"(?m)^\s*#{3,}\s*$", raw)
    return [p.strip("\n") for p in parts if p.strip()]

def extract_citation_and_body(chunk: str) -> Dict[str, str]:
    # Citation is the first non-empty line.
    lines = chunk.splitlines()
    i = 0
    while i < len(lines) and not lines[i].strip():
        i += 1
    citation = lines[i].strip() if i < len(lines) else ""
    
    # Body begins after the first blank line following the citation, if present;
    # otherwise, it begins on the next line.
    j = i + 1
    while j < len(lines) and lines[j].strip() != "":
        # If the citation unexpectedly spans multiple lines, this keeps them in the body.
        # (Most of the time citations are single-line, but this is safe.)
        j += 1
    # If we found a blank line, skip exactly one blank line to match requested format
    if j < len(lines) and lines[j].strip() == "":
        j += 1
    body = "\n".join(lines[j:]).strip("\n")
    return {"citation": citation, "body": body}

MONTHS = {
    "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6,
    "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12
}

def parse_citation_date(citation: str) -> Optional[datetime]:
    # Find the FIRST parenthesized group that looks like: (YYYY, Month D)
    # and avoid matching ranges like (NSW : 1843 - 1893) which have a dash and no month.
    m = re.search(r"\((\d{4})\s*,\s*([A-Za-z]+)\s+(\d{1,2})\)", citation)
    if not m:
        return None
    year = int(m.group(1))
    month_name = m.group(2).strip().lower()
    day = int(m.group(3))
    month = MONTHS.get(month_name)
    if not month:
        return None
    return datetime(year, month, day)

def format_text(citation: str, body: str) -> str:
    # Citation at top, then blank line, then body.
    return f"{citation}\n\n{body}".rstrip() + "\n"


In [None]:
# Read, split, parse
with open(input_path, "r", encoding="utf-8") as f:
    raw = f.read()

chunks = split_chunks(raw)
records = []
for idx, chunk in enumerate(chunks, start=1):
    cb = extract_citation_and_body(chunk)
    dt = parse_citation_date(cb["citation"])
    records.append({
        "index": idx,
        "date": dt,
        "citation": cb["citation"],
        "body": cb["body"],
        "has_date": dt is not None
    })

# Show a quick summary
n_total = len(records)
n_with = sum(1 for r in records if r["has_date"])
n_without = n_total - n_with
n_total, n_with, n_without


In [None]:
# Sort and write output
# Items without a parsed date go last, preserving original order among themselves.
records_sorted = sorted(
    records,
    key=lambda r: (r["date"] is None, r["date"] or datetime.max, r["index"])
)

output_text = (f"\n{separator_out}\n\n").join(
    format_text(r["citation"], r["body"]) for r in records_sorted
).rstrip() + "\n"

with open(output_path, "w", encoding="utf-8") as f:
    f.write(output_text)

output_path


In [None]:
# Optional: preview the first 2 sorted items (citations + dates)
for r in records_sorted[:2]:
    print(r["date"].date() if r["date"] else None, "—", r["citation"])
    print("---")


## Notes
- The date parser looks for the first occurrence of the pattern `(YYYY, Month D)` in the citation line.
- It intentionally ignores the publication-run years like `(NSW : 1843 - 1893)` because those don’t match the month/day pattern.
- If a citation doesn’t match the expected pattern, that text is placed at the end of the output.
