In [None]:
import re
import html
from pathlib import Path
from datetime import datetime
from typing import List, Tuple, Optional

# --- Config ---
INPUT_PATHS = [
    "./Texts/AICLEAN_NEWSARTICLES.txt",  # change/add files here
]
OUTPUT_HTML = "./Texts/AICLEAN_NEWSARTICLES.html"
OUTPUT_EPUB = "./Texts/AICLEAN_NEWSARTICLES.epub"
OUTPUT_PDF  = "./Texts/AICLEAN_NEWSARTICLES.pdf"

FRONT_MATTER = {
    "title": "A Title",
    "creator": "A. Name",
    "role": "Editor and Compiler",
    "publisher": "An Organisation Inc.",
    "date": "2026-01-29",
    "uid": "aoi.pub.newsarticles@v1.0.0",
    "foreword": """
This collection includes newspaper reports about ...
""".strip()
}

# --- Parsing regex ---
HASHLINE_RE = re.compile(r"^\s*#{6,}\s*$", re.MULTILINE)
DATE_IN_CITATION_RE = re.compile(r"\((\d{4}),\s*([A-Za-z]+)\s+(\d{1,2})\)")
URL_RE = re.compile(r"(https?://\S+)", re.IGNORECASE)


def split_articles_by_hashline(text: str) -> List[str]:
    parts = HASHLINE_RE.split(text)
    return [p.strip("\n") for p in parts if p.strip()]


def strip_citation_and_body(article_text: str) -> Tuple[str, str]:
    lines = article_text.splitlines()
    citation_idx = None
    for i, line in enumerate(lines):
        if line.strip():
            citation_idx = i
            break
    if citation_idx is None:
        return "", ""

    citation = lines[citation_idx].strip()
    body_lines = lines[citation_idx + 1 :]
    while body_lines and not body_lines[0].strip():
        body_lines.pop(0)
    body = "\n".join(body_lines).strip()
    return citation, body


def parse_date_from_citation(citation: str) -> Optional[datetime]:
    m = DATE_IN_CITATION_RE.search(citation)
    if not m:
        return None
    yyyy, month_name, dd = m.group(1), m.group(2), m.group(3)
    try:
        return datetime.strptime(f"{yyyy} {month_name} {dd}", "%Y %B %d")
    except ValueError:
        try:
            return datetime.strptime(f"{yyyy} {month_name} {dd}", "%Y %b %d")
        except ValueError:
            return None


def extract_url(citation: str) -> Optional[str]:
    m = URL_RE.search(citation)
    return m.group(1) if m else None


def citation_html(citation: str) -> str:
    url = extract_url(citation)
    esc = html.escape(citation)
    if not url:
        return esc
    url_esc = html.escape(url)
    return esc.replace(
        url_esc,
        f'<a href="{html.escape(url)}" target="_blank" rel="noopener noreferrer">{url_esc}</a>'
    )


def body_to_paragraphs_html(body: str) -> str:
    if not body.strip():
        return ""
    paras = re.split(r"\n\s*\n+", body.strip())
    out = []
    for p in paras:
        p_clean = re.sub(r"\s*\n\s*", " ", p.strip())
        p_clean = re.sub(r"[ \t]+", " ", p_clean).strip()
        if p_clean:
            out.append(f"<p>{html.escape(p_clean)}</p>")
    return "\n".join(out)


def make_article_id(dt: Optional[datetime], idx: int) -> str:
    return f"art-{dt.strftime('%Y%m%d')}-{idx:04d}" if dt else f"art-undated-{idx:04d}"


def sort_articles_for_output(articles: List[Tuple[str, str, Optional[datetime]]]):
    """
    Returns list of tuples (orig_index, citation, body, dt) sorted by dt asc, undated last.
    """
    def sort_key(item):
        _, _, _, dt = item
        return (0, dt) if dt else (1, datetime.max)

    indexed = [(i + 1, a[0], a[1], a[2]) for i, a in enumerate(articles)]
    return sorted(indexed, key=sort_key)


def front_matter_html(front: dict) -> str:
    foreword_paras = "\n".join(
        f"<p>{html.escape(p.strip())}</p>"
        for p in re.split(r"\n\s*\n+", front["foreword"].strip())
        if p.strip()
    )
    return f"""
    <div class="front-matter">
      <h1>{html.escape(front["title"])}</h1>
      <p><strong>{html.escape(front["role"])}:</strong> {html.escape(front["creator"])}</p>
      <p><strong>Publisher:</strong> {html.escape(front["publisher"])}</p>
      <p><strong>Date:</strong> {html.escape(front["date"])}</p>
      <p><strong>Identifier:</strong> {html.escape(front["uid"])}</p>
      <h2>Foreword</h2>
      {foreword_paras}
    </div>
    """


def build_html(articles: List[Tuple[str, str, Optional[datetime]]]) -> str:
    def sort_key(item):
        _, _, dt = item
        return (0, dt) if dt else (1, datetime.max)

    articles_sorted = sorted(enumerate(articles, start=1), key=lambda t: sort_key(t[1]))

    index_items = []
    article_divs = []

    for display_order, (orig_i, (cit, body, dt)) in enumerate(articles_sorted, start=1):
        art_id = make_article_id(dt, orig_i)
        date_label = dt.strftime("%Y-%m-%d") if dt else "undated"
        index_items.append(f'<li><a href="#{art_id}">{html.escape(date_label)}</a></li>')

        article_divs.append(
            "\n".join(
                [
                    f'<div class="article" id="{art_id}">',
                    f'  <div class="citation">{citation_html(cit)}</div>',
                    f'  <div class="textitem">{body_to_paragraphs_html(body)}</div>',
                    "</div>",
                ]
            )
        )

    fm = front_matter_html(FRONT_MATTER)

    html_doc = f"""<!doctype html>
<html lang="en">
<head>
  <meta charset="utf-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1" />
  <title>{html.escape(FRONT_MATTER["title"])}</title>

  <meta name="dc.title" content="{html.escape(FRONT_MATTER["title"])}" />
  <meta name="dc.creator" content="{html.escape(FRONT_MATTER["creator"])}" />
  <meta name="dc.publisher" content="{html.escape(FRONT_MATTER["publisher"])}" />
  <meta name="dc.date" content="{html.escape(FRONT_MATTER["date"])}" />
  <meta name="dc.identifier" content="{html.escape(FRONT_MATTER["uid"])}" />

  <style>
    body {{ font-family: system-ui, -apple-system, Segoe UI, Roboto, Arial, sans-serif; line-height: 1.45; margin: 24px; }}
    .front-matter {{ margin-bottom: 28px; padding: 16px; border: 1px solid #ddd; border-radius: 10px; }}
    .front-matter h1 {{ margin: 0 0 8px 0; }}
    .index {{ margin-bottom: 28px; padding: 16px; border: 1px solid #ddd; border-radius: 10px; }}
    .index h2 {{ margin: 0 0 10px 0; font-size: 18px; }}
    .index ul {{ margin: 0; padding-left: 18px; columns: 2; }}
    .article {{ padding: 18px; margin: 18px 0; border: 1px solid #eee; border-radius: 10px; }}
    .citation {{ font-weight: 600; margin-bottom: 10px; }}
    .citation a {{ font-weight: 600; }}
    .textitem p {{ margin: 0 0 10px 0; }}
  </style>
</head>
<body>
  {fm}

  <div class="index">
    <h2>Index (by date)</h2>
    <ul>
      {"".join(index_items)}
    </ul>
  </div>

  {"".join(article_divs)}
</body>
</html>
"""
    return html_doc


def foreword_chapter(front: dict):
    # Import inside so script still runs if ebooklib isn't installed (until you call make_epub)
    from ebooklib import epub
    import html as _html

    paras = re.split(r"\n\s*\n+", front["foreword"].strip())
    body = "".join(f"<p>{_html.escape(p.strip())}</p>" for p in paras if p.strip())

    content = f"""
    <html><body>
      <h1>{_html.escape(front['title'])}</h1>
      <p><strong>{_html.escape(front['role'])}:</strong> {_html.escape(front['creator'])}</p>
      <p><strong>Publisher:</strong> {_html.escape(front['publisher'])}</p>
      <p><strong>Date:</strong> {_html.escape(front['date'])}</p>
      <p><strong>Identifier:</strong> {_html.escape(front['uid'])}</p>
      <h2>Foreword</h2>
      {body}
    </body></html>
    """
    return epub.EpubHtml(title="Front Matter", file_name="front_matter.xhtml", content=content)


def make_epub(all_articles: List[Tuple[str, str, Optional[datetime]]],
              out_path: str,
              front: dict):
    from ebooklib import epub
    import html as _html

    book = epub.EpubBook()
    book.set_identifier(front["uid"])
    book.set_title(front["title"])
    book.add_author(front["creator"])
    book.add_metadata("DC", "publisher", front["publisher"])
    book.add_metadata("DC", "date", front["date"])
    book.set_language("en")

    style = """
    body { font-family: serif; line-height: 1.4; }
    .citation { font-weight: bold; margin-bottom: 0.6em; }
    p { margin: 0 0 0.8em 0; }
    """
    css_item = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
    book.add_item(css_item)

    # Articles (one chapter per article)
    sorted_items = sort_articles_for_output(all_articles)
    chapters = []

    for order, (orig_i, citation, body, dt) in enumerate(sorted_items, start=1):
        date_label = dt.strftime("%Y-%m-%d") if dt else "undated"
        chap_title = f"{date_label} — Article {order}"

        paras = re.split(r"\n\s*\n+", body.strip()) if body.strip() else []
        para_html = []
        for p in paras:
            p_clean = re.sub(r"\s*\n\s*", " ", p.strip())
            p_clean = re.sub(r"[ \t]+", " ", p_clean).strip()
            if p_clean:
                para_html.append(f"<p>{_html.escape(p_clean)}</p>")

        url = extract_url(citation)
        cit_esc = _html.escape(citation)
        if url:
            url_esc = _html.escape(url)
            cit_esc = cit_esc.replace(url_esc, f'<a href="{_html.escape(url)}">{url_esc}</a>')

        content = f"""
        <html><body>
          <div class="citation">{cit_esc}</div>
          {''.join(para_html)}
        </body></html>
        """

        c = epub.EpubHtml(title=chap_title, file_name=f"chap_{order:04d}.xhtml", content=content)
        c.add_item(css_item)
        book.add_item(c)
        chapters.append(c)

    # Front matter first
    front_ch = foreword_chapter(front)
    front_ch.add_item(css_item)
    book.add_item(front_ch)

    chapters = [front_ch] + chapters

    # Navigation
    book.toc = chapters
    book.add_item(epub.EpubNcx())
    book.add_item(epub.EpubNav())
    book.spine = ["nav"] + chapters

    epub.write_epub(out_path, book, {})
    return out_path


def make_pdf(all_articles: List[Tuple[str, str, Optional[datetime]]],
             out_path: str,
             front: dict):
    from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
    from reportlab.lib.pagesizes import A4
    from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
    from reportlab.lib.units import mm

    styles = getSampleStyleSheet()
    h = ParagraphStyle("Heading", parent=styles["Heading2"], spaceAfter=8)
    cit_style = ParagraphStyle("Citation", parent=styles["BodyText"], spaceAfter=6, leading=14)
    body_style = ParagraphStyle("Body", parent=styles["BodyText"], leading=14, spaceAfter=8)

    doc = SimpleDocTemplate(
        out_path,
        pagesize=A4,
        leftMargin=18*mm, rightMargin=18*mm,
        topMargin=18*mm, bottomMargin=18*mm
    )

    story = []

    # Title page
    story.append(Paragraph(html.escape(front["title"]), styles["Title"]))
    story.append(Spacer(1, 12))
    story.append(Paragraph(
        f"{html.escape(front['role'])}: {html.escape(front['creator'])}<br/>"
        f"Publisher: {html.escape(front['publisher'])}<br/>"
        f"Date: {html.escape(front['date'])}<br/>"
        f"Identifier: {html.escape(front['uid'])}",
        styles["BodyText"]
    ))
    story.append(PageBreak())

    # Foreword page
    story.append(Paragraph("Foreword", styles["Heading1"]))
    for p in re.split(r"\n\s*\n+", front["foreword"].strip()):
        if p.strip():
            story.append(Paragraph(html.escape(p.strip()), body_style))
    story.append(PageBreak())

    # Articles
    sorted_items = sort_articles_for_output(all_articles)

    for order, (orig_i, citation, body, dt) in enumerate(sorted_items, start=1):
        date_label = dt.strftime("%Y-%m-%d") if dt else "undated"
        story.append(Paragraph(f"{date_label} — Article {order}", h))

        url = extract_url(citation)
        cit_text = html.escape(citation)
        if url:
            url_esc = html.escape(url)
            cit_text = cit_text.replace(url_esc, f'<a href="{html.escape(url)}">{url_esc}</a>')
        story.append(Paragraph(cit_text, cit_style))

        if body.strip():
            paras = re.split(r"\n\s*\n+", body.strip())
            for p in paras:
                p_clean = re.sub(r"\s*\n\s*", " ", p.strip())
                p_clean = re.sub(r"[ \t]+", " ", p_clean).strip()
                if p_clean:
                    story.append(Paragraph(html.escape(p_clean), body_style))

        story.append(PageBreak())  # remove if you prefer continuous flow

    doc.build(story)
    return out_path


# --- Main ---
all_articles: List[Tuple[str, str, Optional[datetime]]] = []

for path in INPUT_PATHS:
    text = Path(path).read_text(encoding="utf-8", errors="replace")
    raw_articles = split_articles_by_hashline(text)
    for art in raw_articles:
        citation, body = strip_citation_and_body(art)
        if not citation and not body:
            continue
        dt = parse_date_from_citation(citation)
        all_articles.append((citation, body, dt))

# HTML
html_out = build_html(all_articles)
Path(OUTPUT_HTML).write_text(html_out, encoding="utf-8")
print(f"Wrote HTML: {OUTPUT_HTML}")
print(f"Articles processed: {len(all_articles)}")

# Optional preview (not required)
try:
    from IPython.display import HTML, display
    display(HTML(html_out[:20000] + "<p><em>…preview truncated…</em></p>"))
except Exception as e:
    print("Preview not available:", e)

# EPUB + PDF
# If ebooklib isn't installed yet, run: %pip install ebooklib
epub_path = make_epub(all_articles, OUTPUT_EPUB, front=FRONT_MATTER)
pdf_path  = make_pdf(all_articles, OUTPUT_PDF,  front=FRONT_MATTER)

print("Wrote EPUB:", epub_path)
print("Wrote PDF :", pdf_path)
