In [None]:

import json
import sys
from pathlib import Path
from urllib.parse import urlparse, unquote
from urllib.request import urlopen, Request
from urllib.error import URLError, HTTPError

def load_urls(json_path: str):
    data = json.loads(Path(json_path).read_text(encoding="utf-8"))
    if isinstance(data, dict) and "urls" in data:
        urls = data["urls"]
    elif isinstance(data, list):
        urls = data
    else:
        raise ValueError("JSON must be an array of URLs or an object with key 'urls'.")
    return [u.strip() for u in urls if isinstance(u, str) and u.strip().startswith("http")]

def url_to_filepath(url: str, out_dir: Path) -> Path:
    u = urlparse(url)
    host = u.netloc
    path = unquote(u.path).strip("/")
    base = out_dir / host
    dest_dir = base if path == "" else base / path
    return dest_dir / "index.html"

def fetch(url: str, timeout: float = 15.0) -> bytes:
    req = Request(url, headers={"User-Agent": "SimpleDownloader/1.0"})
    with urlopen(req, timeout=timeout) as resp:
        return resp.read()

def main():
    if len(sys.argv) < 2:
        print("Usage: python download_pages_simple.py <urls.json> [out_dir]")
        sys.exit(1)

    json_path = sys.argv[1]
    out_dir = Path(sys.argv[2]) if len(sys.argv) > 2 else Path("html_pages")
    out_dir.mkdir(parents=True, exist_ok=True)

    urls = load_urls(json_path)
    ok, fail = 0, 0

    for url in urls:
        dest = url_to_filepath(url, out_dir)
        try:
            html = fetch(url)
            dest.parent.mkdir(parents=True, exist_ok=True)
            dest.write_bytes(html)
            ok += 1
            print(f"[OK] {url} -> {dest}")
        except (HTTPError, URLError, TimeoutError, Exception) as e:
            fail += 1
            print(f"[FAIL] {url} :: {e}")

    print(f"\nDone. Saved {ok}/{len(urls)} pages to '{out_dir}'.")
    (out_dir / "download_index.txt").write_text(
        "\n".join(str((url_to_filepath(u, out_dir))) for u in urls),
        encoding="utf-8"
    )

if __name__ == "__main__":
    main()


0


In [8]:
import sys
import pandas as pd
from pathlib import Path
from urllib.parse import urlparse, unquote
from urllib.request import urlopen, Request
from urllib.error import URLError, HTTPError
from bs4 import BeautifulSoup, NavigableString

In [10]:
NOISE_TAGS = ["script", "style", "noscript", "iframe", "svg", "form"]
NOISE_STRUCTURAL = ["header", "nav", "footer", "aside"]
NOISE_KEYWORDS = [
    "menu","header","footer","nav","sidebar","breadcrumbs","crumb",
    "cookie","consent","subscribe","newsletter","share","social",
    "ad "," ads","banner","pagination","pager","toc"
]
CONTENT_HINT_CLASSES = [
    "content","main","article","post","entry","markdown-body",
    "post-content","post-title","post-header"  
]

def load_urls(csv_path: str):
    df = pd.read_csv(Path(csv_path))
    if "Link" not in df.columns:
        raise ValueError("CSV must contain a 'Link' column.")
    urls = df["Link"].dropna().astype(str).tolist()
    return [u.strip() for u in urls if u.strip().startswith("http")]

def url_to_output_path(url: str, out_dir: Path) -> Path:
    u = urlparse(url)
    host, path = u.netloc, unquote(u.path).strip("/")
    base = out_dir / host
    dest_dir = base if path == "" else base / path
    return dest_dir / "index.html"


def fetch_html(url: str, timeout: float = 20.0) -> str:
    req = Request(url, headers={"User-Agent": "SimpleFetcher/1.1"})
    with urlopen(req, timeout=timeout) as resp:
        data = resp.read()
        for enc in ("utf-8", "latin-1"):
            try:
                return data.decode(enc, errors="ignore")
            except Exception:
                continue
        return data.decode("utf-8", errors="replace")
    
def normalize_class_val(val):
    if not val:
        return ""
    return " ".join([str(x) for x in val]) if isinstance(val, list) else str(val).lower()

def remove_noise(soup: BeautifulSoup) -> None:
    for tag in NOISE_TAGS + NOISE_STRUCTURAL:
        for el in list(soup.find_all(tag)):
            try: el.decompose()
            except: pass
    for el in list(soup.find_all(True)):
        try:
            cls = normalize_class_val(el.get("class"))
            idv = normalize_class_val(el.get("id"))
            if any(k in cls or k in idv for k in NOISE_KEYWORDS):
                el.decompose()
        except: continue

def pick_main_node(soup: BeautifulSoup):
    rg_main = soup.select_one(".post-content")
    if rg_main:
        container = soup.new_tag("div")
        title = soup.select_one(".post-title, .post-header h1, h1")
        if title: container.append(title)
        container.append(rg_main)
        return container

    candidates = soup.select("main, article")
    if not candidates:
        sel = ",".join(f"div.{c},section.{c}" for c in CONTENT_HINT_CLASSES)
        candidates = soup.select(sel) or soup.select("div, section")

    best, best_len = None, 0
    for el in candidates:
        try:
            tlen = len(el.get_text(" ", strip=True))
            if tlen > best_len:
                best, best_len = el, tlen
        except: continue

    if best: return best
    if soup.body: return soup.body
    wrap = soup.new_tag("div")
    wrap.append(NavigableString(soup.get_text(" ", strip=True)))
    return wrap

def build_min_html(url: str, title_text: str, content_node) -> str:
    doc = BeautifulSoup("<!doctype html><html><head></head><body></body></html>", "html.parser")
    head, body = doc.head, doc.body
    head.append(doc.new_tag("meta", charset="utf-8"))
    head.append(doc.new_tag("base", href=url))
    title = doc.new_tag("title")
    title.string = (title_text or "").strip() or url
    head.append(title)

    fragment = BeautifulSoup(str(content_node), "html.parser")
    frag_body = fragment.body if fragment.body else fragment
    for child in list(frag_body.contents):
        body.append(child)
    return str(doc)

def clean_html(html: str, url: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    remove_noise(soup)
    main = pick_main_node(soup)
    title_text = soup.title.string if soup.title else ""
    return build_min_html(url, title_text, main)

def run_cleaner(csv_path: str, out_dir: str = "clean_pages"):
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    urls = load_urls(csv_path)
    ok, fail = 0, 0

    for url in urls:
        dest = url_to_output_path(url, out_dir)
        try:
            raw_html = fetch_html(url)
            cleaned = clean_html(raw_html, url)
            dest.parent.mkdir(parents=True, exist_ok=True)
            dest.write_text(cleaned, encoding="utf-8")
            ok += 1
            print(f"[OK] {url} -> {dest}")
        except (HTTPError, URLError, TimeoutError, Exception) as e:
            fail += 1
            print(f"[FAIL] {url} :: {e}")

    print(f"\nDone. Cleaned {ok}/{len(urls)} pages into '{out_dir}'.")
    return out_dir

In [11]:

out_dir = "clean_pages"
run_cleaner("SB_publication_PMC.csv", out_dir)


[OK] https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4136787/ -> clean_pages\www.ncbi.nlm.nih.gov\pmc\articles\PMC4136787\index.html
[OK] https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3630201/ -> clean_pages\www.ncbi.nlm.nih.gov\pmc\articles\PMC3630201\index.html
[OK] https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11988870/ -> clean_pages\www.ncbi.nlm.nih.gov\pmc\articles\PMC11988870\index.html
[OK] https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7998608/ -> clean_pages\www.ncbi.nlm.nih.gov\pmc\articles\PMC7998608\index.html
[OK] https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5587110/ -> clean_pages\www.ncbi.nlm.nih.gov\pmc\articles\PMC5587110\index.html
[OK] https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8396460/ -> clean_pages\www.ncbi.nlm.nih.gov\pmc\articles\PMC8396460\index.html
[OK] https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5666799/ -> clean_pages\www.ncbi.nlm.nih.gov\pmc\articles\PMC5666799\index.html
[OK] https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5460236/ -> clean_pages\www.ncbi.nlm.n

WindowsPath('clean_pages')

In [16]:
from pathlib import Path
from markitdown import MarkItDown
import re
import pandas as pd

In [30]:

def strip_markdown_links(text: str) -> str:
    """
    Convert [text](url) -> text
            [text][ref] -> text
    Remove reference link definitions like: [ref]: http...
    Keep images ( ![alt](url) ) untouched.
    """
    # text = re.sub(r'\[\!\[([^\]]+)\]\(([^)]+)\)\]\(([^)\s]+)(?:\s+"([^"]*)")?\)', r'***\1***', text)
    # text = re.sub(r'(?<!\!)\[(.*?)\]\((.*?)\)\n', r'### \1', text)
    text = re.sub(r'(?<!\!)\[(.*?)\]\((.*?)\)', r'**\1**', text)
    
    # Reference-style: [text][ref] -> **text**
    text = re.sub(r'(?<!\!)\[(.*?)\]\[(.*?)\]', r'**\1**', text)
    # Remove reference definitions
    text = re.sub(r'^\s*\[[^\]]+\]:\s+\S+.*$', '', text, flags=re.MULTILINE)
    return text


def convert_html_to_md(in_dir: str, out_dir: str = "md_out"):

    in_dir, out_dir = Path(in_dir), Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    md = MarkItDown(enable_plugins=False)
    ok, fail = 0, 0

    for html_file in in_dir.rglob("*.html"):
        rel = html_file.relative_to(in_dir)
        out_path = (out_dir / rel).with_suffix(".md")
        try:
            result = md.convert(str(html_file))
            content = result.text_content
    
            # content = strip_markdown_links(content)

            out_path.parent.mkdir(parents=True, exist_ok=True)
            out_path.write_text(content, encoding="utf-8")
            ok += 1
            print(f"[OK] {html_file} -> {out_path}")
        except Exception as e:
            fail += 1
            print(f"[FAIL] {html_file} :: {e}")

    print(f"\nDone. Converted {ok} files to Markdown in '{out_dir}'. Failed: {fail}.")
    return out_dir

In [31]:
convert_html_to_md(out_dir, "md_out")

[OK] clean_pages\www.ncbi.nlm.nih.gov\pmc\articles\PMC10020673\index.html -> md_out\www.ncbi.nlm.nih.gov\pmc\articles\PMC10020673\index.md
[OK] clean_pages\www.ncbi.nlm.nih.gov\pmc\articles\PMC10025027\index.html -> md_out\www.ncbi.nlm.nih.gov\pmc\articles\PMC10025027\index.md
[OK] clean_pages\www.ncbi.nlm.nih.gov\pmc\articles\PMC10027818\index.html -> md_out\www.ncbi.nlm.nih.gov\pmc\articles\PMC10027818\index.md
[OK] clean_pages\www.ncbi.nlm.nih.gov\pmc\articles\PMC10030976\index.html -> md_out\www.ncbi.nlm.nih.gov\pmc\articles\PMC10030976\index.md
[OK] clean_pages\www.ncbi.nlm.nih.gov\pmc\articles\PMC10058394\index.html -> md_out\www.ncbi.nlm.nih.gov\pmc\articles\PMC10058394\index.md
[OK] clean_pages\www.ncbi.nlm.nih.gov\pmc\articles\PMC10063413\index.html -> md_out\www.ncbi.nlm.nih.gov\pmc\articles\PMC10063413\index.md
[OK] clean_pages\www.ncbi.nlm.nih.gov\pmc\articles\PMC10138634\index.html -> md_out\www.ncbi.nlm.nih.gov\pmc\articles\PMC10138634\index.md
[OK] clean_pages\www.ncbi.n

WindowsPath('md_out')