<a href="https://colab.research.google.com/github/CUHK-DH-Lab/YCRG_Cross-Cultural_Analytics/blob/main/EEBO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This code downloads the China related texts from EEBO-TCP, except restricted titles which are only available through ProQuest (see CSV file

 Practical implications for your Colab workflows
✔ You can download:

All Status = Free items
These include all of EEBO‑TCP Phase I & Phase II of the public release

❌ You cannot download:

Any Status = Restricted items
Their XML files will not be present in the GitHub repositories
Your script will automatically skip them because the HTTP request returns 404

In [1]:
# Minimal Colab-ready cell: download the first 2 EEBO-TCP texts and save as .txt

import os, re, requests
import pandas as pd
from lxml import etree

# --- Setup output folders ---
os.makedirs("xml", exist_ok=True)
os.makedirs("txt", exist_ok=True)

# --- Load master list of TCP IDs (Phase I & II) ---
CSV_URL = "https://raw.githubusercontent.com/textcreationpartnership/Texts/master/TCP.csv"
df = pd.read_csv(CSV_URL)

# Take the first 2 IDs (you can change [:2] to [:N] for more)
tcp_ids = df["TCP"].dropna().astype(str).tolist()[:2]

def tei_to_text(xml_bytes):
    """Simple TEI (EEBO-TCP) XML → plain text converter."""
    parser = etree.XMLParser(recover=True)
    root = etree.fromstring(xml_bytes, parser)

    ns = {"tei": "http://www.tei-c.org/ns/1.0"}
    body = root.find(".//tei:text", namespaces=ns)
    if body is None:
        body = root.find(".//tei:body", namespaces=ns)
    if body is None:
        body = root  # fallback

    # Join all visible text; normalize whitespace
    text = " ".join(body.itertext())
    text = re.sub(r"\s+", " ", text)
    return text.strip()

for tid in tcp_ids:
    print(f"Processing {tid} …")
    xml_url = f"https://raw.githubusercontent.com/textcreationpartnership/{tid}/master/{tid}.xml"

    r = requests.get(xml_url, timeout=60)
    if r.status_code != 200:
        print(f"  → XML not found (HTTP {r.status_code}); skipping.")
        continue

    # Save XML
    xml_path = f"xml/{tid}.xml"
    with open(xml_path, "wb") as f:
        f.write(r.content)

    # Convert to TXT (fallback to rough tag strip if parsing fails)
    try:
        txt = tei_to_text(r.content)
    except Exception:
        print("  → parse error; using raw fallback.")
        txt = re.sub(rb"<[^>]+>", b"", r.content).decode("utf-8", "replace")

    with open(f"txt/{tid}.txt", "w", encoding="utf-8") as f:
        f.write(txt)

print("Done. Check /content/xml and /content/txt.")


Processing A00002 …
Processing A00005 …
Done. Check /content/xml and /content/txt.


Only China related texts

In [2]:
# Colab-ready: Select only EEBO-TCP texts related to China (Phases I & II) and convert XML -> TXT

import os, re, requests, pandas as pd
from lxml import etree
from tqdm import tqdm

# --------------------------
# 1) Settings
# --------------------------
CSV_URL = "https://raw.githubusercontent.com/textcreationpartnership/Texts/master/TCP.csv"  # master list of EEBO-TCP IDs (I & II)  (TCP repo)  [source]
# Each TCP ID's raw XML is at: https://raw.githubusercontent.com/textcreationpartnership/<ID>/master/<ID>.xml  [source]

OUT_XML_DIR = "xml_china"
OUT_TXT_DIR = "txt_china"
os.makedirs(OUT_XML_DIR, exist_ok=True)
os.makedirs(OUT_TXT_DIR, exist_ok=True)

# China-related keyword set (lowercase). Feel free to extend/tweak:
KEYWORDS = {
    # direct and common
    "china", "chinese", "pekin", "peking", "beijing", "nanjing", "nanquin", "nanking", "canton", "guangzhou",
    "fujian", "foochow", "hangzhou", "hangchow", "nanchang", "yangtze", "yang-tsze",
    # historical/latinized/variant forms seen in early modern texts
    "chyna", "chine", "sinae", "sina", "sinensis", "sinens.", "cathay", "cataya", "cathaia", "quinsay", "quinsai",
    # region/adjoining terms often used in China-focused travel or Jesuit materials
    "tartar", "tartary", "manchu", "mantchou", "mandarin", "konfucius", "confucius"
}

# --------------------------
# 2) Helpers
# --------------------------
def tei_to_text(xml_bytes: bytes) -> str:
    """
    Light TEI -> text extraction:
    - Prefer the <text>/<body> block
    - Flatten to readable paragraphs
    - Normalize whitespace
    """
    parser = etree.XMLParser(recover=True, resolve_entities=False)
    root = etree.fromstring(xml_bytes, parser=parser)
    ns = {"tei": "http://www.tei-c.org/ns/1.0"}

    body = root.find(".//tei:text", namespaces=ns)
    if body is None:
        body = root.find(".//tei:body", namespaces=ns)
    if body is None:
        body = root

    # Drop notes/running headers/page/col breaks to de-clutter
    for xp in [".//tei:note", ".//tei:fw", ".//tei:pb", ".//tei:cb", ".//tei:lb", ".//tei:gap"]:
        for el in body.findall(xp, ns):
            parent = el.getparent()
            if parent is not None:
                parent.remove(el)

    chunks = []
    def walk(node):
        if not isinstance(node.tag, str):
            return
        if node.text:
            chunks.append(node.text)
        for child in node:
            walk(child)
            if child.tail:
                chunks.append(child.tail)

    walk(body)
    text = "".join(chunks)
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

def row_matches_china(row: pd.Series, keyword_set=KEYWORDS) -> bool:
    """
    Try to match on Title if available; otherwise search across all string cells in the row.
    Matching is case-insensitive and uses substring containment for any keyword.
    """
    # Prefer Title column if present:
    for col in row.index:
        if str(col).strip().lower() in {"title", "main title", "document title"}:
            val = str(row[col]).lower()
            return any(k in val for k in keyword_set)

    # Fallback: concatenate all strings in the row:
    vals = " | ".join([str(v) for v in row.values if isinstance(v, (str, int, float))]).lower()
    return any(k in vals for k in keyword_set)

# --------------------------
# 3) Load master list and filter
# --------------------------
print("Downloading TCP.csv (official master list of EEBO-TCP texts)…")
df = pd.read_csv(CSV_URL)

# Basic sanity: ensure TCP ID column exists
tcp_col = None
for c in df.columns:
    if str(c).strip().lower() == "tcp":
        tcp_col = c
        break
if tcp_col is None:
    raise RuntimeError("Could not find a 'TCP' column in TCP.csv. Inspect df.columns to adapt the code.")

# Filter to China-related
mask = df.apply(row_matches_china, axis=1)
df_china = df[mask].copy()
df_china.reset_index(drop=True, inplace=True)

print(f"Matched {len(df_china)} China-related EEBO-TCP records.")
# Save the selection metadata for reference
df_china.to_csv("china_selection.csv", index=False)

# --------------------------
# 4) Download only those XMLs and convert to TXT
# --------------------------
session = requests.Session()
session.headers.update({"User-Agent": "Colab-China-EEBO-TCP/1.0"})

def fetch_xml(tcp_id: str) -> bytes:
    url = f"https://raw.githubusercontent.com/textcreationpartnership/{tcp_id}/master/{tcp_id}.xml"
    r = session.get(url, timeout=60)
    if r.status_code == 200:
        return r.content
    else:
        return b""

# Iterate and convert
downloaded, converted = 0, 0
for _, row in tqdm(df_china.iterrows(), total=len(df_china), desc="China subset"):
    tcp_id = str(row[tcp_col]).strip()
    if not tcp_id:
        continue

    xml_path = os.path.join(OUT_XML_DIR, f"{tcp_id}.xml")
    txt_path = os.path.join(OUT_TXT_DIR, f"{tcp_id}.txt")

    # Skip if already done
    if os.path.exists(txt_path):
        continue

    # Download XML
    if not os.path.exists(xml_path):
        xml_bytes = fetch_xml(tcp_id)
        if not xml_bytes:
            # Not all IDs necessarily have public GitHub XML (rare), skip gracefully
            continue
        with open(xml_path, "wb") as f:
            f.write(xml_bytes)
        downloaded += 1
    else:
        with open(xml_path, "rb") as f:
            xml_bytes = f.read()

    # Convert TEI -> TXT
    try:
        text = tei_to_text(xml_bytes)
    except Exception:
        # Fallback: crude tag strip
        text = re.sub(rb"<[^>]+>", b"", xml_bytes).decode("utf-8", "replace")

    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(text)
    converted += 1

print(f"\nDone. Downloaded: {downloaded} | Converted to TXT: {converted}")
print(f"XML saved in: {OUT_XML_DIR}")
print(f"TXT saved in: {OUT_TXT_DIR}")
print("Matched metadata saved to: china_selection.csv")

Downloading TCP.csv (official master list of EEBO-TCP texts)…
Matched 208 China-related EEBO-TCP records.


China subset: 100%|██████████| 208/208 [00:50<00:00,  4.08it/s]


Done. Downloaded: 208 | Converted to TXT: 208
XML saved in: xml_china
TXT saved in: txt_china
Matched metadata saved to: china_selection.csv





In [3]:
# Colab cell: Download ALL China-related EEBO-TCP XMLs from china_selection.csv, convert to TXT, and save.

import os, re, requests, pandas as pd
from lxml import etree
from tqdm import tqdm

# ========= Settings =========
SELECTION_CSV = "china_selection.csv"   # produced by the earlier filtering step
XML_DIR = "china_xml"
TXT_DIR = "china_txt"
TIMEOUT = 60

os.makedirs(XML_DIR, exist_ok=True)
os.makedirs(TXT_DIR, exist_ok=True)

# ========= Helpers =========
def tei_to_text(xml_bytes: bytes) -> str:
    """
    TEI (EEBO-TCP) XML -> plain text
    - Prefer <text>/<body>
    - Remove TEI non-content clutter (notes, page breaks, etc.)
    - Normalize whitespace
    """
    parser = etree.XMLParser(recover=True, resolve_entities=False)
    root = etree.fromstring(xml_bytes, parser=parser)
    ns = {"tei": "http://www.tei-c.org/ns/1.0"}

    body = root.find(".//tei:text", namespaces=ns)
    if body is None:
        body = root.find(".//tei:body", namespaces=ns)
    if body is None:
        body = root

    # remove clutter
    for xp in [".//tei:note", ".//tei:fw", ".//tei:pb", ".//tei:cb", ".//tei:lb", ".//tei:gap"]:
        for el in body.findall(xp, ns):
            parent = el.getparent()
            if parent is not None:
                parent.remove(el)

    chunks = []
    def walk(node):
        if not isinstance(node.tag, str):
            return
        if node.text:
            chunks.append(node.text)
        for child in node:
            walk(child)
            if child.tail:
                chunks.append(child.tail)

    walk(body)
    text = "".join(chunks)
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

def fetch_xml_by_id(session: requests.Session, tcp_id: str) -> bytes:
    # Each EEBO-TCP ID has a public raw TEI XML at this predictable URL
    # e.g., https://raw.githubusercontent.com/textcreationpartnership/A12345/master/A12345.xml
    url = f"https://raw.githubusercontent.com/textcreationpartnership/{tcp_id}/master/{tcp_id}.xml"
    resp = session.get(url, timeout=TIMEOUT)
    return resp.content if resp.status_code == 200 else b""

# ========= Load the selection and find the TCP ID column =========
df = pd.read_csv(SELECTION_CSV)
tcp_col = None
for c in df.columns:
    if str(c).strip().lower() == "tcp":
        tcp_col = c
        break
if tcp_col is None:
    raise RuntimeError("Could not find a 'TCP' column in china_selection.csv. Inspect df.columns and update tcp_col accordingly.")

ids = df[tcp_col].dropna().astype(str).str.strip().unique().tolist()
print(f"{len(ids)} TCP IDs found in selection.")

# ========= Download + convert =========
session = requests.Session()
session.headers.update({"User-Agent": "Colab-EEBO-China/1.0"})

downloaded, converted, skipped = 0, 0, 0

for tid in tqdm(ids, desc="Downloading & Converting"):
    xml_path = os.path.join(XML_DIR, f"{tid}.xml")
    txt_path = os.path.join(TXT_DIR, f"{tid}.txt")

    if os.path.exists(txt_path):
        skipped += 1
        continue

    # get XML (from disk or network)
    if os.path.exists(xml_path):
        with open(xml_path, "rb") as f:
            xml_bytes = f.read()
    else:
        xml_bytes = fetch_xml_by_id(session, tid)
        if not xml_bytes:
            # could not fetch (rare: missing repo/file)
            continue
        with open(xml_path, "wb") as f:
            f.write(xml_bytes)
        downloaded += 1

    # convert to txt
    try:
        txt = tei_to_text(xml_bytes)
    except Exception:
        # fallback: crude tag strip
        txt = re.sub(rb"<[^>]+>", b"", xml_bytes).decode("utf-8", "replace")

    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(txt)
    converted += 1

print(f"\nDone.")
print(f"Downloaded XML (this run): {downloaded}")
print(f"Converted to TXT (this run): {converted}")
print(f"Already existed (skipped): {skipped}")
print(f"XML folder: {os.path.abspath(XML_DIR)}")
print(f"TXT folder: {os.path.abspath(TXT_DIR)}")

208 TCP IDs found in selection.


Downloading & Converting: 100%|██████████| 208/208 [00:16<00:00, 12.93it/s]


Done.
Downloaded XML (this run): 208
Converted to TXT (this run): 208
Already existed (skipped): 0
XML folder: /content/china_xml
TXT folder: /content/china_txt





Fix long s and other abbrevation problem in EEBO

In [6]:
import os
import re

TXT_DIR = "china_txt"

# Mapping based on EEBO-TCP transcription standards
EXPANSIONS = {
    'ſ': 's',          # Long s
    # Precomposed Macrons (Commonly used for 'n')
    'ā': 'an', 'ē': 'en', 'ī': 'in', 'ō': 'on', 'ū': 'un',
    'Ā': 'An', 'Ē': 'En', 'Ī': 'In', 'Ō': 'On', 'Ū': 'Un',

    # Scribal Abbreviations / Brevigraphs
    'ꝓ': 'pro',        # p-with-flourish
    'ꝑ': 'per',        # p-with-stroke (can also be par/por)
    'q̄': 'que',        # q-bar (common for -que suffix)
    'ꝙ': 'quod',       # q-with-flourish
    'ꝿ': 'con',        # Reverse c (con/com)
    'ꝸ': 'us',         # 9-like symbol for -us
    'yͤ': 'the',        # Thorn with superscript e (the)
    'yᵗ': 'that',      # Thorn with superscript t (that)

    # Tildes (Often used for double letters)
    'm̃': 'mm', 'ñ': 'nn'
}

# Regex for combining macrons (e.g., any vowel followed by \u0304)
MACRON_REGEX = re.compile(r'([aeiouAEIOU])\u0304')

modified_count = 0
total_replacements = 0

print(f"Normalizing EEBO-TCP abbreviations in {TXT_DIR}...")

for filename in os.listdir(TXT_DIR):
    if filename.endswith(".txt"):
        file_path = os.path.join(TXT_DIR, filename)

        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        new_content = content
        file_replacements = 0

        # 1. Expand standard mapping
        for char, expansion in EXPANSIONS.items():
            count = new_content.count(char)
            if count > 0:
                new_content = new_content.replace(char, expansion)
                file_replacements += count

        # 2. Expand combining macrons using regex (e.g., o + ̄ -> on)
        # We use \1n to refer to the vowel found and add 'n'
        new_content, count = MACRON_REGEX.subn(r'\1n', new_content)
        file_replacements += count

        if file_replacements > 0:
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(new_content)
            total_replacements += file_replacements
            modified_count += 1

print(f"\nProcessing complete.")
print(f"Files updated: {modified_count}")
print(f"Total expansions performed: {total_replacements}")


Normalizing EEBO-TCP abbreviations in china_txt...

Processing complete.
Files updated: 53
Total expansions performed: 4161


In [7]:
from google.colab import files
import shutil

shutil.make_archive("china_txt", "zip", "china_txt")  # creates china_txt.zip
files.download("china_txt.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>