In [1]:
# show content of social science csv
import pandas as pd
df = pd.read_csv('Social_Science.csv')
print(df.head())

   Profile.Astronaut Numbers.Overall  Profile.Astronaut Numbers.Nationwide  \
0                                  1                                     1   
1                                  2                                     2   
2                                  3                                     1   
3                                  3                                     1   
4                                  4                                     2   

          Profile.Name Profile.Gender  Profile.Birth Year Profile.Nationality  \
0        Gagarin, Yuri           male                1934      U.S.S.R/Russia   
1       Titov, Gherman           male                1935      U.S.S.R/Russia   
2  Glenn, John H., Jr.           male                1921                U.S.   
3  Glenn, John H., Jr.           male                1921                U.S.   
4  Carpenter, M. Scott           male                1925                U.S.   

   Profile.Military Profile.Selection.Group 

In [7]:
import os
from bs4 import BeautifulSoup

# ================================
# Config
# ================================
DIRECTORY = "wikipedia_pages"  # Directory containing astronaut folders (each has index.html)

# ================================
# Helpers
# ================================

def _strip_refs_and_edits(node):
    """Remove citation superscripts and edit links from a node in-place."""
    for sup in node.select("sup.reference, sup[id^='cite_ref']"):
        sup.decompose()
    for span in node.select("span.mw-editsection"):
        span.decompose()
    return node


def _text_without_links(el):
    """Return plain text from an element, dropping all link URLs and chrome."""
    # Work on a cloned subtree to avoid mutating original soup
    el = BeautifulSoup(str(el), "html.parser")
    _strip_refs_and_edits(el)
    # Replace links with just their visible text
    for a in el.select("a"):
        a.replace_with(a.get_text(" ", strip=True))
    # Normalize <br> to spaces
    for br in el.find_all("br"):
        br.replace_with(" ")
    return el.get_text(" ", strip=True)


def _find_content_root(soup):
    return soup.select_one("#mw-content-text .mw-parser-output") or soup


def _find_infobox(soup):
    # Prefer the canonical desktop path
    ib = soup.select_one("#mw-content-text .mw-parser-output table.infobox")
    if ib:
        return ib
    # Fallback: any table with a class that includes 'infobox'
    for tbl in soup.find_all("table"):
        classes = tbl.get("class") or []
        if any("infobox" in c for c in classes):
            return tbl
    return None


def _infobox_to_bullets(table):
    bullets = []
    # rows can be under <tbody> or directly under <table>
    rows = table.select(":scope > tr, :scope > tbody > tr")
    for tr in rows:
        th = tr.find("th", recursive=False)
        td = tr.find("td", recursive=False)
        if th and td:
            key = th.get_text(" ", strip=True)
            val = _text_without_links(td)
            if key and val:
                bullets.append(f"- {key}: {val}")
    return bullets


def _wikitable_to_bullets(table):
    bullets = []
    headers = [th.get_text(" ", strip=True) for th in table.select("tr th")]
    for tr in table.select("tr"):
        tds = [
            _text_without_links(td)
            for td in tr.find_all("td", recursive=False)
        ]
        if tds:
            if headers and len(headers) >= len(tds):
                pairs = "; ".join(f"{h}: {v}" for h, v in zip(headers, tds))
                bullets.append(f"- {pairs}")
            else:
                bullets.append(f"- {'; '.join(tds)}")
    return bullets


def _lead_paragraph_bullets(content_root):
    bullets = []
    for child in content_root.children:
        name = getattr(child, "name", None)
        if name == "h2":
            break
        if name == "p" and child.get_text(strip=True):
            bullets.append("- " + _text_without_links(child))
    return bullets


def _section_blocks(content_root):
    """Yield (heading_text, list_of_bullets) for each h2/h3 section."""
    sections = []
    for h in content_root.find_all(["h2", "h3"]):
        heading = h.get_text(" ", strip=True)
        if not heading:
            continue
        block = []
        # Collect siblings until next heading of same/higher level
        for sib in h.find_all_next():
            if sib == h:
                continue
            if getattr(sib, "name", None) in {"h2", "h3"}:
                break
            if getattr(sib, "name", None) == "p" and sib.get_text(strip=True):
                block.append("  - " + _text_without_links(sib))
            elif getattr(sib, "name", None) in {"ul", "ol"}:
                for li in sib.find_all("li", recursive=False):
                    block.append("  - " + _text_without_links(li))
            elif getattr(sib, "name", None) == "table" and "wikitable" in (sib.get("class") or []):
                for b in _wikitable_to_bullets(sib):
                    block.append("  - " + b.lstrip("- "))
        sections.append((heading, block))
    return sections


def _orphan_top_level_paragraphs(content_root):
    bullets = []
    for p in content_root.find_all("p", recursive=False):
        txt = p.get_text(strip=True)
        if txt:
            bullets.append("- " + _text_without_links(p))
    return bullets


# ================================
# Main per-file writer
# ================================

def write_biography_for_html(html_path, out_path):
    with open(html_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    out_lines = []

    # Title (retain; it is core info)
    title = soup.find("h1")
    if title:
        out_lines.append(f"# {title.get_text(strip=True)}\n\n")

    content_root = _find_content_root(soup)

    # Infobox first (concise facts)
    infobox = _find_infobox(soup)
    if infobox:
        for b in _infobox_to_bullets(infobox):
            out_lines.append(b + "\n")
        out_lines.append("\n")
    else:
        # Optional lightweight debug to help adjust selectors if needed
        out_lines.append("- (No infobox found)\n\n")

    # Lead summary as bullets
    lead = _lead_paragraph_bullets(content_root)
    if lead:
        out_lines.extend(l + "\n" for l in lead)
        out_lines.append("\n")

    # Sections, kept compact
    for heading, block in _section_blocks(content_root):
        if heading:
            out_lines.append(f"- {heading}\n")
        for line in block:
            out_lines.append(line + "\n")
        out_lines.append("\n")

    # Any orphan paragraphs not under a section (top-level only)
    orphans = _orphan_top_level_paragraphs(content_root)
    if orphans:
        out_lines.extend(o + "\n" for o in orphans)

    with open(out_path, "w", encoding="utf-8") as out:
        out.writelines(out_lines)


# ================================
# Entry point: walk folders
# ================================
if __name__ == "__main__":
    if not os.path.isdir(DIRECTORY):
        raise SystemExit(f"Directory not found: {DIRECTORY}")

    for folder in os.listdir(DIRECTORY):
        folder_path = os.path.join(DIRECTORY, folder)
        if not os.path.isdir(folder_path):
            continue
        html_file = os.path.join(folder_path, "index.html")
        if not os.path.exists(html_file):
            continue
        output_file = os.path.join(folder_path, "biography.txt")
        try:
            write_biography_for_html(html_file, output_file)
            print(f"✔ Wrote {output_file}")
        except Exception as e:
            print(f"✖ Error processing {html_file}: {e}")


✔ Wrote wikipedia_pages/Dmitri Kondratyev/biography.txt
✔ Wrote wikipedia_pages/Joan E. Higginbotham/biography.txt
✔ Wrote wikipedia_pages/Claude Nicollier/biography.txt
✔ Wrote wikipedia_pages/Carl J. Meade/biography.txt
✔ Wrote wikipedia_pages/Aleksandr Serebrov/biography.txt
✔ Wrote wikipedia_pages/William F. Readdy/biography.txt
✔ Wrote wikipedia_pages/Loren J. Shriver/biography.txt
✔ Wrote wikipedia_pages/M. Achmed Faris/biography.txt
✔ Wrote wikipedia_pages/Ellen Ochoa/biography.txt
✔ Wrote wikipedia_pages/Michael E. Fossum/biography.txt
✔ Wrote wikipedia_pages/Vladimir Titov/biography.txt
✔ Wrote wikipedia_pages/Ellison S. Onizuka/biography.txt
✔ Wrote wikipedia_pages/Timothy L. Kopra/biography.txt
✔ Wrote wikipedia_pages/Charles F., Jr. Bolden/biography.txt
✔ Wrote wikipedia_pages/Boming Liu/biography.txt
✔ Wrote wikipedia_pages/Guion S., Jr. Bluford/biography.txt
✔ Wrote wikipedia_pages/Michael R. Clifford/biography.txt
✔ Wrote wikipedia_pages/Sharon Christa Corrigan McAuliffe

In [8]:
# remove the external links section from all text files in the directory
import os
import re
directory = 'wikipedia_pages'
for folder in os.listdir(directory):
    folder_path = os.path.join(directory, folder)
    if not os.path.isdir(folder_path):
        continue
    text_file = os.path.join(folder_path, 'biography.txt')
    if not os.path.exists(text_file):
        continue
    with open(text_file, 'r', encoding='utf-8') as f:
        content = f.read()
    # Remove the "External links" section and everything after it
    content = re.split(r'- External links', content)[0].strip()
    with open(text_file, 'w', encoding='utf-8') as f:
        f.write(content)
    print(f"Processed {text_file}")

Processed wikipedia_pages/Dmitri Kondratyev/biography.txt
Processed wikipedia_pages/Joan E. Higginbotham/biography.txt
Processed wikipedia_pages/Claude Nicollier/biography.txt
Processed wikipedia_pages/Carl J. Meade/biography.txt
Processed wikipedia_pages/Aleksandr Serebrov/biography.txt
Processed wikipedia_pages/William F. Readdy/biography.txt
Processed wikipedia_pages/Loren J. Shriver/biography.txt
Processed wikipedia_pages/M. Achmed Faris/biography.txt
Processed wikipedia_pages/Ellen Ochoa/biography.txt
Processed wikipedia_pages/Michael E. Fossum/biography.txt
Processed wikipedia_pages/Vladimir Titov/biography.txt
Processed wikipedia_pages/Ellison S. Onizuka/biography.txt
Processed wikipedia_pages/Timothy L. Kopra/biography.txt
Processed wikipedia_pages/Charles F., Jr. Bolden/biography.txt
Processed wikipedia_pages/Boming Liu/biography.txt
Processed wikipedia_pages/Guion S., Jr. Bluford/biography.txt
Processed wikipedia_pages/Michael R. Clifford/biography.txt
Processed wikipedia_pag

In [None]:
# remove the references section from all text files in the directory
import os
import re
directory = 'wikipedia_pages'
for folder in os.listdir(directory):
    folder_path = os.path.join(directory, folder)
    if not os.path.isdir(folder_path):
        continue
    text_file = os.path.join(folder_path, 'biography.txt')
    if not os.path.exists(text_file):
        continue
    with open(text_file, 'r', encoding='utf-8') as f:
        content = f.read()
    # Remove the "References" section and everything after it
    content = re.split(r'- References', content)[0].strip()
    with open(text_file, 'w', encoding='utf-8') as f:
        f.write(content)
    print(f"Processed {text_file}")
    