In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# ---- Load CSV ----
src = Path("astronauts.csv")
df = pd.read_csv(src)

# ---- Name Reformatting ----
def reformat_name(name):
    if pd.isna(name):
        return name
    s = str(name).strip()

    # Handle leading suffix like "Jr." or "Sr."
    suffix = None
    if s.lower().endswith("Jr.") or s.lower().endswith("Sr."):
        print(s)
        parts = s.split(None, 1)  # split into ["Jr.", rest]
        suffix = parts[-1]
        s = parts[:-1] if len(parts) > 1 else ""

    if "," not in s:
        return s if not suffix else f"{s} {suffix}"

    last, rest = s.split(",", 1)
    last = last.strip()
    rest = rest.strip()

    if not rest:
        return s if not suffix else f"{s} {suffix}"

    parts = rest.split()
    if len(parts) == 1:
        first = parts[0]
        middle_parts = []
    else:
        first = parts[-1]
        middle_parts = parts[:-1]

    new_parts = [first] + middle_parts + [last]
    if suffix:
        new_parts.append(suffix)

    return " ".join(p for p in new_parts if p)

# Apply name cleaning
df["Profile.Name"] = df["Profile.Name"].apply(reformat_name)

# ---- Roles Parsing ----
# Detect the roles column (contains "role" in its name)
role_col_candidates = [c for c in df.columns if "role" in c.lower()]
if not role_col_candidates:
    raise KeyError("No column found that looks like it contains astronaut roles.")
role_col = role_col_candidates[0]

def parse_roles(val):
    if pd.isna(val):
        return []
    s = str(val)
    if ";" in s:
        return [r.strip() for r in s.split(";") if r.strip()]
    if "," in s:
        return [r.strip() for r in s.split(",") if r.strip()]
    return [s.strip()]

# ---- Build dictionary ----
astronaut_roles = {}
for _, row in df.iterrows():
    name = row["Profile.Name"]
    roles = parse_roles(row[role_col])
    astronaut_roles[name] = roles

# ---- Example usage ----
print("Preview (first 10 entries):")
for i, (k, v) in enumerate(astronaut_roles.items()):
    if i >= 10: break
    print(f"{k}: {v}")


Yuri
Gherman
John H., Jr.
John H., Jr.
M. Scott
Andriyan
Andriyan
Pavel
Pavel
Walter M., Jr.
Walter M., Jr.
Walter M., Jr.
L. Gordon, Jr.
L. Gordon, Jr.
Valery
Valery
Valery
Valentina
Vladimir
Vladimir
Konstantin
Boris
Pavel
Aleksei
Aleksei
Virgil I.
John W.
John W.
John W.
John W.
John W.
John W.
James A.
James A.
Edward H., II
Charles, Jr.
Charles, Jr.
Charles, Jr.
Charles, Jr.
Frank
Frank
James A., Jr.
James A., Jr.
James A., Jr.
James A., Jr.
Thomas P.
Thomas P.
Thomas P.
Thomas P.
Neil A.
Neil A.
David R.
David R.
David R.
Eugene A.
Eugene A.
Eugene A.
Michael
Michael
Richard F., Jr.
Richard F., Jr.
Edwin Eugene, Jr.
Edwin Eugene, Jr.
Donn F.
Ronnie Walter
Viktor
Viktor
Robert L.
Robert L.
Robert L.
Robert L.
Joe H.
Joe H.
Richard H.
Richard H.
Charles G.
Charles G.
Anatoly
Jean-Loup
Jean-Loup
Jean-Loup
Henry W., Jr.
Henry W., Jr.
Henry W., Jr.
Aleksandr
Georgi
William Alison
Vladimir
Vladimir
Vladimir
Boris
Boris
Aleksei
Aleksei
Aleksei
Yevgeny
Russell L.
Georgi
Valeri
Valeri
Val