In [1]:
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
from collections import defaultdict

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import re

# ---- Load CSV ----
src = Path("../Data/astronauts.csv")
df = pd.read_csv(src)

# ---- Name Reformatting ----
def reformat_name(name):
    if pd.isna(name):
        return name
    s = str(name).strip()
    if "," not in s:
        return s

    last, rest = s.split(",", 1)
    last = last.strip()
    rest = rest.strip()

    if not rest:
        return s

    parts = rest.split()
    if len(parts) == 1:
        first = parts[0]
        middle_parts = []
    else:
        first = parts[-1]
        middle_parts = parts[:-1]

    # Build {first middle last}
    new_parts = [first] + middle_parts + [last]

    # Remove anything containing a dot (like "Jr.", "M.", "Sr.")
    new_parts = [p for p in new_parts if "." not in p]

    return " ".join(new_parts)

# Apply name cleaning
df["Profile.Name"] = df["Profile.Name"].apply(reformat_name)

# ---- Roles Parsing ----
# Detect the roles column (contains "role" in its name)
role_col_candidates = [c for c in df.columns if "role" in c.lower()]
if not role_col_candidates:
    raise KeyError("No column found that looks like it contains astronaut roles.")
role_col = role_col_candidates[0]

def parse_roles(val):
    if pd.isna(val):
        return []
    s = str(val)
    if ";" in s:
        return [r.strip() for r in s.split(";") if r.strip()]
    if "," in s:
        return [r.strip() for r in s.split(",") if r.strip()]
    return [s.strip()]

# ---- Build dictionary ----
astronaut_roles = {}
for _, row in df.iterrows():
    name = row["Profile.Name"]
    roles = parse_roles(row[role_col])
    astronaut_roles[name] = roles

astronaut_roles_pd = pd.DataFrame.from_dict(astronaut_roles, orient="index", columns=["roles"])
astronaut_roles_pd["roles"] = astronaut_roles_pd["roles"].str.lower()
astronaut_roles_pd

Unnamed: 0,roles
Yuri Gagarin,pilot
Gherman Titov,pilot
John Glenn,psp
Scott Carpenter,pilot
Andriyan Nikolayev,pilot
...,...
Charlotte Anne McClain,flight engineer
Christina Koch,flight engineer
Andrew Morgan,flight engineer
Jessica Meir,flight engineer


In [5]:
def remove_dot_words(s):
    if not isinstance(s, str):
        return s
    return " ".join([w for w in s.split() if "." not in w])

test = pd.read_pickle("../Data/astronauts_embeddings.pkl")
test["name"] = test["name"].apply(remove_dot_words)

merged = test.merge(astronaut_roles_pd, how="left", left_on="name", right_index=True)
merged = merged.reset_index(drop=True)

In [6]:
# Find indices of rows with NaN roles and synthetic == 0
nan_role_indices = merged[(merged['roles'].isna()) & (merged['synthetic'] == 0)].index

# Prepare matrix of embeddings for rows with non-NaN roles
non_nan_mask = ~merged['roles'].isna()
embeddings_non_nan = np.vstack(merged.loc[non_nan_mask, 'embedding_concat'].values)
roles_non_nan = merged.loc[non_nan_mask, 'roles'].values

# For each row with NaN role and synthetic == 0, find the most similar row and assign its role
for idx in nan_role_indices:
    emb = np.array(merged.at[idx, 'embedding_concat']).reshape(1, -1)
    # Compute cosine similarity (dot product, since embeddings are assumed normalized)
    sims = emb @ embeddings_non_nan.T
    best_idx = np.argmax(sims)
    best_role = roles_non_nan[best_idx]
    merged.at[idx, 'roles'] = best_role

In [7]:
merged = merged.fillna({"roles": "non-astronaut"})
merged

Unnamed: 0,name,education_tokens,occupation_tokens,interest_tokens,nationality_tokens,embedding_concat,synthetic,roles
0,Abdul Ahad Momand,"[habibia_high_school, kabul_polytechnic_univer...","[fighter_pilot, cosmonaut, accountant]",[],[germany],"[-0.32491165, 0.3235269, 0.06065496, -0.162117...",0,msp
1,Akihiko Hoshide,"[united_world_college_of_south_east_asia, keio...","[engineer, jaxa_astronaut, former_commander_of...",[],[japanese],"[-0.15258501, -0.0836665, -0.14960064, 0.09225...",0,flight engineer
2,Alan Bartlett Shepard,"[united_states_naval_academy, naval_war_college]","[rear_admiral,_usn, astronaut, test_pilot, nav...","[skiing, sailing]",[american],"[-0.07574703, 0.005123809, 0.028343018, 0.3864...",0,commander
3,Alan Bean,[university_of_texas_at_austin],"[american_naval_officer, aviator, aeronautical...",[painting],[american],"[-0.17197394, 0.90371376, 0.69503736, 0.723994...",0,commander
4,Alan Poindexter,"[pensacola_junior_college, georgia_institute_o...","[american_naval_officer, astronaut, captain,_u...","[motorcycling, running, weight_lifting, water_...",[american],"[-0.045659542, 0.14472444, -0.08600358, 0.2123...",0,commander
...,...,...,...,...,...,...,...,...
1130,Sawyer Young,[massachusetts_institute_of_technology],[data_scientist],"[jazz, rock_climbing, sailing, classical_music]",[australia],"[-0.6731038, 0.70710593, -0.035009693, 0.39706...",1,non-astronaut
1131,Theo Chaudhry,"[technical_university_of_munich, university_of...",[real_estate_agent],"[baking, hiking]",[switzerland],"[-0.47105435, -0.3437538, 0.18649812, 0.39642,...",1,non-astronaut
1132,Priya de Vries,"[georgia_institute_of_technology, university_o...","[coo, environmental_scientist]",[community_organizing],[india],"[0.08976037, 0.11498475, -0.08512583, -0.94110...",1,non-astronaut
1133,Zara van Dijk,"[university_of_amsterdam, university_of_copenh...","[auto_mechanic, psychologist]","[chess, photography]",[colombia],"[-0.33594692, -0.06559164, 0.0075628236, 0.216...",1,non-astronaut


In [8]:
# merged.to_pickle("astronauts_with_roles.pkl")
merged.to_csv("../Data/astronauts_with_roles.csv")