The Nobel Prize has been among the most prestigious international awards since 1901. Each year, awards are bestowed in chemistry, literature, physics, physiology or medicine, economics, and peace. In addition to the honor, prestige, and substantial prize money, the recipient also gets a gold medal with an image of Alfred Nobel (1833 - 1896), who established the prize.

![](Nobel_Prize.png)

The Nobel Foundation has made a dataset available of all prize winners from the outset of the awards from 1901 to 2023. The dataset used in this project is from the Nobel Prize API and is available in the `nobel.csv` file in the `data` folder.

In this project, you'll get a chance to explore and answer several questions related to this prizewinning data. And we encourage you then to explore further questions that you're interested in!

In [2]:
# Robust Nobel dataset analysis script
# Save as analyze_nobel.py or run in a notebook cell.
import os
import pandas as pd
import numpy as np
from collections import Counter

# set path to your CSV file
path_candidates = [
    "data/nobel.csv",
    "nobel.csv",
    "./data/nobel.csv",
    "/mnt/data/data/nobel.csv",
    "/mnt/data/nobel.csv"
]

csv_path = None
for p in path_candidates:
    if os.path.exists(p):
        csv_path = p
        break

if csv_path is None:
    raise FileNotFoundError(f"Could not find nobel.csv. Checked: {path_candidates}\n"
                            "Place the file in one of those locations or update csv_path.")

df = pd.read_csv(csv_path)
print(f"Loaded {csv_path} — shape: {df.shape}\nColumns: {list(df.columns)}\n")
# Normalize column names
df.columns = [c.lower() for c in df.columns]

# --- Helper: detect columns ---
# Gender/sex column candidates
gender_candidates = [c for c in df.columns if any(x in c for x in ("sex","gender"))]
# Birth country candidates
birth_country_candidates = [c for c in df.columns if ("birth" in c and "country" in c) or ("born" in c and "country" in c)]
# Category
category_candidates = [c for c in df.columns if c == "category" or "cat" in c]
# Year
year_candidates = [c for c in df.columns if c == "year" or "yr" in c]

print("gender_candidates:", gender_candidates)
print("birth_country_candidates:", birth_country_candidates[:5])
print("category_candidates:", category_candidates[:5])
print("year_candidates:", year_candidates[:5])

# Choose best available
gender_col = gender_candidates[0] if gender_candidates else None
birth_col = birth_country_candidates[0] if birth_country_candidates else None
cat_col = "category" if "category" in df.columns else (category_candidates[0] if category_candidates else None)
year_col = "year" if "year" in df.columns else (year_candidates[0] if year_candidates else None)

# Ensure year numeric
if year_col:
    df[year_col] = pd.to_numeric(df[year_col], errors="coerce")
else:
    raise ValueError("No 'year' column found. Ensure the dataset contains a year field.")

# Create decade column (decade start, e.g. 1910 -> 1910s)
df["decade"] = (df[year_col] // 10 * 10).astype("Int64")

# ---- top_gender and top_country ----
top_gender = None
top_country = None
if gender_col:
    top_gender = df[gender_col].dropna().mode().iat[0] if not df[gender_col].dropna().empty else None
else:
    print("Warning: no gender column detected; top_gender will be None.")

if birth_col:
    top_country = df[birth_col].dropna().mode().iat[0] if not df[birth_col].dropna().empty else None
else:
    print("Warning: no birth-country column detected; top_country will be None.")

# ---- Which decade had highest ratio of US-born winners ----
# Normalize birth-country strings for matching 'United States'
def normalize_country(x):
    if pd.isna(x): return ""
    return str(x).strip().lower()

if birth_col:
    bc_norm = df[birth_col].astype(str).apply(normalize_country)
    def is_usa(s):
        s = s.lower()
        # common forms
        if "united states" in s or "united states of america" in s or s in ("usa", "u.s.a.", "u.s.", "us", "u.s."):
            return True
        # sometimes entries include "usa." or "u.s.a"
        if "usa" in s.replace(".", "") and len(s) < 30:
            return True
        return False
    df["_is_usa"] = bc_norm.apply(is_usa)
    decade_stats = df.groupby("decade").agg(total=("decade","count"), usa_count=("_is_usa","sum")).reset_index()
    decade_stats["usa_ratio"] = decade_stats["usa_count"] / decade_stats["total"]
    # drop decades with zero total just in case
    decade_stats = decade_stats[decade_stats["total"]>0]
    max_row = decade_stats.loc[decade_stats["usa_ratio"].idxmax()]
    max_decade_usa = int(max_row["decade"])
else:
    max_decade_usa = None
    print("Warning: cannot compute US-born ratio without a birth-country column.")

# ---- Which decade+category had highest proportion of female laureates ----
# Build female flag robustly
def female_flag(x):
    if pd.isna(x): return False
    s = str(x).strip().lower()
    return ("female" in s) or ("woman" in s) or (s == "f")

if gender_col:
    df["_is_female"] = df[gender_col].apply(female_flag)
    if cat_col is None:
        raise ValueError("No category column found; cannot compute female proportions by category.")
    grp = df.groupby(["decade", cat_col]).agg(total=("decade","count"), female_count=("_is_female","sum")).reset_index()
    # keep only groups with at least 1 total
    grp = grp[grp["total"]>0].copy()
    grp["female_prop"] = grp["female_count"] / grp["total"]
    top = grp.loc[grp["female_prop"].idxmax()]
    max_female_dict = {int(top["decade"]): top[cat_col]}
else:
    max_female_dict = {}
    print("Warning: no gender column; can't compute female proportions.")

# ---- First woman to receive a Nobel Prize ----
first_woman_name = None
first_woman_category = None
# Candidate name columns
name_candidates = [c for c in df.columns if c in ("full_name","fullname","name","laureate","laureate_name")]
if not name_candidates:
    # try firstname + surname
    if "firstname" in df.columns and "surname" in df.columns:
        df["_full_name"] = df["firstname"].fillna("") + " " + df["surname"].fillna("")
        name_candidates = ["_full_name"]
if name_candidates and gender_col:
    df_fem = df[df["_is_female"]].copy()
    if df_fem.empty:
        first_woman_name = None
        first_woman_category = None
    else:
        # sort by year then by other criteria
        df_fem = df_fem.sort_values([year_col]).reset_index(drop=True)
        r = df_fem.iloc[0]
        # pick best name column
        name_col = None
        for c in name_candidates:
            if c in r.index and pd.notna(r[c]) and str(r[c]).strip()!="":
                name_col = c
                break
        if name_col:
            first_woman_name = str(r[name_col])
        else:
            # fallback: try assembling from firstname/surname
            if "firstname" in df.columns and "surname" in df.columns:
                first_woman_name = (r.get("firstname","") or "") + " " + (r.get("surname","") or "")
                first_woman_name = first_woman_name.strip()
            else:
                first_woman_name = None
        first_woman_category = r[cat_col] if cat_col in r.index else None
else:
    print("Warning: couldn't determine first woman because name or gender columns are missing.")
    first_woman_name = None
    first_woman_category = None

# ---- Repeat winners (people or organisations who won >1 prize) ----
# Try to find a single 'name' column to group by
name_col = None
for c in ["full_name","fullname","name","laureate_name","laureate","org_name","organization","organisation"]:
    if c in df.columns:
        name_col = c
        break
if name_col is None:
    # try firstname + surname
    if "firstname" in df.columns and "surname" in df.columns:
        df["_fullname"] = df["firstname"].fillna("") + " " + df["surname"].fillna("")
        name_col = "_fullname"

repeat_list = []
if name_col:
    counts = df[name_col].fillna("").astype(str).str.strip()
    counts = counts[counts!=""]
    vc = counts.value_counts()
    repeat_list = vc[vc>1].index.tolist()
else:
    print("Warning: no name-like column found; repeat_list will be empty.")

# Print and return
results = {
    "top_gender": top_gender,
    "top_country": top_country,
    "max_decade_usa": max_decade_usa,
    "max_female_dict": max_female_dict,
    "first_woman_name": first_woman_name,
    "first_woman_category": first_woman_category,
    "repeat_list": repeat_list
}

print("===== RESULTS =====")
for k,v in results.items():
    print(f"{k}: {v}")

# Optionally, show the supporting tables
print("\nTop decades by USA ratio (sample):")
if birth_col:
    print(decade_stats.sort_values("usa_ratio", ascending=False).head(10).to_string(index=False))
print("\nTop decade+category by female proportion (sample):")
if not grp.empty:
    print(grp.sort_values("female_prop", ascending=False).head(10).to_string(index=False))

# return results if used as function
results


Loaded data/nobel.csv — shape: (1000, 18)
Columns: ['year', 'category', 'prize', 'motivation', 'prize_share', 'laureate_id', 'laureate_type', 'full_name', 'birth_date', 'birth_city', 'birth_country', 'sex', 'organization_name', 'organization_city', 'organization_country', 'death_date', 'death_city', 'death_country']

gender_candidates: ['sex']
birth_country_candidates: ['birth_country']
category_candidates: ['category']
year_candidates: ['year']
===== RESULTS =====
top_gender: Male
top_country: United States of America
max_decade_usa: 2000
max_female_dict: {2020: 'Literature'}
first_woman_name: Marie Curie, née Sklodowska
first_woman_category: Physics
repeat_list: ['Comité international de la Croix Rouge (International Committee of the Red Cross)', 'Linus Carl Pauling', 'John Bardeen', 'Frederick Sanger', 'Marie Curie, née Sklodowska', 'Office of the United Nations High Commissioner for Refugees (UNHCR)']

Top decades by USA ratio (sample):
 decade  total  usa_count  usa_ratio
   2000 

{'top_gender': 'Male',
 'top_country': 'United States of America',
 'max_decade_usa': 2000,
 'max_female_dict': {2020: 'Literature'},
 'first_woman_name': 'Marie Curie, née Sklodowska',
 'first_woman_category': 'Physics',
 'repeat_list': ['Comité international de la Croix Rouge (International Committee of the Red Cross)',
  'Linus Carl Pauling',
  'John Bardeen',
  'Frederick Sanger',
  'Marie Curie, née Sklodowska',
  'Office of the United Nations High Commissioner for Refugees (UNHCR)']}