In [None]:
# Cell 1 — Imports + load CSV
import re
import unicodedata
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
from matplotlib.ticker import PercentFormatter

# ── Load data ─────────────────────────────────────────────────────────────
for candidate_path in [
    Path("assets/candidates_clean.csv"),
    Path("assets/nepal_election_2082_candidates_education.csv"),
]:
    if candidate_path.exists():
        DATA_PATH = candidate_path
        break

df = pd.read_csv(DATA_PATH)
print(f"Loaded {len(df):,} rows × {df.shape[1]} columns from {DATA_PATH}")

# ── Core QA ───────────────────────────────────────────────────────────────
if "error" in df.columns:
    before = len(df)
    df = df[df["error"].isna()].copy()
    print(f"Dropped {before - len(df):,} error rows → {len(df):,} clean rows")

# Zero-width char stripper
ZW_MAP = dict.fromkeys(map(ord, ["\u200b","\u200c","\u200d","\ufeff","\u2060"]), None)

def _clean(s):
    if pd.isna(s):
        return ""
    return str(s).translate(ZW_MAP).strip()

for col in ["party_np","gender","education_raw","education_bucket",
            "institution_raw","province_np","district_np"]:
    if col in df.columns:
        df[col] = df[col].apply(_clean)
    else:
        df[col] = ""

# Replace sentinel placeholders with empty
for col in ["education_raw","education_bucket","institution_raw"]:
    df[col] = df[col].replace({"-": "", "0": "", "N/A": "", "nan": ""})

# Age as numeric
df["age"] = pd.to_numeric(df.get("age", pd.Series(dtype=float)), errors="coerce")

df.shape

In [2]:
# Cell 2 — Filter to 4 major parties
PARTIES = [
    "राष्ट्रिय स्वतन्त्र पार्टी",
    "नेपाली काँग्रेस",
    "नेपाल कम्युनिष्ट पार्टी (एकीकृत मार्क्सवादी लेनिनवादी)",
    "नेपाली कम्युनिष्ट पार्टी",
]

PARTY_SHORT = {
    "राष्ट्रिय स्वतन्त्र पार्टी":                                  "RSP",
    "नेपाली काँग्रेस":                                             "NC",
    "नेपाल कम्युनिष्ट पार्टी (एकीकृत मार्क्सवादी लेनिनवादी)":  "UML",
    "नेपाली कम्युनिष्ट पार्टी":                                    "NCP",
}

# Colour palette — consistent across all plots
PARTY_COLOR = {
    "RSP": "#E63946",   # red
    "NC":  "#457B9D",   # blue
    "UML": "#E76F51",   # orange-red
    "NCP": "#2A9D8F",   # teal
}
PARTY_ORDER = [PARTY_SHORT[p] for p in PARTIES]

df4 = df[df["party_np"].isin(PARTIES)].copy()
df4["party_short"] = df4["party_np"].map(PARTY_SHORT)

print("Candidates per party:")
df4.groupby("party_short")["candidate_id"].count().reindex(PARTY_ORDER)

Candidates per party:


party_short
RSP    164
NC     165
UML    165
NCP    164
Name: candidate_id, dtype: int64

In [3]:
# Cell 3 — Normalise gender + map education_bucket → analysis group

# ── Gender ────────────────────────────────────────────────────────────────
GENDER_MAP = {"पुरुष": "Male", "महिला": "Female", "अन्य": "Other"}
df4["gender_norm"] = df4["gender"].map(GENDER_MAP).fillna("Unknown")

# ── Education group ───────────────────────────────────────────────────────
# Our education_bucket values from data_collection_fixed:
#   Doctorate / PhD | MPhil | Master | Bachelor | Professional Degree
#   Higher Secondary (Grade 11-12 / 10+2)
#   Secondary (Grade 10 / SEE/SLC)
#   Lower Secondary (Grade 6-8) | Primary (Grade 1-5)
#   Basic Literacy | Other / Unclear
#
# We collapse into 6 ordered analysis groups:
EDU_GROUP_MAP = {
    "Doctorate / PhD":                        "Doctorate / PhD",
    "MPhil":                                  "Master+",
    "Master":                                 "Master+",
    "Bachelor":                               "Bachelor",
    "Professional Degree":                    "Bachelor",       # same level
    "Higher Secondary (Grade 11-12 / 10+2)": "Higher Secondary",
    "Secondary (Grade 10 / SEE/SLC)":        "Up to Secondary",
    "Lower Secondary (Grade 6-8)":           "Up to Secondary",
    "Primary (Grade 1-5)":                   "Up to Secondary",
    "Basic Literacy":                         "Basic Literacy",
    "Other / Unclear":                        "Other / Unclear",
    "School (Grade - Unclear)":               "Other / Unclear",
}

# Ordered for stacked-bar plots (lowest → highest, unclear last)
EDU_ORDER = [
    "Basic Literacy",
    "Up to Secondary",
    "Higher Secondary",
    "Bachelor",
    "Master+",
    "Doctorate / PhD",
    "Other / Unclear",
]

EDU_COLORS = {
    "Basic Literacy":   "#A8DADC",
    "Up to Secondary":  "#457B9D",
    "Higher Secondary": "#1D3557",
    "Bachelor":         "#E9C46A",
    "Master+":          "#F4A261",
    "Doctorate / PhD":  "#E76F51",
    "Other / Unclear":  "#CED4DA",
}

df4["edu_group"] = df4["education_bucket"].map(EDU_GROUP_MAP).fillna("Other / Unclear")

print("Education group distribution (all 4 parties):")
df4["edu_group"].value_counts().reindex(EDU_ORDER)

Education group distribution (all 4 parties):


edu_group
Basic Literacy        7
Up to Secondary      47
Higher Secondary     36
Bachelor            140
Master+             108
Doctorate / PhD       4
Other / Unclear     316
Name: count, dtype: int64

In [4]:
# Cell 4 — Summary statistics table
rows = []
for p in PARTIES:
    sub = df4[df4["party_np"] == p]
    ps  = PARTY_SHORT[p]
    n   = len(sub)

    male   = (sub["gender_norm"] == "Male").sum()
    female = (sub["gender_norm"] == "Female").sum()
    denom  = male + female

    bachelor_plus = sub["edu_group"].isin(["Bachelor","Master+","Doctorate / PhD"]).sum()

    rows.append({
        "Party":               ps,
        "Candidates (n)":      n,
        # Age
        "Age — mean":          round(sub["age"].mean(), 1),
        "Age — median":        sub["age"].median(),
        "Age — min":           sub["age"].min(),
        "Age — max":           sub["age"].max(),
        "Age missing":         sub["age"].isna().sum(),
        # Gender
        "Male":                male,
        "Female":              female,
        "Female %":            f"{female/denom:.1%}" if denom else "—",
        "M:F ratio":           f"{male/female:.1f}:1" if female else "∞",
        # Education
        "Bachelor+":           bachelor_plus,
        "Bachelor+ %":         f"{bachelor_plus/n:.1%}",
        "Edu missing/unclear": (sub["edu_group"] == "Other / Unclear").sum(),
        # Geography
        "Provinces covered":   sub["province_np"].replace("", pd.NA).dropna().nunique(),
        "Districts covered":   sub["district_np"].replace("", pd.NA).dropna().nunique(),
    })

summary = pd.DataFrame(rows).set_index("Party")
summary

Unnamed: 0_level_0,Candidates (n),Age — mean,Age — median,Age — min,Age — max,Age missing,Male,Female,Female %,M:F ratio,Bachelor+,Bachelor+ %,Edu missing/unclear,Provinces covered,Districts covered
Party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
RSP,164,43.1,43.0,25,65,0,148,16,9.8%,9.2:1,62,37.8%,86,7,76
NC,165,55.6,57.0,32,77,0,154,11,6.7%,14.0:1,65,39.4%,80,7,77
UML,165,54.4,56.0,27,78,0,155,10,6.1%,15.5:1,67,40.6%,73,7,77
NCP,164,52.2,51.0,27,88,0,152,12,7.3%,12.7:1,58,35.4%,77,7,77


In [None]:
# Cell 5 — Plot: Age distribution — density histogram
fig, ax = plt.subplots(figsize=(11, 5))
bins = np.arange(20, 96, 5)

for p in PARTIES:
    sub  = df4[df4["party_np"] == p]
    ps   = PARTY_SHORT[p]
    ages = sub["age"].dropna()
    miss = sub["age"].isna().sum()
    ax.hist(
        ages, bins=bins, density=True, alpha=0.40,
        color=PARTY_COLOR[ps],
        label=f"{ps}  (n={len(sub):,}; age missing={miss})",
    )

ax.set_xlabel("Age", fontsize=12)
ax.set_ylabel("Density", fontsize=12)
ax.set_title("Age distribution by party — 2082 Election candidates", fontsize=13, fontweight="bold")
ax.legend(fontsize=9)
ax.axvline(df4["age"].median(), color="grey", ls="--", lw=1, label="Overall median")
plt.tight_layout()
plt.savefig("images/plot_age_histogram.png", dpi=150, bbox_inches="tight")
plt.show()

In [None]:
# Cell 6 — Plot: Age distribution — box plot (median / IQR / outliers per party)
fig, ax = plt.subplots(figsize=(8, 5))

age_data = [df4.loc[df4["party_np"] == p, "age"].dropna().values for p in PARTIES]
bp = ax.boxplot(
    age_data,
    patch_artist=True,
    medianprops=dict(color="black", lw=2),
    notch=False,
)
for patch, p in zip(bp["boxes"], PARTIES):
    patch.set_facecolor(PARTY_COLOR[PARTY_SHORT[p]])
    patch.set_alpha(0.75)

ax.set_xticks(range(1, len(PARTIES) + 1))
ax.set_xticklabels(PARTY_ORDER, fontsize=12)
ax.set_ylabel("Age", fontsize=12)
ax.set_title("Age distribution by party — box plot", fontsize=13, fontweight="bold")

# Annotate median + mean
for i, p in enumerate(PARTIES, start=1):
    ages = df4.loc[df4["party_np"] == p, "age"].dropna()
    ax.text(i, ages.median() + 1.2, f"med={ages.median():.0f}",
            ha="center", va="bottom", fontsize=8, color="black")
    ax.text(i, ages.mean() - 2.5, f"μ={ages.mean():.1f}",
            ha="center", va="top", fontsize=8, color="dimgrey")

plt.tight_layout()
plt.savefig("images/plot_age_boxplot.png", dpi=150, bbox_inches="tight")
plt.show()

In [None]:
# Cell 7 — Plot: Gender — female share + grouped M/F bars
fig, axes = plt.subplots(1, 2, figsize=(13, 5))

# ── Left: female % bar ────────────────────────────────────────────────────
ax = axes[0]
female_pcts, bar_labels = [], []
for p in PARTIES:
    sub    = df4[df4["party_np"] == p]
    male   = (sub["gender_norm"] == "Male").sum()
    female = (sub["gender_norm"] == "Female").sum()
    miss   = (sub["gender_norm"] == "Unknown").sum()
    denom  = male + female
    female_pcts.append(female / denom if denom else 0)
    bar_labels.append(f"{PARTY_SHORT[p]}\n(M={male} / F={female}; miss={miss})")

x = np.arange(len(PARTIES))
bars = ax.bar(x, female_pcts, color=[PARTY_COLOR[ps] for ps in PARTY_ORDER], alpha=0.85)
ax.yaxis.set_major_formatter(PercentFormatter(1.0))
ax.set_xticks(x)
ax.set_xticklabels(bar_labels, fontsize=8)
ax.axhline(0.33, ls="--", color="grey", lw=1, label="33% threshold")
ax.set_ylabel("Female share (among M+F)")
ax.set_title("Female representation by party", fontweight="bold")
ax.legend(fontsize=8)
for bar, pct in zip(bars, female_pcts):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.003,
            f"{pct:.1%}", ha="center", va="bottom", fontsize=9, fontweight="bold")

# ── Right: grouped M/F count bars ─────────────────────────────────────────
ax2 = axes[1]
width = 0.35
males   = [df4.loc[df4["party_np"] == p, "gender_norm"].eq("Male").sum()   for p in PARTIES]
females = [df4.loc[df4["party_np"] == p, "gender_norm"].eq("Female").sum() for p in PARTIES]
x2 = np.arange(len(PARTIES))

ax2.bar(x2 - width/2, males,   width, label="Male",   color="#457B9D", alpha=0.85)
ax2.bar(x2 + width/2, females, width, label="Female", color="#E63946", alpha=0.85)
ax2.set_xticks(x2)
ax2.set_xticklabels(PARTY_ORDER, fontsize=11)
ax2.set_ylabel("Candidate count")
ax2.set_title("Male vs Female candidate count by party", fontweight="bold")
ax2.legend()

plt.suptitle("Gender analysis — 2082 Election candidates", fontsize=13, y=1.01)
plt.tight_layout()
plt.savefig("images/plot_gender.png", dpi=150, bbox_inches="tight")
plt.show()

In [None]:
# Cell 8 — Plot: Education profile — 100% stacked bars
ct  = pd.crosstab(df4["party_short"], df4["edu_group"]).reindex(
    index=PARTY_ORDER, columns=EDU_ORDER, fill_value=0
)
pct = ct.div(ct.sum(axis=1), axis=0)

fig, ax = plt.subplots(figsize=(11, 5.5))
x      = np.arange(len(PARTY_ORDER))
bottom = np.zeros(len(PARTY_ORDER))

for cat in EDU_ORDER:
    vals = pct[cat].values
    bars = ax.bar(x, vals, bottom=bottom, label=cat,
                  color=EDU_COLORS[cat], edgecolor="white", linewidth=0.4)
    # Label segments ≥ 5%
    for xi, (v, b) in enumerate(zip(vals, bottom)):
        if v >= 0.05:
            ax.text(xi, b + v/2, f"{v:.0%}",
                    ha="center", va="center", fontsize=8, color="white", fontweight="bold")
    bottom += vals

ax.yaxis.set_major_formatter(PercentFormatter(1.0))
ax.set_xticks(x)
ax.set_xticklabels(PARTY_ORDER, fontsize=12)
ax.set_ylabel("Share of candidates")
ax.set_title("Education profile by party (normalized 100%)", fontsize=13, fontweight="bold")

# Legend 1: education categories
leg1 = ax.legend(title="Education group", bbox_to_anchor=(1.02, 1), loc="upper left", fontsize=9)
ax.add_artist(leg1)

# Legend 2: per-party missing/unclear counts
party_labels = []
for p in PARTIES:
    sub     = df4[df4["party_np"] == p]
    miss    = sub["education_raw"].eq("").sum() + sub["education_raw"].isna().sum()
    unclear = (sub["edu_group"] == "Other / Unclear").sum()
    party_labels.append(
        f"{PARTY_SHORT[p]} (n={len(sub)};  edu missing={miss};  unclear={unclear})"
    )
handles2 = [mpatches.Patch(facecolor=PARTY_COLOR[ps], alpha=0.75) for ps in PARTY_ORDER]
ax.legend(handles2, party_labels, title="Party (data quality)",
          bbox_to_anchor=(1.02, 0.38), loc="upper left", fontsize=8)

plt.tight_layout()
plt.savefig("images/plot_education_stacked.png", dpi=150, bbox_inches="tight")
plt.show()

In [None]:
# Cell 9 — Plot: Age by education group — box plots
# Shows whether higher-educated candidates skew older or younger.

EDU_PLOT_ORDER = [
    "Basic Literacy", "Up to Secondary", "Higher Secondary",
    "Bachelor", "Master+", "Doctorate / PhD",
]

age_by_edu = [
    df4.loc[df4["edu_group"] == g, "age"].dropna().values
    for g in EDU_PLOT_ORDER
]

fig, ax = plt.subplots(figsize=(12, 5))
bp = ax.boxplot(
    age_by_edu,
    patch_artist=True,
    medianprops=dict(color="black", lw=2),
)
for patch, g in zip(bp["boxes"], EDU_PLOT_ORDER):
    patch.set_facecolor(EDU_COLORS[g])
    patch.set_alpha(0.80)

ax.set_xticks(range(1, len(EDU_PLOT_ORDER) + 1))
ax.set_xticklabels(
    [f"{g}\n(n={len(d)})" for g, d in zip(EDU_PLOT_ORDER, age_by_edu)],
    fontsize=9,
)
ax.set_ylabel("Age", fontsize=12)
ax.set_title("Age distribution by education level (all 4 parties combined)",
             fontsize=13, fontweight="bold")
ax.axhline(df4["age"].median(), color="grey", ls="--", lw=1, label=f"Overall median age = {df4['age'].median():.0f}")
ax.legend(fontsize=9)

plt.tight_layout()
plt.savefig("images/plot_age_by_education.png", dpi=150, bbox_inches="tight")
plt.show()

In [None]:
# Cell 10 — Plot: Education × Gender — female share within each edu group (per party)
# Reveals whether female candidates are concentrated in lower-education tiers.

fig, axes = plt.subplots(1, len(PARTIES), figsize=(14, 5), sharey=True)

for ax, p in zip(axes, PARTIES):
    sub = df4[df4["party_np"] == p]
    ps  = PARTY_SHORT[p]

    female_share = []
    ns = []
    for g in EDU_PLOT_ORDER:
        grp    = sub[sub["edu_group"] == g]
        male   = (grp["gender_norm"] == "Male").sum()
        female = (grp["gender_norm"] == "Female").sum()
        denom  = male + female
        female_share.append(female / denom if denom else np.nan)
        ns.append(denom)

    x = np.arange(len(EDU_PLOT_ORDER))
    bars = ax.bar(x, female_share, color=PARTY_COLOR[ps], alpha=0.80)
    ax.axhline(0.33, ls="--", color="grey", lw=1)
    ax.yaxis.set_major_formatter(PercentFormatter(1.0))
    ax.set_ylim(0, 1)
    ax.set_xticks(x)
    ax.set_xticklabels(
        [f"{g}\n(n={n})" for g, n in zip(EDU_PLOT_ORDER, ns)],
        fontsize=7, rotation=35, ha="right"
    )
    ax.set_title(ps, fontsize=12, color=PARTY_COLOR[ps], fontweight="bold")
    if ax == axes[0]:
        ax.set_ylabel("Female share")

plt.suptitle("Female share by education group — per party (-- = 33% line)",
             fontsize=12, fontweight="bold", y=1.01)
plt.tight_layout()
plt.savefig("images/plot_gender_by_education.png", dpi=150, bbox_inches="tight")
plt.show()

In [None]:
# Cell 11 — Plot: Province distribution — where each party fields candidates
# Highlights geographic concentration / diversity.

# Ordered by geography (west → east)
PROVINCE_ORDER = [
    "कर्णाली प्रदेश",
    "सुदूरपश्चिम प्रदेश",
    "लुम्बिनी प्रदेश",
    "गण्डकी प्रदेश",
    "बागमती प्रदेश",
    "मधेश प्रदेश",
    "कोशी प्रदेश",
]
PROVINCE_SHORT = {
    "कर्णाली प्रदेश":       "Karnali",
    "सुदूरपश्चिम प्रदेश":   "Sudurpaschim",
    "लुम्बिनी प्रदेश":      "Lumbini",
    "गण्डकी प्रदेश":        "Gandaki",
    "बागमती प्रदेश":        "Bagmati",
    "मधेश प्रदेश":          "Madhesh",
    "कोशी प्रदेश":          "Koshi",
}

# Build province counts per party
prov_data = {}
for p in PARTIES:
    sub = df4[df4["party_np"] == p]
    vc  = sub["province_np"].value_counts()
    prov_data[PARTY_SHORT[p]] = [
        vc.get(prov, 0) for prov in PROVINCE_ORDER
    ]

prov_df = pd.DataFrame(prov_data, index=[PROVINCE_SHORT.get(p, p) for p in PROVINCE_ORDER])

fig, ax = plt.subplots(figsize=(12, 5))
x      = np.arange(len(PROVINCE_ORDER))
width  = 0.20
offset = -(len(PARTIES) - 1) / 2 * width

for i, ps in enumerate(PARTY_ORDER):
    ax.bar(x + offset + i*width, prov_df[ps].values,
           width, label=ps, color=PARTY_COLOR[ps], alpha=0.85)

ax.set_xticks(x)
ax.set_xticklabels(prov_df.index, fontsize=10)
ax.set_ylabel("Number of candidates")
ax.set_title("Candidate distribution by province and party",
             fontsize=13, fontweight="bold")
ax.legend(title="Party")

plt.tight_layout()
plt.savefig("images/plot_province.png", dpi=150, bbox_inches="tight")
plt.show()

In [None]:
# Cell 12 — Plot: Mean age by education group per party
# A grouped bar chart: x = edu group, bars = party, y = mean age
# Reveals if each party's pattern of age-vs-education differs.

fig, ax = plt.subplots(figsize=(13, 5))

x      = np.arange(len(EDU_PLOT_ORDER))
n_bars = len(PARTIES)
width  = 0.18
offset = -(n_bars - 1) / 2 * width

for i, p in enumerate(PARTIES):
    sub  = df4[df4["party_np"] == p]
    ps   = PARTY_SHORT[p]
    means = [
        sub.loc[sub["edu_group"] == g, "age"].mean()
        for g in EDU_PLOT_ORDER
    ]
    bars = ax.bar(x + offset + i*width, means, width,
                  label=ps, color=PARTY_COLOR[ps], alpha=0.85)
    for xi, m in enumerate(means):
        if not np.isnan(m):
            ax.text(xi + offset + i*width, m + 0.3, f"{m:.0f}",
                    ha="center", va="bottom", fontsize=7)

ax.set_xticks(x)
ax.set_xticklabels(EDU_PLOT_ORDER, fontsize=10)
ax.set_ylabel("Mean age", fontsize=12)
ax.set_title("Mean candidate age by education level and party",
             fontsize=13, fontweight="bold")
ax.legend(title="Party", fontsize=10)

plt.tight_layout()
plt.savefig("images/plot_mean_age_by_edu_party.png", dpi=150, bbox_inches="tight")
plt.show()

In [13]:
# Cell 13 — Debug: raw education values still landing in 'Other / Unclear'
# Use this to improve the education_bucket mapping in data_collection_fixed.ipynb.

unclear = df4[df4["edu_group"] == "Other / Unclear"].copy()

print(f"Total 'Other / Unclear' across 4 parties: {len(unclear):,}")
print(f"  of which education_raw is empty/missing: "
      f"{(unclear['education_raw'].isna() | unclear['education_raw'].eq('')).sum():,}")
print()
print("Top 30 raw values still unclassified:")
(
    unclear["education_raw"]
    .replace("", pd.NA)
    .fillna("<<MISSING>>")
    .value_counts()
    .head(30)
)

Total 'Other / Unclear' across 4 parties: 316
  of which education_raw is empty/missing: 287

Top 30 raw values still unclassified:


education_raw
<<MISSING>>                        287
+2                                  19
डिप्लोमा इन्जिनियरिङ्ग               2
I.A Pass +2                          1
ईन्जिनेरिङ ग्राजुएड                  1
स्तानक मेकानिकल इन्जिनियरिङ्ग        1
I.A.(+2)                             1
Bcahelor Of Medicine                 1
डिप्लोमा इन सिभिल इन्जिनियरिङ्ग      1
+2 pass                              1
इन्जिनियर                            1
Name: count, dtype: int64