In [6]:
import pandas as pd

# Load the dataset
df = pd.read_csv('../data/synthetic_resumes.csv')  # adjust path if needed
df.head()


Unnamed: 0,candidate_id,gender,ethnicity,education,years_experience,skills,label
0,0,female,groupA,phd,10,investment;as;either;identify,1
1,1,male,groupB,masters,8,data;although;recent;thousand,1
2,2,male,groupB,masters,18,where;guess;agree;skin,1
3,3,female,groupB,bachelors,17,participant;direction;discuss;city,1
4,4,female,groupB,phd,15,serve;scientist;enter;character,0


In [7]:
# Cell 10 — selection rates by group (raw)
if 'label' in df.columns:
    if 'gender' in df.columns:
        print("Selection rate by gender:")
        print(df.groupby('gender')['label'].mean())   # proportion labeled 1
    if 'ethnicity' in df.columns:
        print("\nSelection rate by ethnicity:")
        print(df.groupby('ethnicity')['label'].mean())


Selection rate by gender:
gender
female        0.549107
male          0.585177
non-binary    0.592233
unknown       0.536082
Name: label, dtype: float64

Selection rate by ethnicity:
ethnicity
groupA     0.569884
groupB     0.536538
groupC     0.630769
unknown    0.566265
Name: label, dtype: float64


In [8]:
# Cell 9 — parse skills
if 'skills' in df.columns:
    df['skills'] = df['skills'].fillna('').astype(str).str.lower()
    df['skills_list'] = df['skills'].apply(lambda s: [x.strip() for x in s.split(';') if x.strip()])
    df['num_skills'] = df['skills_list'].apply(len)
    display(df[['skills','skills_list','num_skills']].head(6))


Unnamed: 0,skills,skills_list,num_skills
0,investment;as;either;identify,"[investment, as, either, identify]",4
1,data;although;recent;thousand,"[data, although, recent, thousand]",4
2,where;guess;agree;skin,"[where, guess, agree, skin]",4
3,participant;direction;discuss;city,"[participant, direction, discuss, city]",4
4,serve;scientist;enter;character,"[serve, scientist, enter, character]",4
5,group;call;suggest;kid,"[group, call, suggest, kid]",4


In [9]:
for col in ['gender','ethnicity','education']:
    if col in df.columns:
        print(col, df[col].value_counts().head(10))


gender gender
male          904
female        896
non-binary    103
unknown        97
Name: count, dtype: int64
ethnicity ethnicity
groupA     1202
groupB      520
groupC      195
unknown      83
Name: count, dtype: int64
education education
phd          674
masters      670
bachelors    656
Name: count, dtype: int64


In [10]:
# Cell 8 — normalize certain categorical columns (in-place)
def norm_col(col):
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip().str.lower().replace({'nan':'unknown'})
        print(f"Normalized {col}")

for c in ['gender','ethnicity','education']:
    norm_col(c)

# unify obvious short forms; adapt to your dataset
df['gender'] = df['gender'].replace({'m':'male','f':'female','man':'male','woman':'female'})
df['education'] = df['education'].replace({'bsc':'bachelors','msc':'masters','ph.d':'phd'})


Normalized gender
Normalized ethnicity
Normalized education


In [11]:
# Cell 7 — quick label balance (important for modeling)
if 'label' in df.columns:
    print("Label counts:\n", df['label'].value_counts())
    print("Label distribution (proportion):\n", df['label'].value_counts(normalize=True))


Label counts:
 label
1    1134
0     866
Name: count, dtype: int64
Label distribution (proportion):
 label
1    0.567
0    0.433
Name: proportion, dtype: float64


In [12]:
# Cell 6 — missing values & duplicates
print("Missing values per column:")
print(df.isnull().sum())
if 'candidate_id' in df.columns:
    print("\nDuplicate candidate IDs:", df['candidate_id'].duplicated().sum())


Missing values per column:
candidate_id        0
gender              0
ethnicity           0
education           0
years_experience    0
skills              0
label               0
skills_list         0
num_skills          0
dtype: int64

Duplicate candidate IDs: 0


In [13]:
# Cell 5 — columns, dtypes, sample values
print("Columns:", df.columns.tolist())
print("\nDtypes:\n", df.dtypes)
# show unique values for protected attrs
for col in ['gender','ethnicity','education','label']:
    if col in df.columns:
        print(f"\n--- {col} value_counts ---")
        print(df[col].value_counts(dropna=False))


Columns: ['candidate_id', 'gender', 'ethnicity', 'education', 'years_experience', 'skills', 'label', 'skills_list', 'num_skills']

Dtypes:
 candidate_id         int64
gender              object
ethnicity           object
education           object
years_experience     int64
skills              object
label                int64
skills_list         object
num_skills           int64
dtype: object

--- gender value_counts ---
gender
male          904
female        896
non-binary    103
unknown        97
Name: count, dtype: int64

--- ethnicity value_counts ---
ethnicity
groupa     1202
groupb      520
groupc      195
unknown      83
Name: count, dtype: int64

--- education value_counts ---
education
phd          674
masters      670
bachelors    656
Name: count, dtype: int64

--- label value_counts ---
label
1    1134
0     866
Name: count, dtype: int64


In [14]:
# Cell 4 — basic shape & info
print("Shape:", df.shape)
display(df.info())        # column dtypes and non-null counts
display(df.head(8))


Shape: (2000, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   candidate_id      2000 non-null   int64 
 1   gender            2000 non-null   object
 2   ethnicity         2000 non-null   object
 3   education         2000 non-null   object
 4   years_experience  2000 non-null   int64 
 5   skills            2000 non-null   object
 6   label             2000 non-null   int64 
 7   skills_list       2000 non-null   object
 8   num_skills        2000 non-null   int64 
dtypes: int64(4), object(5)
memory usage: 140.8+ KB


None

Unnamed: 0,candidate_id,gender,ethnicity,education,years_experience,skills,label,skills_list,num_skills
0,0,female,groupa,phd,10,investment;as;either;identify,1,"[investment, as, either, identify]",4
1,1,male,groupb,masters,8,data;although;recent;thousand,1,"[data, although, recent, thousand]",4
2,2,male,groupb,masters,18,where;guess;agree;skin,1,"[where, guess, agree, skin]",4
3,3,female,groupb,bachelors,17,participant;direction;discuss;city,1,"[participant, direction, discuss, city]",4
4,4,female,groupb,phd,15,serve;scientist;enter;character,0,"[serve, scientist, enter, character]",4
5,5,male,groupa,phd,7,group;call;suggest;kid,1,"[group, call, suggest, kid]",4
6,6,non-binary,groupb,masters,6,peace;college;paper;long,1,"[peace, college, paper, long]",4
7,7,non-binary,groupa,phd,8,space;if;may;mother,1,"[space, if, may, mother]",4


In [15]:
# Cell 3 — load dataset with safety checks
import os
import pandas as pd

# adjust relative path if your Jupyter server root differs.
path_candidates = ['data/synthetic_resumes.csv', '../data/synthetic_resumes.csv', 'data/raw_resumes.csv']

for p in path_candidates:
    if os.path.exists(p):
        path = p
        break
else:
    raise FileNotFoundError(f"No CSV found. Searched: {path_candidates}")

print("Loading file from:", path)
df = pd.read_csv(path, low_memory=False)   # low_memory=False avoids dtype warnings
df.head(5)


Loading file from: ../data/synthetic_resumes.csv


Unnamed: 0,candidate_id,gender,ethnicity,education,years_experience,skills,label
0,0,female,groupA,phd,10,investment;as;either;identify,1
1,1,male,groupB,masters,8,data;although;recent;thousand,1
2,2,male,groupB,masters,18,where;guess;agree;skin,1
3,3,female,groupB,bachelors,17,participant;direction;discuss;city,1
4,4,female,groupB,phd,15,serve;scientist;enter;character,0


In [16]:
# Cell 2 — quick shell preview (works in notebook)
!ls -la data
!head -n 6 data/synthetic_resumes.csv


ls: data: No such file or directory
head: data/synthetic_resumes.csv: No such file or directory


In [17]:
# Cell 1 — environment & working-dir checks
import os, sys
print("Current working dir:", os.getcwd())      # where Jupyter is reading files from
print("Python executable:", sys.executable)     # which Python is running
print("sys.path[0]:", sys.path[0])


Current working dir: /Users/chiragrgowda/Documents/resume-fairness/notebooks
Python executable: /Users/chiragrgowda/Documents/resume-fairness/venv/bin/python
sys.path[0]: /opt/homebrew/Cellar/python@3.13/3.13.1/Frameworks/Python.framework/Versions/3.13/lib/python313.zip


In [33]:
# Cell A — load dataset and inspect missingness
import pandas as pd
df = pd.read_csv("../data/synthetic_resumes.csv", low_memory=False)   # adjust path if needed

# basic info
print("Shape:", df.shape)
display(df.head())

# count missing values per column
missing = df.isna().sum().sort_values(ascending=False)
print("Missing counts:\n", missing)

# fraction missing
print("\nMissing fraction:\n", (df.isna().mean()).sort_values(ascending=False))


Shape: (2000, 7)


Unnamed: 0,candidate_id,gender,ethnicity,education,years_experience,skills,label
0,0,female,groupA,phd,10,investment;as;either;identify,1
1,1,male,groupB,masters,8,data;although;recent;thousand,1
2,2,male,groupB,masters,18,where;guess;agree;skin,1
3,3,female,groupB,bachelors,17,participant;direction;discuss;city,1
4,4,female,groupB,phd,15,serve;scientist;enter;character,0


Missing counts:
 candidate_id        0
gender              0
ethnicity           0
education           0
years_experience    0
skills              0
label               0
dtype: int64

Missing fraction:
 candidate_id        0.0
gender              0.0
ethnicity           0.0
education           0.0
years_experience    0.0
skills              0.0
label               0.0
dtype: float64


In [19]:
# Cell B — normalize string placeholders to actual NaN
import numpy as np

# columns to coerce from string 'nan' etc.
for col in df.columns:
    if df[col].dtype == object:
        df[col] = df[col].replace(['', ' ', 'None', 'none', 'NaN', 'nan'], pd.NA)

# re-check missing
display(df.isna().sum())


candidate_id        0
gender              0
ethnicity           0
education           0
years_experience    0
skills              0
label               0
dtype: int64

In [22]:
# Cell C — protected attributes: fill missing with 'unknown'
for col in ['gender', 'ethnicity']:
    if col in df.columns:
        # keep original as backup if needed
        df[f"{col}_raw"] = df[col]
        df[col] = df[col].fillna('unknown').astype(str).str.strip().str.lower()
        # standardized categories examples (adjust to your context)
        df[col] = df[col].replace({
            'm':'male', 'f':'female', 'man':'male', 'woman':'female',
            'na':'unknown', 'n/a':'unknown', 'none':'unknown'
        })
        # create a missing indicator (True if it was missing)
        df[f"{col}_missing"] = df[f"{col}_raw"].isna()


In [23]:
# Cell D — canonicalize education & fill missing
if 'education' in df.columns:
    df['education'] = df['education'].fillna('unknown').astype(str).str.strip().str.lower()
    df['education'] = df['education'].replace({
        'b.sc':'bachelors','bsc':'bachelors','bs':'bachelors',
        'msc':'masters','m.sc':'masters','mtech':'masters',
        'ph.d':'phd','phd':'phd','doctorate':'phd'
    })
    df['education_missing'] = df['education'] == 'unknown'


In [24]:
# Cell E — fix years_experience
if 'years_experience' in df.columns:
    # coerce errors to NaN, strip stray text
    df['years_experience'] = pd.to_numeric(df['years_experience'], errors='coerce')
    # missing indicator
    df['years_experience_missing'] = df['years_experience'].isna()
    # simple imputation: fill with 0 or median (choose one)
    # Option A: fill with 0 (if missing means no experience)
    # df['years_experience'] = df['years_experience'].fillna(0)
    # Option B: fill with median (safer generally)
    median_yrs = df['years_experience'].median(skipna=True)
    df['years_experience'] = df['years_experience'].fillna(median_yrs)
    print("median years:", median_yrs)


median years: 10.0


In [25]:
# Cell F — skills handling
if 'skills' in df.columns:
    df['skills'] = df['skills'].fillna('').astype(str)
    # normalize separators and case
    df['skills'] = df['skills'].str.replace(',', ';').str.lower()
    # split into list (empty string -> empty list)
    df['skills_list'] = df['skills'].apply(lambda s: [x.strip() for x in s.split(';') if x.strip()])
    # feature: number of skills
    df['num_skills'] = df['skills_list'].apply(len)
    # indicator for missing skills
    df['skills_missing'] = df['skills'] == ''


In [26]:
# Cell G — resume_text clean or extract
import re
if 'resume_text' in df.columns:
    df['resume_text'] = df['resume_text'].fillna('').astype(str)
    # very simple text length feature
    df['resume_len_chars'] = df['resume_text'].str.len()
    df['resume_len_words'] = df['resume_text'].str.split().apply(len)
    # optional: try to extract numeric experience from resume_text if years_experience missing
    mask = df['years_experience_missing'] & (df['resume_text'] != '')
    # simple regex to find "X years" pattern
    def extract_years(text):
        m = re.search(r'(\d{1,2})\s+years?', text.lower())
        return int(m.group(1)) if m else None
    df.loc[mask, 'years_experience'] = df.loc[mask, 'resume_text'].apply(extract_years).fillna(df.loc[mask, 'years_experience'])


In [27]:
# Cell H — label sanity
if 'label' in df.columns:
    # check missing labels
    missing_labels = df['label'].isna().sum()
    print("Missing labels:", missing_labels)
    # if labels missing and you cannot infer, you may need to drop them for supervised training
    # e.g., df = df[df['label'].notna()].copy()


Missing labels: 0


In [28]:
# Cell I — display before/after summaries
summary_cols = ['gender_missing','ethnicity_missing','education_missing','years_experience_missing','skills_missing']
for c in summary_cols:
    if c in df.columns:
        print(c, df[c].sum(), "rows (", round(100*df[c].mean(),2),"%)")
# quick group selection rates
if 'label' in df.columns:
    if 'gender' in df.columns:
        print("\nSelection rate by gender (after imputation):")
        print(df.groupby('gender')['label'].mean())


gender_missing 0 rows ( 0.0 %)
ethnicity_missing 0 rows ( 0.0 %)
education_missing 0 rows ( 0.0 %)
years_experience_missing 0 rows ( 0.0 %)
skills_missing 0 rows ( 0.0 %)

Selection rate by gender (after imputation):
gender
female        0.549107
male          0.585177
non-binary    0.592233
unknown       0.536082
Name: label, dtype: float64


In [29]:
# Cell J — example of sklearn imputer usage for numeric and categorical (for future pipeline)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

num_cols = ['years_experience','num_skills','resume_len_words']
cat_cols = ['gender','ethnicity','education']

num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='constant', fill_value='unknown')

preprocessor = ColumnTransformer(transformers=[
    ('num', num_imputer, num_cols),
    ('cat', cat_imputer, cat_cols)
], remainder='drop')

# Fit-transform example (do on training only)
# X = df[num_cols + cat_cols]
# X_processed = preprocessor.fit_transform(X)


In [36]:
# Cell K — save cleaned file (local)
out_path = "../data/cleaned_resumes.csv"
df.to_csv(out_path, index=False)
print("Saved cleaned dataset to", out_path)


Saved cleaned dataset to ../data/cleaned_resumes.csv
