In [None]:
!!git clone https://github.com/zykls/folktables.git


["fatal: destination path 'folktables' already exists and is not an empty directory."]

In [88]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import xgboost as xgb


In [89]:
df = pd.read_csv('adult_reconstruction.csv')
df.head()

Unnamed: 0,hours-per-week,age,capital-gain,capital-loss,workclass,education,education-num,marital-status,relationship,race,gender,native-country,income,occupation
0,20,40,0,0,Private,Bachelors,13,Married-civ-spouse,Wife,White,Female,United-States,49100,Tech-support
1,40,21,0,0,Private,Some-college,10,Divorced,Own-child,White,Male,United-States,11500,Craft-repair
2,10,17,0,0,Private,11th,7,Never-married,Own-child,White,Male,United-States,2600,Other-service
3,50,51,0,0,Private,HS-grad,9,Married-civ-spouse,Husband,Asian-Pac-Islander,Male,Cambodia,38997,Sales
4,38,28,0,0,Private,Bachelors,13,Never-married,Not-in-family,White,Male,?,41400,Exec-managerial


# Data Pre-processing


Dropping missing values

In [90]:
df.replace('?', np.nan, inplace=True)
df = df.dropna()

Mapping categorical features into subgroups

In [91]:
mapping = {
    'Private': 'Private',
    'Self-emp-not-inc': 'Private',
    'Self-emp-inc': 'Private',
    'Federal-gov': 'Government',
    'Local-gov': 'Government',
    'State-gov': 'Government',
    'Without-pay': 'Unemployed',
    'Never-worked': 'Unemployed',
    '?': 'Unknown'
}
education_mapping = {
    'Doctorate': 'Graduate',
    'Masters': 'Graduate',
    'Prof-school': 'Graduate',
    'Bachelors': 'College',
    'Some-college': 'College',
    'Assoc-acdm': 'College',
    'Assoc-voc': 'College',
    'HS-grad': 'High School',
    '12th': 'High School',
    '11th': 'Less than High School',
    '10th': 'Less than High School',
    '9th': 'Less than High School',
    '7th-8th': 'Less than High School',
    '5th-6th': 'Less than High School',
    '1st-4th': 'Less than High School',
    'Preschool': 'Less than High School'
}
marital_mapping = {
    'Married-civ-spouse': 'Married',
    'Married-spouse-absent': 'Married',
    'Married-AF-spouse': 'Married',
    'Never-married': 'Never Married',
    'Divorced': 'Separated/Divorced',
    'Separated': 'Separated/Divorced',
    'Widowed': 'Widowed'
}
continent_mapping = {
    # North America
    'United-States': 'North America',
    'Puerto-Rico': 'North America',
    'Canada': 'North America',
    'Outlying-US(Guam-USVI-etc)': 'North America',
    'Mexico': 'North America',

    # Central & South America
    'Honduras': 'Central & South America',
    'Jamaica': 'Central & South America',
    'Dominican-Republic': 'Central & South America',
    'Ecuador': 'Central & South America',
    'Columbia': 'Central & South America',
    'Guatemala': 'Central & South America',
    'Nicaragua': 'Central & South America',
    'El-Salvador': 'Central & South America',
    'Trinadad&Tobago': 'Central & South America',
    'Peru': 'Central & South America',
    'Haiti': 'Central & South America',
    'Cuba': 'Central & South America',

    # Asia
    'Laos': 'Asia',
    'Cambodia': 'Asia',
    'India': 'Asia',
    'Japan': 'Asia',
    'China': 'Asia',
    'Iran': 'Asia',
    'Philippines': 'Asia',
    'Vietnam': 'Asia',
    'Taiwan': 'Asia',
    'Thailand': 'Asia',
    'Hong': 'Asia',
    'Laos': 'Asia',

    # Europe
    'England': 'Europe',
    'Germany': 'Europe',
    'Greece': 'Europe',
    'Italy': 'Europe',
    'Poland': 'Europe',
    'Portugal': 'Europe',
    'Ireland': 'Europe',
    'France': 'Europe',
    'Hungary': 'Europe',
    'Scotland': 'Europe',
    'Yugoslavia': 'Europe',
    'Holand-Netherlands': 'Europe',

    # Unknown/Other
    '?': 'Unknown',
    'South': 'Unknown'  # unclear value
}

relationship_mapping = {
    'Wife': 'Married',
    'Husband': 'Married',
    'Own-child': 'Child',
    'Other-relative': 'Other-relative',
    'Not-in-family': 'Not-in-family',
    'Unmarried': 'Unmarried'
}

# Workclass privilege
privilege_workclass = {
    'Private': 'Not Privileged Workclass',
    'Government': 'Privileged Workclass',
    'Unemployed': 'Not Privileged Workclass',
    'Unknown': 'Unknown Workclass'
}

# Education privilege
privilege_education = {
    'Graduate': 'Privileged Education',
    'College': 'Privileged Education',
    'High School': 'Not Privileged Education',
    'Less than High School': 'Not Privileged Education'
}

# Marital privilege
privilege_marital = {
    'Married': 'Privileged Marital',
    'Never Married': 'Not Privileged Marital',
    'Separated/Divorced': 'Not Privileged Marital',
    'Widowed': 'Not Privileged Marital'
}

# Relationship privilege
privilege_relationship = {
    'Married': 'Privileged Relationship',
    'Child': 'Not Privileged Relationship',
    'Not-in-family': 'Not Privileged Relationship',
    'Unmarried': 'Not Privileged Relationship',
    'Other-relative': 'Not Privileged Relationship'
}

# Gender privilege
privilege_gender = {
    'Male': 'Privileged Gender',
    'Female': 'Not Privileged Gender'
}

# Race privilege
privilege_race = {
    'White': 'Privileged Race'
    # All others will be filled as Not Privileged Race
}

# Country
privilege_country = {
    # Privileged (mostly Western nations)
    'United-States': 'Privileged Country',
    'Canada': 'Privileged Country',
    'England': 'Privileged Country',
    'Germany': 'Privileged Country',
    'France': 'Privileged Country',
    'Ireland': 'Privileged Country',
    'Scotland': 'Privileged Country',
    'Holand-Netherlands': 'Privileged Country',
    'Greece': 'Privileged Country',
    'Italy': 'Privileged Country',
    'Poland': 'Privileged Country',
    'Portugal': 'Privileged Country',
    'Hungary': 'Privileged Country',

    # Not Privileged (Global South and developing regions)
    'Mexico': 'Not Privileged Country',
    'Puerto-Rico': 'Not Privileged Country',
    'Outlying-US(Guam-USVI-etc)': 'Not Privileged Country',
    'Honduras': 'Not Privileged Country',
    'Jamaica': 'Not Privileged Country',
    'Dominican-Republic': 'Not Privileged Country',
    'Ecuador': 'Not Privileged Country',
    'Columbia': 'Not Privileged Country',
    'Guatemala': 'Not Privileged Country',
    'Nicaragua': 'Not Privileged Country',
    'El-Salvador': 'Not Privileged Country',
    'Trinadad&Tobago': 'Not Privileged Country',
    'Peru': 'Not Privileged Country',
    'Haiti': 'Not Privileged Country',
    'Cuba': 'Not Privileged Country',
    'Laos': 'Not Privileged Country',
    'Cambodia': 'Not Privileged Country',
    'India': 'Not Privileged Country',
    'Japan': 'Not Privileged Country',
    'China': 'Not Privileged Country',
    'Iran': 'Not Privileged Country',
    'Philippines': 'Not Privileged Country',
    'Vietnam': 'Not Privileged Country',
    'Taiwan': 'Not Privileged Country',
    'Thailand': 'Not Privileged Country',
    'Hong': 'Not Privileged Country',
    'Yugoslavia': 'Not Privileged Country',
    'South': 'Not Privileged Country'
}


In [92]:
df['workclass_mapped'] = df['workclass'].map(mapping)
df['education_mapped'] = df['education'].map(education_mapping)
df['marital_mapped'] = df['marital-status'].map(marital_mapping)
df['continent'] = df['native-country'].map(continent_mapping)
df['relationship_mapped'] = df['relationship'].map(relationship_mapping)
df['race_privilige'] = df['race'].map(privilege_race).fillna('Not Privileged Race')
df['workclass_privilige'] = df['workclass_mapped'].map(privilege_workclass)
df['education_privilige'] = df['education_mapped'].map(privilege_education)
df['marital_privilige'] = df['marital_mapped'].map(privilege_marital)
df['relationship_privilige'] = df['relationship_mapped'].map(privilege_relationship)
df['country_privilege'] = df['native-country'].map(privilege_country).fillna('Not Privileged Country')


In [93]:
def prettify_column(col):
    return col.replace('_', ' ').capitalize()

Mapping Country to GDP

sources:
- https://www.imf.org/external/datamapper/NGDPDPC@WEO/OEMDC/ADVEC/WEOWORLD
- https://en.wikipedia.org/wiki/Economy_of_the_Socialist_Federal_Republic_of_Yugoslavia#GDP_per_capita_of_republics_and_autonomous_provinces
- https://data.worldbank.org/indicator/NY.GDP.PCAP.CD?locations=CU


In [94]:
# we use the per country csv as GDP data for the mapping
gdp_per_country = pd.read_csv("GDP per Country.csv", na_values="no data", sep=';')
gdp_per_country.rename(columns={gdp_per_country.columns[0]: 'Country'}, inplace=True)
gdp_per_country['2025'] = pd.to_numeric(gdp_per_country['2025'], errors='coerce')

native_countries = df['native-country'].unique()
gdp_2025_by_country = {}

# We do a mapping to account for the differences in names btwn the datasets (IMF and adult)
country_name_mapping = {
    'United-States': 'United States',
    'England': 'United Kingdom',
    'Puerto-Rico': 'Puerto Rico',
    'Outlying-US(Guam-USVI-etc)': 'United States',
    'South': 'South Africa',
    'China': "China, People's Republic of",
    'Laos': 'Lao P.D.R.',
    'Columbia': 'Colombia',
    'Taiwan': 'Taiwan Province of China',
    'Dominican-Republic': 'Dominican Republic',
    'Scotland': 'United Kingdom',
    'Yugoslavia': None,
    'El-Salvador': 'El Salvador',
    'Trinadad&Tobago': 'Trinidad and Tobago',
    'Hong': 'Hong Kong SAR',
    'Holand-Netherlands': 'Netherlands'
}

gdp_2025_by_country = {}
gdp_per_country['Country'] = gdp_per_country['Country'].str.strip()

for native in native_countries:
    # get mapped name or fall back to native
    gdp_country = country_name_mapping.get(native, native)

    # Step 2: Try to match if we have a name
    if gdp_country:
        match = gdp_per_country[gdp_per_country['Country'].str.lower() == gdp_country.lower()]
        gdp_2025_by_country[native] = match['2025'].values[0] if not match.empty else np.nan
    else:
        gdp_2025_by_country[native] = np.nan

# Manual overrides, since they are not in IMF set
gdp_2025_by_country['Yugoslavia'] = np.float64(5464)
gdp_2025_by_country['Cuba'] = np.float64(9605)
gdp_2025_by_country['Columbia'] = np.float64(8054)

gdp_df = pd.DataFrame.from_dict(gdp_2025_by_country, orient='index', columns=['GDP_2025'])
gdp_df_nonan = gdp_df.dropna().copy()

# Create quantile-based classes (so 0 is poor, 4 is rich)
gdp_df_nonan.loc[:, 'gdp_class'] = pd.qcut(gdp_df_nonan['GDP_2025'], q=5, labels=[0, 1, 2, 3, 4]).astype(int)
gdp_df = gdp_df.join(gdp_df_nonan['gdp_class'])

# Merge the gdp_class column from gdp_df into the main df
# We need to reset the index of gdp_df to merge on the country name
if 'GDP_2025' not in df:
  df = df.merge(gdp_df.reset_index().rename(columns={'index': 'native-country'}),
                on='native-country',
                how='left')

## BELOW is optional, to split it into priviliged and unprivileged
gdp_df = pd.DataFrame.from_dict(gdp_2025_by_country, orient='index', columns=['GDP_2025'])
gdp_df_sorted = gdp_df.dropna().sort_values('GDP_2025')
midpoint = len(gdp_df_sorted) // 2
gdp_df_sorted['privilege'] = ['Not Privileged Country'] * midpoint + ['Privileged Country'] * (len(gdp_df_sorted) - midpoint)
privilege_country = gdp_df_sorted['privilege'].to_dict()
df['country_privilege'] = df['native-country'].map(privilege_country).fillna('Not Privileged Country')

# Train Model

Encode categorical features

In [95]:
cat_features = ['workclass', 'education', 'marital-status', 'relationship', 'race', 'gender', 'native-country', \
                'occupation', 'workclass_mapped', 'education_mapped', 'marital_mapped', 'continent', \
                'relationship_mapped', 'race_privilige', 'workclass_privilige', 'education_privilige', \
                'marital_privilige', 'relationship_privilige', 'country_privilege']


df = df.drop(['GDP_2025', 'gdp_class'], axis=1)

encoded_df = df.copy()  # Use .copy() to avoid the warning
label_encoder = LabelEncoder()

encoded_labes = {}

for i in cat_features:
  encoded_df.loc[:, i] = label_encoder.fit_transform(encoded_df[i])
  encoded_labes[i] = label_encoder.classes_

label_decoder = {}
for k,v in encoded_labes.items():
  label_decoder[k] = dict(zip(range(len(v)), v))

features = list(df.columns)
features.remove('income')

Split train and test sets

In [96]:
train_set, test_set = train_test_split(encoded_df, test_size=0.2)

X_train = np.array(train_set[features])
y_train = np.array(train_set['income'])

X_test = np.array(test_set[features])
y_test = np.array(test_set['income'])


print('Size train set:', train_set.shape[0])
print('Size test set:',test_set.shape[0])

Size train set: 36679
Size test set: 9170


In [97]:
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)

# Test Model

In [98]:
pred_df = test_set.copy()
pred_df['pred_income'] = y_pred
pred_df.head()

Unnamed: 0,hours-per-week,age,capital-gain,capital-loss,workclass,education,education-num,marital-status,relationship,race,...,marital_mapped,continent,relationship_mapped,race_privilige,workclass_privilige,education_privilige,marital_privilige,relationship_privilige,country_privilege,pred_income
34185,40,18,0,0,2,4,3,4,2,4,...,1,3,3,1,0,0,0,0,0,-2464.730715
26118,40,36,0,0,2,11,9,0,1,4,...,2,3,2,1,0,0,0,0,0,16296.672991
2740,20,18,0,0,2,11,9,4,3,4,...,1,3,0,1,0,0,0,0,0,4628.032806
31720,40,29,0,0,2,1,7,4,4,2,...,1,3,4,0,0,0,0,0,0,10366.340696
18540,50,35,0,0,2,9,13,4,1,4,...,1,3,2,1,0,1,0,0,0,35053.866414


In [99]:
def get_sp_reg_matrix(data, attribute):

  features = data[attribute].unique()

  matrix = pd.DataFrame(0.0, index=features, columns=features)

  for i in features:
    for j in features:
      matrix.loc[i, j] = data[data[attribute]==i]['pred_income'].mean() - data[data[attribute]==j]['pred_income'].mean()

  return matrix

race_sp = get_sp_reg_matrix(pred_df, 'race')
gender_sp = get_sp_reg_matrix(pred_df, 'gender')
country_sp = get_sp_reg_matrix(pred_df, 'country_privilege')

In [100]:
def decode_df(df, label):
  for i in df.columns:
    df = df.rename(columns={i: label_decoder[label][i]}, index={i: label_decoder[label][i]})
  return df

gender_sp = decode_df(gender_sp, 'gender')
race_sp = decode_df(race_sp, 'race')
country_sp = decode_df(country_sp, 'country_privilege')

In [101]:
gender_sp

Unnamed: 0,Male,Female
Male,0.0,14961.661877
Female,-14961.661877,0.0


In [102]:
race_sp

Unnamed: 0,White,Black,Other,Asian-Pac-Islander,Amer-Indian-Eskimo
White,0.0,10441.686637,10738.524063,20.755651,9411.385136
Black,-10441.686637,0.0,296.837425,-10420.930986,-1030.301501
Other,-10738.524063,-296.837425,0.0,-10717.768411,-1327.138926
Asian-Pac-Islander,-20.755651,10420.930986,10717.768411,0.0,9390.629485
Amer-Indian-Eskimo,-9411.385136,1030.301501,1327.138926,-9390.629485,0.0


In [103]:
country_sp

Unnamed: 0,Not Privileged Country,Privileged Country
Not Privileged Country,0.0,4495.711808
Privileged Country,-4495.711808,0.0
