In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import xgboost as xgb
from xgboost import XGBClassifier

In [2]:
# reading dataset using panda
data_path = os.path.dirname(os.getcwd())
data_df = pd.read_csv(os.path.join(data_path, "data\\data_income.csv"))

In [8]:
data_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [3]:
# Define the features and target
X = data_df.drop(["income"], axis=1)
y = data_df["income"]

# Define the categorical columns to one-hot encode
categorical_cols = [
    "occupation",
    "education",
    "workclass",
    "marital-status",
    "relationship",
    "race",
    "native-country",
    "gender",
]
numeric_cols = [
    "age",
    "fnlwgt",
    "educational-num",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [42]:
# fix the format for y
y_train = y_train.replace({">50K": 1, "<=50K": 0})
y_test = y_test.replace({">50K": 1, "<=50K": 0})


In [5]:
# Define preprocessing for categorical and numeric data
categorical_preprocessor = OneHotEncoder()
numeric_preprocessor = StandardScaler()

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_preprocessor, categorical_cols),
        ("num", numeric_preprocessor, numeric_cols),
    ],
    remainder="passthrough",  # You can use this to include any remaining columns
)

# Create a pipeline with preprocessing and the machine learning model
model = XGBClassifier(random_state=42)

pipeline = Pipeline([("preprocessor", preprocessor), ("model", model)])
# Fit the pipeline (including preprocessing) to the training data
pipeline.fit(X_train, y_train)
# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8775


Plot de l'accuracy par genre et couleur de peau

In [43]:
from sklearn.metrics import accuracy_score

# Calculer l'accuracy par genre
accuracy_par_genre = {}
genres = ['Male', 'Female']
for genre in genres:
    mask_genre = X_test['gender'] == genre
    accuracy_genre = accuracy_score(y_test[mask_genre], y_pred[mask_genre])
    accuracy_par_genre[genre] = accuracy_genre

# Calculer l'accuracy par race
accuracy_par_race = {}
races = ['White', 'Black']
for race in races:
    mask_race = X_test['race'] == race
    accuracy_race = accuracy_score(y_test[mask_race], y_pred[mask_race])
    accuracy_par_race[race] = accuracy_race

# Créer un tableau double entrée avec les accuracies
tableau_accuracies = pd.DataFrame({
    'Global': accuracy,
    'Male': accuracy_par_genre['Male'],
    'Female': accuracy_par_genre['Female'],
    'White': accuracy_par_race['White'],
    'Black': accuracy_par_race['Black']
}, index=['Accuracy'])

print(tableau_accuracies)

           Global      Male   Female     White     Black
Accuracy  0.87747  0.844724  0.94291  0.870053  0.935103


Statistical Parity Test Homme / Femme

In [8]:
# fix the format for y
data_df['income'] = data_df['income'].replace({">50K": 1, "<=50K": 0})

In [9]:
# Diviser le DataFrame en deux groupes en fonction du genre
group1 = data_df[data_df['gender'] == 'Male']
group2 = data_df[data_df['gender'] == 'Female']

In [13]:
# Parité statistique : Comparer les proportions de prédictions positives entre les groupes
prop_group1_positive = group1['income'].mean()
prop_group2_positive = group2['income'].mean()

# Afficher les résultats
print(f"Parité statistique - Proportion de prédictions positives pour homme : {prop_group1_positive}")
print(f"Parité statistique - Proportion de prédictions positives pour femme : {prop_group2_positive}")

Parité statistique - Proportion de prédictions positives pour homme : 0.3037672281776417
Parité statistique - Proportion de prédictions positives pour femme : 0.10925148221343874


In [26]:
from scipy.stats import chi2_contingency

# Données d'exemple (à remplacer par vos propres données)
group1_predictions = pipeline.predict(group1.drop(["income"], axis=1))
group2_predictions = pipeline.predict(group2.drop(["income"], axis=1))

print(group1_predictions)
print(group2_predictions)

# Ajuster la taille des groupes
min_length = min(len(group1_predictions), len(group2_predictions))
group1_predictions = group1_predictions[:min_length]
group2_predictions = np.random.choice(group2_predictions, min_length, replace=False)

# Convertir les données en float64
group1_predictions = group1_predictions.astype(np.float64)
group2_predictions = group2_predictions.astype(np.float64)

# Ajouter une petite constante aux données pour éviter les fréquences nulles
epsilon = 1e-10
group1_predictions += epsilon
group2_predictions += epsilon

# Créer un tableau de contingence
contingency_table = np.array([group1_predictions, group2_predictions])

# Effectuer le test du chi-carré
chi2, p, _, _ = chi2_contingency(contingency_table, correction=True)

# Interpréter le résultat
print(f"chi2 = {chi2}")
if p < 0.05:
    print("La parité statistique n'est pas respectée.")
else:
    print("Pas de preuve significative de violation de la parité statistique.")

[0 0 1 ... 0 0 0]
[0 0 0 ... 0 0 1]
chi2 = 4686.750440209035
Pas de preuve significative de violation de la parité statistique.


Conditional Statistical Parity Test

In [38]:
# Extraire les données nécessaires
group1_predictions = data_df[data_df['gender'] == 'Male']['income'].values
group2_predictions = data_df[data_df['gender'] == 'Female']['income'].values
group1_positives = data_df[(data_df['gender'] == 'Male') & (data_df['income'] == 1)]['income'].values
group2_positives = data_df[(data_df['gender'] == 'Female') & (data_df['income'] == 1)]['income'].values
protected_attribute = data_df['gender'].values

# Ajuster la taille des groupes
min_length = min(len(group1_predictions), len(group2_predictions))
group1_predictions = group1_predictions[:min_length]
group2_predictions = group2_predictions[:min_length]  # Ajuster ici
group1_positives = group1_positives[:min_length]
group2_positives = group2_positives[:min_length]  # Ajuster ici

# Ajuster la taille des groupes de prédictions positives
min_positives_length = min(len(group1_positives), len(group2_positives))
group1_positives = group1_positives[:min_positives_length]
group2_positives = group2_positives[:min_positives_length]

# Afficher les dimensions après ajustement
print("Dimensions de group1_predictions:", group1_predictions.shape)
print("Dimensions de group2_predictions:", group2_predictions.shape)
print("Dimensions de group1_positives:", group1_positives.shape)
print("Dimensions de group2_positives:", group2_positives.shape)

# Créer un DataFrame pour faciliter l'échantillonnage
df_group1 = pd.DataFrame({'predictions': group1_predictions, 'positives': group1_positives})
df_group2 = pd.DataFrame({'predictions': group2_predictions, 'positives': group2_positives})

# Afficher les dimensions après avoir créé les DataFrames
print("Dimensions de df_group1:", df_group1.shape)
print("Dimensions de df_group2:", df_group2.shape)

# Échantillonner aléatoirement sans remplacement pour égaliser la taille
df_group2 = df_group2.sample(min_length, replace=False, random_state=42)

# Extraire les données égalisées
group1_predictions = df_group1['predictions'].values
group2_predictions = df_group2['predictions'].values
group1_positives = df_group1['positives'].values
group2_positives = df_group2['positives'].values

Dimensions de group1_predictions: (16192,)
Dimensions de group2_predictions: (16192,)
Dimensions de group1_positives: (1769,)
Dimensions de group2_positives: (1769,)


ValueError: All arrays must be of the same length

In [31]:
# Séparer les données en groupes
group1_data = data_df[data_df['gender'] == 'Male']
group2_data = data_df[data_df['gender'] == 'Female']

# Trouver le groupe avec la taille la plus petite
min_group_size = min(len(group1_data), len(group2_data))

# Échantillonner aléatoirement le groupe plus grand pour égaler la taille
group1_data = group1_data.sample(min_group_size, replace=False, random_state=42)
group2_data = group2_data.sample(min_group_size, replace=False, random_state=42)

# Concaténer les groupes équilibrés
data_df = pd.concat([group1_data, group2_data])

# Extraire les données nécessaires
group1_predictions = data_df[data_df['gender'] == 'Male']['income'].values
group2_predictions = data_df[data_df['gender'] == 'Female']['income'].values
group1_positives = data_df[(data_df['gender'] == 'Male') & (data_df['income'] == 1)]['income'].values
group2_positives = data_df[(data_df['gender'] == 'Female') & (data_df['income'] == 1)]['income'].values
protected_attribute = data_df['gender'].values

# Ajuster la taille des groupes
min_length = min(len(group1_predictions), len(group2_predictions))
group1_predictions = group1_predictions[:min_length]
group2_predictions = np.random.choice(group2_predictions, min_length, replace=False)
group1_positives = group1_positives[:min_length]
group2_positives = np.random.choice(group2_positives, min_length, replace=False)

# Afficher les dimensions des données égalisées
print("Dimensions des données égalisées :")
print("Group 1 Predictions:", group1_predictions.shape)
print("Group 2 Predictions:", group2_predictions.shape)
print("Group 1 Positives:", group1_positives.shape)
print("Group 2 Positives:", group2_positives.shape)
print("Protected Attribute:", protected_attribute.shape)

ValueError: Cannot take a larger sample than population when 'replace=False'

In [29]:
from scipy.stats import chi2_contingency, chi2

# Créer un tableau de contingence conditionnelle
contingency_table = np.array([group1_predictions, group2_predictions, group1_positives, group2_positives])

# Effectuer le test de Mantel-Haenszel
chi2, p, _, _ = chi2_contingency(contingency_table, lambda_="log-likelihood")

# Degré de liberté
df = (contingency_table.shape[0] - 1) * (contingency_table.shape[1] - 1)

# Calculer la valeur critique
critical_value = chi2.ppf(0.95, df)

# Interpréter le résultat
if chi2 > critical_value:
    print("La parité statistique conditionnelle n'est pas respectée.")
else:
    print("Pas de preuve significative de violation de la parité statistique conditionnelle.")

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (4,) + inhomogeneous part.