In [1]:
import pandas as pd
import numpy as np
import os
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, brier_score_loss, log_loss
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import optuna
from sklearn.linear_model import LogisticRegression

In [2]:
import warnings
from sklearn.exceptions import DataConversionWarning

In [3]:
os.chdir(r'C:\Users\decmg\OneDrive\Documentos\Material Disciplinas\MO436-IA Ética\Atividade 4')

In [4]:
import numpy as np
import pandas as pd

def statistical_parity_difference(data, sensitive_attr, predicted_label, positive_class=1):
    """
    Calcula a Statistical Parity Difference (SPD).
    """
    groups = data[sensitive_attr].unique()
    priv_group = groups[0]  # Grupo privilegiado
    unpriv_group = groups[1]  # Grupo não-privilegiado
    
    prob_priv = data[data[sensitive_attr] == priv_group][predicted_label].mean()
    prob_unpriv = data[data[sensitive_attr] == unpriv_group][predicted_label].mean()
    
    spd = prob_unpriv - prob_priv
    return spd

def disparate_impact(data, sensitive_attr, predicted_label, positive_class=1):
    """
    Calcula o Disparate Impact (DI).
    """
    groups = data[sensitive_attr].unique()
    priv_group = groups[0]  # Grupo privilegiado
    unpriv_group = groups[1]  # Grupo não-privilegiado
    
    prob_priv = data[data[sensitive_attr] == priv_group][predicted_label].mean()
    prob_unpriv = data[data[sensitive_attr] == unpriv_group][predicted_label].mean()
    
    di = prob_unpriv / prob_priv if prob_priv != 0 else 0
    return di

def average_odds_difference(data, sensitive_attr, actual_label, predicted_label, positive_class=1):
    """
    Calcula a Average Odds Difference (AOD).
    """
    groups = data[sensitive_attr].unique()
    priv_group = groups[0]
    unpriv_group = groups[1]
    
    # Verdadeiros positivos rate
    tpr_priv = data[(data[sensitive_attr] == priv_group) & (data[actual_label] == positive_class)][predicted_label].mean()
    tpr_unpriv = data[(data[sensitive_attr] == unpriv_group) & (data[actual_label] == positive_class)][predicted_label].mean()
    
    # Verdadeiros negativos rate
    fpr_priv = data[(data[sensitive_attr] == priv_group) & (data[actual_label] != positive_class)][predicted_label].mean()
    fpr_unpriv = data[(data[sensitive_attr] == unpriv_group) & (data[actual_label] != positive_class)][predicted_label].mean()
    
    aod = 0.5 * ((tpr_unpriv - tpr_priv) + (fpr_unpriv - fpr_priv))
    return aod

def equalized_odds_difference(data, sensitive_attr, actual_label, predicted_label, positive_class=1):
    """
    Calcula o Equalized Odds Difference (EOD).
    """
    groups = data[sensitive_attr].unique()
    priv_group = groups[0]
    unpriv_group = groups[1]
    
    # Verdadeiros positivos rate
    tpr_priv = data[(data[sensitive_attr] == priv_group) & (data[actual_label] == positive_class)][predicted_label].mean()
    tpr_unpriv = data[(data[sensitive_attr] == unpriv_group) & (data[actual_label] == positive_class)][predicted_label].mean()
    
    eod = tpr_unpriv - tpr_priv
    return eod


In [5]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, precision_score, f1_score, log_loss, brier_score_loss, roc_auc_score
def calculate_metrics(X_train, X_val, y_train, y_val, models, threshold=0.5):
    results = []
    predictions = {}
    
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        

        y_pred_proba = model.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else None
        
        if y_pred_proba is not None:
            y_pred = (y_pred_proba >= threshold).astype(int)
        else:
            y_pred = model.predict(X_val)

        predictions[f"y_pred_{model_name}"] = y_pred

        metrics = {
            "Model": model_name,
            "Accuracy": accuracy_score(y_val, y_pred),
            "Balanced Accuracy": balanced_accuracy_score(y_val, y_pred),
            "Recall": recall_score(y_val, y_pred),
            "Precision": precision_score(y_val, y_pred),
            "F1 Score": f1_score(y_val, y_pred),
            "Risk Bayes (Log Loss)": log_loss(y_val, y_pred_proba) if y_pred_proba is not None else "N/A",
            "Brier Score": brier_score_loss(y_val, y_pred_proba) if y_pred_proba is not None else "N/A",
            "AUC": roc_auc_score(y_val, y_pred_proba) if y_pred_proba is not None else "N/A"
        }
        
        results.append(metrics)
    metrics_df = pd.DataFrame(results)
    display(metrics_df)
    return metrics_df, predictions


In [6]:
def calculate_fairness_metrics_multi(df, y_train, sensitive_columns, outcome_column_name, favorable_outcome):
    """
    Calcula as métricas de Statistical Parity, Disparate Impact e Equalized Odds Difference
    para múltiplos atributos sensíveis.

    Parâmetros:
    df (DataFrame): O dataframe contendo os dados (features).
    y_train (DataFrame ou Series): O dataframe ou série contendo os targets (rótulos).
    sensitive_columns (list): Lista com o nome das colunas sensíveis (ex: ['sex', 'race', 'native_country']).
    outcome_column_name (str): O nome que será dado à coluna de resultado binário após combinação com y_train.
    favorable_outcome: O valor considerado como resultado favorável (ex: 1 para aprovado).

    Retorna:
    DataFrame: Um dataframe contendo as métricas de Statistical Parity, Disparate Impact e EOD.
    """
    df_combined = df.copy()
    df_combined[outcome_column_name] = y_train  # Adiciona a coluna target ao DataFrame

    results = []
    
    for col in sensitive_columns:
        group_favorable_rate = df_combined.groupby(col).apply(
            lambda x: (x[outcome_column_name] == favorable_outcome).mean()
        ).reset_index(name='Favorable Rate')

        protected_rate = group_favorable_rate.loc[group_favorable_rate[col] == 1, 'Favorable Rate'].values[0]
        non_protected_rate = group_favorable_rate.loc[group_favorable_rate[col] == 0, 'Favorable Rate'].values[0]

        parity_diff = abs(protected_rate - non_protected_rate)

        disparate_impact = protected_rate / non_protected_rate if non_protected_rate > 0 else 0

        protected_true_positive_rate = df_combined[
            (df_combined[col] == 1) & (df_combined[outcome_column_name] == favorable_outcome)
        ].shape[0] / df_combined[df_combined[col] == 1].shape[0]

        non_protected_true_positive_rate = df_combined[
            (df_combined[col] == 0) & (df_combined[outcome_column_name] == favorable_outcome)
        ].shape[0] / df_combined[df_combined[col] == 0].shape[0]

        eod = abs(protected_true_positive_rate - non_protected_true_positive_rate)

        results.append({
            'Sensitive Attribute': col,
            'Protected Group': 1,
            'Non-Protected Group': 0,
            'Protected Favorable Rate': protected_rate,
            'Non-Protected Favorable Rate': non_protected_rate,
            'Statistical Parity': parity_diff,
            'Disparate Impact': disparate_impact,
            'Equalized Odds Difference (EOD)': eod
        })

    return pd.DataFrame(results)


In [7]:
x_train,x_val,y_train,y_val = pd.read_csv('X_train.csv'),pd.read_csv('X_val.csv'),pd.read_csv('y_train.csv'),pd.read_csv('y_val.csv')
y_train.drop(['Unnamed: 0'],axis=1,inplace=True),
y_val.drop(['Unnamed: 0'],axis=1,inplace=True)
x_train.drop(['Unnamed: 0'],axis=1,inplace=True)
x_val.drop(['Unnamed: 0'],axis=1,inplace=True)

### Imparcialidade antes de Treinar um Modelo (Desconsiderar o EOD nesse caso)

In [8]:
result_initial = calculate_fairness_metrics_multi(
    x_train, 
    y_train, 
    sensitive_columns=['sex', 'race', 'native_country'], 
    outcome_column_name='income', 
    favorable_outcome=0  # 0 agora é o resultado favorável (alta renda)
)
result_initial

  group_favorable_rate = df_combined.groupby(col).apply(
  group_favorable_rate = df_combined.groupby(col).apply(
  group_favorable_rate = df_combined.groupby(col).apply(


Unnamed: 0,Sensitive Attribute,Protected Group,Non-Protected Group,Protected Favorable Rate,Non-Protected Favorable Rate,Statistical Parity,Disparate Impact,Equalized Odds Difference (EOD)
0,sex,1,0,0.114186,0.31369,0.199504,0.364009,0.199504
1,race,1,0,0.15885,0.263659,0.104809,0.602482,0.104809
2,native_country,1,0,0.195528,0.254239,0.05871,0.769073,0.05871


### Carregar os modelos a serem testados

In [10]:
# Importar os modelos já treinados
import joblib
loaded_models = {}
for name in ['rf','nb','knn','SVC','lr']:
    loaded_models[name] = joblib.load(f'models//{name}_model.pkl')

In [11]:
lr_tun = loaded_models['lr']

In [12]:
lr_tun.fit(x_train, y_train)

# Avaliar o modelo no conjunto de validação
y_val_pred = lr_tun.predict(x_val)
y_val_prob = lr_tun.predict_proba(x_val)[:, 1]

  y = column_or_1d(y, warn=True)
  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


In [13]:
y_val_pred,y_val_prob

(array([0, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([0.39608261, 0.58476045, 0.54731181, ..., 0.97724678, 0.87081925,
        0.99278732]))

## Técnica de Reghwiehtning - Pre processamento

In [28]:
y_train.nunique

<bound method DataFrame.nunique of        income
0           1
1           1
2           0
3           0
4           0
...       ...
25632       1
25633       0
25634       0
25635       1
25636       1

[25637 rows x 1 columns]>

In [14]:
# Garantir que y_train seja unidimensional
if isinstance(y_train, pd.DataFrame) or len(y_train.shape) > 1:
    y_train = y_train.values.ravel()

# Combinar os dados sensíveis e os rótulos em um DataFrame auxiliar
temp_df = pd.DataFrame({'sensitive': x_train['sensitive'], 'target': y_train})

# Calcular os tamanhos dos grupos (contagem por combinação de valores sensíveis e target)
group_counts = temp_df.groupby(['sensitive', 'target']).size()

# Calcular os pesos para cada grupo
total_count = len(temp_df)
group_weights = total_count / (group_counts + 1e-6)

# Mapear os pesos de volta para cada linha no DataFrame original
x_train['weights'] = temp_df.set_index(['sensitive', 'target']).index.map(group_weights)

# Confirmar os primeiros pesos calculados
print(x_train[['sensitive', 'weights']].head())


   sensitive   weights
0          0  2.156000
1          0  2.156000
2          0  4.717019
3          0  4.717019
4          0  4.717019


In [15]:

lr_tun.fit(x_train.drop(columns=['sensitive', 'weights']), y_train, sample_weight=x_train['weights'])

y_val_pred = lr_tun.predict(x_val)
accuracy = accuracy_score(y_val, y_val_pred)

print(f"Accuracy with reweighing: {accuracy:.4f}")


  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


Accuracy with reweighing: 0.7028


In [9]:
def calculate_fairness_metrics_multi_reweighted(df, y_train, sensitive_columns, outcome_column_name, favorable_outcome, weights_column):
    """
    Calcula as métricas de Statistical Parity, Disparate Impact e Equalized Odds Difference
    para múltiplos atributos sensíveis, considerando pesos reweighted.

    Parâmetros:
    df (DataFrame): O dataframe contendo os dados (features).
    y_train (DataFrame ou Series): O dataframe ou série contendo os targets (rótulos).
    sensitive_columns (list): Lista com o nome das colunas sensíveis (ex: ['sex', 'race', 'native_country']).
    outcome_column_name (str): O nome que será dado à coluna de resultado binário após combinação com y_train.
    favorable_outcome: O valor considerado como resultado favorável (ex: 1 para aprovado).
    weights_column (str): O nome da coluna que contém os pesos reweighted.

    Retorna:
    DataFrame: Um dataframe contendo as métricas de fairness considerando reweighting.
    """
    df_combined = df.copy()
    df_combined[outcome_column_name] = y_train  
    results = []
    
    for col in sensitive_columns:
        # Calcular a taxa de resultado favorável ponderada para cada grupo
        group_favorable_rate = df_combined.groupby(col).apply(
            lambda x: np.sum((x[outcome_column_name] == favorable_outcome) * x[weights_column]) /
                      np.sum(x[weights_column])
        ).reset_index(name='Favorable Rate')
        protected_rate = group_favorable_rate.loc[group_favorable_rate[col] == 1, 'Favorable Rate'].values[0]
        non_protected_rate = group_favorable_rate.loc[group_favorable_rate[col] == 0, 'Favorable Rate'].values[0]

        parity_diff = abs(protected_rate - non_protected_rate)

        disparate_impact = protected_rate / non_protected_rate if non_protected_rate > 0 else 0

        protected_true_positive_rate = np.sum(
            (df_combined[col] == 1) & (df_combined[outcome_column_name] == favorable_outcome) *
            df_combined[weights_column]
        ) / np.sum((df_combined[col] == 1) * df_combined[weights_column])

        non_protected_true_positive_rate = np.sum(
            (df_combined[col] == 0) & (df_combined[outcome_column_name] == favorable_outcome) *
            df_combined[weights_column]
        ) / np.sum((df_combined[col] == 0) * df_combined[weights_column])

        eod = abs(protected_true_positive_rate - non_protected_true_positive_rate)

        results.append({
            'Sensitive Attribute': col,
            'Protected Group': 1,
            'Non-Protected Group': 0,
            'Protected Favorable Rate': protected_rate,
            'Non-Protected Favorable Rate': non_protected_rate,
            'Statistical Parity': parity_diff,
            'Disparate Impact': disparate_impact,
            'Equalized Odds Difference (EOD)': eod
        })

    return pd.DataFrame(results)


In [20]:
x_train['weights'] = x_train['weights'] 

fairness_metrics_reweighted = calculate_fairness_metrics_multi_reweighted(
    df=x_train,
    y_train=y_train,
    sensitive_columns=['sensitive'],
    outcome_column_name='predicted',
    favorable_outcome=1,  
    weights_column='weights'
)

print(fairness_metrics_reweighted)


  Sensitive Attribute  Protected Group  Non-Protected Group  \
0           sensitive                1                    0   

   Protected Favorable Rate  Non-Protected Favorable Rate  Statistical Parity  \
0                       0.5                           0.5        2.045033e-10   

   Disparate Impact  Equalized Odds Difference (EOD)  
0               1.0                         0.088329  


  group_favorable_rate = df_combined.groupby(col).apply(


### Metodologia In-Processing Fairness


In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np

def fairness_penalty(predictions, sensitive_column, favorable_outcome):
    """
    Calcula a penalidade de fairness com base na Demographic Parity.
    Penalidade é proporcional à diferença entre as taxas de resultado favorável dos grupos.

    predictions: np.array
        Predições do modelo (valores binários).
    sensitive_column: np.array
        Coluna sensível (ex.: sexo, raça).
    favorable_outcome: int
        Valor considerado como resultado favorável (ex.: 1 para aprovado).
    """
    group_0 = predictions[sensitive_column == 0]
    group_1 = predictions[sensitive_column == 1]

    rate_0 = np.mean(group_0 == favorable_outcome)
    rate_1 = np.mean(group_1 == favorable_outcome)

    penalty = abs(rate_0 - rate_1)  
    return penalty

def train_model_with_fairness_penalty(X_train, y_train, sensitive_column, lr_model, lambda_fairness=1.0):
    """
    Treina um modelo ajustado para fairness, adicionando uma penalidade na loss.

    X_train: np.array
        Dados de entrada.
    y_train: np.array
        Rótulos.
    sensitive_column: np.array
        Coluna sensível (valores 0/1).
    lr_model: LogisticRegression
        Modelo de regressão logística.
    lambda_fairness: float
        Peso da penalidade de fairness na loss.
    """
    lr_model.fit(X_train, y_train)  

    predictions = lr_model.predict(X_train)


    fairness_loss = fairness_penalty(predictions, sensitive_column, favorable_outcome=1)


    combined_loss = -lr_model.score(X_train, y_train) + lambda_fairness * fairness_loss

    print(f"Fairness Loss: {fairness_loss:.4f}")
    print(f"Combined Loss: {combined_loss:.4f}")

    return lr_model


In [39]:

sensitive_column = x_train['native_country'].values  # Atributo sensível

# Modelo base
lr_model = LogisticRegression()


In [40]:
# Treinamento com regularização de fairness
trained_model = train_model_with_fairness_penalty(
    X_train=x_train,
    y_train=y_train,
    sensitive_column=sensitive_column,
    lr_model=lr_model,
    lambda_fairness=0.5  
)


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fairness Loss: 0.0505
Combined Loss: -0.7859


In [43]:
# Predições no conjunto de validação

# Avaliar fairness
fairness_metrics = calculate_fairness_metrics_multi(
    df=x_val,
    y_train=y_val_pred,
    sensitive_columns=['native_country'],
    outcome_column_name='predicted',
    favorable_outcome=1
)

# Avaliar performance
accuracy = accuracy_score(y_val, y_val_pred)
roc_auc = roc_auc_score(y_val, trained_model.predict_proba(x_val)[:, 1])

print(f"Accuracy: {accuracy:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")
print(f"\nFairness Metrics:\n{fairness_metrics}")


Accuracy: 0.8097
ROC-AUC: 0.8543

Fairness Metrics:
  Sensitive Attribute  Protected Group  Non-Protected Group  \
0      native_country                1                    0   

   Protected Favorable Rate  Non-Protected Favorable Rate  Statistical Parity  \
0                  0.742706                       0.63163            0.111076   

   Disparate Impact  Equalized Odds Difference (EOD)  
0          1.175856                         0.111076  


  group_favorable_rate = df_combined.groupby(col).apply(


## Metodologia Pos-Processing

In [34]:
def calculate_fairness_metrics_multi_reweighted(df, y_train, sensitive_columns, outcome_column_name, favorable_outcome, weights_column):
    """
    Calcula as métricas de Statistical Parity, Disparate Impact e Equal Opportunity
    para múltiplos atributos sensíveis, considerando pesos reweighted.

    Parâmetros:
    df (DataFrame): O dataframe contendo os dados (features).
    y_train (DataFrame ou Series): O dataframe ou série contendo os targets (rótulos).
    sensitive_columns (list): Lista com o nome das colunas sensíveis (ex: ['sex', 'race', 'native_country']).
    outcome_column_name (str): O nome que será dado à coluna de resultado binário após combinação com y_train.
    favorable_outcome: O valor considerado como resultado favorável (ex: 1 para aprovado).
    weights_column (str): O nome da coluna que contém os pesos reweighted.

    Retorna:
    DataFrame: Um dataframe contendo as métricas de fairness considerando reweighting.
    """
    df_combined = df.copy()
    df_combined[outcome_column_name] = y_train  # Adiciona a coluna target ao DataFrame

    results = []
    
    for col in sensitive_columns:
        # Calcular a taxa de resultado favorável ponderada para cada grupo
        group_favorable_rate = df_combined.groupby(col).apply(
            lambda x: np.sum((x[outcome_column_name] == favorable_outcome) * x[weights_column]) /
                      np.sum(x[weights_column])
        ).reset_index(name='Favorable Rate')

        # Considerar o grupo protegido como aquele cujo valor é 1
        protected_rate = group_favorable_rate.loc[group_favorable_rate[col] == 1, 'Favorable Rate'].values[0]
        non_protected_rate = group_favorable_rate.loc[group_favorable_rate[col] == 0, 'Favorable Rate'].values[0]

        # Calcular o Statistical Parity (diferença nas taxas de resultado favorável ponderadas)
        parity_diff = abs(protected_rate - non_protected_rate)

        # Calcular o Disparate Impact
        disparate_impact = protected_rate / non_protected_rate if non_protected_rate > 0 else 0

        # Calcular o Equal Opportunity (diferença nas taxas de verdadeiros positivos ponderadas)
        protected_true_positive_rate = np.sum(
            (df_combined[col] == 1) & (df_combined[outcome_column_name] == favorable_outcome) &
            (y_train == favorable_outcome) * df_combined[weights_column]
        ) / np.sum((df_combined[col] == 1) & (y_train == favorable_outcome) * df_combined[weights_column])

        non_protected_true_positive_rate = np.sum(
            (df_combined[col] == 0) & (df_combined[outcome_column_name] == favorable_outcome) &
            (y_train == favorable_outcome) * df_combined[weights_column]
        ) / np.sum((df_combined[col] == 0) & (y_train == favorable_outcome) * df_combined[weights_column])

        equal_opportunity = abs(protected_true_positive_rate - non_protected_true_positive_rate)

        results.append({
            'Sensitive Attribute': col,
            'Protected Group': 1,
            'Non-Protected Group': 0,
            'Protected Favorable Rate': protected_rate,
            'Non-Protected Favorable Rate': non_protected_rate,
            'Statistical Parity': parity_diff,
            'Disparate Impact': disparate_impact,
            'Equal Opportunity': equal_opportunity
        })

    return pd.DataFrame(results)


In [11]:
pip install fairlearn


Collecting fairlearn
  Downloading fairlearn-0.11.0-py3-none-any.whl.metadata (7.0 kB)
Downloading fairlearn-0.11.0-py3-none-any.whl (232 kB)
   ---------------------------------------- 0.0/232.3 kB ? eta -:--:--
   --- ----------------------------------- 20.5/232.3 kB 682.7 kB/s eta 0:00:01
   -------------------------- ------------- 153.6/232.3 kB 3.1 MB/s eta 0:00:01
   ---------------------------------------- 232.3/232.3 kB 2.8 MB/s eta 0:00:00
Installing collected packages: fairlearn
Successfully installed fairlearn-0.11.0
Note: you may need to restart the kernel to use updated packages.


In [35]:
from fairlearn.postprocessing import ThresholdOptimizer
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler



# Modelo treinado (Logistic Regression)
model = LogisticRegression()
model.fit(x_train, y_train)

# Predições no conjunto de validação
y_val_pred = model.predict(x_val)
y_val_prob = model.predict_proba(x_val)[:, 1]  # Probabilidades para o rótulo positivo


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [36]:
from fairlearn.postprocessing import ThresholdOptimizer

# Garantir que os dados estão no formato correto
if isinstance(y_val, pd.DataFrame) or len(y_val.shape) > 1:
    y_val = y_val.values.ravel()

if isinstance(x_val['race'], pd.DataFrame) or len(x_val['race'].shape) > 1:
    sensitive_features = x_val['race'].values.ravel()
else:
    sensitive_features = x_val['race']

# Configurar e ajustar com Equalized Odds
threshold_optimizer = ThresholdOptimizer(
    estimator=model,
    constraints="equalized_odds",
    predict_method="predict_proba"
)

# Ajustar as predições no conjunto de validação
threshold_optimizer.fit(x_val, y_val, sensitive_features=sensitive_features)

# Fazer predições ajustadas
y_val_adjusted = threshold_optimizer.predict(x_val, sensitive_features=sensitive_features)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
X_val_scaled = x_val.copy()

In [37]:
from sklearn.metrics import accuracy_score, roc_auc_score
from fairlearn.postprocessing import ThresholdOptimizer

# Garantir que y_val seja 1D
if len(y_val.shape) > 1:
    y_val = y_val[:, 0]  # Selecionar apenas a primeira coluna, se necessário

# Garantir que sensitive_features esteja no formato correto
if isinstance(x_val['race'], pd.DataFrame) or len(x_val['race'].shape) > 1:
    sensitive_features = x_val['race'].values.ravel()
else:
    sensitive_features = x_val['race']

# Configurar o Equalized Odds Threshold Optimizer
threshold_optimizer = ThresholdOptimizer(
    estimator=model,  # Modelo original treinado
    constraints="equalized_odds",  # Garantir Equalized Odds
    predict_method="predict_proba"  # Usar probabilidades para ajuste
)

# Ajustar o modelo usando os dados de validação
threshold_optimizer.fit(X_val_scaled, y_val, sensitive_features=sensitive_features)

# Fazer predições ajustadas (classe ajustada)
y_val_adjusted = threshold_optimizer.predict(X_val_scaled, sensitive_features=sensitive_features)

# Fazer predições ajustadas de probabilidades
y_val_prob_adjusted = threshold_optimizer._pmf_predict(
    X_val_scaled, sensitive_features=sensitive_features
)

# Garantir que y_val_prob_adjusted seja um array 1D (somente as probabilidades da classe positiva)
if y_val_prob_adjusted.ndim > 1:
    y_val_prob_adjusted = y_val_prob_adjusted[:, 1]  # Selecionar a coluna da classe positiva

# Avaliar métricas de fairness usando as predições ajustadas
fairness_metrics_adjusted = calculate_fairness_metrics_multi(
    df=x_val,
    y_train=y_val_adjusted,  # Previsões ajustadas
    sensitive_columns=['race'],  # Atributo sensível
    outcome_column_name='adjusted_predicted',  # Nome para as predições ajustadas
    favorable_outcome=1
)

# Avaliar métricas de performance
accuracy_adjusted = accuracy_score(y_val, y_val_adjusted)
roc_auc_adjusted = roc_auc_score(y_val, y_val_prob_adjusted)

# Exibir os resultados
print(f"Adjusted Model Accuracy: {accuracy_adjusted:.4f}")
print(f"Adjusted Model ROC-AUC: {roc_auc_adjusted:.4f}")
print("\nFairness Metrics After Post-Processing:")
print(fairness_metrics_adjusted)


Adjusted Model Accuracy: 0.8309
Adjusted Model ROC-AUC: 0.6899

Fairness Metrics After Post-Processing:
  Sensitive Attribute  Protected Group  Non-Protected Group  \
0                race                1                    0   

   Protected Favorable Rate  Non-Protected Favorable Rate  Statistical Parity  \
0                  0.890263                      0.843218            0.047045   

   Disparate Impact  Equalized Odds Difference (EOD)  
0          1.055792                         0.047045  


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  group_favorable_rate = df_combined.groupby(col).apply(
