## Setting Sex, Rce, Sex+Race, Age+Education as protected attribute

In [11]:
import torch
import pandas as pd
import numpy as np
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from fairlearn.metrics import demographic_parity_difference, equalized_odds_difference
from sklearn.preprocessing import StandardScaler
import torch.nn as nn

class FCNN(nn.Module):
    def __init__(self, input_dim, hidden_dim=64):
        super(FCNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

def get_device():
    """
    Get the appropriate device (CUDA if available, else CPU)
    """
    if torch.cuda.is_available():
        return torch.device('cuda')
    return torch.device('cpu')

def load_model(model_path):
    """
    Load a PyTorch FCNN model with safe device handling
    """
    try:
        # First load to CPU
        state_dict = torch.load(model_path, map_location='cpu')
        
        # Get input dimension from fc1 weight matrix
        input_dim = state_dict['fc1.weight'].shape[1]
        hidden_dim = state_dict['fc1.weight'].shape[0]
        
        # Create model with correct dimensions
        model = FCNN(input_dim=input_dim, hidden_dim=hidden_dim)
        
        # Load state dict while on CPU
        model.load_state_dict(state_dict)
        model.eval()
        
        # Move to GPU if available
        device = get_device()
        if device.type == 'cuda':
            # Try moving to CUDA with error handling
            try:
                model = model.to(device)
            except RuntimeError as e:
                # print(f"Warning: Could not move model to CUDA. Using CPU instead. Error: {e}")
                device = torch.device('cpu')
        
        return model, device
        
    except Exception as e:
        print(f"Error in load_model: {str(e)}")
        raise

def prepare_adult_dataset(expected_features=14, protected_attribute='sex'):
    """
    Prepare Adult dataset with proper formatting for fairness analysis
    
    Args:
        expected_features (int): Number of features the model expects
        protected_attribute (str): Which attribute to use as protected ('sex', 'race', 
                                 'race+sex', 'age+education')
    """
    # Define column names
    column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
                   'marital-status', 'occupation', 'relationship', 'race', 
                   'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 
                   'native-country', 'income']
    
    # Load data
    df = pd.read_csv('../data/raw/adult.csv', 
                     header=None,
                     names=column_names,
                     skipinitialspace=True)
    
    # Clean the data
    string_columns = df.select_dtypes(include=['object']).columns
    for col in string_columns:
        df[col] = df[col].str.strip()
    
    # Ensure income is properly encoded
    df['income'] = (df['income'].str.contains('>50K')).astype(int)
    
    # Create base numerical features
    numerical_features = ['age', 'fnlwgt', 'education-num', 'capital-gain',
                         'capital-loss', 'hours-per-week']
                         
    # Create all protected attribute variations
    # 1. Sex (binary)
    df['sex_binary'] = (df['sex'] == 'Male').astype(int)
    
    # 2. Race (binary: White vs non-White)
    df['race_binary'] = (df['race'] == 'White').astype(int)
    
    # 3. Race + Sex (intersectional)
    df['race_sex'] = ((df['race'] == 'White') & (df['sex'] == 'Male')).astype(int)
    
    # 4. Age + Education (intersectional)
    # Define higher education as having education-num >= 12 (high school grad or higher)
    df['higher_edu'] = (df['education-num'] >= 12).astype(int)
    # Define older as age >= 40
    df['older'] = (df['age'] >= 40).astype(int)
    # Privileged: older with higher education
    df['age_edu'] = ((df['age'] >= 40) & (df['education-num'] >= 12)).astype(int)
    
    # Scale numerical features
    scaler = StandardScaler()
    df[numerical_features] = scaler.fit_transform(df[numerical_features])
    
    # Determine which protected attribute to use
    if protected_attribute == 'sex':
        actual_attribute = 'sex_binary'
    elif protected_attribute == 'race':
        actual_attribute = 'race_binary'
    elif protected_attribute == 'race+sex':
        actual_attribute = 'race_sex'
    elif protected_attribute == 'age+education':
        actual_attribute = 'age_edu'
    else:
        raise ValueError(f"Unsupported protected attribute: {protected_attribute}")
    
    # To ensure consistency, always include the used protected attribute
    # in the dataset with its original column name
    df[protected_attribute] = df[actual_attribute]
    
    # Start with numerical features plus the selected protected attribute
    base_features = numerical_features + [protected_attribute]
    final_df = df[base_features + ['income']]
    
    # If we need more features to match expected dimension
    if expected_features > len(base_features):
        # Calculate how many more features we need
        features_needed = expected_features - len(base_features)
        
        # Add categorical features one at a time until we have enough
        categorical_features = ['workclass', 'education', 'marital-status', 'occupation',
                               'relationship', 'native-country']
        
        # Don't include race or sex again if they're part of our protected attribute
        if protected_attribute == 'race' or protected_attribute == 'race+sex':
            if 'race' in categorical_features:
                categorical_features.remove('race')
                
        if protected_attribute == 'sex' or protected_attribute == 'race+sex':
            if 'sex' in categorical_features:
                categorical_features.remove('sex')
        
        for cat_feature in categorical_features:
            if len(final_df.columns) - 1 >= expected_features:  # -1 for 'income'
                break
                
            # Add this categorical feature
            cat_encoded = pd.get_dummies(df[cat_feature], prefix=cat_feature)
            final_df = pd.concat([final_df.drop('income', axis=1), 
                                cat_encoded, 
                                final_df['income']], axis=1)
    
    # If we have too many features, select only the needed amount
    if len(final_df.columns) - 1 > expected_features:  # -1 for 'income'
        # Always ensure the protected attribute is included
        keep_cols = [protected_attribute]
        
        # Add other columns until we reach the expected number
        remaining_cols = [c for c in final_df.columns if c != protected_attribute and c != 'income']
        keep_cols.extend(remaining_cols[:expected_features - 1])
        
        # Add income back
        keep_cols.append('income')
        final_df = final_df[keep_cols]
    
    print(f"\nDataset shape for {protected_attribute}: {final_df.shape}")
    
    # Convert all columns to float32 for PyTorch compatibility
    final_df = final_df.astype('float32')
    
    return final_df

def compute_fairness_metrics(model, device, dataset, protected_attribute, privileged_groups=None):
    """
    Compute various fairness metrics using AIF360 and Fairlearn
    """
    if privileged_groups is None:
        privileged_groups = [{protected_attribute: 1}]
    
    # Ensure all data is float32
    dataset = dataset.astype('float32')
    
    # Convert to AIF360 format
    aif_dataset = BinaryLabelDataset(
        df=dataset,
        label_names=['income'],
        protected_attribute_names=[protected_attribute],
        privileged_protected_attributes=[[1]]
    )
    
    # Prepare input features
    X = torch.FloatTensor(dataset.drop('income', axis=1).values)
    
    # Move to appropriate device
    if device.type == 'cuda':
        try:
            X = X.to(device)
        except RuntimeError as e:
            print(f"Warning: Could not move input to CUDA. Using CPU instead. Error: {e}")
            X = X.cpu()
            model = model.cpu()
            device = torch.device('cpu')
    
    # Process in batches
    batch_size = 1000
    predictions = []
    
    with torch.no_grad():
        for i in range(0, len(X), batch_size):
            batch = X[i:i + batch_size]
            pred = model(batch).squeeze().cpu().numpy()
            predictions.append(pred)
    
    y_pred = (np.concatenate(predictions) > 0.5).astype(float)
    y_true = dataset['income'].values
    
    # Calculate fairlearn metrics
    dem_parity = demographic_parity_difference(
        y_true=y_true,
        y_pred=y_pred,
        sensitive_features=dataset[protected_attribute]
    )
    
    eq_odds = equalized_odds_difference(
        y_true=y_true,
        y_pred=y_pred,
        sensitive_features=dataset[protected_attribute]
    )
    
    # Create classified dataset with model predictions
    classified_dataset = aif_dataset.copy()
    classified_dataset.labels = y_pred.reshape(-1, 1)
    
    # Use classification_metric for all fairness calculations
    classification_metric = ClassificationMetric(
        aif_dataset,
        classified_dataset,
        unprivileged_groups=[{protected_attribute: 0}],
        privileged_groups=privileged_groups
    )
    
    # MODIFY HERE: Replace the direct metrics from the dataset with metrics based on predictions
    return {
        # Use metrics from classification_metric instead of direct dataset metrics
        'disparate_impact': classification_metric.disparate_impact(),
        'statistical_parity_difference': classification_metric.statistical_parity_difference(),
        'demographic_parity_difference': dem_parity,
        'equalized_odds_difference': eq_odds,
        'average_odds_difference': classification_metric.average_odds_difference(),
        'equal_opportunity_difference': classification_metric.equal_opportunity_difference(),
        'theil_index': classification_metric.theil_index()
    }

def analyze_model_fairness(model_path, protected_attribute='sex'):
    """
    Main function to analyze model fairness
    
    Args:
        model_path (str): Path to the model file
        protected_attribute (str): Which attribute to use as protected
    """
    print(f"\nAnalyzing model for {protected_attribute} fairness: {model_path}")
    
    # Load model and get input dimension
    model, device = load_model(model_path)
    input_dim = model.fc1.weight.shape[1]
    print(f"Model expects input dimension: {input_dim}")
    print(f"Using device: {device}")
    
    # Prepare dataset with correct features
    dataset = prepare_adult_dataset(expected_features=input_dim, 
                                   protected_attribute=protected_attribute)
    
    # Compute fairness metrics
    fairness_metrics = compute_fairness_metrics(
        model,
        device,
        dataset,
        protected_attribute=protected_attribute,
        privileged_groups=[{protected_attribute: 1}]
    )
    
    return fairness_metrics

def print_fairness_report(metrics, model_name, protected_attribute):
    """
    Print a formatted report of fairness metrics
    """
    print(f"\nFairness Report for {model_name} - Protected: {protected_attribute}")
    print("-" * 60)
    print(f"{'Metric':<30} {'Value':<10} {'Interpretation'}")
    print("-" * 60)
    
    # Define thresholds and interpretations
    interpretations = {
        'disparate_impact': lambda x: "Fair" if 0.8 <= x <= 1.25 else "Unfair",
        'statistical_parity_difference': lambda x: "Fair" if abs(x) <= 0.1 else "Unfair",
        'demographic_parity_difference': lambda x: "Fair" if abs(x) <= 0.1 else "Unfair",
        'equalized_odds_difference': lambda x: "Fair" if abs(x) <= 0.1 else "Unfair",
        'average_odds_difference': lambda x: "Fair" if abs(x) <= 0.1 else "Unfair",
        'equal_opportunity_difference': lambda x: "Fair" if abs(x) <= 0.1 else "Unfair",
        'theil_index': lambda x: "Fair" if x <= 0.2 else "Unfair"
    }
    
    for metric, value in metrics.items():
        interp = interpretations.get(metric, lambda x: "Unknown")(value)
        print(f"{metric.replace('_', ' ').title():<30} {value:.4f}    {interp}")
    
    print("-" * 60)

# Create comparative analysis across all models and attributes
def run_comprehensive_analysis():
    results = {}
    
    for model_path in model_paths:
        model_name = model_path.split('/')[-1]
        results[model_name] = {}
        
        for attr in protected_attributes:
            try:
                print(f"\n===== Analyzing: {model_name} with {attr} =====")
                fairness_metrics = analyze_model_fairness(model_path, protected_attribute=attr)
                print_fairness_report(fairness_metrics, model_name, attr)
                results[model_name][attr] = fairness_metrics
            except Exception as e:
                print(f"Error analyzing {model_name} with {attr}: {str(e)}")
                import traceback
                print(traceback.format_exc())
    
    # Print comparison summary
    # print("\n\n===== FAIRNESS COMPARISON SUMMARY =====")
    # for metric in ['disparate_impact', 'demographic_parity_difference', 'equalized_odds_difference']:
    #     print(f"\n--- {metric.replace('_', ' ').title()} ---")
    #     print(f"{'Model':<30} {'sex':<10} {'race':<10} {'race+sex':<10} {'age+edu':<10}")
    #     print("-" * 70)
        
    #     for model_name in results:
    #         values = []
    #         for attr in protected_attributes:
    #             if attr in results[model_name] and metric in results[model_name][attr]:
    #                 values.append(f"{results[model_name][attr][metric]:.4f}")
    #             else:
    #                 values.append("N/A")
            
    #         print(f"{model_name[:30]:<30} {values[0]:<10} {values[1]:<10} {values[2]:<10} {values[3]:<10}")


In [14]:

# Example usage for multiple models and protected attributes
model_paths = [
    '../models/baseline/fcnn_model_adult_income.pth',
    '../models/debiased/adv_fcnn_model_adult_sex.pth',
    '../models/debiased/fair_demographic_parity_fcnn_model_adult_sex.pth'
]

# protected_attributes = ['sex', 'race', 'race+sex', 'age+education']
protected_attributes = ['sex']

# Run the comprehensive analysis
run_comprehensive_analysis()


===== Analyzing: fcnn_model_adult_income.pth with sex =====

Analyzing model for sex fairness: ../models/baseline/fcnn_model_adult_income.pth
Model expects input dimension: 14
Using device: cpu

Dataset shape for sex: (32561, 15)

Fairness Report for fcnn_model_adult_income.pth - Protected: sex
------------------------------------------------------------
Metric                         Value      Interpretation
------------------------------------------------------------
Disparate Impact               0.4959    Unfair
Statistical Parity Difference  -0.0295    Fair
Demographic Parity Difference  0.0295    Fair
Equalized Odds Difference      0.0304    Fair
Average Odds Difference        0.0155    Fair
Equal Opportunity Difference   0.0304    Fair
Theil Index                    0.2221    Unfair
------------------------------------------------------------

===== Analyzing: adv_fcnn_model_adult_sex.pth with sex =====

Analyzing model for sex fairness: ../models/debiased/adv_fcnn_model_adult

In [13]:

model_paths = [
    '../models/baseline/fcnn_model_adult_income.pth',
    '../models/distillation/fcnn_student_model_adult_income_4.pth',
    '../models/distillation/fcnn_student_model_adult_income_8.pth'
]

# protected_attributes = ['sex', 'race', 'race+sex', 'age+education']
protected_attributes = ['sex']

# Run the comprehensive analysis
run_comprehensive_analysis()


===== Analyzing: fcnn_model_adult_income.pth with sex =====

Analyzing model for sex fairness: ../models/baseline/fcnn_model_adult_income.pth
Model expects input dimension: 14
Using device: cpu

Dataset shape for sex: (32561, 15)

Fairness Report for fcnn_model_adult_income.pth - Protected: sex
------------------------------------------------------------
Metric                         Value      Interpretation
------------------------------------------------------------
Disparate Impact               0.4959    Unfair
Statistical Parity Difference  -0.0295    Fair
Demographic Parity Difference  0.0295    Fair
Equalized Odds Difference      0.0304    Fair
Average Odds Difference        0.0155    Fair
Equal Opportunity Difference   0.0304    Fair
Theil Index                    0.2221    Unfair
------------------------------------------------------------

===== Analyzing: fcnn_student_model_adult_income_4.pth with sex =====

Analyzing model for sex fairness: ../models/distillation/fcnn_st

In [9]:
state_dict = torch.load('../models/baseline/teacher_model_adult.pth', map_location='cpu')
print(state_dict.keys())

odict_keys(['fc1.weight', 'fc1.bias', 'fc2.weight', 'fc2.bias'])


In [12]:
# Diagnostic code
df = pd.read_csv('../data/raw/adult.csv')
print("Available columns:")
print(df.columns.tolist())

Available columns:
['39', ' State-gov', ' 77516', ' Bachelors', ' 13', ' Never-married', ' Adm-clerical', ' Not-in-family', ' White', ' Male', ' 2174', ' 0', ' 40', ' United-States', ' <=50K']
