In [1]:
from ebm import *

# Adult dataset

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix

# Load Adult dataset from UCI
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'sex',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
]

df = pd.read_csv(url, names=columns, na_values=' ?', skipinitialspace=True)
df.dropna(inplace=True)

# Preprocess data
df['sex'] = LabelEncoder().fit_transform(df['sex'])  # Male=1, Female=0
df['income'] = (df['income'] == '>50K').astype(int)  # Binary classification

# Select key features
features = ['age', 'education-num', 'hours-per-week', 'sex', 'capital-gain']
X = df[features]
y = df['income']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create gender masks using sample weights
male_weights = (X_train['sex'] == 1).astype(float)
female_weights = (X_train['sex'] == 0).astype(float)

print(np.sum(male_weights))
print(np.sum(female_weights))

# Train gender-specific EBM models
male_model = EBMClassifier(n_cycles=1000, learning_rate=0.1, n_bins=256)
male_model.fit(X_train, y_train, sample_weight=male_weights)
male_model.set_feature_names(features)

female_model = EBMClassifier(n_cycles=1000, learning_rate=0.1, n_bins=256)
female_model.fit(X_train, y_train, sample_weight=female_weights)
female_model.set_feature_names(features)

normal_model = EBMClassifier(n_cycles=1000, learning_rate=0.1, n_bins=256)
normal_model.fit(X_train, y_train)
normal_model.set_feature_names(features)

# Evaluate performance
def evaluate_model(model, X, y, name):
    preds = model.predict(X)
    acc = accuracy_score(y, preds)
    print(f"\n{name} Model:")
    print(f"Overall Accuracy: {acc:.2f}")
    print("Gender Performance:")
    print(f"Male Accuracy: {accuracy_score(y[X['sex']==1], preds[X['sex']==1]):.2f}")
    print(f"Female Accuracy: {accuracy_score(y[X['sex']==0], preds[X['sex']==0]):.2f}")

print("=== Test Set Performance ===")
evaluate_model(male_model, X_test, y_test, "Male-Trained")
evaluate_model(female_model, X_test, y_test, "Female-Trained")
evaluate_model(normal_model, X_test, y_test, "Normal")

# Analyze sex feature contributions
print("\nSex Feature Contributions:")
print("Male Model:", male_model.feature_graphs[features.index('sex')][1])
print("Female Model:", female_model.feature_graphs[features.index('sex')][1])

# Visualize feature impacts
%matplotlib widget
vis = EBMVisualizer([male_model, female_model, normal_model], model_names=["Male-Trained", "Female-Trained", "Normal"])

17403.0
8645.0
=== Test Set Performance ===

Male-Trained Model:
Overall Accuracy: 0.82
Gender Performance:
Male Accuracy: 0.80
Female Accuracy: 0.86

Female-Trained Model:
Overall Accuracy: 0.80
Gender Performance:
Male Accuracy: 0.75
Female Accuracy: 0.90

Normal Model:
Overall Accuracy: 0.81
Gender Performance:
Male Accuracy: 0.76
Female Accuracy: 0.91

Sex Feature Contributions:
Male Model: [-0.07756571]
Female Model: [-0.12225092]


HBox(children=(VBox(children=(Dropdown(description='Feature:', options=(('age', 0), ('education-num', 1), ('ho…

# German Dataset

In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load German Credit Dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"
columns = [
    'checking_status', 'duration', 'credit_history', 'purpose', 'credit_amount',
    'savings_account', 'employment', 'installment_rate', 'personal_status_sex',
    'other_debtors', 'present_residence', 'property', 'age', 'other_installment_plans',
    'housing', 'existing_credits', 'job', 'num_maintenance', 'telephone', 'foreign_worker', 'target'
]

df = pd.read_csv(url, sep=' ', names=columns, header=None)

# Preprocessing
# Create binary sex feature (Male=1, Female=0)
df['sex'] = df['personal_status_sex'].apply(lambda x: 1 if x in ['A91', 'A93', 'A94'] else 0)

# Convert target to binary (Good credit=1, Bad credit=0)
df['target'] = df['target'].replace({1: 1, 2: 0})

# Select and encode features
features = ['age', 'sex', 'credit_amount', 'duration', 'checking_status', 'savings_account']
categorical_features = ['checking_status', 'savings_account']

# Label encode categorical features
le = LabelEncoder()
for col in categorical_features:
    df[col] = le.fit_transform(df[col])

X = df[features]
y = df['target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create gender-specific sample weights
male_weights = (X_train['sex'] == 1).astype(float)
female_weights = (X_train['sex'] == 0).astype(float)

# Train models
male_model = EBMClassifier(n_cycles=1000, learning_rate=0.1, n_bins=256)
male_model.fit(X_train, y_train, sample_weight=male_weights)
male_model.set_feature_names(features)

female_model = EBMClassifier(n_cycles=1000, learning_rate=0.1, n_bins=256)
female_model.fit(X_train, y_train, sample_weight=female_weights)
female_model.set_feature_names(features)

normal_model = EBMClassifier(n_cycles=1000, learning_rate=0.1, n_bins=256)
normal_model.fit(X_train, y_train)
normal_model.set_feature_names(features)

print("=== Test Set Performance ===")
evaluate_model(male_model, X_test, y_test, "Male")
evaluate_model(female_model, X_test, y_test, "Female")
evaluate_model(normal_model, X_test, y_test, "Normal")

# Analyze feature contributions
print("Sex Feature Contributions:")
print(f"Male Model: {male_model.feature_graphs[features.index('sex')][1]}")
print(f"Female Model: {female_model.feature_graphs[features.index('sex')][1]}")

# Visualize differences
vis = EBMVisualizer([male_model, female_model, normal_model], model_names=["Male-Trained", "Female-Trained", "Normal"])

=== Test Set Performance ===

Male Model:
Overall Accuracy: 0.70
Gender Performance:
Male Accuracy: 0.71
Female Accuracy: 0.70

Female Model:
Overall Accuracy: 0.73
Gender Performance:
Male Accuracy: 0.72
Female Accuracy: 0.79

Normal Model:
Overall Accuracy: 0.77
Gender Performance:
Male Accuracy: 0.78
Female Accuracy: 0.75
Sex Feature Contributions:
Male Model: [0.04577506]
Female Model: [0.03801522]


HBox(children=(VBox(children=(Dropdown(description='Feature:', options=(('age', 0), ('sex', 1), ('credit_amoun…