In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import seaborn as sns

In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import sys
sys.path.append('../src')


from fairsgod.losses import FairODLoss
from fairsgod.fairod import OutlierDetector
from fairsgod.fairod_sg import SGOutlierDetector
from fairsgod import evaluation
from sklearn.metrics import roc_auc_score

In [4]:
# load adult
adult_df = pd.read_csv('../datasets/proc/crafted_adult.csv')

y_pv = adult_df[['OUTLIER','sex']]
X = adult_df.drop(columns=['OUTLIER', 'sex'])

X_train, X_test, y_pv_train, y_pv_test = train_test_split(X, y_pv, test_size=0.3, shuffle=True)

pv_test = y_pv_test['sex']
pv_train = y_pv_train['sex']
y_train = y_pv_train['OUTLIER']
y_test = y_pv_test['OUTLIER']

## SG-AE Method

In [5]:
from fairsgod.losses import FairODLoss
from fairsgod.fairod import OutlierDetector
from fairsgod.fairod_sg import SGOutlierDetector
from fairsgod import evaluation

### Bayesian optimization 

In [6]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [7]:
class BayesWrapper():
    def __init__(self, *args, **kwargs):
        self. model = SGOutlierDetector(epsilon=1e-6)
        
    def fit(self, X, y, batch_size=256, epochs=5, val_X=None, val_pv=None, stopping_after=None):
        pv = X.loc[:, 'pv']
        X = X.drop('pv', axis=1)
        self.model.fit(X, pv, batch_size, epochs)
        
    def score(self, X, y):
        y_pred =  self.model.predict_scores(X.drop('pv', axis=1))
        return roc_auc_score(y, y_pred)
    
    def get_params(self, *args, **kwargs):
        return self.model.get_params(*args, **kwargs)

    def set_params(self, **params):
        self.model = self.model.set_params(**params)
        return self

In [8]:
# log-uniform: understand as search over p = exp(x) by varying x
opt = BayesSearchCV(
        BayesWrapper(),#SGOutlierDetector(epsilon=1e-6),#, alpha=0.5, gamma=0.5),
        {
            #'epsilon': Real(1e-6, 0.9, prior='log-uniform'), # 0.01, 
            'lambda_se': Real(.01, 100, prior='log-uniform'),
            'a': Real(5, 20, prior='log-uniform'),
            'lambda_a': Real(20.0, 100.0, prior='log-uniform'),
            'alpha': Real(.01, .6, prior='log-uniform'),
            'gamma': Real(.001, 1, prior='log-uniform')
        },
        #scoring=scoring_fn,
        n_iter=32,
        random_state=0,
        fit_params= dict(
              batch_size=512, epochs=3, val_X=None, val_pv=None
        ),
        verbose=1,
        cv=2
)

In [9]:
optim_train_x = X_train.copy()
optim_train_x['pv'] = pv_train

In [10]:
# executes bayesian optimization
_ = opt.fit(optim_train_x, y_train)
# model can be saved, used for predictions or scoring
#print(opt.score(X_test, y_test))

Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fi

RuntimeError: Cannot clone object <__main__.BayesWrapper object at 0x7fc544b62c50>, as the constructor either does not set or modifies parameter lambda_se

In [11]:
opt.best_params_

OrderedDict([('a', 19.109134674877264),
             ('alpha', 0.22144480473574865),
             ('gamma', 0.01908303557687294),
             ('lambda_a', 26.905164720107628),
             ('lambda_se', 0.08998819465893838)])

In [12]:
opt.best_score_

0.8153913393471426

In [13]:
model = SGOutlierDetector(epsilon=0.001, 
                          lambda_se=0.08, 
                          lambda_a=25.0,
                          a=20, 
                          alpha=0.22, 
                          gamma=0.01)


_, _ = model.fit(X_train, pv_train, batch_size=512, epochs=6, val_X=X_test, val_pv=pv_test)

In [14]:
# save predictions
X_pred = model.predict_scores(X_test).numpy()

In [15]:
# AUC score
print("AUC score:", roc_auc_score(y_test, X_pred))
# AP ratio
print("AP RATIO ", evaluation.compute_AP_ratio(y_test, X_pred, pv_test))
# Precision ratio
print("PRECISION RATIO ", evaluation.compute_precision_ratio(y_test, X_pred, pv_test))
# Fairness metric
print("FAIRNESS METRIC  ", evaluation.compute_Fairness_metric(y_test, X_pred, pv_test).numpy())
# Group Fidelity metric
print("GROUP FIDELITY METRIC  ", evaluation.compute_GF_metric(X_pred, pv_test).numpy())

AUC score: 0.8282596962009795
AP RATIO  1.0474507142451983
PRECISION RATIO  1.1163029356479435
FAIRNESS METRIC   0.31768081883157234
GROUP FIDELITY METRIC   0.00028300422


**FAIROD metrics**
```python
AUC score: ~0.55
AP RATIO  2.5573934221167383
PRECISION RATIO  2.6830466830466833
FAIRNESS METRIC   0.6403809904497662
GROUP FIDELITY METRIC   1.0610635
````
