In [61]:
# !pip install -r ../exp_requirements.txt

In [62]:
# !pip uninstall virny -y

In [63]:
# Install using an HTTP link
# !pip install git+https://github.com/DataResponsibly/Virny.git@development

# Install using an SSH link
# !pip install git+ssh://git@github.com/DataResponsibly/Virny.git@development

In [64]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [65]:
import os
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

# COMPAS Dataset With Different Proportions Generation

## Import dependencies

In [66]:
import os

from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from virny.utils.custom_initializers import create_config_obj
from virny.datasets.data_loaders import CompasDataset

from source.generators.sampling import ProportionsGenerator
from source.preprocessing import get_null_imputer_preprocessor
from source.experiment_interface import run_exp_iteration
from source.custom_initializers import create_experiment_data_loader
from configs.constants import NUM_METRICS_COMPUTATION_RUNS, EXPERIMENT_SEEDS

## Initialize input variables for the experiment

In [67]:
ROOT_DIR = os.path.join(os.getcwd(), "..", "..")
EXPERIMENT_NAME = 'diff_proportions'
SAVE_RESULTS_DIR_PATH = os.path.join(ROOT_DIR, 'results', EXPERIMENT_NAME)

In [68]:
data_loader = CompasDataset()
data_loader.X_data.head()

Unnamed: 0,age,juv_fel_count,juv_misd_count,juv_other_count,priors_count,race,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,c_charge_degree_F,c_charge_degree_M,sex
0,25,0.0,-2.340451,1.0,-15.010999,African-American,1,0,0,0,1,1
1,26,0.0,0.0,0.0,0.0,Caucasian,1,0,0,1,0,0
2,21,0.0,0.0,0.0,0.0,Caucasian,0,0,1,1,0,1
3,29,0.0,0.0,0.0,6.0,African-American,1,0,0,0,1,1
4,40,0.0,0.0,0.0,7.513697,Caucasian,1,0,0,1,0,1


In [69]:
data_loader.full_df.isna().sum()

age                        0
juv_fel_count              0
juv_misd_count             0
juv_other_count            0
priors_count               0
age_cat_25 - 45            0
age_cat_Greater than 45    0
age_cat_Less than 25       0
c_charge_degree_F          0
c_charge_degree_M          0
race                       0
sex                        0
recidivism                 0
dtype: int64

In [70]:
data_loader.X_data.race.value_counts()

African-American    3175
Caucasian           2103
Name: race, dtype: int64

### Define a db writer and custom fields to insert into your database

In [71]:
import uuid

custom_table_fields_dct = {
    'session_uuid': str(uuid.uuid4()),
    'new_proportions_dct': None,
}
print('Current session uuid: ', custom_table_fields_dct['session_uuid'])

Current session uuid:  4c5b84eb-f720-4314-97a1-8a8924432c51


### Create a metrics computation config object

In [72]:
config_yaml_path = 'experiment_config.yaml'
config_yaml_content = \
f"""
dataset_name: COMPAS
bootstrap_fraction: 0.8
# n_estimators: 50
n_estimators: 10
# num_runs: {NUM_METRICS_COMPUTATION_RUNS}
num_runs: 2
sensitive_attributes_dct: {{'sex': 0, 'race': 'Caucasian', 'sex&race': None}}
"""

with open(config_yaml_path, 'w', encoding='utf-8') as f:
    f.write(config_yaml_content)

In [73]:
metrics_computation_config = create_config_obj(config_yaml_path=config_yaml_path)

### Define tuning parameter for models

In [74]:
def get_models_params_for_tuning(models_tuning_seed):
    return {
        'DecisionTreeClassifier': {
            'model': DecisionTreeClassifier(random_state=models_tuning_seed),
            'params': {
                "max_depth": [20, 30],
                "min_samples_split" : [0.1],
                "max_features": ['sqrt'],
                "criterion": ["gini", "entropy"]
            }
        },
        'LogisticRegression': {
            'model': LogisticRegression(random_state=models_tuning_seed),
            'params': {
                'penalty': ['l2'],
                'C' : [0.0001, 0.1, 1, 100],
                'solver': ['newton-cg', 'lbfgs'],
                'max_iter': [250],
            }
        },
        # 'RandomForestClassifier': {
        #     'model': RandomForestClassifier(random_state=models_tuning_seed),
        #     'params': {
        #         "max_depth": [6, 10],
        #         "min_samples_leaf": [1],
        #         "n_estimators": [50, 100],
        #         "max_features": [0.6]
        #     }
        # },
        # 'XGBClassifier': {
        #     'model': XGBClassifier(random_state=models_tuning_seed, verbosity=0),
        #     'params': {
        #         'learning_rate': [0.1],
        #         'n_estimators': [200],
        #         'max_depth': [5, 7],
        #         'lambda':  [10, 100]
        #     }
        # },
        # 'KNeighborsClassifier': {
        #     'model': KNeighborsClassifier(),
        #     'params': {
        #         'n_neighbors' : [5, 7, 9, 11, 13, 15, 25],
        #         'weights' : ['uniform', 'distance'],
        #         'metric' : ['minkowski', 'euclidean', 'manhattan']
        #     }
        # },
        # 'MLPClassifier': {
        #     'model': MLPClassifier(random_state=models_tuning_seed),
        #     'params': {
        #         'hidden_layer_sizes':[(100,), (100,100,), (100,50,100,)],
        #         'activation': ['logistic', 'tanh', 'relu'],
        #         'solver': ['lbfgs', 'sgd', 'adam'],
        #         'learning_rate': ['constant', 'invscaling', 'adaptive']
        #     }
        # }
    }

## Run experiments

### Experiment iteration 1

In [75]:
# Configs for an experiment iteration
exp_iter_num = 1
column_for_subsampling = 'race'
new_proportions_dct = {
    'African-American': 0.5,
    'Caucasian': 0.5,
}
experiment_seed = EXPERIMENT_SEEDS[exp_iter_num - 1]
generator = ProportionsGenerator(experiment_seed, column_for_subsampling, new_proportions_dct)

custom_table_fields_dct['new_proportions_dct'] = new_proportions_dct
custom_table_fields_dct['experiment_iteration'] = f'Exp_iter_{exp_iter_num}'
models_params_for_tuning = get_models_params_for_tuning(experiment_seed)
# Create a transformed data loader
exp_iter_data_loader = create_experiment_data_loader(data_loader, generator)
# preprocessor = get_null_imputer_preprocessor(exp_iter_data_loader)

In [76]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler


preprocessor = ColumnTransformer(transformers=[
    ('categorical_features', OneHotEncoder(handle_unknown='ignore', sparse=False), exp_iter_data_loader.categorical_columns),
    ('numerical_features', StandardScaler(), exp_iter_data_loader.numerical_columns),
])

African-American    3175
Caucasian           2103
Name: race, dtype: int64

In [77]:
exp_iter_data_loader.X_data[column_for_subsampling].value_counts()

African-American    2103
Caucasian           2103
Name: race, dtype: int64

In [78]:
exp_iter_data_loader.X_data[column_for_subsampling].value_counts()

2023-04-08 14:26:29 experiment_interface.py INFO    : Start an experiment iteration for the following custom params:
2023-04-08 14:26:30 experiment_interface.py INFO    : The dataset is preprocessed


{'dataset_split_seed': 100,
 'experiment_iteration': 'Exp_iter_1',
 'model_init_seed': 100,
 'new_proportions_dct': {'African-American': 0.5, 'Caucasian': 0.5},
 'session_uuid': '4c5b84eb-f720-4314-97a1-8a8924432c51'}


2023/04/08, 14:26:30: Tuning DecisionTreeClassifier...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
2023/04/08, 14:26:30: Tuning for DecisionTreeClassifier is finished [F1 score = 0.6494160498115268, Accuracy = 0.6591448931116389]

2023/04/08, 14:26:30: Tuning LogisticRegression...
Fitting 3 folds for each of 8 candidates, totalling 24 fits


2023-04-08 14:26:30 experiment_interface.py INFO    : Models are tuned and saved to a file
2023-04-08 14:26:30 experiment_interface.py INFO    : Connected to MongoDB


2023/04/08, 14:26:30: Tuning for LogisticRegression is finished [F1 score = 0.6593012815786539, Accuracy = 0.668646080760095]



Multiple runs progress:   0%|          | 0/2 [00:00<?, ?it/s]

Analyze models in one run:   0%|          | 0/2 [00:00<?, ?it/s]

Classifiers testing by bootstrap:   0%|          | 0/10 [00:00<?, ?it/s]

Classifiers testing by bootstrap:   0%|          | 0/10 [00:00<?, ?it/s]

Analyze models in one run:   0%|          | 0/2 [00:00<?, ?it/s]

Classifiers testing by bootstrap:   0%|          | 0/10 [00:00<?, ?it/s]

Classifiers testing by bootstrap:   0%|          | 0/10 [00:00<?, ?it/s]

In [79]:
exp_iter_data_loader.X_data.head()

Unnamed: 0,Metric,overall,sex_priv,sex_dis,race_priv,race_dis,sex&race_priv,sex&race_dis,Model_Seed,Model_Name,Model_Params,Run_Number,Dataset_Name,Num_Estimators
0,Mean,0.549036,0.611194,0.53129,0.598277,0.499325,0.650081,0.483727,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,COMPAS,10
1,Std,0.105445,0.118112,0.101828,0.099041,0.111909,0.109967,0.107856,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,COMPAS,10
2,IQR,0.136486,0.154499,0.131344,0.128274,0.144777,0.146269,0.139828,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,COMPAS,10
3,Entropy,0.0,0.312711,0.0,0.209328,0.0,0.227392,0.0,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,COMPAS,10
4,Jitter,0.210583,0.229115,0.205293,0.154242,0.267462,0.167094,0.257738,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,COMPAS,10
5,Per_Sample_Accuracy,0.6462,0.638503,0.648397,0.669504,0.622673,0.630769,0.616369,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,COMPAS,10
6,Label_Stability,0.707601,0.691979,0.712061,0.793853,0.620525,0.776923,0.629167,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,COMPAS,10
7,TPR,0.497268,0.388889,0.52381,0.352201,0.608696,0.205128,0.609195,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,COMPAS,10
8,TNR,0.806723,0.869565,0.786704,0.863636,0.735849,0.907692,0.709877,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,COMPAS,10
9,PPV,0.664234,0.651163,0.666667,0.608696,0.692308,0.571429,0.69281,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,COMPAS,10


In [80]:
multiple_run_metrics_dct = run_exp_iteration(data_loader=exp_iter_data_loader,
                                             experiment_seed=experiment_seed,
                                             db_collection_name='diff_proportions_results',
                                             preprocessor=preprocessor,
                                             models_params_for_tuning=models_params_for_tuning,
                                             metrics_computation_config=metrics_computation_config,
                                             custom_table_fields_dct=custom_table_fields_dct,
                                             with_tuning=True,
                                             save_results_dir_path=SAVE_RESULTS_DIR_PATH,
                                             tuned_params_df_path=None)

In [80]:
vsample_model_metrics_df = multiple_run_metrics_dct[list(models_params_for_tuning.keys())[0]]
sample_model_metrics_df.head(20)

### Experiment iteration 2