In [54]:
# !pip install -r ../exp_requirements.txt

In [55]:
# !pip uninstall virny -y

In [56]:
# Install using an HTTP link
# !pip install git+https://github.com/DataResponsibly/Virny.git@development

# Install using an SSH link
# !pip install git+ssh://git@github.com/DataResponsibly/Virny.git@development

In [57]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [58]:
import os
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

# Credit Dataset With Null Imputer

## Import dependencies

In [59]:
import os
import copy

from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from virny.utils.custom_initializers import create_config_obj
from virny.datasets.data_loaders import CreditDataset

from source.preprocessing import get_null_imputer_preprocessor
from source.experiment_interface import run_exp_iteration
from configs.constants import NUM_METRICS_COMPUTATION_RUNS, EXPERIMENT_SEEDS

## Initialize input variables for the experiment

In [60]:
ROOT_DIR = os.path.join(os.getcwd(), "..", "..")
EXPERIMENT_NAME = 'preprocessing'
SAVE_RESULTS_DIR_PATH = os.path.join(ROOT_DIR, 'results', EXPERIMENT_NAME)

In [61]:
data_loader = CreditDataset(subsample_size=50_000)
data_loader.X_data.head()

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines
1,0.766127,45,2,0.802982,9120.0,13,0,6
2,0.957151,40,0,0.121876,2600.0,4,0,0
3,0.65818,38,1,0.085113,3042.0,2,1,0
4,0.23381,30,0,0.03605,3300.0,5,0,0
5,0.907239,49,1,0.024926,63588.0,7,0,1


In [62]:
data_loader.full_df.isna().sum()

SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64

### Define a db writer and custom fields to insert into your database

In [63]:
import uuid

custom_table_fields_dct = {
    'session_uuid': str(uuid.uuid4()),
    'preprocessing_technique': None,
}
print('Current session uuid: ', custom_table_fields_dct['session_uuid'])

Current session uuid:  091fd7d4-98f1-4512-bebd-5652f739aa3d


### Create a metrics computation config object

In [64]:
config_yaml_path = 'experiment_config.yaml'
age_range = [i for i in range(0, 31)]
config_yaml_content = \
f"""
dataset_name: Credit
bootstrap_fraction: 0.8
n_estimators: 50  # Better to input the higher number of estimators than 100; this is only for this use case example
# num_runs: {NUM_METRICS_COMPUTATION_RUNS}
num_runs: 2
sensitive_attributes_dct: {{'age': {age_range}}}
"""

with open(config_yaml_path, 'w', encoding='utf-8') as f:
    f.write(config_yaml_content)

In [65]:
metrics_computation_config = create_config_obj(config_yaml_path=config_yaml_path)

### Define tuning parameter for models

In [66]:
def get_models_params_for_tuning(models_tuning_seed):
    return {
        'DecisionTreeClassifier': {
            'model': DecisionTreeClassifier(random_state=models_tuning_seed),
            'params': {
                "max_depth": [20, 30],
                "min_samples_split" : [0.1],
                "max_features": ['sqrt'],
                "criterion": ["gini", "entropy"]
            }
        },
        'LogisticRegression': {
            'model': LogisticRegression(random_state=models_tuning_seed),
            'params': {
                'penalty': ['l2'],
                'C' : [0.0001, 0.1, 1, 100],
                'solver': ['newton-cg', 'lbfgs'],
                'max_iter': [250],
            }
        },
        # 'RandomForestClassifier': {
        #     'model': RandomForestClassifier(random_state=models_tuning_seed),
        #     'params': {
        #         "max_depth": [6, 10],
        #         "min_samples_leaf": [1],
        #         "n_estimators": [50, 100],
        #         "max_features": [0.6]
        #     }
        # },
        # 'XGBClassifier': {
        #     'model': XGBClassifier(random_state=models_tuning_seed, verbosity=0),
        #     'params': {
        #         'learning_rate': [0.1],
        #         'n_estimators': [200],
        #         'max_depth': [5, 7],
        #         'lambda':  [10, 100]
        #     }
        # },
        # 'KNeighborsClassifier': {
        #     'model': KNeighborsClassifier(),
        #     'params': {
        #         'n_neighbors' : [5, 7, 9, 11, 13, 15, 25],
        #         'weights' : ['uniform', 'distance'],
        #         'metric' : ['minkowski', 'euclidean', 'manhattan']
        #     }
        # },
        # 'MLPClassifier': {
        #     'model': MLPClassifier(random_state=models_tuning_seed),
        #     'params': {
        #         'hidden_layer_sizes':[(100,), (100,100,), (100,50,100,)],
        #         'activation': ['logistic', 'tanh', 'relu'],
        #         'solver': ['lbfgs', 'sgd', 'adam'],
        #         'learning_rate': ['constant', 'invscaling', 'adaptive']
        #     }
        # }
    }

## Run experiments

### Experiment iteration 1

In [67]:
# Configs for an experiment iteration
exp_iter_num = 1
experiment_seed = EXPERIMENT_SEEDS[exp_iter_num - 1]
custom_table_fields_dct['preprocessing_technique'] = 'cat: mode, num: median'
custom_table_fields_dct['experiment_iteration'] = f'Exp_iter_{exp_iter_num}'
exp_iter_data_loader = copy.deepcopy(data_loader)  # Add deepcopy to avoid data leakage
preprocessor = get_null_imputer_preprocessor(exp_iter_data_loader)
models_params_for_tuning = get_models_params_for_tuning(experiment_seed)

In [68]:
multiple_run_metrics_dct = run_exp_iteration(data_loader=exp_iter_data_loader,
                                             experiment_seed=experiment_seed,
                                             preprocessor=preprocessor,
                                             models_params_for_tuning=models_params_for_tuning,
                                             metrics_computation_config=metrics_computation_config,
                                             custom_table_fields_dct=custom_table_fields_dct,
                                             with_tuning=True,
                                             save_results_dir_path=SAVE_RESULTS_DIR_PATH,
                                             tuned_params_df_path=None)

2023-03-28 17:30:06 experiment_interface.py INFO    : Start an experiment iteration for the following custom params: None

2023-03-28 17:30:06 experiment_interface.py INFO    : The dataset is preprocessed


{'experiment_iteration': 'Exp_iter_1',
 'preprocessing_technique': 'cat: mode, num: median',
 'session_uuid': '091fd7d4-98f1-4512-bebd-5652f739aa3d'}
2023/03/28, 17:30:06: Tuning DecisionTreeClassifier...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
2023/03/28, 17:30:08: Tuning for DecisionTreeClassifier is finished [F1 score = 0.4827554309900644, Accuracy = 0.9333222222222223]

2023/03/28, 17:30:08: Tuning LogisticRegression...
Fitting 3 folds for each of 8 candidates, totalling 24 fits


2023-03-28 17:30:10 experiment_interface.py INFO    : Models are tuned and saved to a file
2023-03-28 17:30:10 experiment_interface.py INFO    : Connected to MongoDB


2023/03/28, 17:30:10: Tuning for LogisticRegression is finished [F1 score = 0.49653573582902033, Accuracy = 0.9334444444444444]



Multiple runs progress:   0%|          | 0/2 [00:00<?, ?it/s]

Analyze models in one run:   0%|          | 0/2 [00:00<?, ?it/s]

##############################  [Model 1 / 2] Analyze DecisionTreeClassifier  ##############################
Model seed:  101




2023-03-28 17:30:10 abstract_overall_variance_analyzer.py INFO    : Start classifiers testing by bootstrap


Classifiers testing by bootstrap:   0%|          | 0/50 [00:00<?, ?it/s]





2023-03-28 17:30:13 abstract_overall_variance_analyzer.py INFO    : Successfully tested classifiers by bootstrap
2023-03-28 17:30:38 abstract_overall_variance_analyzer.py INFO    : Successfully computed predict proba metrics






##############################  [Model 2 / 2] Analyze LogisticRegression  ##############################
Model seed:  101




2023-03-28 17:31:04 abstract_overall_variance_analyzer.py INFO    : Start classifiers testing by bootstrap


Classifiers testing by bootstrap:   0%|          | 0/50 [00:00<?, ?it/s]





2023-03-28 17:31:23 abstract_overall_variance_analyzer.py INFO    : Successfully tested classifiers by bootstrap
2023-03-28 17:31:46 abstract_overall_variance_analyzer.py INFO    : Successfully computed predict proba metrics








Analyze models in one run:   0%|          | 0/2 [00:00<?, ?it/s]

##############################  [Model 1 / 2] Analyze DecisionTreeClassifier  ##############################
Model seed:  102




2023-03-28 17:32:09 abstract_overall_variance_analyzer.py INFO    : Start classifiers testing by bootstrap


Classifiers testing by bootstrap:   0%|          | 0/50 [00:00<?, ?it/s]





2023-03-28 17:32:13 abstract_overall_variance_analyzer.py INFO    : Successfully tested classifiers by bootstrap
2023-03-28 17:32:37 abstract_overall_variance_analyzer.py INFO    : Successfully computed predict proba metrics






##############################  [Model 2 / 2] Analyze LogisticRegression  ##############################
Model seed:  102




2023-03-28 17:33:01 abstract_overall_variance_analyzer.py INFO    : Start classifiers testing by bootstrap


Classifiers testing by bootstrap:   0%|          | 0/50 [00:00<?, ?it/s]





2023-03-28 17:33:21 abstract_overall_variance_analyzer.py INFO    : Successfully tested classifiers by bootstrap
2023-03-28 17:33:46 abstract_overall_variance_analyzer.py INFO    : Successfully computed predict proba metrics
2023-03-28 17:34:11 experiment_interface.py INFO    : Metrics are computed








In [69]:
sample_model_metrics_df = multiple_run_metrics_dct[list(models_params_for_tuning.keys())[0]]
sample_model_metrics_df.head(20)

Unnamed: 0,Metric,overall,age_priv,age_dis,Model_Seed,Model_Name,Model_Params,Run_Number,Dataset_Name,Num_Estimators
0,Mean,0.932808,0.895988,0.93562,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,Credit,50
1,Std,0.024318,0.032874,0.023665,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,Credit,50
2,IQR,0.035657,0.051088,0.034479,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,Credit,50
3,Entropy,0.0,0.0,0.0,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,Credit,50
4,Jitter,0.008712,0.015042,0.008229,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,Credit,50
5,Per_Sample_Accuracy,0.933358,0.892058,0.936511,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,Credit,50
6,Label_Stability,0.989676,0.982049,0.990258,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,Credit,50
7,TPR,0.0,0.0,0.0,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,Credit,50
8,TNR,1.0,1.0,1.0,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,Credit,50
9,PPV,,,,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,Credit,50


### Experiment iteration 2