In [1]:
# !pip install -r ../exp_requirements.txt

In [2]:
# !pip uninstall virny -y

In [3]:
# Install using an HTTP link
# !pip install git+https://github.com/DataResponsibly/Virny.git@development

# Install using an SSH link
# !pip install git+ssh://git@github.com/DataResponsibly/Virny.git@development

In [4]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [5]:
import os
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

# Credit Dataset With Null Imputer

## Import dependencies

In [6]:
import os
import copy

from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from virny.utils.custom_initializers import create_config_obj
from virny.datasets.data_loaders import CreditDataset

from configs.constants import NUM_METRICS_COMPUTATION_RUNS, EXPERIMENT_SEEDS, TEST_SET_FRACTION
from source.error_injectors.measuring import RandomNullsGenerator
from source.user_interfaces.experiment_interface import run_exp_iteration
from source.utils.custom_initializers import create_experiment_data_loader
from source.preprocessing.basic_preprocessing import get_null_imputer_preprocessor

## Initialize input variables for the experiment

In [7]:
ROOT_DIR = os.path.join(os.getcwd(), "..", "..")
EXPERIMENT_NAME = 'preprocessing'
DB_COLLECTION_NAME = 'preprocessing_results'
SAVE_RESULTS_DIR_PATH = os.path.join(ROOT_DIR, 'results', EXPERIMENT_NAME)

In [8]:
data_loader = CreditDataset(subsample_size=50_000)
data_loader.X_data.head()

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines
27099,0.343984,46,0,94.0,,3,0,0
133696,0.818239,35,1,0.177587,4898.0,5,1,0
49189,0.687057,41,1,0.462124,9200.0,7,0,2
147872,0.342368,47,2,0.427277,6400.0,9,2,2
40834,0.27571,61,0,1867.0,,16,0,1


In [9]:
data_loader.full_df.isna().sum()

SeriousDlqin2yrs                           0
RevolvingUtilizationOfUnsecuredLines       0
age                                        0
NumberOfTime30-59DaysPastDueNotWorse       0
DebtRatio                                  0
MonthlyIncome                           9964
NumberOfOpenCreditLinesAndLoans            0
NumberOfTimes90DaysLate                    0
NumberRealEstateLoansOrLines               0
NumberOfTime60-89DaysPastDueNotWorse       0
NumberOfDependents                      1306
dtype: int64

### Define a db writer and custom fields to insert into your database

In [None]:
from source.db_functions import connect_to_mongodb

client, _, db_writer_func = connect_to_mongodb(DB_COLLECTION_NAME)

In [10]:
import uuid

custom_table_fields_dct = {
    'session_uuid': str(uuid.uuid4()),
    'preprocessing_technique': None,
}
print('Current session uuid: ', custom_table_fields_dct['session_uuid'])

Current session uuid:  22ffa685-b892-474d-91f5-091cde823d4e


### Create a metrics computation config object

In [11]:
config_yaml_path = 'experiment_config.yaml'
age_range = [i for i in range(0, 31)]
config_yaml_content = \
f"""
dataset_name: Credit
bootstrap_fraction: 0.8
# n_estimators: 50
n_estimators: 10
# num_runs: {NUM_METRICS_COMPUTATION_RUNS}
num_runs: 2
sensitive_attributes_dct: {{'age': {age_range}}}
"""

with open(config_yaml_path, 'w', encoding='utf-8') as f:
    f.write(config_yaml_content)

In [12]:
metrics_computation_config = create_config_obj(config_yaml_path=config_yaml_path)

### Define tuning parameter for models

In [13]:
def get_models_params_for_tuning(models_tuning_seed):
    return {
        'DecisionTreeClassifier': {
            'model': DecisionTreeClassifier(random_state=models_tuning_seed),
            'params': {
                "max_depth": [20, 30],
                "min_samples_split" : [0.1],
                "max_features": ['sqrt'],
                "criterion": ["gini", "entropy"]
            }
        },
        'LogisticRegression': {
            'model': LogisticRegression(random_state=models_tuning_seed),
            'params': {
                'penalty': ['l2'],
                'C' : [0.0001, 0.1, 1, 100],
                'solver': ['newton-cg', 'lbfgs'],
                'max_iter': [250],
            }
        },
        # 'RandomForestClassifier': {
        #     'model': RandomForestClassifier(random_state=models_tuning_seed),
        #     'params': {
        #         "max_depth": [6, 10],
        #         "min_samples_leaf": [1],
        #         "n_estimators": [50, 100],
        #         "max_features": [0.6]
        #     }
        # },
        # 'XGBClassifier': {
        #     'model': XGBClassifier(random_state=models_tuning_seed, verbosity=0),
        #     'params': {
        #         'learning_rate': [0.1],
        #         'n_estimators': [200],
        #         'max_depth': [5, 7],
        #         'lambda':  [10, 100]
        #     }
        # },
        # 'KNeighborsClassifier': {
        #     'model': KNeighborsClassifier(),
        #     'params': {
        #         'n_neighbors' : [5, 7, 9, 11, 13, 15, 25],
        #         'weights' : ['uniform', 'distance'],
        #         'metric' : ['minkowski', 'euclidean', 'manhattan']
        #     }
        # },
        # 'MLPClassifier': {
        #     'model': MLPClassifier(random_state=models_tuning_seed),
        #     'params': {
        #         'hidden_layer_sizes':[(100,), (100,100,), (100,50,100,)],
        #         'activation': ['logistic', 'tanh', 'relu'],
        #         'solver': ['lbfgs', 'sgd', 'adam'],
        #         'learning_rate': ['constant', 'invscaling', 'adaptive']
        #     }
        # }
    }

## Run experiments

### Experiment iteration 1

In [14]:
# Configs for an experiment iteration
exp_iter_num = 1
experiment_seed = EXPERIMENT_SEEDS[exp_iter_num - 1]
columns_nulls_percentage_dct = {
    'NumberOfTimes90DaysLate': 0.05,
    'NumberRealEstateLoansOrLines': 0.1,
    'NumberOfTime60-89DaysPastDueNotWorse': 0.1,
}
generator = RandomNullsGenerator(experiment_seed, columns_nulls_percentage_dct)

custom_table_fields_dct['preprocessing_technique'] = 'cat: mode, num: median'
custom_table_fields_dct['experiment_iteration'] = f'Exp_iter_{exp_iter_num}'
models_params_for_tuning = get_models_params_for_tuning(experiment_seed)
# Create a transformed data loader
exp_iter_data_loader = create_experiment_data_loader(data_loader, generator)
preprocessor = get_null_imputer_preprocessor(exp_iter_data_loader)

In [15]:
multiple_run_metrics_dct = run_exp_iteration(data_loader=exp_iter_data_loader,
                                             experiment_seed=experiment_seed,
                                             test_set_fraction=TEST_SET_FRACTION,
                                             db_writer_func=db_writer_func,
                                             preprocessor=preprocessor,
                                             models_params_for_tuning=models_params_for_tuning,
                                             metrics_computation_config=metrics_computation_config,
                                             custom_table_fields_dct=custom_table_fields_dct,
                                             with_tuning=True,
                                             save_results_dir_path=SAVE_RESULTS_DIR_PATH,
                                             tuned_params_df_path=None)

2023-04-07 01:12:14 experiment_interface.py INFO    : Start an experiment iteration for the following custom params:


{'dataset_split_seed': 100,
 'experiment_iteration': 'Exp_iter_1',
 'model_init_seed': 100,
 'preprocessing_technique': 'cat: mode, num: median',
 'session_uuid': '22ffa685-b892-474d-91f5-091cde823d4e'}




2023-04-07 01:12:14 experiment_interface.py INFO    : The dataset is preprocessed


2023/04/07, 01:12:14: Tuning DecisionTreeClassifier...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
2023/04/07, 01:12:16: Tuning for DecisionTreeClassifier is finished [F1 score = 0.4829009142796696, Accuracy = 0.9338666666666665]

2023/04/07, 01:12:16: Tuning LogisticRegression...
Fitting 3 folds for each of 8 candidates, totalling 24 fits


2023-04-07 01:12:16 experiment_interface.py INFO    : Models are tuned and saved to a file
2023-04-07 01:12:17 experiment_interface.py INFO    : Connected to MongoDB


2023/04/07, 01:12:16: Tuning for LogisticRegression is finished [F1 score = 0.5016224277219355, Accuracy = 0.9344]



Multiple runs progress:   0%|          | 0/2 [00:00<?, ?it/s]

Analyze models in one run:   0%|          | 0/2 [00:00<?, ?it/s]

Classifiers testing by bootstrap:   0%|          | 0/10 [00:00<?, ?it/s]

Classifiers testing by bootstrap:   0%|          | 0/10 [00:00<?, ?it/s]

Analyze models in one run:   0%|          | 0/2 [00:00<?, ?it/s]

Classifiers testing by bootstrap:   0%|          | 0/10 [00:00<?, ?it/s]

Classifiers testing by bootstrap:   0%|          | 0/10 [00:00<?, ?it/s]

In [16]:
sample_model_metrics_df = multiple_run_metrics_dct[list(models_params_for_tuning.keys())[0]]
sample_model_metrics_df.head(20)

Unnamed: 0,Metric,overall,age_priv,age_dis,Model_Seed,Model_Name,Model_Params,Run_Number,Dataset_Name,Num_Estimators
0,Mean,0.935559,0.89295,0.93881,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,Credit,10
1,Std,0.024406,0.036197,0.023507,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,Credit,10
2,IQR,0.032097,0.048222,0.030867,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,Credit,10
3,Entropy,0.0,0.0,0.0,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,Credit,10
4,Jitter,0.00922,0.019339,0.008448,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,Credit,10
5,Per_Sample_Accuracy,0.93309,0.871791,0.937768,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,Credit,10
6,Label_Stability,0.98966,0.978279,0.990528,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,Credit,10
7,TPR,0.0,0.0,0.0,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,Credit,10
8,TNR,1.0,1.0,1.0,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,Credit,10
9,PPV,,,,101,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",Run_1,Credit,10


### Experiment iteration 2

In [None]:
client.close()