# Performance showcase of added "to_sql" functionality in mlinspect

Here the performance of the proposed inspection using sql will be compared to the original one in pandas. Part of
the "healthcare" and "compas" pipeline will be used.

## Required packages:
See: requirements/requirements.txt and requirements/requirements.dev.txt

## Some parameters you might want to set:

In [1]:
import os
import sys
import time

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from mlinspect.utils import get_project_root
from mlinspect import PipelineInspector, OperatorType
from mlinspect.inspections import HistogramForColumns, RowLineage, MaterializeFirstOutputRows
from mlinspect.checks import NoBiasIntroducedFor, NoIllegalFeatures
from demo.feature_overview.no_missing_embeddings import NoMissingEmbeddings
from example_pipelines.healthcare import custom_monkeypatching
from mlinspect.to_sql.dbms_connectors.postgresql_connector import PostgresqlConnector

from mlinspect.to_sql.dbms_connectors.umbra_connector import UmbraConnector


# DBMS related:
UMBRA_USER = "postgres"
UMBRA_PW = ""
UMBRA_DB = ""
UMBRA_PORT = 5433
UMBRA_HOST = "/tmp/"

POSTGRES_USER = "luca"
POSTGRES_PW = "password"
POSTGRES_DB = "healthcare_benchmark"
POSTGRES_PORT = 5432
POSTGRES_HOST = "localhost"

HEALTHCARE_FILE_PY = os.path.join(str(get_project_root()), "example_pipelines", "healthcare", "healthcare.py")
COMPAS_FILE_PY = os.path.join(str(get_project_root()), "example_pipelines", "compas", "compas.py")
ADULT_SIMPLE_FILE_PY = os.path.join(str(get_project_root()), "example_pipelines", "adult_simple", "adult_simple.py")
ADULT_COMPLEX_FILE_PY = os.path.join(str(get_project_root()), "example_pipelines", "adult_complex", "adult_complex.py")

# No model training:
HEALTHCARE_FILE_PY_R = os.path.join(str(get_project_root()), "test", "monkeypatchingSQL", "pipelines_for_tests",
                                    "healthcare", "healthcare_res.py")
COMPAS_FILE_PY_R = os.path.join(str(get_project_root()), "test", "monkeypatchingSQL", "pipelines_for_tests", "compas",
                              "compas_res.py")
ADULT_SIMPLE_FILE_PY_R = os.path.join(str(get_project_root()), "test", "monkeypatchingSQL", "pipelines_for_tests",
                                    "adult_simple", "adult_simple_res.py")
ADULT_COMPLEX_FILE_PY_R = os.path.join(str(get_project_root()), "test", "monkeypatchingSQL", "pipelines_for_tests",
                                     "adult_complex", "adult_complex_res.py")

HEALTHCARE_BIAS = ['age_group', 'race']
COMPAS_BIAS = ['sex', 'race']
ADULT_SIMPLE_BIAS = ['race']

## Benchmark setup:

In [2]:
def run_inspection(file_location, bias, to_sql, dbms_connector=None, mode=None, materialize=None):
    from PIL import Image
    import matplotlib.pyplot as plt
    from mlinspect.visualisation import save_fig_to_path

    inspector_result = PipelineInspector \
        .on_pipeline_from_py_file(file_location) \
        .add_custom_monkey_patching_module(custom_monkeypatching) #\
        #.add_check(NoBiasIntroducedFor(bias)) \
        #.add_check(NoIllegalFeatures()) \
        #.add_check(NoMissingEmbeddings()) \
        #.add_required_inspection(RowLineage(5)) \
        #.add_required_inspection(MaterializeFirstOutputRows(5))

    if to_sql:
        inspector_result = inspector_result.execute_in_sql(dbms_connector=dbms_connector, mode=mode,
                                                           materialize=materialize)
    else:
        inspector_result = inspector_result.execute()

    extracted_dag = inspector_result.dag
    filename = os.path.join(str(get_project_root()), "demo", "feature_overview", "healthcare.png")
    save_fig_to_path(extracted_dag, filename)
    im = Image.open(filename)
    plt.imshow(im)

    check_results = inspector_result.check_to_check_results
    #no_bias_check_result = check_results[NoBiasIntroducedFor(bias)]

    #distribution_changes_overview_df = NoBiasIntroducedFor.get_distribution_changes_overview_as_df(
    #    no_bias_check_result)
    #result = ""
    #result += distribution_changes_overview_df.to_markdown()

    #for i in list(no_bias_check_result.bias_distribution_change.items()):
    #    _, join_distribution_changes = i
    #    for column, distribution_change in join_distribution_changes.items():
    #        result += "\n"
    #        result += f"\033[1m Column '{column}'\033[0m"
    #        result += distribution_change.before_and_after_df.to_markdown()
        
    #print(result)
    result=None
    return result

## Benchmark of default inspection using CTEs:

In [3]:
#dbms_connector_u = UmbraConnector(dbname=UMBRA_DB, user=UMBRA_USER, password=UMBRA_PW, port=UMBRA_PORT, host=UMBRA_HOST)

dbms_connector_p = PostgresqlConnector(dbname=POSTGRES_DB, user=POSTGRES_USER, password=POSTGRES_PW,
                                       port=POSTGRES_PORT, host=POSTGRES_HOST)

def run_for_all(file_location, bias, mode="", materialize=None):
    t0 = time.time()
    run_inspection(file_location=file_location, bias=bias, to_sql=False)
    t1 = time.time()
    print("\nTime spend with original (pandas): " + str(t1 - t0))

    t0 = time.time()
    run_inspection(file_location=file_location, bias=bias, to_sql=True, dbms_connector=dbms_connector_p, mode=mode,
                   materialize=materialize)
    t1 = time.time()
    print("\nTime spend with modified SQL inspections (PSQL): " + str(t1 - t0))

#    if not materialize: # Materialized not supported by Umbra -> main-memory performance
#        t0 = time.time()
#       run_inspection(file_location=file_location, bias=bias, to_sql=True, dbms_connector=dbms_connector_u, mode=mode,
#                       materialize=materialize)
#        t1 = time.time()
#       print("\nTime spend with modified SQL inspections (Umbra): " + str(t1 - t0))


## End-to-End example of the preprocessing-pipeline inspection + model training:

Slightly different inspections results are expected because of the random split. Still, the resulting model accuracy should
be similar.

In [4]:
run_for_all(HEALTHCARE_FILE_PY, HEALTHCARE_BIAS, mode="VIEW", materialize=False)

XXXXXXXXXXXXXXXXXXXXXX HIER IST DER MODIFIED CODE XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
from mlinspect.instrumentation._pipeline_executor import set_code_reference_call, set_code_reference_subscript, monkey_patch, undo_monkey_patch
monkey_patch()
"""Predicting which patients are at a higher risk of complications"""
import os
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from example_pipelines.healthcare.healthcare_utils import MyW2VTransformer, MyKerasClassifier, create_model
from mlinspect.utils import get_project_root, store_timestamp
import time
from mlinspect.to_sql.dbms_connectors.postgresql_connector import PostgresqlConnector
COUNTIES_OF_INTEREST = ['county2', 'county3']
patients = pd.read_csv(os.path.join(str(get_project_root(**
    set_code_reference_call(25, 40, 25,

TypeError: execute_inspections() takes 5 positional arguments but 8 were given

In [None]:
run_for_all(HEALTHCARE_FILE_PY, HEALTHCARE_BIAS, mode="VIEW", materialize=True)

## Execute and inspect just the inspections:

In [5]:
run_for_all(HEALTHCARE_FILE_PY_R, HEALTHCARE_BIAS, mode="CTE", materialize=None)


from mlinspect.instrumentation._pipeline_executor import set_code_reference_call, set_code_reference_subscript, monkey_patch, undo_monkey_patch
monkey_patch()
"""Predicting which patients are at a higher risk of complications"""
import os
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from example_pipelines.healthcare.healthcare_utils import MyW2VTransformer, MyKerasClassifier, create_model
from mlinspect.utils import get_project_root
COUNTIES_OF_INTEREST = ['county2', 'county3']
patients = pd.read_csv(os.path.join(str(get_project_root(**
    set_code_reference_call(20, 40, 20, 58)), **set_code_reference_call(20,
    36, 20, 59)), 'test', 'monkeypatchingSQL', 'pipelines_for_tests',
    'healthcare', 'patients.csv', **set_code_reference_call(20, 23, 21, 51)
    ), **set_code_

In [6]:
run_for_all(COMPAS_FILE_PY_R, COMPAS_BIAS, mode="CTE", materialize=None)







from mlinspect.instrumentation._pipeline_executor import set_code_reference_call, set_code_reference_subscript, monkey_patch, undo_monkey_patch
monkey_patch()
"""
An example pipeline
"""
import os
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, label_binarize
from mlinspect.utils import get_project_root
train_file = os.path.join(str(get_project_root(**set_code_reference_call(15,
    30, 15, 48)), **set_code_reference_call(15, 26, 15, 49)), 'test',
    'monkeypatchingSQL', 'pipelines_for_tests', 'compas',
    'compas_train.csv', **set_code_reference_call(15, 13, 15, 132))
train_data = pd.read_csv(train_file, **set_code_reference_call(16, 13, 16, 
    63, na_values='', index_col=0))
test_file = os.path.join(str(get_project_root(**set_code_reference_call(17,
    29, 17, 47)), 

In [7]:
run_for_all(ADULT_SIMPLE_FILE_PY_R, ADULT_SIMPLE_BIAS, mode="CTE", materialize=None)







pipeline start

Time spend with original (pandas): 0.06773543357849121
pipeline start

Time spend with modified SQL inspections (PSQL): 0.10215044021606445


In [8]:
run_for_all(ADULT_COMPLEX_FILE_PY_R, ADULT_SIMPLE_BIAS, mode="CTE", materialize=None)







from mlinspect.instrumentation._pipeline_executor import set_code_reference_call, set_code_reference_subscript, monkey_patch, undo_monkey_patch
monkey_patch()
"""
An example pipeline
"""
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from mlinspect.utils import get_project_root
train_file = os.path.join(str(get_project_root(**set_code_reference_call(17,
    30, 17, 48)), **set_code_reference_call(17, 26, 17, 49)), 'test',
    'monkeypatchingSQL', 'pipelines_for_tests', 'adult_complex',
    'adult_train.csv', **set_code_reference_call(17, 13, 17, 138))
train_data = pd.read_csv(train_file, **set_code_reference_call(18, 13, 18, 
    63, na_values='', index_col=0))
test_file = os.path.join(str(get_project_root(**set_code_

  mode = stats.mode(array)
  mode = stats.mode(array)
  mode = stats.mode(array)
  mode = stats.mode(array)


In [1]:
z=4

In [2]:
print(z)

4
