In [2]:
import uuid

from hana_ml.algorithms.pal.auto_ml import AutomaticClassification
from hana_ml.dataframe import ConnectionContext
from hana_ml.model_storage import ModelStorage
from hana_ml.visualizers.automl_progress import PipelineProgressStatusMonitor
from hana_ml.visualizers.automl_report import BestPipelineReport
from hana_ml.visualizers.unified_report import UnifiedReport

## Check the setup and connect to the database

In [3]:
%run 010-check_setup.ipynb

SAP HANA Client for Python: 2.17.23080800
Connected to SAP HANA db version 4.00.000.00.1693299409 (fa/CE2023.16) 
at c5889dd5-e0f6-4930-8408-94d53ca61dbf.hna0.prod-us10.hanacloud.ondemand.com:443 as user CODEJAM00
Current time on the SAP HANA server: 2023-09-06 20:45:05.800000


# Tables from SAP HANA

In [4]:
column_id="PassengerId"
column_label="Survived"
features_subset=[
            "NameFirstLetter",
            "FarePerPerson",
            "SibSp",
            "ParCh",
            "Gender",
            "PClass",
            "Embarked",
            "Title",
            "Age",
            "PersonsPerTicket"
]

In [5]:
hdf_titanic_train = myconn.table("TRAIN_FE").select(
            [column_id]+features_subset+[column_label]
        ).cast(column_label, 'NVARCHAR(1)')

# Instantiate AutoML `auto_c` object

Parametrization of `auto_c` instance: https://help.sap.com/doc/cd94b08fe2e041c2ba778374572ddba9/2023_1_QRC/en-US/pal/topics/genetic_optimization_automl.html#control-parameters

In [6]:
# AutomaticClassification init 
progress_id = "automl_{}".format(uuid.uuid1())
auto_c = AutomaticClassification(progress_indicator_id=progress_id)

In [7]:
# enable_workload_class
auto_c.enable_workload_class(workload_class_name="PAL_AUTOML_WORKLOAD")

Display the current configuration

In [8]:
auto_c.display_config_dict()

        Used Operators     Category
0         LabelEncoder  Transformer
1        OneHotEncoder  Transformer
2   PolynomialFeatures  Transformer
3               CATPCA  Transformer
4        FS_supervised  Transformer
5      FS_unsupervised  Transformer
6      HGBT_Classifier   Classifier
7       MLP_Classifier   Classifier
8    M_LOGR_Classifier   Classifier
9        NB_Classifier   Classifier
10      RDT_Classifier   Classifier
11               SCALE  Transformer
12      SVM_Classifier   Classifier
13       DT_Classifier   Classifier
14            SAMPLING    Resampler
15               SMOTE    Resampler
16          SMOTETomek    Resampler
17          TomekLinks    Resampler

------------------------------------------------------------

LabelEncoder
............

            Param Config
0  IGNORE_UNKNOWN    [1]
------------------------------------------------------------

OneHotEncoder
.............

              Param                        Config
0  MINIMUM_FRACTION  [0.05, 0.1, 0.

## Get an overview of pipeline operators and parameters

In [9]:
from hana_ml.algorithms.pal import auto_ml

In [10]:
auto_ml.get_pipeline_info(myconn).collect()

Unnamed: 0,NAME,CATEGORY,PARAMETERS
0,NB_Classifier,Classifier,"{""LAPLACE"":{""type"":""number""},""DISCRETIZATION"":..."
1,M_LOGR_Classifier,Classifier,"{""MAX_ITERATION"":{""type"":""integer""},""ENET_ALPH..."
2,SVM_Classifier,Classifier,"{""KERNEL_TYPE"":{""enum"":[0,1,2,3]},""POLY_DEGREE..."
3,MLP_Classifier,Classifier,"{""ACTIVATION"":{""enum"":[1,2,3,4,5,6,7,8,9,10,11..."
4,DT_Classifier,Classifier,"{""ALGORITHM"":{""enum"":[1,2,3]},""ALLOW_MISSING_D..."
5,RDT_Classifier,Classifier,"{""TREES_NUM"":{""type"":""integer""},""TRY_NUM"":{""ty..."
6,HGBT_Classifier,Classifier,"{""ITER_NUM"":{""type"":""integer""},""SEED"":{""type"":..."
7,MLR_Regressor,Regressor,"{""ADJUSTED_R2"":{""type"":""integer""},""ALG"":{""enum..."
8,MLP_Regressor,Regressor,"{""ACTIVATION"":{""enum"":[1,2,3,4,5,6,7,8,9,10,11..."
9,POL_Regressor,Regressor,"{""POLYNOMIAL_NUM"":{""type"":""integer""},""ADJUSTED..."


In [11]:
from IPython.display import HTML
HTML(auto_ml.get_pipeline_info(myconn).filter("NAME='RDT_Classifier'").collect().to_html())

Unnamed: 0,NAME,CATEGORY,PARAMETERS
0,RDT_Classifier,Classifier,"{""TREES_NUM"":{""type"":""integer""},""TRY_NUM"":{""type"":""integer""},""NODE_SIZE"":{""type"":""integer""},""MAX_DEPTH"":{""type"":""integer""},""SPLIT_THRESHOLD"":{""type"":""number""},""SEED"":{""type"":""integer""},""ALLOW_MISSING_DEPENDENT"":{""type"":""integer""},""SAMPLE_FRACTION"":{""type"":""number""},""COMPRESSION"":{""type"":""integer""},""MAX_BITS"":{""type"":""integer""},""QUANTIZE_RATE"":{""type"":""number""},""STRATA"":{""type"":""string""},""CALCULATE_OOB"":{""type"":""integer""},""PRIOR"":{""type"":""string""}}"


In [12]:
eval(auto_ml.get_pipeline_info(myconn).filter("NAME='RDT_Classifier'").collect()["PARAMETERS"][0])

{'TREES_NUM': {'type': 'integer'},
 'TRY_NUM': {'type': 'integer'},
 'NODE_SIZE': {'type': 'integer'},
 'MAX_DEPTH': {'type': 'integer'},
 'SPLIT_THRESHOLD': {'type': 'number'},
 'SEED': {'type': 'integer'},
 'ALLOW_MISSING_DEPENDENT': {'type': 'integer'},
 'SAMPLE_FRACTION': {'type': 'number'},
 'COMPRESSION': {'type': 'integer'},
 'MAX_BITS': {'type': 'integer'},
 'QUANTIZE_RATE': {'type': 'number'},
 'STRATA': {'type': 'string'},
 'CALCULATE_OOB': {'type': 'integer'},
 'PRIOR': {'type': 'string'}}

## Experiment with [pipeline setting](https://help.sap.com/doc/cd94b08fe2e041c2ba778374572ddba9/2023_1_QRC/en-US/pal/algorithms/hana_ml.algorithms.pal.auto_ml.AutomaticClassification.html#hana_ml.algorithms.pal.auto_ml.AutomaticClassification.delete_config_dict)

In [13]:
auto_c.delete_config_dict(category="Resampler")
auto_c.delete_config_dict(category="Transformer")

In [14]:
auto_c.delete_config_dict(operator_name='MLP_Classifier')
auto_c.delete_config_dict(operator_name='M_LOGR_Classifier')
auto_c.delete_config_dict(operator_name='NB_Classifier')
auto_c.delete_config_dict(operator_name='HGBT_Classifier')
auto_c.delete_config_dict(operator_name='SVM_Classifier')

In [15]:
auto_c.update_config_dict("RDT_Classifier", "TREES_NUM", [10, 50, 100])
auto_c.update_config_dict("RDT_Classifier", "MAX_DEPTH", {'range': [3, 2, 8]})
auto_c.update_config_dict("RDT_Classifier", "NODE_SIZE", [1, 100])
auto_c.delete_config_dict(operator_name='RDT_Classifier', param_name='SAMPLE_FRACTION')

In [16]:
auto_c.display_config_dict()

   Used Operators    Category
0  RDT_Classifier  Classifier
1   DT_Classifier  Classifier

------------------------------------------------------------

RDT_Classifier
..............

           Param                Config
0      TREES_NUM         [10, 50, 100]
1      NODE_SIZE              [1, 100]
2  CALCULATE_OOB                   [0]
3      MAX_DEPTH  {'range': [3, 2, 8]}
------------------------------------------------------------

DT_Classifier
.............

                   Param                 Config
0              ALGORITHM              [1, 2, 3]
1              MAX_DEPTH  {'range': [1, 1, 11]}
2  MIN_RECORDS_OF_PARENT  {'range': [2, 1, 21]}
3    MIN_RECORDS_OF_LEAF  {'range': [1, 1, 21]}
------------------------------------------------------------



## Start Progress Status Monitor

In [17]:
# invoke a PipelineProgressStatusMonitor
progress_status_monitor = PipelineProgressStatusMonitor(
    # need to create a connection context different from the one of AutoML
    connection_context=myconn, 
    automatic_obj=auto_c)

In [18]:
progress_status_monitor.start()

# training
try:
    auto_c.fit(data=hdf_titanic_train.set_index('PassengerId'))
except Exception as e:
    raise e

<IPython.core.display.Javascript object>

## Get a report for the Best Pipeline

In [None]:
BestPipelineReport(auto_c).generate_notebook_iframe()

In [None]:
# To generate an HTML page for the same:
# BestPipelineReport(auto_c).generate_html('auto-ml')

In [None]:
auto_c.model_[0].head(5).collect()

In [None]:
auto_c.model_[1].collect()

## Save the model in SAP HANA db

In [None]:
ms = ModelStorage(myconn)
auto_c.name = 'CodeJam-Titanic-AutoML'
auto_c.version = 1
ms.save_model(model=auto_c, if_exists='replace')

In [None]:
ms.list_models()

In [None]:
null = None
false = False
true = True
eval(ms.list_models(name=auto_c.name, version=auto_c.version).at[0, 'JSON'])

# Call predition

In [None]:
hdf_titanic_test=myconn.table('TEST_FE').select(
            [column_id]+features_subset
        )

In [None]:
hdf_res = auto_c.predict(hdf_titanic_test.set_index('PassengerId'))

In [None]:
hdf_res.head(4).collect()

# Compare to the ground truth

In [None]:
hdf_titanic_complete=myconn.table('COMPLETE', schema='TITANIC')

In [None]:
hdf_res_ext=hdf_res.set_index('ID').join(myconn.table("TEST_FE").set_index('PassengerId'))

In [None]:
hdf_res_ext.head(3).collect()

In [None]:
hdf_res_incl_groundtruth=(hdf_res_ext.set_index(['Name', 'Ticket']).join(hdf_titanic_complete.set_index(['name', 'ticket']))
                 .select('ID', 'Name', 'Ticket', 'SCORES','survived',('1-ABS(SCORES-"survived")', 'IS_CORRECT'))
                 .cast('SCORES', 'INT')

)

In [None]:
hdf_res_incl_groundtruth.head(3).collect()

## Using `metrics`

In [None]:
import hana_ml.algorithms.pal.metrics as pal_metrics

In [None]:
pal_metrics.accuracy_score(data=hdf_res_incl_groundtruth, label_true='survived', label_pred='SCORES')

In [None]:
hdf_cm, hdf_cr = pal_metrics.confusion_matrix(data=hdf_res_incl_groundtruth, key='ID', label_true='survived', label_pred='SCORES')

In [None]:
hdf_cm.collect()

In [None]:
hdf_cr.collect()

In [None]:
from hana_ml.visualizers.metrics import MetricsVisualizer
MetricsVisualizer().plot_confusion_matrix(hdf_cm, normalize=False);

🤓 **Let's discuss**:
- Comparison of the last two models you trained