## Check software and hardware

In [1]:
import cudf
import cuml
import cupy

assert cupy.cuda.is_available(), 'CUDA not ready!'

## Prepare train_data and test_data

In [2]:
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [3]:
X,y = datasets.load_breast_cancer(as_frame=True,return_X_y=True)
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7,random_state=335)

In [4]:
train_data = pd.concat([X_train,y_train],axis=1)

train_data_cu = cudf.from_pandas(train_data)
X_test_cu=cudf.from_pandas(X_test)
y_test_cu=cudf.from_pandas(y_test)

## Make HyperGBM experiment with cuDF dataframe and run it

In [5]:
%%time
from hypergbm import make_experiment

experiment = make_experiment(train_data_cu, target='target', reward_metric='precision')
estimator = experiment.run()

CPU times: user 34.4 s, sys: 5.7 s, total: 40.1 s
Wall time: 14.4 s


In [6]:
estimator

LocalizablePipeline(steps=[('data_clean',
                            DataCleanStep(cv=True,
                                          data_cleaner_args={'correct_object_dtype': True,
                                                             'drop_columns': None,
                                                             'drop_constant_columns': True,
                                                             'drop_duplicated_columns': False,
                                                             'drop_idness_columns': True,
                                                             'drop_label_nan_rows': True,
                                                             'int_convert_to': 'float',
                                                             'nan_chars': None,
                                                             'reduce_mem_usage': False,
                                                             'reserve_columns': None},
                       

## Export the trained model

In [7]:
import pickle
with open('model.pkl','wb') as f:
    pickle.dump( estimator,f)

## Scoring with cuML

In [8]:
from cuml.metrics import accuracy_score

preds=estimator.predict(X_test_cu) 
accuracy_score(y_test_cu,preds)

0.9649122953414917

## Scoring with Hypernets.tabular toolbox

In [9]:
from hypernets.tabular import get_tool_box

tb=get_tool_box(cudf.DataFrame)

preds=estimator.predict(X_test_cu)
proba=estimator.predict_proba(X_test_cu)

scores = tb.metrics.calc_score(y_test_cu,preds,proba,task='binary', metrics=['auc','accuracy','f1','recall','precision'] )
scores

{'auc': 0.9776472383277388,
 'accuracy': 0.9649122807017544,
 'f1': 0.9736842105263158,
 'recall': 0.9823008849557522,
 'precision': 0.9652173913043478}

## Convert trained model  *as local* to used with pandas data and sklearn utilities

In [10]:
lestimator = estimator.as_local()
lestimator

Pipeline(steps=[('data_clean',
                 DataCleanStep(cv=True,
                               data_cleaner_args={'correct_object_dtype': True,
                                                  'drop_columns': None,
                                                  'drop_constant_columns': True,
                                                  'drop_duplicated_columns': False,
                                                  'drop_idness_columns': True,
                                                  'drop_label_nan_rows': True,
                                                  'int_convert_to': 'float',
                                                  'nan_chars': None,
                                                  'reduce_mem_usage': False,
                                                  'reserve_columns': None},
                               name='data_clean')),
                ('est...
                 GreedyEnsemble(weight=[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

In [11]:
from sklearn.metrics import get_scorer

scorer=get_scorer('precision')
# scorer=get_scorer('accuracy')
# scorer = get_scorer('roc_auc_ovo')
# sc=get_scorer('recall')
score = scorer(lestimator,X_test,y_test)
score

0.9652173913043478

In [12]:
from sklearn.metrics import classification_report

y_pred=lestimator.predict(X_test)
print(classification_report(y_test, y_pred, digits=5))

              precision    recall  f1-score   support

           0    0.96429   0.93103   0.94737        58
           1    0.96522   0.98230   0.97368       113

    accuracy                        0.96491       171
   macro avg    0.96475   0.95667   0.96053       171
weighted avg    0.96490   0.96491   0.96476       171

