# This notebook shows example usage of auto binary classification lib

In [1]:
# !pip install automl-pn==0.1.3
from automl_pn.binary_classifier import BinaryClassifier

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np

**Load/make dataset**

In [2]:
X, y = make_classification(n_samples=1000, n_features=25, n_redundant = 5, n_informative=20,
                           n_classes=2, n_clusters_per_class=1, random_state=0)

**Optionaly split it to test/train**

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Create classifier instance (you can check availdable params in docs)**

In [4]:
cls = BinaryClassifier(metric='roc_auc')  

**Fit**

In [5]:
cls.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of  12 | elapsed:   15.1s remaining:  2.8min
[Parallel(n_jobs=-1)]: Done   2 out of  12 | elapsed:   15.9s remaining:  1.3min
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed:   22.0s remaining:  1.1min
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed:   26.4s remaining:   52.9s
[Parallel(n_jobs=-1)]: Done   5 out of  12 | elapsed:   35.4s remaining:   49.6s
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed:   35.5s remaining:   35.5s
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:   36.2s remaining:   25.8s
[Parallel(n_jobs=-1)]: Done   8 out of  12 | elapsed:   36.6s remaining:   18.2s
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:   40.4s remaining:   13.4s
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:   41.1s remaining:    8.1s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   42.1s remaining:    0.0s
[Parallel(n_jobs=-1)]

**Predict**

In [6]:
cls.predict(X_test)

array([0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 1], dtype=int64)

In [7]:
cls.predict_proba(X_test)[:,1].reshape(-1)

array([0.38341786, 0.9609645 , 0.0110818 , 0.82318462, 0.97697142,
       0.92453127, 0.79868153, 0.02375711, 0.01596364, 0.01314334,
       0.9736499 , 0.70950239, 0.81677857, 0.12732937, 0.96653539,
       0.00852472, 0.94823861, 0.20382568, 0.91945591, 0.98796377,
       0.96737098, 0.92993095, 0.97905196, 0.02047091, 0.98433438,
       0.92029294, 0.29086089, 0.8466406 , 0.97834811, 0.97876439,
       0.61071294, 0.93517268, 0.77295865, 0.94816278, 0.08124   ,
       0.38064044, 0.06842935, 0.92769078, 0.0163628 , 0.99065491,
       0.90009463, 0.56807794, 0.01805846, 0.62270024, 0.02783804,
       0.01118544, 0.90552341, 0.01586985, 0.03922581, 0.81817642,
       0.99557259, 0.00492311, 0.9830045 , 0.1524079 , 0.07887718,
       0.12175741, 0.66456967, 0.98470174, 0.01251034, 0.01914846,
       0.58861822, 0.17976004, 0.92038291, 0.01277547, 0.44533397,
       0.97365288, 0.65238791, 0.18651193, 0.1015901 , 0.99515655,
       0.52845915, 0.91090072, 0.04634121, 0.07200316, 0.01411

In [8]:
print(classification_report(y_test, cls.predict(X_test)))

              precision    recall  f1-score   support

           0       0.94      0.97      0.95        95
           1       0.97      0.94      0.96       105

    accuracy                           0.95       200
   macro avg       0.95      0.96      0.95       200
weighted avg       0.96      0.95      0.96       200



In [9]:
cls.models_score

{'CatBoostClassifier': 0.931161118924832,
 'LGBMClassifier': 0.8999843725582122,
 'Nearest Neighbors': 0.6448663853727145,
 'Linear SVM': 0.6241600250039068,
 'RBF SVM': 0.5,
 'Gaussian Process': 0.5,
 'Decision Tree': 0.8066104078762306,
 'Random Forest': 0.5872792623847476,
 'Neural Net': 0.7184716361931552,
 'AdaBoost': 0.8876386935458666,
 'Naive Bayes': 0.6423659946866698,
 'QDA': 0.5832942647288639}

In [10]:
cls.best_model

<catboost.core.CatBoostClassifier at 0x24f2075f3d0>

**For some datasets (synthetic mostly) and models data preprocessing could make predictions worse, you can disable it**

In [11]:
cls = BinaryClassifier(metric='roc_auc', preprocess_data=False)  

**Fit**

In [12]:
cls.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend MultiprocessingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of  12 | elapsed:    3.5s remaining:   39.9s
[Parallel(n_jobs=-1)]: Done   2 out of  12 | elapsed:    3.6s remaining:   18.2s
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed:    4.6s remaining:   14.0s
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed:    5.7s remaining:   11.6s
[Parallel(n_jobs=-1)]: Done   5 out of  12 | elapsed:    6.9s remaining:    9.7s
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed:    8.7s remaining:    8.7s
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:    9.2s remaining:    6.6s
[Parallel(n_jobs=-1)]: Done   8 out of  12 | elapsed:   10.4s remaining:    5.1s
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:   12.6s remaining:    4.1s
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:   12.9s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   15.1s remaining:    0.0s
[Parallel(n_jobs=-1)]

**Predict**

In [13]:
print(classification_report(y_test, cls.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99        95
           1       1.00      0.99      1.00       105

    accuracy                           0.99       200
   macro avg       0.99      1.00      0.99       200
weighted avg       1.00      0.99      1.00       200



In [14]:
cls.models_score

{'CatBoostClassifier': 0.90014064697609,
 'LGBMClassifier': 0.90014064697609,
 'Nearest Neighbors': 0.8756055633692765,
 'Linear SVM': 0.8559931239256133,
 'RBF SVM': 0.5,
 'Gaussian Process': 0.8506016565088295,
 'Decision Tree': 0.7126113455227379,
 'Random Forest': 0.7746522894202219,
 'Neural Net': 0.925144553836537,
 'AdaBoost': 0.8189560868885764,
 'Naive Bayes': 0.8681825285200813,
 'QDA': 0.9938271604938271}