In [3]:
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, cross_validate, HalvingGridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from ucimlrepo import fetch_ucirepo
from NaDropper import HighNaNDropper
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

In [4]:
def build_pipeline(**args):
    pipeline = Pipeline([
        ('dropper', HighNaNDropper(threshold=0.3)),
        ('imputer', KNNImputer()),
        ('classifier', DecisionTreeClassifier(**args))
    ])
    return pipeline

In [5]:
def build_preprocess_pipeline():
    pipeline = Pipeline([
        ('imputer', KNNImputer()),
    ])
    return pipeline

In [2]:
def process_dataset(x, y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, shuffle=True)
    y_train = np.ravel(y_train)
    preprocess_pipeline = build_preprocess_pipeline()
    x_train_processed = preprocess_pipeline.fit_transform(x_train)
    x_resampled, y_resampled = SMOTE().fit_resample(x_train_processed, y_train)
    return x_resampled, y_resampled, x_test, y_test

In [8]:
def get_hyperparams_for_dataset(x, y, scoring):
    param_grid = {
    'classifier__ccp_alpha': np.linspace(0, 0.25, 25),
    'classifier__max_depth': [5, 10, 20, None],
    'classifier__min_samples_leaf': [2, 3, 5, 10]
    }
    pipeline = build_pipeline()
    search = HalvingGridSearchCV(pipeline, param_grid=param_grid, aggressive_elimination=True, n_jobs=2, scoring=scoring)
    search.fit(x, y)
    ccp_alpha, max_depth, min_samples = search.best_params_['classifier__ccp_alpha'], search.best_params_['classifier__max_depth'], search.best_params_['classifier__min_samples_leaf']
    return ccp_alpha, max_depth, min_samples

In [22]:
def test_hyperparams(x, y, x_test, y_test, ccp, max_depth, min_samples, average):
    pipeline = build_pipeline(ccp_alpha=ccp, max_depth=max_depth, min_samples_leaf=min_samples)
    pipeline.fit(x, y)
    y_pred = pipeline.predict(x_test.to_numpy())
    return f1_score(y_pred, y_test, average=average)

In [6]:
polish_companies_bankruptcy = fetch_ucirepo(id=365)
iris = fetch_ucirepo(id=53)
iris.data.targets = LabelEncoder().fit_transform(iris.data.targets)

  y = column_or_1d(y, warn=True)


In [7]:
x_companies, y_companies, x_companies_test, y_companies_test = process_dataset(polish_companies_bankruptcy.data.features, polish_companies_bankruptcy.data.targets)
x_iris, y_iris, x_iris_test, y_iris_test = process_dataset(iris.data.features, iris.data.targets)

In [9]:
hyperparams_iris = get_hyperparams_for_dataset(x_iris, y_iris, 'f1_micro')

In [10]:
hyperparams_pcb = get_hyperparams_for_dataset(x_companies, y_companies, 'f1')

In [23]:
test_hyperparams(x_iris, y_iris, x_iris_test, y_iris_test, *hyperparams_iris, 'micro')

0.8947368421052632

In [26]:
test_hyperparams(x_companies, y_companies, x_companies_test, y_companies_test, *hyperparams_pcb, 'binary')

0.2304199772985244

In [32]:
from sklearn.tree import export_text

tree_rules = export_text(test_pipeline['classifier'])
print(tree_rules)

|--- feature_6 <= -0.00
|   |--- feature_0 <= 2.00
|   |   |--- feature_0 <= 2.00
|   |   |   |--- feature_0 <= 1.00
|   |   |   |   |--- feature_26 <= 0.13
|   |   |   |   |   |--- feature_27 <= 0.73
|   |   |   |   |   |   |--- feature_57 <= -1.21
|   |   |   |   |   |   |   |--- feature_62 <= 122.42
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- feature_62 >  122.42
|   |   |   |   |   |   |   |   |--- feature_29 <= 4.58
|   |   |   |   |   |   |   |   |   |--- feature_23 <= -0.07
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- feature_23 >  -0.07
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- feature_29 >  4.58
|   |   |   |   |   |   |   |   |   |--- feature_10 <= 0.09
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |--- feature_10 >  0.09
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |-