In [1]:
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, HalvingGridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from ucimlrepo import fetch_ucirepo
from NaDropper import HighNaNDropper
from sklearn.pipeline import Pipeline
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
def build_pipeline(**args):
    pipeline = Pipeline([
        ('dropper', HighNaNDropper(threshold=0.3)),
        ('imputer', KNNImputer()),
        ('classifier', DecisionTreeClassifier(**args))
    ])
    return pipeline

In [3]:
def build_preprocess_pipeline():
    pipeline = Pipeline([
        ('imputer', KNNImputer()),
    ])
    return pipeline

In [22]:
def process_dataset(x, y, balance=False):
    x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, shuffle=True)
    y_train = np.ravel(y_train)
    preprocess_pipeline = build_preprocess_pipeline()
    x_train_processed = preprocess_pipeline.fit_transform(x_train)
    if balance:
        x_resampled, y_resampled = SMOTE().fit_resample(x_train_processed, y_train)
        return x_resampled, y_resampled, x_test, y_test
    else:
        return x_train_processed, y_train, x_test, y_test

In [19]:
def get_hyperparams_for_dataset(pipeline, x, y, scoring):
    param_grid = {
    'classifier__ccp_alpha': np.linspace(0, 0.25, 25),
    'classifier__max_depth': [5, 10, 20, None],
    'classifier__min_samples_leaf': [2, 3, 5, 10]
    }
    search = HalvingGridSearchCV(pipeline, param_grid=param_grid, aggressive_elimination=True, n_jobs=2, scoring=scoring)
    search.fit(x, y)
    ccp_alpha, max_depth, min_samples = search.best_params_['classifier__ccp_alpha'], search.best_params_['classifier__max_depth'], search.best_params_['classifier__min_samples_leaf']
    return ccp_alpha, max_depth, min_samples

In [14]:
def build_for_hyperparams(ccp, max_depth, min_samples_leaf):
    pipeline = build_pipeline(ccp_alpha=ccp, max_depth=max_depth, min_samples_leaf=min_samples_leaf)
    return pipeline

In [15]:
from sklearn.model_selection import cross_val_score

def dump_cv_metrics(pipeline, x, y, stratified=False, scoring='f1_micro'):
    cv_results = cross_val_score(pipeline, x, y, cv=StratifiedKFold(shuffle=True) if stratified else 5, scoring=scoring)
    result_array = np.array(cv_results)
    print(f"Average F1 Score: {result_array}")

In [16]:
def dump_metrics(pipeline, x_test, y_test):
    y_pred = pipeline.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

In [17]:
import os.path
import pickle as pkl

def load_cache_dataset(filename: str, id: int):
    if os.path.exists(filename):
        with open(filename, 'rb') as f:
            dataset = pkl.load(f)
    else:
        dataset = fetch_ucirepo(id=id)
        with open(filename, 'wb') as f:
            pkl.dump(dataset, f)
    return dataset

polish_companies_bankruptcy = load_cache_dataset('pcb.pkl', 365)
iris = load_cache_dataset('iris.pkl', 53)
iris.data.targets = LabelEncoder().fit_transform(iris.data.targets)

  y = column_or_1d(y, warn=True)


In [10]:
iris.data.targets

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [24]:
x_companies, y_companies, x_companies_test, y_companies_test = process_dataset(polish_companies_bankruptcy.data.features, polish_companies_bankruptcy.data.targets)
x_iris, y_iris, x_iris_test, y_iris_test = process_dataset(iris.data.features, iris.data.targets)

In [30]:
y_companies

array([0, 0, 0, ..., 0, 1, 0], shape=(32553,))

In [36]:
iris_unweighted = build_pipeline()
hyperparams_iris_unweighted = get_hyperparams_for_dataset(iris_unweighted, x_iris, y_iris, 'f1_micro')
hyperparams_iris_unweighted

(np.float64(0.09375), 5, 2)

In [20]:
iris_weighted = build_pipeline(class_weight='balanced')
hyperparams_iris_weighted = get_hyperparams_for_dataset(iris_weighted, x_iris, y_iris, 'f1_micro')
hyperparams_iris_weighted

(np.float64(0.09375), 20, 3)

In [31]:
pcb_unweighted = build_pipeline()
hyperparams_pcb_unweighted = get_hyperparams_for_dataset(pcb_unweighted, x_companies, y_companies, 'f1')
hyperparams_pcb_unweighted

(np.float64(0.0), None, 3)

In [32]:
pcb_weighted = build_pipeline(class_weight='balanced')
hyperparams_pcb_weighted = get_hyperparams_for_dataset(pcb_weighted, x_companies, y_companies, 'f1')
hyperparams_pcb_weighted

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

(np.float64(0.05208333333333333), 10, 3)

In [44]:
iris_pipeline_unweighted = build_for_hyperparams(*hyperparams_iris_unweighted)
iris_pipeline_unweighted.fit(x_iris, y_iris)

Accuracy: 0.8947
Precision: 0.8947
Recall: 0.8947
F1 Score: 0.8947




# Cross-validation stratification effect

In [37]:
iris_pipeline_unweighted = build_for_hyperparams(*hyperparams_iris_unweighted)
pcb_pipeline_unweighted = build_for_hyperparams(*hyperparams_pcb_unweighted)

In [38]:
iris_pipeline_weighted = build_for_hyperparams(*hyperparams_iris_weighted)
pcb_pipeline_weighted = build_for_hyperparams(*hyperparams_pcb_weighted)

## Unweighted iris pipeline with good hyperparams

In [39]:
dump_cv_metrics(iris_pipeline_unweighted, x_iris, y_iris, False)
dump_cv_metrics(iris_pipeline_unweighted, x_iris, y_iris, True)

Average F1 Score: [1.         0.95652174 1.         0.95454545 1.        ]
Average F1 Score: [1.         1.         0.95454545 0.95454545 0.95454545]


## Weighted iris pipeline with good hyperparams

In [40]:
dump_cv_metrics(iris_pipeline_weighted, x_iris, y_iris, False)
dump_cv_metrics(iris_pipeline_weighted, x_iris, y_iris, True)

Average F1 Score: [1.         0.95652174 1.         0.95454545 1.        ]
Average F1 Score: [1.         1.         0.95454545 0.95454545 0.95454545]


## Unweighted PCB pipeline with good hyperparams

In [41]:
dump_cv_metrics(pcb_pipeline_unweighted, x_companies, y_companies, False)
dump_cv_metrics(pcb_pipeline_unweighted, x_companies, y_companies, True)

Average F1 Score: [0.93994778 0.9419444  0.9419444  0.93963134 0.93932412]
Average F1 Score: [0.94071571 0.94378744 0.93917985 0.93717358 0.93794163]


## Weighted PCB piepline with good hyperparams

In [42]:
dump_cv_metrics(pcb_pipeline_weighted, x_companies, y_companies, False)
dump_cv_metrics(pcb_pipeline_weighted, x_companies, y_companies, True)

Average F1 Score: [0.95177392 0.95177392 0.95177392 0.95192012 0.95192012]
Average F1 Score: [0.95177392 0.95177392 0.95177392 0.95192012 0.95192012]


# Class weight in decision tree

In [12]:
weighted_pipeline = build_pipeline(class_weight='balanced', ccp_alpha=0.01)
unweighted_pipeline = build_pipeline(ccp_alpha=0.01)

In [13]:
x_train, y_train, x_test, y_test = process_dataset(polish_companies_bankruptcy.data.features, polish_companies_bankruptcy.data.targets, balance=False)

In [14]:
weighted_pipeline.fit(x_train, y_train)
unweighted_pipeline.fit(x_train, y_train)

In [18]:
dump_cv_metrics(weighted_pipeline, x_test, y_test, False, scoring='recall')
dump_cv_metrics(unweighted_pipeline, x_test, y_test, False, scoring='recall')

Average F1 Score: [0.76190476 0.6        0.63461538 0.75961538 0.76190476]
Average F1 Score: [0. 0. 0. 0. 0.]
