In [42]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, HalvingGridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from ucimlrepo import fetch_ucirepo
from NaDropper import HighNaNDropper
from sklearn.pipeline import Pipeline
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [6]:
def build_pipeline(**args):
    pipeline = Pipeline([
        ('dropper', HighNaNDropper(threshold=0.3)),
        ('imputer', KNNImputer()),
        ('classifier', DecisionTreeClassifier(**args))
    ])
    return pipeline

In [7]:
def build_preprocess_pipeline():
    pipeline = Pipeline([
        ('imputer', KNNImputer()),
    ])
    return pipeline

In [8]:
def process_dataset(x, y, balance=True):
    x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, shuffle=True)
    y_train = np.ravel(y_train)
    preprocess_pipeline = build_preprocess_pipeline()
    x_train_processed = preprocess_pipeline.fit_transform(x_train)
    if balance:
        x_resampled, y_resampled = SMOTE().fit_resample(x_train_processed, y_train)
        return x_resampled, y_resampled, x_test, y_test
    else:
        return x_train_processed, y_train, x_test, y_test

In [9]:
def get_hyperparams_for_dataset(x, y, scoring, **args):
    param_grid = {
    'classifier__ccp_alpha': np.linspace(0, 0.25, 25),
    'classifier__max_depth': [5, 10, 20, None],
    'classifier__min_samples_leaf': [2, 3, 5, 10]
    }
    pipeline = build_pipeline(**args)
    search = HalvingGridSearchCV(pipeline, param_grid=param_grid, aggressive_elimination=True, n_jobs=-1, scoring=scoring)
    search.fit(x, y)
    ccp_alpha, max_depth, min_samples = search.best_params_['classifier__ccp_alpha'], search.best_params_['classifier__max_depth'], search.best_params_['classifier__min_samples_leaf']
    return {'ccp_alpha': ccp_alpha, 'max_depth': max_depth, 'min_samples': min_samples}

In [10]:
def build_for_hyperparams(param_dict):
    ccp = param_dict['ccp_alpha']
    max_depth = param_dict['max_depth']
    min_samples_leaf = param_dict['min_samples']
    pipeline = build_pipeline(ccp_alpha=ccp, max_depth=max_depth, min_samples_leaf=min_samples_leaf)
    return pipeline

In [51]:
from sklearn.model_selection import cross_val_score

def get_cv_metrics(pipeline, x, y, stratified=False, scoring='f1_micro') -> np.ndarray:
    cv_results = cross_val_score(pipeline, x, y, cv=StratifiedKFold(shuffle=True) if stratified else 5, scoring=scoring)
    result_array = np.array(cv_results)
    return result_array

In [12]:
def dump_metrics(pipeline, x_test, y_test):
    y_pred = pipeline.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

In [13]:
import os.path
import pickle as pkl

def load_cache_dataset(filename: str, id: int):
    if os.path.exists(filename):
        with open(filename, 'rb') as f:
            dataset = pkl.load(f)
    else:
        dataset = fetch_ucirepo(id=id)
        with open(filename, 'wb') as f:
            pkl.dump(dataset, f)
    return dataset

polish_companies_bankruptcy = load_cache_dataset('pcb.pkl', 365)
iris = load_cache_dataset('iris.pkl', 53)
iris.data.targets = LabelEncoder().fit_transform(iris.data.targets)

  y = column_or_1d(y, warn=True)


In [14]:
x_companies, y_companies, x_companies_test, y_companies_test = process_dataset(polish_companies_bankruptcy.data.features, polish_companies_bankruptcy.data.targets, balance=False)
x_iris, y_iris, x_iris_test, y_iris_test = process_dataset(iris.data.features, iris.data.targets, balance=False)

In [15]:
hyperparams_iris_unweighted = get_hyperparams_for_dataset(x_iris, y_iris, 'f1_micro')
hyperparams_iris_weighted = get_hyperparams_for_dataset(x_iris, y_iris, 'f1_micro', class_weight='balanced')
print(f'iris hyperparameters for unweighted classifier {hyperparams_iris_unweighted}')
print(f'iris hyperparameters for weighted classifier {hyperparams_iris_weighted}')

iris hyperparameters for unweighted classifier {'ccp_alpha': np.float64(0.125), 'max_depth': 5, 'min_samples': 3}
iris hyperparameters for weighted classifier {'ccp_alpha': np.float64(0.010416666666666666), 'max_depth': 20, 'min_samples': 3}


In [16]:
hyperparams_pcb_unweighted = get_hyperparams_for_dataset(x_companies, y_companies, 'f1')
hyperparams_pcb_weighted = get_hyperparams_for_dataset(x_companies, y_companies, 'f1', class_weight='balanced')
print(f'PCB hyperparameters for unweighted classifier {hyperparams_pcb_unweighted}')
print(f'PCB hyperparameters for weighted classifier {hyperparams_pcb_weighted}')

PCB hyperparameters for unweighted classifier {'ccp_alpha': np.float64(0.0), 'max_depth': None, 'min_samples': 5}
PCB hyperparameters for weighted classifier {'ccp_alpha': np.float64(0.0625), 'max_depth': 10, 'min_samples': 10}


In [17]:
iris_pipeline_weighted = build_for_hyperparams(hyperparams_iris_weighted)
iris_pipeline_unweighted = build_for_hyperparams(hyperparams_iris_unweighted)
iris_pipeline_weighted.fit(x_iris, y_iris)
iris_pipeline_unweighted.fit(x_iris, y_iris)

In [33]:
pcb_pipeline_weighted = build_for_hyperparams(hyperparams_pcb_weighted)
pcb_pipeline_unweighted = build_for_hyperparams(hyperparams_pcb_unweighted)
pcb_pipeline_weighted.fit(x_companies, y_companies)
pcb_pipeline_unweighted.fit(x_companies, y_companies)


# Cross-validation stratification effect

In [55]:
iris_list = [[*get_cv_metrics(iris_pipeline_weighted, x_iris, y_iris, False), True, False],
             [*get_cv_metrics(iris_pipeline_weighted, x_iris, y_iris, True), True, True],
             [*get_cv_metrics(iris_pipeline_unweighted, x_iris, y_iris, False), False, False],
             [*get_cv_metrics(iris_pipeline_unweighted, x_iris, y_iris, True), False, True]]

In [56]:
pcb_list = [
    [*get_cv_metrics(pcb_pipeline_weighted, x_companies, y_companies, False), True, False],
    [*get_cv_metrics(pcb_pipeline_weighted, x_companies, y_companies, True), True, True],
    [*get_cv_metrics(pcb_pipeline_unweighted, x_companies, y_companies, False), False, False],
    [*get_cv_metrics(pcb_pipeline_unweighted, x_companies, y_companies, True), False, True]
]

In [63]:
pcb_cv_dataframe = pd.DataFrame(pcb_list, columns=['results 1', 'results 2', 'results 3', 'results 4', 'results 5', 'weighted', 'stratified']).melt(value_vars=['results 1', 'results 2', 'results 3', 'results 4', 'results 5'], id_vars=['weighted', 'stratified'])
iris_cv_dataframe = pd.DataFrame(iris_list, columns=['results 1', 'results 2', 'results 3', 'results 4', 'results 5', 'weighted', 'stratified']).melt(value_vars=['results 1', 'results 2', 'results 3', 'results 4', 'results 5'], id_vars=['weighted', 'stratified'])

In [62]:
pcb_cv_dataframe

Unnamed: 0,weighted,stratified,variable,value
0,True,False,results 1,0.951774
1,True,True,results 1,0.951774
2,False,False,results 1,0.943634
3,False,True,results 1,0.944709
4,True,False,results 2,0.951774
5,True,True,results 2,0.951774
6,False,False,results 2,0.943787
7,False,True,results 2,0.946398
8,True,False,results 3,0.951774
9,True,True,results 3,0.951774


In [64]:
pcb_pipeline_unweighted['classifier']

In [65]:
pcb_pipeline_weighted['classifier']

In [28]:
weighted_pipeline.fit(x_companies, y_companies)

In [29]:
##%%
get_cv_metrics(weighted_pipeline, x_companies_test, y_companies_test, False, scoring='recall')
##%%
get_cv_metrics(unweighted_pipeline, x_companies_test, y_companies_test, False, scoring='recall')

Average F1 Score: [0.59047619 0.67619048 0.61538462 0.71153846 0.84761905]
Average F1 Score: [0. 0. 0. 0. 0.]


In [30]:
get_cv_metrics(weighted_pipeline, x_companies_test, y_companies_test, False, scoring='recall')

KeyboardInterrupt: 

In [31]:
get_cv_metrics(unweighted_pipeline, x_companies_test, y_companies_test, False, scoring='recall')

Average F1 Score: [0. 0. 0. 0. 0.]
