In [76]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import optuna
from optuna.trial import Trial
from typing import List, Any, Tuple

In [6]:
# Data Loading
print("Loading data...")
X_model = pd.read_csv('../data/X_model.csv')
Y_model = pd.read_csv('../data/Y_model.csv')

Loading data...


In [77]:
scaler = MinMaxScaler(feature_range=(0,1))

In [97]:
# Define preprocessors
print("Defining preprocessors...")
def column(colnames: List[str]):
    def _column(X: pd.DataFrame):
        X = X.fillna(0)
        return [
            [colname, X[colname].values] for colname in colnames
        ]
    return _column

def rangesum(
    name:str, 
    regex: str, 
    prefixes: str, 
    dist: np.ndarray
):
    def _rangesum(X: pd.DataFrame):
        X = X.fillna(0)
        return [
            [
                prefix + name, 
                X.filter(regex=(prefix + regex), axis=1).values.dot(dist)
            ] for prefix in prefixes
        ]
    return _rangesum

def _fillna(X: np.ndarray) -> np.ndarray:
    return np.nan_to_num(X, copy=True, nan=0)

def array_divide(
    numerator: List[Tuple[str, np.ndarray]], 
    denominator: List[Tuple[str, np.ndarray]]
) -> List[Any]:
    assert len(numerator) == len(denominator)
    return [
        [
            "r" + numerator_colname, 
            _fillna(np.divide(numerator_col, denominator_col))
        ] for [numerator_colname, numerator_col], [_, denominator_col] in zip(numerator, denominator)
    ]

def one_hot_encode(column: str) -> pd.DataFrame:
    def _one_hot_encode(X: pd.DataFrame):
        X = X.fillna(0)
        df_dummies = pd.get_dummies(X[column], prefix=column)
        return [
            [colname, df_dummies[colname].values] for colname in df_dummies.columns
        ]
    return _one_hot_encode

Defining preprocessors...


In [98]:
def preprocess(X: pd.DataFrame, processors: List[Any]) -> pd.DataFrame:
    X_new = pd.DataFrame()

    for processor in processors:
        for colname, col in processor if type(processor) == type([]) else processor(X):
            X_new[colname] = col

    X_new = X_new.fillna(0)

    X_new = pd.DataFrame(scaler.fit_transform(X_new), columns=X_new.columns)

    return X_new

In [11]:
def equal_dist(length: int) -> np.ndarray:
    return np.ones(length)

def linear_dist(length: int) -> np.ndarray:
    return np.arange(start=0, stop=1, step=1/length)

In [106]:
print("Data preprocessing...")
dist_GIT = rangesum(
    'GIT', 
    r"202205[0-9]{2}", 
    "cts", 
    equal_dist(31)
)(X_model)
dist_VAT = rangesum(
    'VAT', 
    r"20220[17](?:[01][0-9]|2[0-5])", 
    "ts", 
    np.concatenate((equal_dist(25), equal_dist(25)))
)(X_model)
entire_days = 31 + 29 + 31 + 30 + 31 + 30 + 31 + 25
entire = rangesum(
    'Entire', 
    r"2022[0-9]{4}", 
    "cts", 
    equal_dist(entire_days)
)(X_model)

Data preprocessing...


In [107]:
X_processed = preprocess(
    X_model, 
    [
        column(['age_code']),
        one_hot_encode('gender'),
        one_hot_encode('region_code'),
        dist_GIT,
        dist_VAT,
        entire,
        # array_divide(dist_GIT, entire), # rel_GIT
        # array_divide(dist_VAT, entire[1:]), # rel_VAT
    ]
)

In [108]:
X_processed.head()

Unnamed: 0,age_code,gender_1,gender_2,region_code_0,region_code_1,region_code_2,region_code_4,region_code_5,region_code_6,region_code_7,...,region_code_17,region_code_18,cGIT,tGIT,sGIT,tVAT,sVAT,cEntire,tEntire,sEntire
0,0.923077,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00016,0.000255,1.9e-05
1,0.307692,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.002172,0.0,2.8e-05,0.0,0.000167,0.006241,0.0,0.00016
2,0.384615,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.006515,0.004902,0.003714,0.0,0.001549,0.004161,0.000764,0.001039
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.002172,0.0,0.000564,0.0,0.0,0.0008,0.0,0.000131
4,0.307692,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.002172,0.0,0.00045,0.0,9.4e-05,0.0016,0.000255,0.000123


In [109]:
def _construct_and_cross_validate(**kwargs):
    classifier = DecisionTreeClassifier(
        criterion="gini",
        splitter=kwargs['splitter'],
        max_depth=kwargs['max_depth'],
        min_samples_split=kwargs['min_samples_split'],
        min_samples_leaf=kwargs['min_samples_leaf'],
        min_weight_fraction_leaf=kwargs['min_weight_fraction_leaf'],
        random_state=100,
        min_impurity_decrease=kwargs['min_impurity_decrease'],
        class_weight={0: 1, 1: 14.291397}, # Super imbalanced data
    )

    scores = cross_val_score(
        classifier,
        X_processed,
        Y_model,
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=100),
        scoring='roc_auc' # for binary classification
    )

    return scores

In [110]:
print("Preparing for hyperparameter tuning...")
# Task: Hyperparameter tuning with Optuna
def objective(trial: Trial):
    # Construct a DecisionTreeClassifier object
    scores = _construct_and_cross_validate(
        splitter=trial.suggest_categorical('splitter', ['best', 'random']),
        max_depth=trial.suggest_int('max_depth', 1, 10),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 40),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 20),
        min_weight_fraction_leaf=trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.5),
        min_impurity_decrease=trial.suggest_float('min_impurity_decrease', 0.0, 0.5),
    )

    return scores.mean()

print("Hyperparameter tuning started...")
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

# Print the best parameters
print("Best params")
print(study.best_params)

[32m[I 2022-11-18 19:11:58,297][0m A new study created in memory with name: no-name-1add8815-81d6-4e85-bdef-a0a9b5438969[0m


Preparing for hyperparameter tuning...
Hyperparameter tuning started...


[32m[I 2022-11-18 19:11:59,445][0m Trial 0 finished with value: 0.5 and parameters: {'splitter': 'random', 'max_depth': 5, 'min_samples_split': 15, 'min_samples_leaf': 16, 'min_weight_fraction_leaf': 0.38071947494177427, 'min_impurity_decrease': 0.31990497011887997}. Best is trial 0 with value: 0.5.[0m
[32m[I 2022-11-18 19:12:01,543][0m Trial 1 finished with value: 0.5 and parameters: {'splitter': 'best', 'max_depth': 6, 'min_samples_split': 15, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.019193986717413125, 'min_impurity_decrease': 0.34852154127200086}. Best is trial 0 with value: 0.5.[0m
[32m[I 2022-11-18 19:12:02,611][0m Trial 2 finished with value: 0.5 and parameters: {'splitter': 'random', 'max_depth': 9, 'min_samples_split': 24, 'min_samples_leaf': 6, 'min_weight_fraction_leaf': 0.14599304137233765, 'min_impurity_decrease': 0.04545269383014444}. Best is trial 0 with value: 0.5.[0m
[32m[I 2022-11-18 19:12:04,718][0m Trial 3 finished with value: 0.5 and paramete

Best params
{'splitter': 'best', 'max_depth': 4, 'min_samples_split': 34, 'min_samples_leaf': 3, 'min_weight_fraction_leaf': 0.09868801833773594, 'min_impurity_decrease': 0.0006510732133421264}


In [111]:
print("Finalizing model...")
scores = _construct_and_cross_validate(
    splitter=study.best_params['splitter'],
    max_depth=study.best_params['max_depth'],
    min_samples_split=study.best_params['min_samples_split'],
    min_samples_leaf=study.best_params['min_samples_leaf'],
    min_weight_fraction_leaf=study.best_params['min_weight_fraction_leaf'],
    min_impurity_decrease=study.best_params['min_impurity_decrease'],
)

print("Average ROC AUC Score", np.mean(scores))
print("Standard Deviation of ROC AUC Score", np.std(scores))

Finalizing model...
Average ROC AUC Score 0.8460058027968331
Standard Deviation of ROC AUC Score 0.0007123669765399855
