In [8]:
#@formatter:off
%load_ext autoreload
%autoreload 2
#@formatter:on

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
type_to_imputer_strategy = {'float64': 'mean', 'object': 'most_frequent'}

In [10]:
SEED = 42

In [11]:
def get_categorical_columns_by_range_of_uniqueness(df, min_unique, max_unique, target_column):
    df = df.drop(target_column, axis=1)
    a = df.nunique()
    a = a.ge(min_unique) & a.le(max_unique)

    return list(a[a].index)

In [12]:
from src.data_preprocessing import preprocess_data
from src.test_binning import get_score_of_classification_model
from src.binner import apply_binning_on_column
from src.alt_binning import find_optimal_binning_without_frequency
import time


def test_column(df_train, df_test, target_column_name, categorical_column_name):
    start_time = time.time()
    optimal_binning = find_optimal_binning_without_frequency(df_train, target_column_name, categorical_column_name)
    total_time = time.time() - start_time

    df_train_copy = df_train.copy()
    df_test_copy = df_test.copy()
    new_col_train = apply_binning_on_column(df_train_copy[categorical_column_name], optimal_binning)
    df_train_copy[categorical_column_name] = new_col_train
    new_col_test = apply_binning_on_column(df_test_copy[categorical_column_name], optimal_binning)
    df_test_copy[categorical_column_name] = new_col_test

    x_train, y_train, x_test, y_test = preprocess_data(df_train_copy, df_test_copy, target_column_name)
    with_binning_score = get_score_of_classification_model(x_train, y_train, x_test, y_test)

    new_categories = new_col_train.unique()

    return with_binning_score, new_categories, total_time

In [13]:
import pandas as pd
from tqdm import tqdm
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split


def examine_dataset(df, target_column_name, dataset_name, min_unique, max_unique):
    df_columns = df.columns
    nan_columns_summary = df.isnull().sum() != 0
    nan_columns = nan_columns_summary.index[nan_columns_summary].tolist()

    transformers = []
    for feature in df_columns:
        if feature in nan_columns:
            transformers.append(
                (f'{feature}_imputer', SimpleImputer(strategy=type_to_imputer_strategy[df[feature].dtype.name]),
                 [feature]))
        else:
            transformers.append((f'{feature}_keeper', 'passthrough', [feature]))

    column_trans = ColumnTransformer(transformers)
    df_transformed_data = column_trans.fit_transform(df)
    df_transformed = pd.DataFrame(data=df_transformed_data, columns=df_columns)

    df_train, df_test = train_test_split(df_transformed, test_size=0.25, random_state=SEED)
    df_train.reset_index(drop=True, inplace=True)
    df_test.reset_index(drop=True, inplace=True)

    x_train, y_train, x_test, y_test = preprocess_data(df_train, df_test, target_column_name)

    score_without_binning = round(get_score_of_classification_model(x_train=x_train, y_train=y_train, x_test=x_test,
                                                                    y_test=y_test), 3)

    categorical_columns = get_categorical_columns_by_range_of_uniqueness(df_train, min_unique=min_unique,
                                                                         max_unique=max_unique,
                                                                         target_column=target_column_name)

    success_columns = {}

    for column in tqdm(categorical_columns, desc=f"{dataset_name} Progress"):
        with_binning_score, optimal_binning, total_time = test_column(df_train, df_test, target_column_name,
                                                                      categorical_column_name=column)

        if with_binning_score > score_without_binning:
            success_columns[column] = {"score": round(with_binning_score, 3),
                                       "og_unique": sorted(df_train[column].unique()),
                                       "og_n_unique": df_train[column].nunique(),
                                       "new_unique": optimal_binning,
                                       "n_unique": len(optimal_binning),
                                       "total_time": total_time}

    return success_columns, score_without_binning

In [14]:
def get_results(datasets_df, min_unique, max_unique):
    model_records = []

    for idx, row in tqdm(datasets_df.iterrows(), desc="Datasets Progress", total=len(datasets_df)):
        success_columns, score_without_binning = examine_dataset(df=row.df, dataset_name=row["Name"],
                                                                 target_column_name=row["Target Column"],
                                                                 min_unique=min_unique, max_unique=max_unique)
        for col in success_columns:
            model_records.append([row["Name"], col, success_columns[col]["score"], score_without_binning,
                                  success_columns[col]["og_unique"], success_columns[col]["og_n_unique"],
                                  success_columns[col]["new_unique"], success_columns[col]["n_unique"],
                                  success_columns[col]["total_time"]])

    return model_records

In [15]:
read_dataset = lambda x: pd.read_csv(f'data/{x}')

churn_modeling = ['churn_modeling', read_dataset("churn_modeling/Churn_Modelling.csv"), 'Exited']
titanic = ['titanic', read_dataset("titanic/train.csv"), 'Survived']
# home_credit_risk = ['home_credit_risk', read_dataset("home_credit_risk/train.csv"), "TARGET"]

datasets_df = pd.DataFrame([churn_modeling, titanic], columns=['Name', 'df', 'Target Column'])

In [16]:
import warnings

warnings.simplefilter('ignore')

min_unique = 5
max_unique = 11

model_records = get_results(datasets_df, min_unique=min_unique, max_unique=max_unique)
pd.DataFrame(data=model_records,
             columns=["Dataset", "Column Name", "Optimal Binning Model Score", "Score without Binning", "og_unique",
                      "og_n_unique", "new_unique", "n_unique", "total_time"])

Datasets Progress:   0%|          | 0/2 [00:00<?, ?it/s]
churn_modeling Progress:   0%|          | 0/1 [00:00<?, ?it/s][A
churn_modeling Progress: 100%|██████████| 1/1 [00:11<00:00, 11.39s/it][A
Datasets Progress:  50%|█████     | 1/2 [00:15<00:15, 15.96s/it]
titanic Progress:   0%|          | 0/2 [00:00<?, ?it/s][A
titanic Progress:  50%|█████     | 1/2 [00:00<00:00,  1.72it/s][A
titanic Progress: 100%|██████████| 2/2 [00:01<00:00,  1.64it/s][A
Datasets Progress: 100%|██████████| 2/2 [00:17<00:00,  8.72s/it]


Unnamed: 0,Dataset,Column Name,Optimal Binning Model Score,Score without Binning,og_unique,og_n_unique,new_unique,n_unique,total_time
0,churn_modeling,Tenure,84.28,83.76,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]",11,"[0, 8, 10, 4, 9, 5]",6,7.533731
1,titanic,SibSp,81.614,80.717,"[0, 1, 2, 3, 4, 5, 8]",7,"[3, 2, 4, 5]",4,0.456467
2,titanic,Parch,81.166,80.717,"[0, 1, 2, 3, 4, 5, 6]",7,"[6, 2, 3, 4]",4,0.502581
