In [1]:
% load_ext autoreload
% autoreload 2

In [117]:
train_file_path = "data/application_train.csv"
target_column_name = "TARGET"

In [119]:
from src.utils import load_dataset

df = load_dataset(train_file_path)
raw_df = df
raw_df

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,...,0,0,0,0,,,,,,
307507,456252,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,...,0,0,0,0,,,,,,
307508,456253,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,456254,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [120]:
df_columns = df.columns
nan_columns_summary = df.isnull().sum() != 0
nan_columns = nan_columns_summary.index[nan_columns_summary].tolist()

In [121]:
type_to_imputer_strategy = {'float64': 'mean', 'object': 'most_frequent'}

In [122]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# The transformers must be with the order of the columns, otherwise when we wrap it back to dataframe the columns will mix up
transformers = []
for feature in df_columns:
    if feature in nan_columns:
        transformers.append(
            (f'{feature}_imputer', SimpleImputer(strategy=type_to_imputer_strategy[df[feature].dtype.name]), [feature]))
    else:
        transformers.append((f'{feature}_keeper', 'passthrough', [feature]))

column_trans = ColumnTransformer(transformers)
df_transformed_data = column_trans.fit_transform(df)
df_transformed = pd.DataFrame(data=df_transformed_data, columns=df_columns)
df_transformed

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500,406598,24700.5,...,0,0,0,0,0,0,0,0,0,1
1,100003,0,Cash loans,F,N,N,0,270000,1.2935e+06,35698.5,...,0,0,0,0,0,0,0,0,0,0
2,100004,0,Revolving loans,M,Y,Y,0,67500,135000,6750,...,0,0,0,0,0,0,0,0,0,0
3,100006,0,Cash loans,F,N,Y,0,135000,312682,29686.5,...,0,0,0,0,0.00640245,0.00700021,0.0343619,0.267395,0.265474,1.89997
4,100007,0,Cash loans,M,N,Y,0,121500,513000,21865.5,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,Cash loans,M,N,N,0,157500,254700,27558,...,0,0,0,0,0.00640245,0.00700021,0.0343619,0.267395,0.265474,1.89997
307507,456252,0,Cash loans,F,N,Y,0,72000,269550,12001.5,...,0,0,0,0,0.00640245,0.00700021,0.0343619,0.267395,0.265474,1.89997
307508,456253,0,Cash loans,F,N,Y,0,153000,677664,29979,...,0,0,0,0,1,0,0,1,0,1
307509,456254,1,Cash loans,F,N,Y,0,171000,370107,20205,...,0,0,0,0,0,0,0,0,0,0


In [123]:
from sklearn.model_selection import train_test_split
from src.test_binning import SEED

df_train, df_test = train_test_split(df_transformed, test_size=0.25, random_state=SEED)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [124]:
from src.data_preprocessing import preprocess_data

x_train, y_train, x_test, y_test = preprocess_data(df_train, df_test, target_column_name)

In [125]:
from src.test_binning import get_score_of_classification_model

score_without_binning = get_score_of_classification_model(x_train=x_train, y_train=y_train, x_test=x_test,
                                                          y_test=y_test)
print(f"The score without binning is: {score_without_binning:.3f}")

The score without binning is: 91.869


In [126]:
from src.binner import apply_binning_on_column
from src.alt_binning import find_optimal_binning_without_frequency


def test_column(df_train, target_column_name, categorical_column_name):
    optimal_binning = find_optimal_binning_without_frequency(df_train, target_column_name, categorical_column_name)

    df_train_copy = df_train.copy()
    df_test_copy = df_test.copy()
    new_col_train = apply_binning_on_column(df_train_copy[categorical_column_name], optimal_binning)
    df_train_copy[categorical_column_name] = new_col_train
    new_col_test = apply_binning_on_column(df_test_copy[categorical_column_name], optimal_binning)
    df_test_copy[categorical_column_name] = new_col_test

    x_train, y_train, x_test, y_test = preprocess_data(df_train_copy, df_test_copy, target_column_name)
    with_binning_score = get_score_of_classification_model(x_train, y_train, x_test, y_test)

    return with_binning_score


In [127]:
def get_categorical_columns_by_range_of_uniqueness(df, min_unique, max_unique):
    a = df.nunique()
    a = a.ge(min_unique) & a.le(max_unique)

    return list(a[a].index)

In [128]:
min_unique = 5
max_unique = 10

categorical_columns = get_categorical_columns_by_range_of_uniqueness(df_train, min_unique=min_unique,
                                                                     max_unique=max_unique)
print(f'{len(categorical_columns)} to test')

12 to test


In [129]:
from tqdm import tqdm

success_columns = []
failure_columns = []

for column in tqdm(categorical_columns):
    with_binning_score = test_column(df_train, target_column_name, categorical_column_name=column)

    if with_binning_score > score_without_binning:
        success_columns.append(column)
        print(f'SUCCESS (: {len(success_columns)/len(success_columns) + len(failure_columns)}')
    else:
        failure_columns.append(column)
        print(f'Failure ):  {len(success_columns)/len(categorical_columns)}')

  8%|▊         | 1/12 [00:33<06:09, 33.56s/it]

Failure ):  0.0


 17%|█▋        | 2/12 [01:03<05:14, 31.47s/it]

SUCCESS (: 0.08333333333333333


 25%|██▌       | 3/12 [01:32<04:31, 30.13s/it]

SUCCESS (: 0.16666666666666666


 33%|███▎      | 4/12 [02:03<04:05, 30.64s/it]

SUCCESS (: 0.25


 42%|████▏     | 5/12 [02:33<03:32, 30.29s/it]

Failure ):  0.25


 50%|█████     | 6/12 [03:08<03:11, 31.89s/it]

Failure ):  0.25


 58%|█████▊    | 7/12 [03:47<02:51, 34.28s/it]

SUCCESS (: 0.3333333333333333


 67%|██████▋   | 8/12 [04:30<02:28, 37.13s/it]

Failure ):  0.3333333333333333


 75%|███████▌  | 9/12 [05:05<01:49, 36.42s/it]

SUCCESS (: 0.4166666666666667


 83%|████████▎ | 10/12 [05:35<01:08, 34.40s/it]

SUCCESS (: 0.5


 92%|█████████▏| 11/12 [06:06<00:33, 33.53s/it]

SUCCESS (: 0.5833333333333334


100%|██████████| 12/12 [06:42<00:00, 33.51s/it]

SUCCESS (: 0.6666666666666666





In [133]:
print(f'{len(success_columns)} of {len(categorical_columns)}')

8 of 12


# The pipeline is completed
# TODO: FIND more datasets with categorical features, mushrooms is too easy (100 accuracy with 100 samples without any smart binning)