In [47]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [132]:
train_file_path = "data/diabetes/diabetes.csv"
target_column_name = "Outcome"

In [133]:
import pandas as pd

df = pd.read_csv(train_file_path)
raw_df = df
raw_df

Unnamed: 0,Pregnancies,Gender,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,CalorieIntake,Exercise,SleepDuration
0,0,M,148,72,35,0,33.6,0.627,50,1,2508.3,No,6
1,1,F,85,66,29,0,26.6,0.351,31,0,2760.0,Evening,8
2,0,M,183,64,0,0,23.3,0.672,32,1,6480.0,Morning,8
3,1,F,89,66,23,94,28.1,0.167,21,0,,Evening,8
4,0,M,137,40,35,168,43.1,2.288,33,1,1970.0,No,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,3,F,101,76,48,180,32.9,0.171,63,0,2860.0,Morning,6
764,2,F,122,70,27,0,36.8,0.340,27,0,3320.0,Morning,7
765,0,M,121,72,23,112,26.2,0.245,30,0,3453.0,Evening,4
766,1,F,126,60,0,0,30.1,0.349,47,1,8850.0,No,8


In [134]:
df_columns = df.columns
nan_columns_summary = df.isnull().sum() != 0
nan_columns = nan_columns_summary.index[nan_columns_summary].tolist()

In [135]:
type_to_imputer_strategy = {'float64': 'mean', 'object': 'most_frequent'}

In [136]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# The transformers must be with the order of the columns, otherwise when we wrap it back to dataframe the columns will mix up
transformers = []
for feature in df_columns:
    if feature in nan_columns:
        transformers.append(
            (f'{feature}_imputer', SimpleImputer(strategy=type_to_imputer_strategy[df[feature].dtype.name]), [feature]))
    else:
        transformers.append((f'{feature}_keeper', 'passthrough', [feature]))

column_trans = ColumnTransformer(transformers)
df_transformed_data = column_trans.fit_transform(df)
df_transformed = pd.DataFrame(data=df_transformed_data, columns=df_columns)
df_transformed

Unnamed: 0,Pregnancies,Gender,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,CalorieIntake,Exercise,SleepDuration
0,0,M,148,72,35,0,33.6,0.627,50,1,2508.3,No,6
1,1,F,85,66,29,0,26.6,0.351,31,0,2760,Evening,8
2,0,M,183,64,0,0,23.3,0.672,32,1,6480,Morning,8
3,1,F,89,66,23,94,28.1,0.167,21,0,3639.6,Evening,8
4,0,M,137,40,35,168,43.1,2.288,33,1,1970,No,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,3,F,101,76,48,180,32.9,0.171,63,0,2860,Morning,6
764,2,F,122,70,27,0,36.8,0.34,27,0,3320,Morning,7
765,0,M,121,72,23,112,26.2,0.245,30,0,3453,Evening,4
766,1,F,126,60,0,0,30.1,0.349,47,1,8850,No,8


In [137]:
from sklearn.model_selection import train_test_split
from src.test_binning import SEED

df_train, df_test = train_test_split(df_transformed, test_size=0.25, random_state=SEED)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [138]:
from src.data_preprocessing import preprocess_data

x_train, y_train, x_test, y_test = preprocess_data(df_train, df_test, target_column_name)

In [139]:
from src.test_binning import get_score_of_classification_model

score_without_binning = get_score_of_classification_model(x_train=x_train, y_train=y_train, x_test=x_test,
                                                          y_test=y_test)
print(f"The score without binning is: {score_without_binning:.3f}")

The score without binning is: 98.958


In [140]:
from src.binner import apply_binning_on_column
from src.alt_binning import find_optimal_binning_without_frequency


def test_column(df_train, df_test, target_column_name, categorical_column_name):
    optimal_binning = find_optimal_binning_without_frequency(df_train, target_column_name, categorical_column_name)

    df_train_copy = df_train.copy()
    df_test_copy = df_test.copy()
    new_col_train = apply_binning_on_column(df_train_copy[categorical_column_name], optimal_binning)
    df_train_copy[categorical_column_name] = new_col_train
    new_col_test = apply_binning_on_column(df_test_copy[categorical_column_name], optimal_binning)
    df_test_copy[categorical_column_name] = new_col_test

    x_train, y_train, x_test, y_test = preprocess_data(df_train_copy, df_test_copy, target_column_name)
    with_binning_score = get_score_of_classification_model(x_train, y_train, x_test, y_test)

    return with_binning_score


In [141]:
def get_categorical_columns_by_range_of_uniqueness(df, min_unique, max_unique, target_column):
    df = df.drop(target_column, axis=1)
    a = df.nunique()
    a = a.ge(min_unique) & a.le(max_unique)

    return list(a[a].index)

In [142]:
min_unique = 5
max_unique = 20

categorical_columns = get_categorical_columns_by_range_of_uniqueness(df_train, min_unique=min_unique,
                                                                     max_unique=max_unique, target_column=target_column_name)
print(f'{len(categorical_columns)} to test')

2 to test


In [143]:
from tqdm import tqdm

success_columns = []
failure_columns = []

for column in tqdm(categorical_columns):
    with_binning_score = test_column(df_train, df_test, target_column_name, categorical_column_name=column)

    if with_binning_score > score_without_binning:
        success_columns.append(column)
        print('SUCCESS (:')
    else:
        failure_columns.append(column)
        print('Failure ):')

    print(f'automatic_binning_score={with_binning_score:.3f}\nvanilla_score={score_without_binning:.3f}\n{len(success_columns)/(len(success_columns) + len(failure_columns))}')

 50%|█████     | 1/2 [00:00<00:00,  1.16it/s]

Failure ):
automatic_binning_score=98.438
vanilla_score=98.958
0.0


100%|██████████| 2/2 [00:17<00:00,  8.91s/it]

SUCCESS (:
automatic_binning_score=99.479
vanilla_score=98.958
0.5





In [144]:
print(f'{len(success_columns)} of {len(categorical_columns)}')

1 of 2


# The pipeline is completed
# TODO: FIND more datasets with categorical features