In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import operator
import itertools
import scipy.stats as stats
import pymit
from tqdm import tqdm
import traceback
from category_encoders import one_hot, target_encoder
created_features_dict = {}

In [2]:

dataset_name = 'horse'
target = 'outcome'
positive_target = 'lived'

In [3]:
df = pd.read_csv(f'datasets/{dataset_name}.csv')

In [30]:
def clean_df(df, target, positive_target):
    
    def clean_target(df, target, positive_target):
        df.loc[df[target] != positive_target, target] = 0
        df.loc[df[target] == positive_target, target] = 1
        y_all = df[[target]].astype(int)
        X_all = df.drop([target], axis=1)
        return X_all, y_all
    
    def nulls(df, th_del=0.8, th_mean=0.2):
        null = df.isnull().sum().sort_values()[::-1]
        null = null[null > 0] / len(df)
        null = pd.concat([df[null.index].dtypes, null],axis=1)
        null.columns=['dtype', 'ratio_null']
        null.index.name = 'feature'
        if null.empty:
            return "No nulls."
        
        for idx, row in null.iterrows():
            if row['ratio_null'] > th_del:
                df = df.drop(idx, axis=1)
            
            elif row['ratio_null'] < th_mean:
                df[idx] = df.fillna(df[idx].mean)
            
            else:
                if row['dtype'] == 'object':
                    df[idx] = df[idx].fillna('my_nan_value')
                else:
                    df[idx] = df[idx].fillna(df[idx].min()*1000)
        
        return df
    
    def clean_groups(df, target, max_group_size=10):
        is_object = ['object']
        object_features = list(df.select_dtypes(include=is_object).columns)
        if target in object_features:
            object_features.remove(target)
        for col in object_features:
            try:
                df[col] = df[col].astype(float)
            except:
                len_unique = len(df[col].unique())
                if len_unique < max_group_size:
                    df = df.rename({col: f"{col}_group"},axis=1)
                else:
                    df = df.rename({col: f"{col}_mean_encode"},axis=1)
        return df
                    
        
    
    df = nulls(df.copy())
    df = clean_groups(df.copy(), target)
    X_all, y_all = clean_target(df.copy(), target, positive_target)
    return X_all, y_all

def prepare_bases_to_modeling(X_train_ori, X_test_ori):
    
    one_hot_cols = [x for x in X_train_ori.columns if 'group' in x]
    mean_encoding_cols = [x for x in X_train_ori.columns if 'encode' in x]

    if len(one_hot_cols) > 0:
        enc = one_hot.OneHotEncoder(cols=one_hot_cols, drop_invariant=True)
        X_train_ori = enc.fit_transform(X_train_ori.copy())
        X_test_ori = enc.transform(X_test_ori.copy())
#         X_train = X_train.drop(ORIGINAL_FEATURES, axis=1)
#         X_train = pd.concat([X_train, X_train_ori], axis=1)
#         X_test = pd.concat([X_test, X_test_ori], axis=1)

    if len(mean_encoding_cols) > 0:
        enc = target_encoder.TargetEncoder(cols=mean_encoding_cols, drop_invariant=True)
        X_train_ori = training_numeric_dataset = enc.fit_transform(X_train_ori.copy(), y_train)
        X_test_ori = testing_numeric_dataset = enc.transform(X_test_ori.copy())
#         X_train = X_train.drop(ORIGINAL_FEATURES, axis=1)
#         X_train = pd.concat([X_train, X_train_ori], axis=1)
#         X_test = pd.concat([X_test, X_test_ori], axis=1)
    
    return X_train_ori, X_test_ori
    

In [31]:
X_all, y_all = clean_df(df.copy(), target, positive_target)
X_train_ori, X_test_ori, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.3, random_state = 0)
X_train_ori, X_test_ori = prepare_bases_to_modeling(X_train_ori.copy(), X_test_ori.copy())

In [32]:

def numpy_discretize(X_train, X_test, gran=10, retry=True):
    """
    multi-granularity discretization
    method. The basic idea is simple: instead of using a fine-tuned
    granularity, we discretize each numerical feature into several, rather
    than only one, categorical features, each with a different granularity.
    
    min granularity = 10
    
    Sometimes de edge values did not permit to execute correct discretization
    if this happens the step is not executed
    """
    global created_features_dict

    # separa dados numericos que precisam de binarizacao
    is_numeric = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numeric_features = X_train.select_dtypes(include=is_numeric)
    # cehca se nao tem _disc somente nos ultimos caracteres
    numeric_features = [feat for feat in numeric_features.columns if 'disc' not in feat[-4:]]
    X_train_numeric_np = X_train[numeric_features].T.to_numpy()
    X_test_numeric_np = X_test[numeric_features].T.to_numpy()
    # cacheando posicoes das features
    dict_feature_order = {}
    for feat in numeric_features:
        dict_feature_order[feat] = X_train.columns.get_loc(feat)
    shape_X_train = X_train.shape[0]
    shape_X_test = X_test.shape[0]
    feat_count = 0
    with tqdm(total=len(numeric_features)) as pbar:
        for feat in numeric_features:
            feat_index = dict_feature_order[feat]
            this_gran = gran
            success = False
            while not success:
                try:
                    D_train = np.zeros([shape_X_train, 1])
                    D_test = np.zeros([shape_X_test, 1])
                    # calc numpy histogram and apply to features
                    hist, bin_edges = np.histogram(X_train_numeric_np[feat_index], bins=this_gran)
                    D_train[:, 0] = np.digitize(X_train_numeric_np[feat_index], bin_edges, right=False)
                    D_test[:, 0] = np.digitize(X_test_numeric_np[feat_index], bin_edges, right=False)

                    # apply back to pandas
                    X_train[f"{feat}_disc"] = D_train
                    X_test[f"{feat}_disc"] = D_test

                    success = True
                except:
#                     traceback.print_exc()
                    if retry:
#                         print(f"Not possible to correct work on cut {feat} > {this_gran}")
                        this_gran = this_gran - 1
                    else:
                        this_gran = 1
                        
                    if this_gran <= 1:
                        success = True

                if success and this_gran > 1:
                    #upoad global dict with feature info
                    created_features_dict[f"{feat}_disc"] = {
                        "num_of_source_features": 1,
                        "source_feature_name": [feat],
                        "source_feature_type": ['numeric'],
                        "target_feature_type": ['discrete'],
                        "operator": "discretizer"
                    }

            feat_count += 1
            pbar.update(1)
    return X_train, X_test



def min_max_scaler(X_train, X_test):
    global created_features_dict
    
    is_numeric = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numeric_features = list(X_train.select_dtypes(include=is_numeric).columns)
    numeric_features = [x for x in numeric_features if 'disc' not in x[-4:]]
    scaler = MinMaxScaler()
    scaler.fit(X_train[numeric_features])
    norm_feats = [f"{x}_norm" for x in ORIGINAL_FEATURES if 'disc' not in x[-4:] and x in numeric_features]
    
    for feat in [x for x in ORIGINAL_FEATURES if 'disc' not in x[-4:]]:
        #upoad global dict with feature info
        created_features_dict[f"{feat}_norm"] = {
            "num_of_source_features": 1,
            "source_feature_name": [feat],
            "source_feature_type": ['numeric'],
            "target_feature_type": ['numeric'],
            "operator": "normalizer"
        }
    X_train = X_train.reindex(columns=X_train.columns.tolist() + norm_feats)
    X_test = X_test.reindex(columns=X_test.columns.tolist() + norm_feats)
    X_train.loc[:, norm_feats] = scaler.transform(X_train[numeric_features])
    X_test.loc[:, norm_feats] = scaler.transform(X_test[numeric_features])
    return X_train, X_test


def binary_operators(df):
    global created_features_dict
    
    # calc all pair columns
    all_columns = list(df)
    all_columns = [x for x in all_columns if "disc" not in x and "group" not in x and "encode" not in x]
    pairwise_cols = list(itertools.combinations(all_columns, 2))
    tmp_dfs = []
    with tqdm(total=len(pairwise_cols)) as pbar:
        for pair in pairwise_cols:
            tmp_df = df[[pair[0], pair[1]]].copy()
            
            tmp_df[f"{pair[0]}_x_{pair[1]}_op_sum"] = tmp_df[pair[0]] + tmp_df[pair[1]]

            #upoad global dict with feature info
            type_pair1 = 'disc' if 'disc' in pair[0] else 'numeric'
            type_pair2 = 'disc' if 'disc' in pair[1] else 'numeric'

            created_features_dict[f"{pair[0]}_x_{pair[1]}_op_sum"] = {
                "num_of_source_features": 2,
                "source_feature_name": [pair[0], pair[1]],
                "source_feature_type": [type_pair1, type_pair2],
                "target_feature_type": ['numeric'],
                "operator": "binary_sum"
            }        

            tmp_df[f"{pair[0]}_x_{pair[1]}_op_sub"] = tmp_df[pair[0]] - tmp_df[pair[1]]
            created_features_dict[f"{pair[0]}_x_{pair[1]}_op_sub"] = {
                "num_of_source_features": 2,
                "source_feature_name": [pair[0], pair[1]],
                "source_feature_type": [type_pair1, type_pair2],
                "target_feature_type": ['numeric'],
                "operator": "binary_sub"
            }             

            tmp_df[f"{pair[0]}_x_{pair[1]}_op_mul"] = tmp_df[pair[0]] * tmp_df[pair[1]]
            created_features_dict[f"{pair[0]}_x_{pair[1]}_op_mul"] = {
                "num_of_source_features": 2,
                "source_feature_name": [pair[0], pair[1]],
                "source_feature_type": [type_pair1, type_pair2],
                "target_feature_type": ['numeric'],
                "operator": "binary_mul"
            }             

            tmp_df[f"{pair[0]}_x_{pair[1]}_op_div"] = tmp_df[pair[0]] / tmp_df[pair[1]]
            created_features_dict[f"{pair[0]}_x_{pair[1]}_op_div"] = {
                "num_of_source_features": 2,
                "source_feature_name": [pair[0], pair[1]],
                "source_feature_type": [type_pair1, type_pair2],
                "target_feature_type": ['numeric'],
                "operator": "binary_div"
            }   
            tmp_df = tmp_df.replace([np.inf, -np.inf], np.nan)
            tmp_dfs.append(tmp_df)
            pbar.update(1)
        tmp_df_concat = pd.concat(tmp_dfs, axis=1)
        df = pd.concat([df, tmp_df_concat], axis=1)
    return df


def high_order_operators(df):
    
    def _update_dict(group_col, columns, op):
        global created_features_dict
        
        for feat in columns:
            created_features_dict[f'{feat}_group_by_{col}_and_{op}'] = {
            "num_of_source_features": 2,
            "source_feature_name": [group_col, feat],
            "source_feature_type": ['discrete', 'numeric'],
            "target_feature_type": ['numeric'],
            "operator": f"group_{op}"
            }
            
    group_columns = [col for col in df.columns if "disc" in col or "group" in col]
    to_group_columns = [col for col in df.columns if "disc" not in col and "group" not in col]
    all_dfs = pd.DataFrame()
    for col in group_columns:
        print(f"Grouping {col}")
        
        df_avg = df[to_group_columns+[col]].groupby(col).transform('mean').add_suffix(f'_group_by_{col}_and_mean')
        _update_dict(col, to_group_columns, 'mean')
        df_min = df[to_group_columns+[col]].groupby(col).transform('min').add_suffix(f'_group_by_{col}_and_min')
        _update_dict(col, to_group_columns, 'min')
        df_max = df[to_group_columns+[col]].groupby(col).transform('max').add_suffix(f'_group_by_{col}_and_max')
        _update_dict(col, to_group_columns, 'max')
        
        all_dfs = pd.concat([all_dfs, df_avg, df_min, df_max], axis=1,  sort=False)
    all_dfs = pd.concat([df,all_dfs], axis=1)
    return all_dfs

def _entropy_based_measures(X_train, y_train, target):

    df_mutual_info = pd.DataFrame()
    fail_count = 0
    with tqdm(total=len( X_train.columns)) as pbar:
        for feat in X_train.columns:
            try:
                df_mutual_info[feat] = [pymit.I(X_train[feat].values, y_train[target].values , bins=[10,2])]
                pbar.update(1)
            except:
                fail_count += 1
    print(fail_count)
    return df_mutual_info

In [33]:
ORIGINAL_FEATURES = X_train_ori.columns

# discretize
X_train, X_test = numpy_discretize(X_train_ori.copy(), X_test_ori.copy(), gran=10)


100%|██████████| 62/62 [00:00<00:00, 1001.04it/s]


In [34]:
# normalize
X_train, X_test = min_max_scaler(X_train.copy(), X_test.copy())

step1_train = X_train.copy()
step1_test = X_test.copy()
step1_features = X_train.columns

In [35]:
X_train = high_order_operators(X_train[step1_features].copy())
X_test = high_order_operators(X_test[step1_features].copy())

step2_train = X_train.copy()
step2_test = X_test.copy()
step2_features = X_train.columns

Grouping surgery_group_1
Grouping surgery_group_2
Grouping age_group_1
Grouping age_group_2
Grouping pulse_group_1
Grouping pulse_group_2
Grouping respiratory_rate_group_1
Grouping respiratory_rate_group_2
Grouping temp_of_extremities_group_1
Grouping temp_of_extremities_group_2
Grouping peripheral_pulse_group_1
Grouping peripheral_pulse_group_2
Grouping peripheral_pulse_group_3
Grouping peripheral_pulse_group_4
Grouping peripheral_pulse_group_5
Grouping mucous_membrane_group_1
Grouping mucous_membrane_group_2
Grouping capillary_refill_time_group_1
Grouping capillary_refill_time_group_2
Grouping pain_group_1
Grouping pain_group_2
Grouping peristalsis_group_1
Grouping peristalsis_group_2
Grouping abdominal_distention_group_1
Grouping abdominal_distention_group_2
Grouping nasogastric_tube_group_1
Grouping nasogastric_tube_group_2
Grouping nasogastric_tube_group_3
Grouping nasogastric_tube_group_4
Grouping nasogastric_reflux_group_1
Grouping nasogastric_reflux_group_2
Grouping nasogastric

In [36]:
# binary operators
X_train = binary_operators(X_train[step1_features].copy())
X_test = binary_operators(X_test[step1_features].copy())

step3_train = X_train.copy()
step3_test = X_test.copy()
step3_features = X_train.columns

100%|██████████| 66/66 [00:00<00:00, 230.83it/s]
100%|██████████| 66/66 [00:00<00:00, 232.86it/s]


In [37]:
X_train = pd.concat([step1_train, step2_train, step3_train], axis=1).dropna(axis=1, thresh=0.03*X_train.shape[0])
X_test = pd.concat([step1_test, step2_test, step3_test], axis=1)[X_train.columns]
X_train = X_train.loc[:,~X_train.columns.duplicated()]
X_test = X_test.loc[:,~X_test.columns.duplicated()]

In [38]:
%%time
# discretize
X_train, X_test = numpy_discretize(X_train.copy(), X_test.copy(), gran=10)


100%|██████████| 6622/6622 [00:20<00:00, 328.94it/s]

CPU times: user 19.8 s, sys: 607 ms, total: 20.5 s
Wall time: 20.2 s





In [39]:
entropy_filter = _entropy_based_measures(X_train, y_train, target)
features_to_keep = np.round(entropy_filter.T.sort_values(by=0), 2)
to_keep = list(features_to_keep.loc[features_to_keep[0]>0].index)
len(to_keep)

100%|█████████▉| 13116/13149 [00:24<00:00, 528.00it/s]

33





6194

In [40]:
def dataset_based_meta_features(X_train, X_test, y_train, y_test):
    """
    Applied in the original set!
    """
    
    
    
    def _general_information(X):
        dataset_info_df= pd.DataFrame()

        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        num_numeric_attr = X.select_dtypes(include=numerics).shape[1]
        num_duscrete_attr = X.shape[1] - num_numeric_attr
            
        
        dataset_info_df['num_instances'] = [X.shape[0]]
        dataset_info_df['num_features'] = X.shape[1]
        
        dataset_info_df['num_numeric_attr'] = num_numeric_attr
        dataset_info_df['num_discrete_attr'] = num_duscrete_attr
        dataset_info_df['ratio_numeric_attr'] = num_numeric_attr/ (num_numeric_attr+num_duscrete_attr)
        dataset_info_df['ratio_discrete_attr'] = num_duscrete_attr/ (num_numeric_attr+num_duscrete_attr)
        
        return dataset_info_df
    
    


    def _initial_evaluation(X_train, X_test, y_train, y_test):
    
        from sklearn.ensemble import RandomForestClassifier
        from sklearn import metrics
        
        def acc(y_true, y_pred):
            from sklearn.metrics import accuracy_score
            y_pred = list(map(lambda k: 0 if k<=0.5 else 1, y_pred))
            return accuracy_score(y_true, y_pred)

        def f1(y_true, y_pred, th):
            from sklearn.metrics import f1_score
            y_pred = list(map(lambda k: 0 if k<=th else 1, y_pred))
            return f1_score(y_true, y_pred)

        def precision(y_true, y_pred, th):
            from sklearn.metrics import precision_score
            y_pred = list(map(lambda k: 0 if k<=th else 1, y_pred))
            return precision_score(y_true, y_pred, average='macro') 

        def recall(y_true, y_pred, th):
            from sklearn.metrics import recall_score
            y_pred = list(map(lambda k: 0 if k<=th else 1, y_pred))
            return recall_score(y_true, y_pred, average='macro')

        def auc(y_true, y_pred):
            from sklearn.metrics import roc_auc_score
            return roc_auc_score(y_true, y_pred)


        df_initial_evaluation = pd.DataFrame()
        clf = RandomForestClassifier()
        clf.fit(X_train, y_train)
        y_pred = clf.predict_proba(X_test)[:,1]

        for th in [0.4, 0.45, 0.5, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9]:
            df_initial_evaluation[f'f1_{th}'] = [f1(y_test, y_pred, th)]
            df_initial_evaluation[f'precision_{th}'] = precision(y_test, y_pred, th)
            df_initial_evaluation[f'recall_{th}'] = recall(y_test, y_pred, th)

        df_initial_evaluation['auc'] = auc(y_test, y_pred)
        
        df_initial_evaluation['avg_f1'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'f1' in col]].mean(axis=1)
        df_initial_evaluation['std_f1'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'f1' in col]].std(axis=1)
        df_initial_evaluation['max_f1'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'f1' in col]].max(axis=1)
        df_initial_evaluation['min_f1'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'f1' in col]].min(axis=1)
    
        df_initial_evaluation['avg_precision'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'precision' in col]].mean(axis=1)
        df_initial_evaluation['std_precision'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'precision' in col]].std(axis=1)
        df_initial_evaluation['max_precision'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'precision' in col]].max(axis=1)
        df_initial_evaluation['min_precision'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'precision' in col]].min(axis=1)
    
        df_initial_evaluation['avg_recall'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'recall' in col]].mean(axis=1)
        df_initial_evaluation['std_recall'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'recall' in col]].std(axis=1)
        df_initial_evaluation['max_recall'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'recall' in col]].max(axis=1)
        df_initial_evaluation['min_recall'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'recall' in col]].min(axis=1)
    
        return df_initial_evaluation
    
    
    def _entropy_based_measures(X_train, y_train):

        df_mutual_info = pd.DataFrame()

        for feat in X_train.columns:
            df_mutual_info[feat] = [pymit.I(X_train[feat].values, y_train[target].values , bins=[10,2])]
        
        df_mutual_info['avg_mi'] = df_mutual_info.mean(axis=1)
        df_mutual_info['std_mi'] = df_mutual_info.std(axis=1)
        df_mutual_info['min_mi'] = df_mutual_info.min(axis=1)
        df_mutual_info['max_mi'] = df_mutual_info.max(axis=1)
        return df_mutual_info[['avg_mi', 'std_mi', 'min_mi', 'max_mi']]
    
    
    def _feature_diversity(X_train):
        
        df_feature_diversity = pd.DataFrame()
        
        disc_columns = [col for col in X_train if 'disc' in col and 'group' not in col and 'encode' not in col]
        numeric_columns = [col for col in X_train if 'disc' not in col and 'group' not in col and 'encode' not in col]
        
        numeric_pairs = list(itertools.combinations(numeric_columns, 2))
        all_t = []
        for pair in numeric_pairs:
            t_pair = stats.ttest_rel(X_train[pair[0]].values, X_train[pair[1]].values)[0]
            all_t.append(t_pair)
        
        all_chi = []
        disc_pairs = list(itertools.combinations(disc_columns, 2))
        for pair in disc_pairs:
            contingency = pd.crosstab(X_train[pair[0]].values, X_train[pair[1]].values) 
            chi, _, _, _ = stats.chi2_contingency(contingency) 
            all_chi.append(chi)
        
        if len(all_t) == 0:
            df_feature_diversity['avg_t'] = [-99]
            df_feature_diversity['std_t'] = -99
            df_feature_diversity['max_t'] = -99
            df_feature_diversity['min_t'] = -99
        else:
            df_feature_diversity['avg_t'] = [np.mean(all_t)]
            df_feature_diversity['std_t'] = np.std(all_t)
            df_feature_diversity['max_t'] = np.max(all_t)
            df_feature_diversity['min_t'] = np.min(all_t)
            
        if len(all_chi) == 0:
            df_feature_diversity['avg_chi'] = -99
            df_feature_diversity['std_chi'] = -99
            df_feature_diversity['max_chi'] = -99
            df_feature_diversity['min_chi'] = -99
        else:        
            df_feature_diversity['avg_chi'] = np.mean(all_chi)
            df_feature_diversity['std_chi'] = np.std(all_chi)
            df_feature_diversity['max_chi'] = np.max(all_chi)
            df_feature_diversity['min_chi'] = np.min(all_chi)

        return df_feature_diversity
    
    dataset_info_df = _general_information(X_train.copy())
    dataset_initial_eval = _initial_evaluation(X_train.copy(), X_test.copy(), y_train.copy(), y_test.copy())
    dataset_entropy_info = _entropy_based_measures(X_train.copy(), y_train.copy())
    dataset_feature_diversity = _feature_diversity(X_train.copy())

    
    df = pd.concat([dataset_info_df, dataset_initial_eval, dataset_entropy_info, dataset_feature_diversity], axis=1)
    return df
        
        
def candidate_mi_and_stattest(operator_feat, X_train, y_train, X_train_ori, target):
    # passo 1 da parte de features candidatas
    tests_df = pd.DataFrame()
    
    this_feat = created_features_dict[operator_feat]
    original_features = X_train_ori.columns

    all_t = []
    for original_feat in original_features:
        tmp_df_stat_tests = pd.DataFrame()
        tmp_df_stat_tests['feature_name'] = [operator_feat]
        
        # nao faz o teste na feature que deu origem a nova
        if original_feat not in this_feat['source_feature_name']:
            t_pair = stats.ttest_rel(X_train[operator_feat].values, X_train_ori[original_feat].values)[0]
            all_t.append(t_pair)
    
    try:
        mutual_info = pymit.I(X_train[operator_feat].values, y_train[target].values , bins=[10,2])
    except:
        print(operator_feat)
              
    
    if len(all_t) == 0:
        tmp_df_stat_tests[f'{original_feat}_avg_t'] = -99
        tmp_df_stat_tests[f'{original_feat}_std_t'] = -99
        tmp_df_stat_tests[f'{original_feat}_max_t'] = -99
        tmp_df_stat_tests[f'{original_feat}_min_t'] = -99
    else:
        tmp_df_stat_tests[f'{original_feat}_avg_t'] = np.mean(all_t)
        tmp_df_stat_tests[f'{original_feat}_std_t'] = np.std(all_t)
        tmp_df_stat_tests[f'{original_feat}_max_t'] = np.max(all_t)
        tmp_df_stat_tests[f'{original_feat}_min_t'] = np.min(all_t)
    
    tmp_df_stat_tests[f'feat_mutual_info'] = mutual_info
      
    return tmp_df_stat_tests  


def generic_meta_features(operator_feature, X_train):
    # passo 2 das features candidatas
    # https://github.com/giladkatz/ExploreKit/blob/master/src/main/java/explorekit/Evaluation/MLFeatureExtraction/OperatorAssignmentBasedAttributes.java
    op_dict = created_features_dict[operator_feature]
    df_generic_meta_feats = pd.DataFrame()
    df_generic_meta_feats['feature_name'] = [operator_feature]
    df_generic_meta_feats['num_sources'] = op_dict['num_of_source_features']
    df_generic_meta_feats['num_numeric_sources'] = len([x for x in op_dict['source_feature_type'] if 'numeric' in x])
    df_generic_meta_feats['num_discrete_sources'] = len([x for x in op_dict['source_feature_type'] if 'discrete' in x])
    df_generic_meta_feats['discretizer_in_use'] = True if op_dict['operator']=='discretizer' else False
    df_generic_meta_feats['normalizer_in_use'] = True if op_dict['operator']=='normalizer' else False
    df_generic_meta_feats['group_in_use'] = True if 'group' in op_dict['operator'] else False
    df_generic_meta_feats['binary_in_use'] = True if 'binary' in op_dict['operator'] else False
    
    # discrete sources
    indices_discrete = [i for i, x in enumerate(op_dict['source_feature_type']) if x == "discrete"]
    if len(indices_discrete) >= 1:
        discrete_columns = [op_dict['source_feature_name'][i] for i in indices_discrete]
        X_train_numpy = X_train[discrete_columns].astype(float).to_numpy()
        df_generic_meta_feats['max_discrete_source_value'] = X_train_numpy.max()
        df_generic_meta_feats['min_discrete_source_value'] = X_train_numpy.min()
        df_generic_meta_feats['avg_discrete_source_value'] = X_train_numpy.mean()
        df_generic_meta_feats['std_discrete_source_value'] = X_train_numpy.std()
        
        all_chi = []
        for discrete_feat in discrete_columns:
            # transform target feature in discrete
            if 'disc' not in operator_feature:
                this_feat_discrete, _ = numpy_discretize(X_train[[operator_feature]].copy(),
                                                      X_train[[operator_feature]].copy(), gran=10)
                contingency = pd.crosstab(X_train[discrete_feat].values, this_feat_discrete[f"{operator_feature}_disc"].values) 
            else:
                this_feat_discrete = X_train[[operator_feature]]
                contingency = pd.crosstab(X_train[discrete_feat].values, this_feat_discrete[f"{operator_feature}"].values) 
            
            chi, _, _, _ = stats.chi2_contingency(contingency) 
            all_chi.append(chi)
        
        df_generic_meta_feats['max_chi_source_opattr_value'] = np.max(chi)
        df_generic_meta_feats['min_chi_source_opattr_value'] = np.min(chi)
        df_generic_meta_feats['avg_chi_source_opattr_value'] = np.mean(chi)
        df_generic_meta_feats['std_chi_source_opattr_value'] = np.std(chi)
            
        
    else:
        df_generic_meta_feats['max_discrete_source_value'] = 0
        df_generic_meta_feats['min_discrete_source_value'] = 0
        df_generic_meta_feats['avg_discrete_source_value'] = 0
        df_generic_meta_feats['std_discrete_source_value'] = 0
        df_generic_meta_feats['max_chi_source_opattr_value'] = 0
        df_generic_meta_feats['min_chi_source_opattr_value'] = 0
        df_generic_meta_feats['avg_chi_source_opattr_value'] = 0
        df_generic_meta_feats['std_chi_source_opattr_value'] = 0
        
    # numeric sources
    indices_numeric = [i for i, x in enumerate(op_dict['source_feature_type']) if x == "numeric"]
    if len(indices_numeric) >= 1:
        numeric_columns = [ op_dict['source_feature_name'][i] for i in indices_numeric]
        try:
            X_train_numpy = X_train[numeric_columns].to_numpy()
            df_generic_meta_feats['max_numeric_source_value'] = np.max(X_train_numpy)
            df_generic_meta_feats['min_numeric_source_value'] = np.min(X_train_numpy)
            df_generic_meta_feats['avg_numeric_source_value'] = np.mean(X_train_numpy)
            df_generic_meta_feats['std_numeric_source_value'] = np.std(X_train_numpy)
        except:
            print(X_train_numpy)
            print(numeric_columns)
            print(operator_feature)
        all_t = []
        for src_feat in numeric_columns:
            t_pair = stats.ttest_rel(X_train[operator_feature].values,
                                     X_train[src_feat].values)[0]
            all_t.append(t_pair)
            
        df_generic_meta_feats['max_ttest_source_opattr_value'] = np.max(all_t)
        df_generic_meta_feats['min_ttest_source_opattr_value'] = np.min(all_t)
        df_generic_meta_feats['avg_ttest_source_opattr_value'] = np.mean(all_t)
        df_generic_meta_feats['std_ttest_source_opattr_value'] = np.std(all_t)
           
    else:
        df_generic_meta_feats['max_numeric_source_value'] = 0
        df_generic_meta_feats['min_numeric_source_value'] = 0
        df_generic_meta_feats['avg_numeric_source_value'] = 0
        df_generic_meta_feats['std_numeric_source_value'] = 0
        df_generic_meta_feats['max_ttest_source_opattr_value'] = 0
        df_generic_meta_feats['min_ttest_source_opattr_value'] = 0
        df_generic_meta_feats['avg_ttest_source_opattr_value'] = 0
        df_generic_meta_feats['std_ttest_source_opattr_value'] = 0

    return df_generic_meta_feats
        
              

In [41]:
dataset_based_meta_features = dataset_based_meta_features(X_train_ori.copy(), X_test_ori.copy(), y_train.copy(), y_test.copy())



In [42]:
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_test.mean())

In [43]:
fitered_keep_dict = {k: v for k, v in created_features_dict.items() if k in to_keep}

In [46]:
%%time
t_test_statistic_candidate_df = pd.DataFrame()
general_meta_feature_candidates = pd.DataFrame()

with tqdm(total=len(to_keep)) as pbar:
    for k, v in fitered_keep_dict.items():
        all_features = list([k] + v['source_feature_name'])
        tmp_t_test_statistic_candidate_df = candidate_mi_and_stattest(k, X_train[all_features].copy(), y_train.copy(), X_train_ori.copy(), target)
        tmp_general_meta_feature_candidates = generic_meta_features(k, X_train[all_features].copy())

        t_test_statistic_candidate_df = t_test_statistic_candidate_df.append(tmp_t_test_statistic_candidate_df)
        general_meta_feature_candidates = general_meta_feature_candidates.append(tmp_general_meta_feature_candidates)
        pbar.update(1)
        

  6%|▋         | 395/6194 [00:47<11:56,  8.09it/s]
100%|██████████| 1/1 [00:00<00:00, 599.79it/s]
  6%|▋         | 396/6194 [00:47<12:23,  7.80it/s]
100%|██████████| 1/1 [00:00<00:00, 808.93it/s]
  6%|▋         | 397/6194 [00:47<12:40,  7.62it/s]
100%|██████████| 1/1 [00:00<00:00, 602.89it/s]
  6%|▋         | 398/6194 [00:47<12:47,  7.55it/s]
100%|██████████| 1/1 [00:00<00:00, 583.92it/s]
  6%|▋         | 399/6194 [00:47<13:03,  7.39it/s]
100%|██████████| 1/1 [00:00<00:00, 800.90it/s]
  6%|▋         | 400/6194 [00:48<13:01,  7.41it/s]
100%|██████████| 1/1 [00:00<00:00, 801.82it/s]
  6%|▋         | 401/6194 [00:48<13:20,  7.24it/s]
100%|██████████| 1/1 [00:00<00:00, 830.23it/s]
  6%|▋         | 402/6194 [00:48<13:20,  7.23it/s]
100%|██████████| 1/1 [00:00<00:00, 837.02it/s]
  7%|▋         | 403/6194 [00:48<13:14,  7.29it/s]
100%|██████████| 1/1 [00:00<00:00, 767.06it/s]
  7%|▋         | 404/6194 [00:48<13:14,  7.29it/s]
100%|██████████| 1/1 [00:00<00:00, 518.07it/s]
  7%|▋         | 405

CPU times: user 13min 23s, sys: 10.3 s, total: 13min 33s
Wall time: 13min 13s





In [58]:
final_df = general_meta_feature_candidates.merge(t_test_statistic_candidate_df, on='feature_name')
final_df.index = final_df['feature_name']
final_df = final_df.drop(['feature_name'], axis=1)

In [49]:
%%time
from fangorn.training import classifiers

base_clf = classifiers.logistic_regression_classifier(train_set= [X_train_ori, y_train],
                         test_set= [X_test_ori, y_test],
                         features= X_train_ori.columns,
                         target= 'Class',
                         test_metrics= ['auc'],
                         project_name= dataset_name
                         ) 
base_auc = base_clf['calc_metrics']['auc']
base_auc

CPU times: user 44.1 ms, sys: 4.05 ms, total: 48.2 ms
Wall time: 46.7 ms


0.6235

In [62]:
%%time
dict_feature_error_diff = {}
with tqdm(total=final_df.shape[0]) as pbar:
    for idx, row in final_df.iterrows():
        this_X_train = X_train_ori.copy()
        this_X_test = X_test_ori.copy()

        this_X_train[idx] = X_train[idx]
        this_X_test[idx] = X_test[idx]

        this_clf = classifiers.random_forest_classifier(train_set= [this_X_train, y_train],
                             test_set= [this_X_test, y_test],
                             features= this_X_train.columns,
                             target= 'Class',
                             test_metrics= ['auc'],
                             project_name= 'explore_kit'
                             ) 
        this_auc = this_clf['calc_metrics']['auc']
        error_diff = this_auc - base_auc
        dict_feature_error_diff[idx] = error_diff
        pbar.update(1)

100%|██████████| 6148/6148 [14:59<00:00,  6.84it/s]

CPU times: user 14min 58s, sys: 6.54 s, total: 15min 4s
Wall time: 14min 59s





6148

In [86]:
tt = final_df.reset_index()
tt['feature_goodness'] = tt['feature_name']
tt['feature_goodness'] = tt['feature_goodness'].map(dict_feature_error_diff)

In [78]:
th_value =  np.round(tt['feature_goodness'].quantile(.8),3)
th_value

0.12

In [117]:
keep = list({i[1]:i[0] for i in sorted(zip(dict_feature_error_diff.values(), dict_feature_error_diff.keys()), reverse=True)[:300]}.keys())

In [79]:
only_features = {k: v for k, v in dict_feature_error_diff.items() if v > th_value}
keep = list(only_features.keys())
len(keep)

1776

In [98]:
def save_dataset_info(final_df, dict_feature_error_diff, dataset_name, base_clf, keep, dataset_based_meta_features):
    import joblib
    joblib.dump(keep, f'ExploreKit/{dataset_name}_{len(keep)}_features_to_keep')
    joblib.dump(dict_feature_error_diff, f'ExploreKit/{dataset_name}_dict_feature_error_diff')
    joblib.dump(base_clf, f'ExploreKit/{dataset_name}_base_clf')
    # save meta feature csv
    final_df = final_df.reset_index()
    final_df.to_csv(f'ExploreKit/{dataset_name}_meta_features.csv', index=False)
    final_df.index = final_df['feature_name']
    final_df = final_df.drop(['feature_name'], axis=1)
    
    # save dataset feature dict
    joblib.dump(dict_feature_error_diff, f'ExploreKit/{dataset_name}.featuredict')

    # join and save final dataset for mL modeling
    for col in dataset_based_meta_features.columns:
        final_df[col] = list(dataset_based_meta_features[col].values) * final_df.shape[0]
    final_df['dataset'] = dataset_name

    tt = final_df.reset_index()
    tt['feature_goodness'] = tt['feature_name']
    tt['feature_goodness'] = tt['feature_goodness'].map(dict_feature_error_diff)
    joblib.dump(tt, f'ExploreKit/{dataset_name}_meta_ml_modeling.df')
    
    return True

## FS

In [118]:
X_train = X_train[keep]
X_test = X_test[keep]

In [119]:
def numpy_discretize_multi_gran(X_train, X_test, max_gran=10):
    """
    multi-granularity discretization
    method. The basic idea is simple: instead of using a fine-tuned
    granularity, we discretize each numerical feature into several, rather
    than only one, categorical features, each with a different granularity.
    
    min granularity = 3
    
    Sometimes de edge values did not permit to execute correct discretization
    if this happens the step is not executed
    """
    
    # separa dados numericos que precisam de binarizacao
    is_numeric = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numeric_features = X_train.select_dtypes(include=is_numeric)
    discrete_features = []
    print(f"Discretizing {len(numeric_features.columns)} features...")
    feat_count = 0
    for feat in numeric_features:
        if feat_count % 50 == 0:
            print(f" Working in {feat}")
        X_train_np = X_train[[feat]].to_numpy()
        X_test_np = X_test[[feat]].to_numpy()
        for gran in range(3, max_gran+1):
            try:
                D_train = np.zeros([X_train.shape[0], 1])
                D_test = np.zeros([X_test.shape[0], 1])
                # calc numpy histogram and apply to features
                hist, bin_edges = np.histogram(X_train_np[:, 0], bins=gran)
                D_train[:, 0] = np.digitize(X_train_np[:,0], bin_edges, right=False)
                D_test[:, 0] = np.digitize(X_test_np[:,0], bin_edges, right=False)

                # apply back to pandas
                X_train[f"{feat}_{gran}"] = D_train
                X_test[f"{feat}_{gran}"] = D_test
            except:
                print(f"Not possible to correct work on cut {feat} > {gran}")
                break
        
        feat_count += 1
        X_train = X_train.drop(feat, axis=1)
        X_test = X_test.drop(feat, axis=1)
        
    return X_train, X_test

In [124]:
X_train_discrete, X_test_discrete = numpy_discretize_multi_gran(X_train.copy(), X_test.copy())

Discretizing 300 features...
 Working in lesion_1_norm_x_lesion_2_norm_op_sub
 Working in abdomo_protein_group_by_lesion_1_disc_and_mean
 Working in rectal_temp_norm_x_lesion_2_norm_op_sum
 Working in abdomo_protein_norm_group_by_abdomo_appearance_group_2_norm_and_mean
 Working in hospital_number_norm_x_lesion_1_norm_op_sub
 Working in hospital_number_norm_group_by_nasogastric_reflux_group_3_disc_and_min_disc


In [125]:
def hjmi_selector(X, y, bins, max_features):
    
    X = X.to_numpy()
    Y = y.to_numpy().ravel()

    [tmp, features] = X.shape
    D = np.zeros([tmp, features])

    for i in range(features):
        N, E = np.histogram(X[:,i], bins=bins)
        D[:,i] = np.digitize(X[:,i], E, right=False)

    selected_features = []
    j_h = 0
    hjmi = None
    for i in range(0,max_features):
        JMI = np.zeros([features], dtype=np.float)
        for X_k in range(features):
            if X_k in selected_features:
                continue
            jmi_1 = pymit.I(D[:,X_k], Y, bins=[bins,2])
            jmi_2 = 0
            for X_j in selected_features:
                tmp1 = pymit.I(D[:,X_k], D[:,X_j], bins=[bins,bins])
                tmp2 = pymit.I_cond(D[:,X_k], D[:,X_j], Y, bins=[bins,bins,2])
                jmi_2 += tmp1 - tmp2
            if len(selected_features) == 0:
                JMI[X_k] += j_h + jmi_1
            else:
                JMI[X_k] += j_h + jmi_1 - jmi_2/len(selected_features)
        
        f = JMI.argmax()
        j_h = JMI[f]
        if (hjmi == None) or ((j_h - hjmi)/hjmi > 0.03):
            r = 0
            if hjmi != None:
                r = ((j_h - hjmi)/hjmi) 

            hjmi = j_h
            selected_features.append(f)
            print("{:0>3d} {:>3d} {} - {}".format(len(selected_features), f, j_h, r))
        else:
            return selected_features

In [126]:
%%time
selected_features = hjmi_selector(X_train_discrete.copy(), y_train.copy(), bins=10, max_features=300)

001 1607 0.2020153150546952 - 0
002 641 0.2563862279892669 - 0.2691425297129127
003 693 0.36851553594969894 - 0.4373452850405291
004 694 0.44658408931513566 - 0.21184603022026405
005 1605 0.5174751812027086 - 0.15874074689111484
006 740 0.5745648035810855 - 0.11032340187927464
007 158 0.6321289686818531 - 0.10018741966439267
008 1603 0.7005171597884138 - 0.10818708601374047
009 741 0.7653990975565872 - 0.09262005485742925
010 1601 0.8260229972320378 - 0.07920560642020948
011 742 0.8833138403727532 - 0.06935744323426128
012 692 0.9399554216529377 - 0.06412395990114009
013 159 0.9938507178716026 - 0.05733814070021383
014 691 1.0538977119708455 - 0.0604185246531164
015  32 1.1099854200400034 - 0.05321930907722623
016 739 1.1721110552869212 - 0.055969775931542176
017 690 1.2318056402220503 - 0.05092912029612799
018 1551 1.2869717738764086 - 0.04478477111406457
019 689 1.3470741688772596 - 0.04670063184044844
020 738 1.4050350850278297 - 0.04302726419205153
021 184 1.4603906522956411 - 0.03

In [121]:
filtered_train = X_train_discrete[X_train_discrete.columns[selected_features]]
filtered_test = X_test_discrete[X_test_discrete.columns[selected_features]]

NameError: name 'selected_features' is not defined