In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import operator
import itertools
import scipy.stats as stats
import pymit


In [2]:
df = pd.read_csv('puma8.csv')
df = df.dropna()
df['thetadd3_bin'] = 1
df.loc[df['thetadd3'] < 1, 'thetadd3_bin'] = 0
Y_all = df[['thetadd3_bin']]
X_all = df.drop(['thetadd3', 'thetadd3_bin'], axis=1)
X_train_ori, X_test_ori, y_train, y_test = train_test_split(X_all, Y_all, test_size = 0.3, random_state = 0)


In [3]:
created_features_dict = {}

In [4]:
def numpy_discretize(X_train, X_test, gran=10):
    """
    multi-granularity discretization
    method. The basic idea is simple: instead of using a fine-tuned
    granularity, we discretize each numerical feature into several, rather
    than only one, categorical features, each with a different granularity.
    
    min granularity = 10
    
    Sometimes de edge values did not permit to execute correct discretization
    if this happens the step is not executed
    """
    global created_features_dict
    # separa dados numericos que precisam de binarizacao
    is_numeric = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numeric_features = X_train.select_dtypes(include=is_numeric)
    numeric_features = [feat for feat in numeric_features.columns if 'disc' not in feat[-4:]]
    discrete_features = []
#     print(f"Discretizing {len(numeric_features.columns)} features...")
    feat_count = 0
    for feat in numeric_features:
        this_gran = gran
        if feat_count % 50 == 0:
            print(f" Working in {feat}")
        X_train_np = X_train[[feat]].to_numpy()
        X_test_np = X_test[[feat]].to_numpy()
        success = False
        while not success:
            try:
                D_train = np.zeros([X_train.shape[0], 1])
                D_test = np.zeros([X_test.shape[0], 1])
                # calc numpy histogram and apply to features
                hist, bin_edges = np.histogram(X_train_np[:, 0], bins=this_gran)
                D_train[:, 0] = np.digitize(X_train_np[:,0], bin_edges, right=False)
                D_test[:, 0] = np.digitize(X_test_np[:,0], bin_edges, right=False)

                # apply back to pandas
                X_train[f"{feat}_disc"] = D_train
                X_test[f"{feat}_disc"] = D_test

                success = True
            except:
                print(f"Not possible to correct work on cut {feat} > {this_gran}")
                this_gran = this_gran - 1
                if this_gran <= 1:
                    success = True
            
            if success and this_gran > 1:
                #upoad global dict with feature info
                created_features_dict[f"{feat}_disc"] = {
                    "num_of_source_features": 1,
                    "source_feature_name": [feat],
                    "source_feature_type": ['numeric'],
                    "target_feature_type": ['discrete'],
                    "operator": "discretizer"
                }
        
        feat_count += 1
    return X_train, X_test



def min_max_scaler(X_train, X_test):
    global created_features_dict
    
    scaler = MinMaxScaler()
    scaler.fit(X_train[ORIGINAL_FEATURES])
    norm_feats = [f"{x}_norm" for x in ORIGINAL_FEATURES if 'disc' not in x[-4:]]
    
    for feat in [x for x in ORIGINAL_FEATURES if 'disc' not in x[-4:]]:
        #upoad global dict with feature info
        created_features_dict[f"{feat}_norm"] = {
            "num_of_source_features": 1,
            "source_feature_name": [feat],
            "source_feature_type": ['numeric'],
            "target_feature_type": ['numeric'],
            "operator": "normalizer"
        }
        
    X_train = X_train.reindex(columns=X_train.columns.tolist() + norm_feats)
    X_test = X_test.reindex(columns=X_test.columns.tolist() + norm_feats)
    X_train.loc[:, norm_feats] = scaler.transform(X_train[ORIGINAL_FEATURES])
    X_test.loc[:, norm_feats] = scaler.transform(X_test[ORIGINAL_FEATURES])
    return X_train, X_test


def binary_operators(df):
    global created_features_dict
    
    # calc all pair columns
    all_columns = list(df)
    pairwise_cols = list(itertools.combinations(all_columns, 2))
    for pair in pairwise_cols:
        
        df[f"{pair[0]}_x_{pair[1]}_op_sum"] = df[pair[0]] + df[pair[1]]
        
        #upoad global dict with feature info
        type_pair1 = 'disc' if 'disc' in pair[1] else 'numeric'
        type_pair2 = 'disc' if 'disc' in pair[1] else 'numeric'
        
        created_features_dict[f"{pair[0]}_x_{pair[1]}_op_sum"] = {
            "num_of_source_features": 2,
            "source_feature_name": [pair[0], pair[1]],
            "source_feature_type": [type_pair1, type_pair2],
            "target_feature_type": ['numeric'],
            "operator": "binary_sum"
        }        
                                      
        df[f"{pair[0]}_x_{pair[1]}_op_sub"] = df[pair[0]] - df[pair[1]]
        created_features_dict[f"{pair[0]}_x_{pair[1]}_op_sub"] = {
            "num_of_source_features": 2,
            "source_feature_name": [pair[0], pair[1]],
            "source_feature_type": [type_pair1, type_pair2],
            "target_feature_type": ['numeric'],
            "operator": "binary_sub"
        }             
                              
        df[f"{pair[0]}_x_{pair[1]}_op_mul"] = df[pair[0]] * df[pair[1]]
        created_features_dict[f"{pair[0]}_x_{pair[1]}_op_mul"] = {
            "num_of_source_features": 2,
            "source_feature_name": [pair[0], pair[1]],
            "source_feature_type": [type_pair1, type_pair2],
            "target_feature_type": ['numeric'],
            "operator": "binary_mul"
        }             
                 
        df[f"{pair[0]}_x_{pair[1]}_op_div"] = df[pair[0]] / df[pair[1]]
        created_features_dict[f"{pair[0]}_x_{pair[1]}_op_div"] = {
            "num_of_source_features": 2,
            "source_feature_name": [pair[0], pair[1]],
            "source_feature_type": [type_pair1, type_pair2],
            "target_feature_type": ['numeric'],
            "operator": "binary_div"
        }   
        df = df.replace([np.inf, -np.inf], np.nan)
    
    return df


def high_order_operators(df):
    
    def _update_dict(group_col, columns, op):
        global created_features_dict
        
        for feat in columns:
            created_features_dict[f'{feat}_group_by_{col}_and_mean'] = {
            "num_of_source_features": 2,
            "source_feature_name": [group_col, feat],
            "source_feature_type": ['discrete', 'numeric'],
            "target_feature_type": ['numeric'],
            "operator": f"group_{op}"
            }
            
    group_columns = [col for col in df.columns if "disc" in col]
    to_group_columns = [col for col in df.columns if "disc" not in col]
    all_dfs = pd.DataFrame()
    for col in group_columns:
        print(f"Grouping {col}")
        
        df_avg = df[to_group_columns+[col]].groupby(col).transform('mean').add_suffix(f'_group_by_{col}_and_mean')
        _update_dict(col, to_group_columns, 'mean')
        df_min = df[to_group_columns+[col]].groupby(col).transform('min').add_suffix(f'_group_by_{col}_and_min')
        _update_dict(col, to_group_columns, 'min')
        df_max = df[to_group_columns+[col]].groupby(col).transform('max').add_suffix(f'_group_by_{col}_and_max')
        _update_dict(col, to_group_columns, 'max')
        
        all_dfs = pd.concat([all_dfs, df_avg, df_min, df_max], axis=1,  sort=False)
    all_dfs = pd.concat([df,all_dfs], axis=1)
    return all_dfs
        

In [5]:
ORIGINAL_FEATURES = X_train_ori.columns

# discretize
X_train, X_test = numpy_discretize(X_train_ori.copy(), X_test_ori.copy(), gran=10)
# normalize
X_train, X_test = min_max_scaler(X_train.copy(), X_test.copy())

step1_train = X_train.copy()
step1_test = X_test.copy()
step1_features = X_train.columns

 Working in theta1


In [6]:
X_train = high_order_operators(X_train[step1_features].copy())
X_test = high_order_operators(X_test[step1_features].copy())

step2_train = X_train.copy()
step2_test = X_test.copy()
step2_features = X_train.columns

Grouping theta1_disc
Grouping theta2_disc
Grouping theta3_disc
Grouping thetad1_disc
Grouping thetad2_disc
Grouping thetad3_disc
Grouping tau1_disc
Grouping tau2_disc
Grouping theta1_disc
Grouping theta2_disc
Grouping theta3_disc
Grouping thetad1_disc
Grouping thetad2_disc
Grouping thetad3_disc
Grouping tau1_disc
Grouping tau2_disc


In [7]:
# binary operators
X_train = binary_operators(X_train[step1_features].copy())
X_test = binary_operators(X_test[step1_features].copy())

step3_train = X_train.copy()
step3_test = X_test.copy()
step3_features = X_train.columns

In [8]:
X_train = pd.concat([step1_train, step2_train, step3_train], axis=1).dropna(axis=1, thresh=0.03*X_train.shape[0])
X_test = pd.concat([step1_test, step2_test, step3_test], axis=1)[X_train.columns]

In [9]:
X_train = X_train.loc[:,~X_train.columns.duplicated()]
X_test = X_test.loc[:,~X_test.columns.duplicated()]

In [10]:
# discretize
X_train, X_test = numpy_discretize(X_train, X_test, gran=10)


 Working in theta1
 Working in theta3_group_by_theta1_disc_and_max
 Working in thetad2_group_by_theta2_disc_and_max
 Working in tau1_group_by_theta3_disc_and_max
 Working in theta1_norm_group_by_thetad1_disc_and_max
 Working in theta3_norm_group_by_thetad2_disc_and_max
 Working in thetad2_norm_group_by_thetad3_disc_and_max
 Working in tau1_norm_group_by_tau1_disc_and_max
 Working in theta1_x_theta2_op_sum
 Working in theta1_x_thetad3_disc_op_mul
Not possible to correct work on cut theta1_x_theta1_norm_op_div > 10
Not possible to correct work on cut theta1_x_theta1_norm_op_div > 9
Not possible to correct work on cut theta1_x_theta1_norm_op_div > 8
Not possible to correct work on cut theta1_x_theta1_norm_op_div > 7
Not possible to correct work on cut theta1_x_theta1_norm_op_div > 6
Not possible to correct work on cut theta1_x_theta1_norm_op_div > 5
Not possible to correct work on cut theta1_x_theta1_norm_op_div > 4
Not possible to correct work on cut theta1_x_theta1_norm_op_div > 3
Not p

In [11]:
def dataset_based_meta_features(X_train, X_test, y_train, y_test):
    """
    Applied in the original set!
    """
    
    
    
    def _general_information(X):
        dataset_info_df= pd.DataFrame()

        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        num_numeric_attr = X.select_dtypes(include=numerics).shape[1]
        num_duscrete_attr = X.shape[1] - num_numeric_attr
            
        
        dataset_info_df['num_instances'] = [X.shape[0]]
        dataset_info_df['num_features'] = X.shape[1]
        
        dataset_info_df['num_numeric_attr'] = num_numeric_attr
        dataset_info_df['num_discrete_attr'] = num_duscrete_attr
        dataset_info_df['ratio_numeric_attr'] = num_numeric_attr/ (num_numeric_attr+num_duscrete_attr)
        dataset_info_df['ratio_discrete_attr'] = num_duscrete_attr/ (num_numeric_attr+num_duscrete_attr)
        
        return dataset_info_df
    
    


    def _initial_evaluation(X_train, X_test, y_train, y_test):
    
        from sklearn.ensemble import RandomForestClassifier
        from sklearn import metrics
        
        def acc(y_true, y_pred):
            from sklearn.metrics import accuracy_score
            y_pred = list(map(lambda k: 0 if k<=0.5 else 1, y_pred))
            return accuracy_score(y_true, y_pred)

        def f1(y_true, y_pred, th):
            from sklearn.metrics import f1_score
            y_pred = list(map(lambda k: 0 if k<=th else 1, y_pred))
            return f1_score(y_true, y_pred)

        def precision(y_true, y_pred, th):
            from sklearn.metrics import precision_score
            y_pred = list(map(lambda k: 0 if k<=th else 1, y_pred))
            return precision_score(y_true, y_pred, average='macro') 

        def recall(y_true, y_pred, th):
            from sklearn.metrics import recall_score
            y_pred = list(map(lambda k: 0 if k<=th else 1, y_pred))
            return recall_score(y_true, y_pred, average='macro')

        def auc(y_true, y_pred):
            from sklearn.metrics import roc_auc_score
            return roc_auc_score(y_true, y_pred)


        df_initial_evaluation = pd.DataFrame()
        clf = RandomForestClassifier()
        clf.fit(X_train, y_train)
        y_pred = clf.predict_proba(X_test)[:,1]

        for th in [0.4, 0.45, 0.5, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9]:
            df_initial_evaluation[f'f1_{th}'] = [f1(y_test, y_pred, th)]
            df_initial_evaluation[f'precision_{th}'] = precision(y_test, y_pred, th)
            df_initial_evaluation[f'recall_{th}'] = recall(y_test, y_pred, th)

        df_initial_evaluation['auc'] = auc(y_test, y_pred)
        
        df_initial_evaluation['avg_f1'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'f1' in col]].mean(axis=1)
        df_initial_evaluation['std_f1'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'f1' in col]].std(axis=1)
        df_initial_evaluation['max_f1'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'f1' in col]].max(axis=1)
        df_initial_evaluation['min_f1'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'f1' in col]].min(axis=1)
    
        df_initial_evaluation['avg_precision'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'precision' in col]].mean(axis=1)
        df_initial_evaluation['std_precision'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'precision' in col]].std(axis=1)
        df_initial_evaluation['max_precision'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'precision' in col]].max(axis=1)
        df_initial_evaluation['min_precision'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'precision' in col]].min(axis=1)
    
        df_initial_evaluation['avg_recall'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'recall' in col]].mean(axis=1)
        df_initial_evaluation['std_recall'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'recall' in col]].std(axis=1)
        df_initial_evaluation['max_recall'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'recall' in col]].max(axis=1)
        df_initial_evaluation['min_recall'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'recall' in col]].min(axis=1)
    
        return df_initial_evaluation
    
    
    def _entropy_based_measures(X_train, y_train):

        df_mutual_info = pd.DataFrame()

        for feat in X_train.columns:
            df_mutual_info[feat] = [pymit.I(X_train[feat].values, y_train['thetadd3_bin'].values , bins=[10,2])]
        
        df_mutual_info['avg_mi'] = df_mutual_info.mean(axis=1)
        df_mutual_info['std_mi'] = df_mutual_info.std(axis=1)
        df_mutual_info['min_mi'] = df_mutual_info.min(axis=1)
        df_mutual_info['max_mi'] = df_mutual_info.max(axis=1)
        return df_mutual_info[['avg_mi', 'std_mi', 'min_mi', 'max_mi']]
    
    
    def _feature_diversity(X_train):
        
        df_feature_diversity = pd.DataFrame()
        
        disc_columns = [col for col in X_train if 'disc' in col]
        numeric_columns = [col for col in X_train if 'disc' not in col]
        
        numeric_pairs = list(itertools.combinations(numeric_columns, 2))
        all_t = []
        for pair in numeric_pairs:
            t_pair = stats.ttest_rel(X_train[pair[0]].values, X_train[pair[1]].values)[0]
            all_t.append(t_pair)
        
        all_chi = []
        disc_pairs = list(itertools.combinations(disc_columns, 2))
        for pair in disc_pairs:
            contingency = pd.crosstab(X_train[pair[0]].values, X_train[pair[1]].values) 
            chi, _, _, _ = stats.chi2_contingency(contingency) 
            all_chi.append(chi)
        
        if len(all_t) == 0:
            df_feature_diversity['avg_t'] = [-99]
            df_feature_diversity['std_t'] = -99
            df_feature_diversity['max_t'] = -99
            df_feature_diversity['min_t'] = -99
        else:
            df_feature_diversity['avg_t'] = [np.mean(all_t)]
            df_feature_diversity['std_t'] = np.std(all_t)
            df_feature_diversity['max_t'] = np.max(all_t)
            df_feature_diversity['min_t'] = np.min(all_t)
            
        if len(all_chi) == 0:
            df_feature_diversity['avg_chi'] = -99
            df_feature_diversity['std_chi'] = -99
            df_feature_diversity['max_chi'] = -99
            df_feature_diversity['min_chi'] = -99
        else:        
            df_feature_diversity['avg_chi'] = np.mean(all_chi)
            df_feature_diversity['std_chi'] = np.std(all_chi)
            df_feature_diversity['max_chi'] = np.max(all_chi)
            df_feature_diversity['min_chi'] = np.min(all_chi)

        return df_feature_diversity
    
    dataset_info_df = _general_information(X_train.copy())
    dataset_initial_eval = _initial_evaluation(X_train.copy(), X_test.copy(), y_train.copy(), y_test.copy())
    dataset_entropy_info = _entropy_based_measures(X_train.copy(), y_train.copy())
    dataset_feature_diversity = _feature_diversity(X_train.copy())

    
    df = pd.concat([dataset_info_df, dataset_initial_eval, dataset_entropy_info, dataset_feature_diversity], axis=1)
    return df
        
        
        

In [12]:
p = dataset_based_meta_features(X_train_ori.copy(), X_test_ori.copy(), y_train.copy(), y_test.copy())



## Operators based attributes

In [13]:
X_train.head(1)

Unnamed: 0,theta1,theta2,theta3,thetad1,thetad2,thetad3,tau1,tau2,theta1_disc,theta2_disc,...,thetad2_norm_x_tau2_norm_op_mul_disc,thetad3_norm_x_tau1_norm_op_sum_disc,thetad3_norm_x_tau1_norm_op_sub_disc,thetad3_norm_x_tau1_norm_op_mul_disc,thetad3_norm_x_tau2_norm_op_sum_disc,thetad3_norm_x_tau2_norm_op_sub_disc,thetad3_norm_x_tau2_norm_op_mul_disc,tau1_norm_x_tau2_norm_op_sum_disc,tau1_norm_x_tau2_norm_op_sub_disc,tau1_norm_x_tau2_norm_op_mul_disc
7252,-0.994493,1.336377,-0.634258,1.327497,0.902959,-1.622991,0.467831,0.428653,3.0,9.0,...,7.0,5.0,1.0,1.0,5.0,1.0,1.0,9.0,6.0,8.0


In [14]:
created_features_dict['theta1_disc']

{'num_of_source_features': 1,
 'source_feature_name': ['theta1'],
 'source_feature_type': ['numeric'],
 'target_feature_type': ['discrete'],
 'operator': 'discretizer'}

In [15]:
def candidate_mi_and_stattest(operator_feat, X_train, y_train, X_train_ori):
    # passo 1 da parte de features candidatas
    tests_df = pd.DataFrame()
    
    this_feat = created_features_dict[operator_feat]
    original_features = X_train_ori.columns

    all_t = []
    for original_feat in original_features:
        tmp_df_stat_tests = pd.DataFrame()
        tmp_df_stat_tests['feature_name'] = [operator_feat]
        
        # nao faz o teste na feature que deu origem a nova
        if original_feat not in this_feat['source_feature_name']:
            t_pair = stats.ttest_rel(X_train[operator_feat].values, X_train_ori[original_feat].values)[0]
            all_t.append(t_pair)
    
    try:
        mutual_info = pymit.I(X_train[operator_feat].values, y_train['thetadd3_bin'].values , bins=[10,2])
    except:
        print(operator_feat)
              
    
    if len(all_t) == 0:
        tmp_df_stat_tests[f'{original_feat}_avg_t'] = -99
        tmp_df_stat_tests[f'{original_feat}_std_t'] = -99
        tmp_df_stat_tests[f'{original_feat}_max_t'] = -99
        tmp_df_stat_tests[f'{original_feat}_min_t'] = -99
    else:
        tmp_df_stat_tests[f'{original_feat}_avg_t'] = np.mean(all_t)
        tmp_df_stat_tests[f'{original_feat}_std_t'] = np.std(all_t)
        tmp_df_stat_tests[f'{original_feat}_max_t'] = np.max(all_t)
        tmp_df_stat_tests[f'{original_feat}_min_t'] = np.min(all_t)
    
    tmp_df_stat_tests[f'feat_mutual_info'] = mutual_info
      
    return tmp_df_stat_tests  
        
        

In [16]:
def generic_meta_features(operator_feature, X_train):
    # passo 2 das features candidatas
    # https://github.com/giladkatz/ExploreKit/blob/master/src/main/java/explorekit/Evaluation/MLFeatureExtraction/OperatorAssignmentBasedAttributes.java
    op_dict = created_features_dict[operator_feature]
    df_generic_meta_feats = pd.DataFrame()
    df_generic_meta_feats['feature_name'] = [operator_feature]
    df_generic_meta_feats['num_sources'] = op_dict['num_of_source_features']
    df_generic_meta_feats['num_numeric_sources'] = len([x for x in op_dict['source_feature_type'] if 'numeric' in x])
    df_generic_meta_feats['num_discrete_sources'] = len([x for x in op_dict['source_feature_type'] if 'discrete' in x])
    df_generic_meta_feats['discretizer_in_use'] = True if op_dict['operator']=='discretizer' else False
    df_generic_meta_feats['normalizer_in_use'] = True if op_dict['operator']=='normalizer' else False
    df_generic_meta_feats['group_in_use'] = True if 'group' in op_dict['operator'] else False
    df_generic_meta_feats['binary_in_use'] = True if 'binary' in op_dict['operator'] else False
    
    # discrete sources
    indices_discrete = [i for i, x in enumerate(op_dict['source_feature_type']) if x == "discrete"]
    if len(indices_discrete) >= 1:
        discrete_columns = [op_dict['source_feature_name'][i] for i in indices_discrete]
        X_train_numpy = X_train[discrete_columns].to_numpy()
        df_generic_meta_feats['max_discrete_source_value'] = X_train_numpy.max()
        df_generic_meta_feats['min_discrete_source_value'] = X_train_numpy.min()
        df_generic_meta_feats['avg_discrete_source_value'] = X_train_numpy.mean()
        df_generic_meta_feats['std_discrete_source_value'] = X_train_numpy.std()
        
        all_chi = []
        for discrete_feat in discrete_columns:
            # transform target feature in discrete
            if 'disc' not in operator_feature:
                this_feat_discrete, _ = numpy_discretize(X_train[[operator_feature]].copy(),
                                                      X_train[[operator_feature]].copy(), gran=10)
                contingency = pd.crosstab(X_train[discrete_feat].values, this_feat_discrete[f"{operator_feature}_disc"].values) 
            else:
                this_feat_discrete = X_train[[operator_feature]]
                contingency = pd.crosstab(X_train[discrete_feat].values, this_feat_discrete[f"{operator_feature}"].values) 
            
            chi, _, _, _ = stats.chi2_contingency(contingency) 
            all_chi.append(chi)
        
        df_generic_meta_feats['max_chi_source_opattr_value'] = np.max(chi)
        df_generic_meta_feats['min_chi_source_opattr_value'] = np.min(chi)
        df_generic_meta_feats['avg_chi_source_opattr_value'] = np.mean(chi)
        df_generic_meta_feats['std_chi_source_opattr_value'] = np.std(chi)
            
        
    else:
        df_generic_meta_feats['max_discrete_source_value'] = 0
        df_generic_meta_feats['min_discrete_source_value'] = 0
        df_generic_meta_feats['avg_discrete_source_value'] = 0
        df_generic_meta_feats['std_discrete_source_value'] = 0
        df_generic_meta_feats['max_chi_source_opattr_value'] = 0
        df_generic_meta_feats['min_chi_source_opattr_value'] = 0
        df_generic_meta_feats['avg_chi_source_opattr_value'] = 0
        df_generic_meta_feats['std_chi_source_opattr_value'] = 0
        
    # numeric sources
    indices_numeric = [i for i, x in enumerate(op_dict['source_feature_type']) if x == "numeric"]
    if len(indices_numeric) >= 1:
        numeric_columns = [ op_dict['source_feature_name'][i] for i in indices_numeric]
        try:
            X_train_numpy = X_train[numeric_columns].to_numpy()
            df_generic_meta_feats['max_numeric_source_value'] = np.max(X_train_numpy)
            df_generic_meta_feats['min_numeric_source_value'] = np.min(X_train_numpy)
            df_generic_meta_feats['avg_numeric_source_value'] = np.mean(X_train_numpy)
            df_generic_meta_feats['std_numeric_source_value'] = np.std(X_train_numpy)
        except:
            print(X_train_numpy)
            print(numeric_columns)
            print(operator_feature)
        all_t = []
        for src_feat in numeric_columns:
            t_pair = stats.ttest_rel(X_train[operator_feature].values,
                                     X_train[src_feat].values)[0]
            all_t.append(t_pair)
            
        df_generic_meta_feats['max_ttest_source_opattr_value'] = np.max(all_t)
        df_generic_meta_feats['min_ttest_source_opattr_value'] = np.min(all_t)
        df_generic_meta_feats['avg_ttest_source_opattr_value'] = np.mean(all_t)
        df_generic_meta_feats['std_ttest_source_opattr_value'] = np.std(all_t)
           
    else:
        df_generic_meta_feats['max_numeric_source_value'] = 0
        df_generic_meta_feats['min_numeric_source_value'] = 0
        df_generic_meta_feats['avg_numeric_source_value'] = 0
        df_generic_meta_feats['std_numeric_source_value'] = 0
        df_generic_meta_feats['max_ttest_source_opattr_value'] = 0
        df_generic_meta_feats['min_ttest_source_opattr_value'] = 0
        df_generic_meta_feats['avg_ttest_source_opattr_value'] = 0
        df_generic_meta_feats['std_ttest_source_opattr_value'] = 0

    return df_generic_meta_feats

In [17]:
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_test.mean())

In [18]:
%%time
t_test_statistic_candidate_df = pd.DataFrame()
general_meta_feature_candidates = pd.DataFrame()

for k, v in created_features_dict.items():
    tmp_t_test_statistic_candidate_df = candidate_mi_and_stattest(k, X_train.copy(), y_train.copy(), X_train_ori.copy())
    tmp_general_meta_feature_candidates = generic_meta_features(k, X_train.copy())

    t_test_statistic_candidate_df = t_test_statistic_candidate_df.append(tmp_t_test_statistic_candidate_df)
    general_meta_feature_candidates = general_meta_feature_candidates.append(tmp_general_meta_feature_candidates)

        

CPU times: user 4min 2s, sys: 3min 32s, total: 7min 34s
Wall time: 7min 34s


In [19]:
final_df = general_meta_feature_candidates.merge(t_test_statistic_candidate_df, on='feature_name')

final_df.to_csv('ExploreKit/puma8_meta_features.csv', index=False)
final_df.index = final_df['feature_name']
final_df = final_df.drop(['feature_name'], axis=1)

In [21]:
final_df

Unnamed: 0_level_0,num_sources,num_numeric_sources,num_discrete_sources,discretizer_in_use,normalizer_in_use,group_in_use,binary_in_use,max_discrete_source_value,min_discrete_source_value,avg_discrete_source_value,...,std_numeric_source_value,max_ttest_source_opattr_value,min_ttest_source_opattr_value,avg_ttest_source_opattr_value,std_ttest_source_opattr_value,tau2_avg_t,tau2_std_t,tau2_max_t,tau2_min_t,feat_mutual_info
feature_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
theta1_disc,1,1,0,True,False,False,False,0.0,0.0,0.0,...,1.080683,235.763217,235.763217,235.763217,0.0,140.054816,3.947758,146.566915,136.278777,0.002284
theta2_disc,1,1,0,True,False,False,False,0.0,0.0,0.0,...,1.094214,231.582869,231.582869,231.582869,0.0,138.436936,3.716240,144.276192,135.015260,0.155866
theta3_disc,1,1,0,True,False,False,False,0.0,0.0,0.0,...,1.082332,233.205621,233.205621,233.205621,0.0,138.713998,3.839769,145.012416,135.832228,0.093736
thetad1_disc,1,1,0,True,False,False,False,0.0,0.0,0.0,...,1.092286,230.066283,230.066283,230.066283,0.0,136.302116,3.783544,142.368977,133.264380,0.001927
thetad2_disc,1,1,0,True,False,False,False,0.0,0.0,0.0,...,1.092324,229.786020,229.786020,229.786020,0.0,135.888182,3.948007,142.160947,132.987019,0.000951
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
thetad3_norm_x_tau2_norm_op_sub_disc,1,1,0,True,False,False,False,0.0,0.0,0.0,...,0.411288,244.062151,244.062151,244.062151,0.0,189.836203,30.269086,268.128787,175.247843,0.000650
thetad3_norm_x_tau2_norm_op_mul_disc,1,1,0,True,False,False,False,0.0,0.0,0.0,...,0.219990,107.083117,107.083117,107.083117,0.0,103.246152,13.171095,132.631338,94.024597,0.001575
tau1_norm_x_tau2_norm_op_sum_disc,1,1,0,True,False,False,False,0.0,0.0,0.0,...,0.404946,200.319611,200.319611,200.319611,0.0,187.638601,19.813108,222.260032,174.961106,0.001314
tau1_norm_x_tau2_norm_op_sub_disc,1,1,0,True,False,False,False,0.0,0.0,0.0,...,0.411486,243.661500,243.661500,243.661500,0.0,181.157643,14.977883,220.740699,174.098345,0.000913


In [23]:
%%time
base_clf = classifiers.logistic_regression_classifier(train_set= [X_train_ori, y_train],
                         test_set= [X_test_ori, y_test],
                         features= X_train_ori.columns,
                         target= 'thetadd3_bin',
                         test_metrics= ['auc'],
                         project_name= 'explore_kit'
                         ) 
base_auc = base_clf['calc_metrics']['auc']

CPU times: user 181 ms, sys: 145 ms, total: 326 ms
Wall time: 22.5 ms


In [46]:
from fangorn.training import classifiers


0.8546998983716289

In [28]:
%%time
dict_feature_error_diff = {}
for idx, row in final_df.iterrows():
    print(idx)
    this_X_train = X_train_ori.copy()
    this_X_test = X_test_ori.copy()
    
    this_X_train[idx] = X_train[idx]
    this_X_test[idx] = X_test[idx]
    
    this_clf = classifiers.random_forest_classifier(train_set= [this_X_train, y_train],
                         test_set= [this_X_test, y_test],
                         features= this_X_train.columns,
                         target= 'thetadd3_bin',
                         test_metrics= ['auc'],
                         project_name= 'explore_kit'
                         ) 
    this_auc = this_clf['calc_metrics']['auc']
    error_diff = this_auc - base_auc
    dict_feature_error_diff[idx] = error_diff
    

theta1_disc
theta2_disc
theta3_disc
thetad1_disc
thetad2_disc
thetad3_disc
tau1_disc
tau2_disc
theta1_norm
theta2_norm
theta3_norm
thetad1_norm
thetad2_norm
thetad3_norm
tau1_norm
tau2_norm
theta1_group_by_theta1_disc_and_mean
theta2_group_by_theta1_disc_and_mean
theta3_group_by_theta1_disc_and_mean
thetad1_group_by_theta1_disc_and_mean
thetad2_group_by_theta1_disc_and_mean
thetad3_group_by_theta1_disc_and_mean
tau1_group_by_theta1_disc_and_mean
tau2_group_by_theta1_disc_and_mean
theta1_norm_group_by_theta1_disc_and_mean
theta2_norm_group_by_theta1_disc_and_mean
theta3_norm_group_by_theta1_disc_and_mean
thetad1_norm_group_by_theta1_disc_and_mean
thetad2_norm_group_by_theta1_disc_and_mean
thetad3_norm_group_by_theta1_disc_and_mean
tau1_norm_group_by_theta1_disc_and_mean
tau2_norm_group_by_theta1_disc_and_mean
theta1_group_by_theta2_disc_and_mean
theta2_group_by_theta2_disc_and_mean
theta3_group_by_theta2_disc_and_mean
thetad1_group_by_theta2_disc_and_mean
thetad2_group_by_theta2_disc_an

In [1]:
dict_feature_error_diff

NameError: name 'dict_feature_error_diff' is not defined

In [75]:
only_features = {k: v for k, v in dict_feature_error_diff.items() if v > 0.005}
keep = list(only_features.keys())
len(keep)

279

In [77]:
X_train[keep].to_csv('X_train_keep.csv', index=False)
X_test[keep].to_csv('X_test_keep.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)


In [74]:
tt = classifiers.random_forest_classifier(train_set= [X_train[keep], y_train],
                         test_set= [X_test[keep], y_test],
                         features= keep,
                         target= 'thetadd3_bin',
                         test_metrics= ['auc'],
                         project_name= 'explore_kit'
                         ) 
tt['calc_metrics']

{'auc': 0.9107232828281157}

In [30]:
dict(sorted(dict_feature_error_diff.items(), key=lambda item: item[1], reverse=False))

{'thetad3_group_by_theta3_disc_and_mean': -0.005154577745704048,
 'thetad3_norm_group_by_theta3_disc_and_mean': -0.005154577745704048,
 'thetad2_group_by_theta2_disc_and_max_disc': -0.004537524703639106,
 'thetad2_norm_group_by_theta2_disc_and_max_disc': -0.004537524703639106,
 'thetad3_group_by_theta3_disc_and_mean_disc': -0.0035814897328860784,
 'thetad3_norm_group_by_theta3_disc_and_mean_disc': -0.0035814897328860784,
 'thetad1_group_by_theta3_disc_and_mean_disc': -0.002603606317510887,
 'thetad1_norm_group_by_theta3_disc_and_mean_disc': -0.002603606317510887,
 'theta1_norm_x_theta2_norm_op_div': -0.0015072116419106996,
 'thetad1_group_by_theta3_disc_and_mean': -0.0014698044564206425,
 'thetad1_norm_group_by_theta3_disc_and_mean': -0.0014698044564206425,
 'theta1_disc_x_theta2_norm_op_div': -0.0009540487485144578,
 'thetad1_group_by_theta2_disc_and_mean': -0.0006743224499390355,
 'thetad1_norm_group_by_theta2_disc_and_mean': -0.0006743224499390355,
 'thetad2_x_theta2_disc_op_mul': -

In [66]:
indices_to_keep = ~this_X_train.isin([np.nan, np.inf, -np.inf]).any(1)

In [80]:
np.isfinite(this_X_test).all()

theta1                     True
theta2                     True
theta3                     True
thetad1                    True
thetad2                    True
thetad3                    True
tau1                       True
tau2                       True
theta1_x_theta2_op_sum    False
dtype: bool

In [79]:
theta1_x_theta2_op_sum

NameError: name 'theta1_x_theta2_op_sum' is not defined

In [67]:
indices_to_keep

7252    True
238     True
1524    True
4827    True
2165    True
        ... 
4931    True
3264    True
1653    True
2607    True
2732    True
Length: 5734, dtype: bool

In [89]:
X_test['theta3_disc']

2310     6.0
1916     4.0
3585     6.0
7404     8.0
5278    10.0
        ... 
6752     6.0
6234     9.0
3563     1.0
7644     8.0
5673     5.0
Name: theta3_disc, Length: 2458, dtype: float64

In [87]:
X_test[[col for col in X_test.columns if 'sum' in col]] 

Unnamed: 0,theta1_x_theta2_op_sum,theta1_x_theta3_op_sum,theta1_x_thetad1_op_sum,theta1_x_thetad2_op_sum,theta1_x_thetad3_op_sum,theta1_x_tau1_op_sum,theta1_x_tau2_op_sum,theta1_x_theta1_disc_op_sum,theta1_x_theta2_disc_op_sum,theta1_x_theta3_disc_op_sum,...,thetad1_norm_x_thetad2_norm_op_sum_disc,thetad1_norm_x_thetad3_norm_op_sum_disc,thetad1_norm_x_tau1_norm_op_sum_disc,thetad1_norm_x_tau2_norm_op_sum_disc,thetad2_norm_x_thetad3_norm_op_sum_disc,thetad2_norm_x_tau1_norm_op_sum_disc,thetad2_norm_x_tau2_norm_op_sum_disc,thetad3_norm_x_tau1_norm_op_sum_disc,thetad3_norm_x_tau2_norm_op_sum_disc,tau1_norm_x_tau2_norm_op_sum_disc
2310,,,,,,,,,,,...,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
1916,,,,,,,,,,,...,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
3585,,,,,,,,,,,...,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
7404,,,,,,,,,,,...,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
5278,,,,,,,,,,,...,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6752,,,,,,,,,,,...,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
6234,,,,,,,,,,,...,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
3563,,,,,,,,,,,...,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
7644,,,,,,,,,,,...,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0


In [54]:
base_auc

0.8546998983716289

In [53]:
this_auc

0.8571763864658817

In [52]:
error_diff

-0.0024764880942528267