In [57]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import operator
import itertools

In [63]:
df = pd.read_csv('puma8.csv')
df['thetadd3_bin'] = 1
df.loc[df['thetadd3'] < 1, 'thetadd3_bin'] = 0
Y_all = df[['thetadd3_bin']]
X_all = df.drop(['thetadd3', 'thetadd3_bin'], axis=1)
X_train_ori, X_test_ori, y_train, y_test = train_test_split(X_all, Y_all, test_size = 0.3, random_state = 0)


In [4]:
def numpy_discretize(X_train, X_test, gran=10):
    """
    multi-granularity discretization
    method. The basic idea is simple: instead of using a fine-tuned
    granularity, we discretize each numerical feature into several, rather
    than only one, categorical features, each with a different granularity.
    
    min granularity = 10
    
    Sometimes de edge values did not permit to execute correct discretization
    if this happens the step is not executed
    """
    
    # separa dados numericos que precisam de binarizacao
    is_numeric = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numeric_features = X_train.select_dtypes(include=is_numeric)
    numeric_features = [feat for feat in numeric_features.columns if 'disc' not in feat[-4:]]
    discrete_features = []
#     print(f"Discretizing {len(numeric_features.columns)} features...")
    feat_count = 0
    for feat in numeric_features:
        this_gran = gran
        if feat_count % 50 == 0:
            print(f" Working in {feat}")
        X_train_np = X_train[[feat]].to_numpy()
        X_test_np = X_test[[feat]].to_numpy()
        success = False
        while not success:
            try:
                D_train = np.zeros([X_train.shape[0], 1])
                D_test = np.zeros([X_test.shape[0], 1])
                # calc numpy histogram and apply to features
                hist, bin_edges = np.histogram(X_train_np[:, 0], bins=this_gran)
                D_train[:, 0] = np.digitize(X_train_np[:,0], bin_edges, right=False)
                D_test[:, 0] = np.digitize(X_test_np[:,0], bin_edges, right=False)

                # apply back to pandas
                X_train[f"{feat}_disc"] = D_train
                X_test[f"{feat}_disc"] = D_test
                success = True
            except:
                print(f"Not possible to correct work on cut {feat} > {this_gran}")
                this_gran = this_gran - 1
                if this_gran <= 1:
                    success = True
        
        feat_count += 1
    return X_train, X_test



def min_max_scaler(X_train, X_test):
    scaler = MinMaxScaler()
    scaler.fit(X_train[ORIGINAL_FEATURES])
    norm_feats = [f"{x}_norm" for x in ORIGINAL_FEATURES if 'disc' not in x[-4:]]
    X_train = X_train.reindex(columns=X_train.columns.tolist() + norm_feats)
    X_test = X_test.reindex(columns=X_test.columns.tolist() + norm_feats)
    X_train.loc[:, norm_feats] = scaler.transform(X_train[ORIGINAL_FEATURES])
    X_test.loc[:, norm_feats] = scaler.transform(X_test[ORIGINAL_FEATURES])
    return X_train, X_test


def binary_operators(df):
    # calc all pair columns
    all_columns = list(df)
    pairwise_cols = list(itertools.combinations(all_columns, 2))
    for pair in pairwise_cols:
        df[f"{pair[0]}_x_{pair[1]}_op_sum"] = df[pair[0]] + X_train[pair[1]]
        df[f"{pair[0]}_x_{pair[1]}_op_sub"] = df[pair[0]] - X_train[pair[1]]
        df[f"{pair[0]}_x_{pair[1]}_op_mul"] = df[pair[0]] * X_train[pair[1]]
        df[f"{pair[0]}_x_{pair[1]}_op_div"] = df[pair[0]] / X_train[pair[1]]
        df = df.replace([np.inf, -np.inf], np.nan)
    
    return df


def high_order_operators(df):
    group_columns = [col for col in df.columns if "disc" in col]
    to_group_columns = [col for col in df.columns if "disc" not in col]
    all_dfs = pd.DataFrame()
    for col in group_columns:
        print(f"Grouping {col}")
        df_avg = df[to_group_columns+[col]].groupby(col).transform('mean').add_suffix(f'_group_by_{col}_and_mean')
        df_min = df[to_group_columns+[col]].groupby(col).transform('min').add_suffix(f'_group_by_{col}_and_min')
        df_max = df[to_group_columns+[col]].groupby(col).transform('max').add_suffix(f'_group_by_{col}_and_max')
        all_dfs = pd.concat([all_dfs, df_avg, df_min, df_max], axis=1,  sort=False)
    all_dfs = pd.concat([df,all_dfs], axis=1)
    return all_dfs
        

In [4]:
ORIGINAL_FEATURES = X_train.columns

# discretize
X_train, X_test = numpy_discretize(X_train_ori, X_test_ori, gran=10)
# normalize
X_train, X_test = min_max_scaler(X_train, X_test)

step1_train = X_train.copy()
step1_test = X_test.copy()
step1_features = X_train.columns

 Working in theta1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [5]:
X_train = high_order_operators(X_train[step1_features].copy())
X_test = high_order_operators(X_test[step1_features].copy())

step2_train = X_train.copy()
step2_test = X_test.copy()
step2_features = X_train.columns

Grouping theta1_disc
Grouping theta2_disc
Grouping theta3_disc
Grouping thetad1_disc
Grouping thetad2_disc
Grouping thetad3_disc
Grouping tau1_disc
Grouping tau2_disc
Grouping theta1_disc
Grouping theta2_disc
Grouping theta3_disc
Grouping thetad1_disc
Grouping thetad2_disc
Grouping thetad3_disc
Grouping tau1_disc
Grouping tau2_disc


In [6]:
# binary operators
X_train = binary_operators(X_train[step1_features].copy())
X_test = binary_operators(X_test[step1_features].copy())

step3_train = X_train.copy()
step3_test = X_test.copy()
step3_features = X_train.columns

In [7]:
X_train = pd.concat([step1_train, step2_train, step3_train], axis=1).dropna(axis=1, thresh=0.03*X_train.shape[0])
X_test = pd.concat([step1_test, step2_test, step3_test], axis=1)[X_train.columns]

In [8]:
# discretize
X_train, X_test = numpy_discretize(X_train, X_test, gran=10)


 Working in theta1
 Working in theta3_group_by_theta1_disc_and_min
 Working in thetad2_group_by_theta2_disc_and_min
 Working in tau1_group_by_theta3_disc_and_min
 Working in theta1_norm_group_by_thetad1_disc_and_min
 Working in theta3_norm_group_by_thetad2_disc_and_min
 Working in thetad2_norm_group_by_thetad3_disc_and_min
 Working in tau1_norm_group_by_tau1_disc_and_min
 Working in theta1_group_by_tau2_disc_and_max
 Working in theta1_x_thetad3_op_mul
Not possible to correct work on cut theta1_x_theta1_norm_op_div > 10
Not possible to correct work on cut theta1_x_theta1_norm_op_div > 9
Not possible to correct work on cut theta1_x_theta1_norm_op_div > 8
Not possible to correct work on cut theta1_x_theta1_norm_op_div > 7
Not possible to correct work on cut theta1_x_theta1_norm_op_div > 6
Not possible to correct work on cut theta1_x_theta1_norm_op_div > 5
Not possible to correct work on cut theta1_x_theta1_norm_op_div > 4
Not possible to correct work on cut theta1_x_theta1_norm_op_div > 3

In [138]:
def dataset_based_meta_features(X_train, X_test, y_train, y_test):
    """
    Applied in the original set!
    """
    
    
    
    def _general_information(X):
        dataset_info_df= pd.DataFrame()

        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        num_numeric_attr = X.select_dtypes(include=numerics).shape[1]
        num_duscrete_attr = X.shape[1] - num_numeric_attr
            
        
        dataset_info_df['num_instances'] = [X.shape[0]]
        dataset_info_df['num_features'] = X.shape[1]
        
        dataset_info_df['num_numeric_attr'] = num_numeric_attr
        dataset_info_df['num_discrete_attr'] = num_duscrete_attr
        dataset_info_df['ratio_numeric_attr'] = num_numeric_attr/ (num_numeric_attr+num_duscrete_attr)
        dataset_info_df['ratio_discrete_attr'] = num_duscrete_attr/ (num_numeric_attr+num_duscrete_attr)
        
        return dataset_info_df
    
    


    def _initial_evaluation(X_train, X_test, y_train, y_test):
    
        from sklearn.ensemble import RandomForestClassifier
        from sklearn import metrics
        
        def acc(y_true, y_pred):
            from sklearn.metrics import accuracy_score
            y_pred = list(map(lambda k: 0 if k<=0.5 else 1, y_pred))
            return accuracy_score(y_true, y_pred)

        def f1(y_true, y_pred, th):
            from sklearn.metrics import f1_score
            y_pred = list(map(lambda k: 0 if k<=th else 1, y_pred))
            return f1_score(y_true, y_pred)

        def precision(y_true, y_pred, th):
            from sklearn.metrics import precision_score
            y_pred = list(map(lambda k: 0 if k<=th else 1, y_pred))
            return precision_score(y_true, y_pred, average='macro') 

        def recall(y_true, y_pred, th):
            from sklearn.metrics import recall_score
            y_pred = list(map(lambda k: 0 if k<=th else 1, y_pred))
            return recall_score(y_true, y_pred, average='macro')

        def auc(y_true, y_pred):
            from sklearn.metrics import roc_auc_score
            return roc_auc_score(y_true, y_pred)


        df_initial_evaluation = pd.DataFrame()
        clf = RandomForestClassifier()
        clf.fit(X_train, y_train)
        y_pred = clf.predict_proba(X_test)[:,1]

        for th in [0.4, 0.45, 0.5, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9]:
            df_initial_evaluation[f'f1_{th}'] = [f1(y_test, y_pred, th)]
            df_initial_evaluation[f'precision_{th}'] = precision(y_test, y_pred, th)
            df_initial_evaluation[f'recall_{th}'] = recall(y_test, y_pred, th)

        df_initial_evaluation['auc'] = auc(y_test, y_pred)
        
        df_initial_evaluation['avg_f1'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'f1' in col]].mean(axis=1)
        df_initial_evaluation['std_f1'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'f1' in col]].std(axis=1)
        df_initial_evaluation['max_f1'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'f1' in col]].max(axis=1)
        df_initial_evaluation['min_f1'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'f1' in col]].min(axis=1)
    
        df_initial_evaluation['avg_precision'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'precision' in col]].mean(axis=1)
        df_initial_evaluation['std_precision'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'precision' in col]].std(axis=1)
        df_initial_evaluation['max_precision'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'precision' in col]].max(axis=1)
        df_initial_evaluation['min_precision'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'precision' in col]].min(axis=1)
    
        df_initial_evaluation['avg_recall'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'recall' in col]].mean(axis=1)
        df_initial_evaluation['std_recall'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'recall' in col]].std(axis=1)
        df_initial_evaluation['max_recall'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'recall' in col]].max(axis=1)
        df_initial_evaluation['min_recall'] = df_initial_evaluation[[col for col in df_initial_evaluation.columns if 'recall' in col]].min(axis=1)
    
        return df_initial_evaluation
    
    
    def _entropy_based_measures(X_train, y_train):
        import pymit

        df_mutual_info = pd.DataFrame()

        for feat in X_train.columns:
            df_mutual_info[feat] = [pymit.I(X_train[feat].values, y_train['thetadd3_bin'].values , bins=[10,2])]
        
        df_mutual_info['avg_mi'] = df_mutual_info.mean(axis=1)
        df_mutual_info['std_mi'] = df_mutual_info.std(axis=1)
        df_mutual_info['min_mi'] = df_mutual_info.min(axis=1)
        df_mutual_info['max_mi'] = df_mutual_info.max(axis=1)
        return df_mutual_info[['avg_mi', 'std_mi', 'min_mi', 'max_mi']]
        
        
        
        
    
    dataset_info_df = _general_information(X_train.copy())
    dataset_initial_eval = _initial_evaluation(X_train.copy(), X_test.copy(), y_train.copy(), y_test.copy())
    dataset_entropy_info = _entropy_based_measures(X_train.copy(), y_train.copy())

    
    df = pd.concat([dataset_info_df, dataset_initial_eval, dataset_entropy_info], axis=1)
    return df
        
        
        

In [139]:
p = dataset_based_meta_features(X_train_ori.copy(), X_test_ori.copy(), y_train.copy(), y_test.copy())



In [140]:
p

Unnamed: 0,num_instances,num_features,num_numeric_attr,num_discrete_attr,ratio_numeric_attr,ratio_discrete_attr,f1_0.4,precision_0.4,recall_0.4,f1_0.45,...,max_precision,min_precision,avg_recall,std_recall,max_recall,min_recall,avg_mi,std_mi,min_mi,max_mi
0,5734,8,8,0,1.0,0.0,0.825077,0.820065,0.81638,0.825879,...,0.834082,0.020401,0.777027,0.065971,0.831138,0.065971,0.032418,0.055554,0.000951,0.155866


In [112]:
x = X_train.head(10)['theta1'].values

In [126]:
p = X_train.tail(10)['theta1'].values

In [122]:
y = y_train['thetadd3_bin'].head(10).values

In [137]:
_entropy_based_measures(X_train_ori, y_train)

Unnamed: 0,theta1,theta2,theta3,thetad1,thetad2,thetad3,tau1,tau2
0,0.002284,0.155866,0.093736,0.001927,0.000951,0.001575,0.001431,0.001576


In [131]:
df_mutual_info

NameError: name 'df_mutual_info' is not defined

In [128]:
??pymit.I

[0;31mSignature:[0m [0mpymit[0m[0;34m.[0m[0mI[0m[0;34m([0m[0mX[0m[0;34m,[0m [0mY[0m[0;34m,[0m [0mbins[0m[0;34m)[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mI[0m[0;34m([0m[0mX[0m[0;34m,[0m[0mY[0m[0;34m,[0m[0mbins[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""[0m
[0;34m    Calculates the mutual information of X and Y.[0m
[0;34m    If X and Y are already discretised, set \ref bins to the amount of bins, aka states of X and Y.[0m
[0;34m    If X and Y are not discretised, \ref bins will be used, to diskretise X and Y into \ref bins states[0m
[0;34m    [0m
[0;34m    @param X numpy vector[0m
[0;34m    @param Y numpy vector[0m
[0;34m    @param bins If X and Y are already diskretised specify the amount of bins of X and Y. If X and Y are not discretised specifies the amount of bins to diskretise X and Y into[0m
[0;34m                bins can be spcified as tuple, e.g. bins = (bins_x, bins_y), to diskretise X and Y

In [127]:
pymit.H_cond(x, p, bins=10)

0.18061799739838869