In order to automate discretization and spare its dependence
on human experts, we propose a multi-granularity discretization
method. The basic idea is simple: instead of using a fine-tuned
granularity, we discretize each numerical feature into several, rather
than only one, categorical features, each with a different granularity.
Figure 5 gives an illustration of discretizing a numerical feature
with four levels of granularity. Since more levels of granularity are
considered, it is more likely to get a promising result

Vamos considerar p = 5 primeiramente

In [1]:
import sklearn
import sklearn.datasets
import pandas as pd
from sklearn import linear_model
import operator
import numpy as np
import itertools
from sklearn import metrics
from scipy.optimize import minimize 
import multiprocessing as mp
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression

from fangorn.files_prep import get_data, data_to_pandas
from fangorn.preprocessing import splitting, feature_selection
from fangorn.training import classifiers

from category_encoders import OneHotEncoder

In [2]:
# read dataset
all_datasets = get_data.get_all_data(only='ml_challenge')
this_dataset = 'madeline'

All ML_CHALLENGE files ready!


In [3]:
# configure dataset
X_all, y_all = data_to_pandas.read_prepare_data(this_dataset)
new_X_all = X_all.copy()
for col in X_all.columns:
    new_X_all[col] = pd.cut(X_all[col], bins=10, labels=[0,1,2,3,4,5,6,7,8,9])

X_all = new_X_all.copy()
# X_all = feature_selection.extra_trees_feature_selection(X_all, y_all)
dataset_split_dict = splitting.simple_train_test_val_split(X_all, y_all)

X_train = dataset_split_dict['train']['X']
y_train = dataset_split_dict['train']['y']
X_test = dataset_split_dict['test']['X']
y_test = dataset_split_dict['test']['y']

A remaining problem is how to determine the
levels of granularity. For an experienced user, it can set a group of
potentially good values. If no values are specified, AutoCross will
use {10^p
}
P
p=1
as default values, where P is an integer determined
by a rule-based mechanism that considers the available memory,
data size and feature numbers.

In [4]:
def discretize_numeric_features(X_train, X_test, max_gran=10):
    """
    multi-granularity discretization
    method. The basic idea is simple: instead of using a fine-tuned
    granularity, we discretize each numerical feature into several, rather
    than only one, categorical features, each with a different granularity.
    
    min granularity = 3
    
    Sometimes de edge values did not permit to execute correct discretization
    if this happens the step is not executed
    """
    
    # separa dados numericos que precisam de binarizacao
    is_numeric = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numeric_features = X_train.select_dtypes(include=is_numeric)
    discrete_features = []
    print(f"Discretizing {len(numeric_features.columns)} features...")
    for feat in numeric_features:
        print(f" Working in {feat}")
        for gran in range(2, max_gran+1):
            try:
                # calcula retibins no treino e aplica no teste
                X_train[f"{feat}_{gran}"], this_bins = pd.qcut(X_train[feat],
                                               gran,
                                               labels= [f"bin_{i}" for i in range(gran)],
                                               retbins = True
                                              )
                X_train[f"{feat}_{gran}"] = X_train[f"{feat}_{gran}"].cat.codes
                X_train[f"{feat}_{gran}"] = X_train[f"{feat}_{gran}"].astype(int)
                # aumenta range dos bins para garantir abrangencia
                this_bins = np.concatenate(([-np.inf], this_bins[1:-1], [np.inf]))
                
                # aplicando no teste
                X_test[f"{feat}_{gran}"] = pd.cut(X_test[feat], bins=this_bins, labels=[f"bin_{i}" for i in range(gran)])
                
                discrete_features.append(f"{feat}_{gran}")
            except:
                print(f"Not possible to correct work on cut {feat} > {gran}")
                break
        X_train = X_train.drop(feat, axis=1)
        X_test = X_test.drop(feat, axis=1)
        
    return X_train, X_test, discrete_features

def get_dummies(X_train, X_test):
    ohe = OneHotEncoder(cols=X_train.columns).fit(X=X_train)
    X_train = ohe.transform(X_train)
    X_test = ohe.transform(X_test)
    return X_train, X_test

def run_field_wise_minibatch_gradient_descent_lr(this_feature,
                                                 X_all,
                                                 y_all,
                                                 y_feature,
                                                 all_features=False,
                                                 return_clf = False):
    """
    Run field wise logistic regression
    """
        
    clf = linear_model.SGDClassifier(loss='log',
                                     n_jobs= -1,
                                     warm_start = True)
    if all_features:
        this_X = X_all
        
    else:
        this_X = pd.get_dummies(X_all[this_feature], columns=this_feature)
        
#     for i in range(0, 1000):
#         this_batch_samples = this_X.sample(frac=0.8)
#         this_y = y_all.loc[y_all.index.isin(this_batch_samples.index)]
#         clf.partial_fit(this_batch_samples, this_y[y_feature], classes=this_y[y_feature].unique())
    
    clf = LogisticRegression(random_state=0, max_iter=10000).fit(this_X, y_all[y_feature])

    if return_clf:
        all_preds = clf.predict_proba(this_X)
        y_all['preds'] = all_preds
        final_score = metrics.log_loss(y_all[y_feature], y_all['preds'])
        return clf, final_score, y_all
   
    return clf.score(this_X, y_all)


def run_initial_logit(X_train, y_train, X_test, y_test):
    clf = LogisticRegression().fit(X_train, y_train[y_train.columns[0]])
    all_preds = clf.predict_proba(X_test)[:, 1]
    logloss = metrics.log_loss(y_test[y_test.columns[0]], all_preds)
    fpr, tpr, thresholds = metrics.roc_curve(y_test[y_test.columns[0]], all_preds, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    return clf, logloss, auc
    

def measure_and_clean_discrete_features(X_train, y_train, X_test, y_test):
    """
    In order to avoid the dramatic increase in feature number caused
    by discretization, once these features are generated, we use fieldwise LR to evaluate them and keep
    only the best half. 
    """
    
    def _select_discrete_features(feature_score, abs=True):
        """
        Select only the best half of measured features
        """
        if abs:
            feature_score = {k: np.abs(v) for k,v in feature_score.items()}
            
        sorted_score = sorted(feature_score.items(), key=operator.itemgetter(1), reverse=True)
        half_features = int(np.floor(len(sorted_score)*0.2))
        return [k[0] for k in sorted_score[:half_features]]
    
    feature_score = {}
    # while what you describe is properly called minibatch learning.
    # That's implemented in sklearn.linear_model.SGDClassifier,
    # which fits a logistic regression model if you give it the option loss="log".
    this_classifier, logloss, auc = run_initial_logit(X_train, y_train, X_test, y_test)
    
    coef_dict = dict(list(zip(X_train.columns, this_classifier.coef_[0])))
    print(coef_dict, auc)
#     selected_features = _select_discrete_features(coef_dict)
    
    return X_train, X_test

In [65]:
X_train_discretized, X_test_discretized, discrete_features = discretize_numeric_features(X_train.copy(), X_test.copy())

Discretizing 259 features...
 Working in 0
 Working in 1
 Working in 2
 Working in 3
 Working in 4
 Working in 5
 Working in 6
 Working in 7
 Working in 8
 Working in 9
 Working in 10
 Working in 11
 Working in 12
 Working in 13
 Working in 14
 Working in 15
 Working in 16
 Working in 17
 Working in 18
 Working in 19
 Working in 20
 Working in 21
Not possible to correct work on cut 21 > 4
 Working in 22
 Working in 23
 Working in 24
 Working in 25
 Working in 26
 Working in 27
 Working in 28
 Working in 29
 Working in 30
 Working in 31
 Working in 32
 Working in 33
 Working in 34
 Working in 35
 Working in 36
 Working in 37
 Working in 38
 Working in 39
 Working in 40
 Working in 41
 Working in 42
 Working in 43
 Working in 44
 Working in 45
 Working in 46
 Working in 47
 Working in 48
 Working in 49
 Working in 50
 Working in 51
 Working in 52
 Working in 53
 Working in 54
 Working in 55
 Working in 56
 Working in 57
 Working in 58
 Working in 59
 Working in 60
 Working in 61
 Working

In [16]:
from pyitlib import discrete_random_variable as drv
def hjmi_selector(X, y, n_iter):
    max_features = n_iter
    y[y.columns[0]] = y[y.columns[0]].astype(int)
    selected_features = []
    collected_hjmi = []
    j_h = 0
    all_jmi = {}
    feat_mi_selected = {}
    for i in range(n_iter):
        print(i)
        count = 1
        for feat in X.columns:
            X[feat] = X[feat].astype(int)
            if feat in selected_features:
                if feat in all_jmi:
                    all_jmi.pop(feat)
                continue
                
            mi = drv.information_mutual(X[feat].values, y[y.columns[0]].values)
            jmi_2 = 0
            for selected_feature in selected_features:
                
                if f'{feat}_&_{selected_feature}' in feat_mi_selected:
                    jmi_2 = feat_mi_selected[f'{feat}_&_{selected_feature}']
                else:
                    tmp_mi = drv.information_mutual(X[feat].values, X[selected_feature].values)
                    tmp_cmi = drv.information_mutual_conditional(X[feat].values, X[selected_feature].values, y[y.columns[0]].values)
                    jmi_2 = jmi_2 + tmp_mi - tmp_cmi
                    feat_mi_selected[f'{feat}_&_{selected_feature}'] = jmi_2

            try:
                all_jmi[feat] = j_h + mi - jmi_2 / count - len(selected_feature)
            except:
                all_jmi[feat] = j_h + mi - jmi_2 / count - 1
            count += 1
            
        j_h_max_feat = max(all_jmi.items(), key=operator.itemgetter(1))[0]
        j_h_max_value = max(all_jmi.items(), key=operator.itemgetter(1))[1]
        j_h = j_h_max_value
        
        if not selected_features:
            hjmi = j_h_max_value
            selected_features.append(j_h_max_feat)
            collected_hjmi.append(hjmi)
        
        else:
            print(((j_h_max_value-hjmi)/hjmi))
            if (((j_h_max_value-hjmi)/hjmi) > 0.03) and len(selected_features) < max_features:
                hjmi = j_h_max_value
                selected_features.append(j_h_max_feat)
                collected_hjmi.append(hjmi)
            else:
                return selected_features, collected_hjmi  

t = hjmi_selector(X_train.copy(), y_train.copy(), n_iter=300)

0
1
1.0020876033345303
2
0.5044357151149288
3
0.3268466944321082
4
0.2379796925699773
5
0.18562080534721795
6
0.14953347665871117
7
0.12743935271008086
8
0.11326428565679861
9
0.09700921463608089
10
0.08703989783382106
11
0.0695966146160475
12
0.06815196148635208
13
0.061634644594839506
14
0.059187102855009464
15
0.05464931310521193
16
0.04657591944818497
17
0.04234338418536994
18
0.044149785200028165
19
0.03461888111723691
20
0.04014900846079075
21
0.03454116847279897
22
0.029788053122393434


In [17]:
features_to_keep = t[0]

In [18]:
X_train_discretized = X_train[features_to_keep]
X_test_discretized = X_test[features_to_keep]

In [19]:
X_train_dummy, X_test_dummy = get_dummies(X_train_discretized.copy(), X_test_discretized.copy())


In [20]:
final_X_train, final_X_test = measure_and_clean_discrete_features(X_train_dummy.copy(), y_train.copy(), X_test_dummy.copy(), y_test.copy())

{'191_1': -0.5215963026061161, '191_2': 0.055769789495850104, '191_3': -0.16952481873889605, '191_4': -0.4659260524121722, '191_5': -0.3002992474980817, '191_6': 0.3641007260452931, '191_7': 0.28862965900992105, '191_8': 0.5260745736918807, '191_9': 0.6715897321724065, '191_10': -0.43568371513139853, '237_1': 0.2737812978897422, '237_2': 0.2592135557927553, '237_3': -0.676332767687875, '237_4': -0.2595636537496368, '237_5': -0.03670278999619111, '237_6': 0.1975930698781436, '237_7': 0.26573528925816603, '237_8': 0.3951275193318306, '237_9': -0.35834707932310284, '237_10': -0.04737009736513244, '0_1': 0.46437269580441015, '0_2': 0.30519150238584825, '0_3': -0.22437602476417537, '0_4': -0.08989178770107276, '0_5': -0.2956756533950654, '0_6': -0.12388030068803356, '0_7': -0.1442572155272359, '0_8': -0.006048269245042318, '0_9': 0.11946019536035628, '0_10': 0.008239201798535574, '1_1': -0.09881835660518387, '1_2': -0.2856241658311096, '1_3': 0.061226488933176185, '1_4': 0.02806864104646024



In [21]:
start_classifier, logloss, auc = run_initial_logit(final_X_train.copy(), y_train.copy(), final_X_test.copy(), y_test.copy())
print(f"LOGLOSS: {logloss} - AUC: {auc}")

LOGLOSS: 0.7035338290326201 - AUC: 0.6006808943089431




In [24]:
def _calc_logloss(y_true, preds):
#     return metrics.log_loss(y_true[y_true.columns[0]], preds['preds'])
    fpr, tpr, thresholds = metrics.roc_curve(y_true[y_true.columns[0]],  preds['preds'], pos_label=1)
    auc = metrics.auc(fpr, tpr)
    return -1 * auc

def _obj(x, 
         X_train,
         y_train,
         combined_features,
         bsum):

    # add x to bsum
    this_w = np.array(x).reshape(-1,1)
    this_value = np.array(combined_features).reshape(1,-1)
    bsum_with_new_feature = bsum + np.matmul(this_w.T, this_value)
    this_preds = 1/(1 + np.exp(-bsum_with_new_feature)) 
    preds = pd.DataFrame(this_preds.reshape(-1,1), columns=['preds'])
    logloss = _calc_logloss(y_train, preds)
    return logloss

In [40]:
current_training_set = final_X_train.copy()
current_test_set = final_X_test.copy()
start_classifier, start_logloss, start_auc = run_initial_logit(current_training_set, y_train, current_test_set, y_test)

coef_dict = dict(list(zip(current_training_set.columns,start_classifier.coef_[0])))
intercept = start_classifier.intercept_[0]

In [43]:
feature_pair = ('month_8','poutcome_4')

In [44]:
col_coefs = np.array(list(coef_dict.values())).reshape(1,-1)
bsum = np.add(np.matmul(col_coefs, current_training_set.values.T), intercept)

combined_features = current_training_set[feature_pair[0]] | current_training_set[feature_pair[1]]

In [45]:
minimize(_obj, 0, args=(current_training_set,
                             y_train,
                             combined_features,
                             bsum), method='Nelder-Mead')

 final_simplex: (array([[0.000125 ],
       [0.0001875]]), array([-0.90176082, -0.90176079]))
           fun: -0.9017608175039736
       message: 'Optimization terminated successfully.'
          nfev: 8
           nit: 4
        status: 0
       success: True
             x: array([0.000125])

In [80]:
def score_one_pair(X_train, y_train, X_test, y_test, feature_pair, coef_dict, intercept):
    
    def _calc_logloss(y_true, preds):
        return metrics.log_loss(y_true[y_true.columns[0]], preds['preds'])

    def _obj(x, 
             X_train,
             y_train,
             combined_features,
             bsum):
        
        # add x to bsum
        this_w = np.array(x).reshape(-1,1)
        this_value = np.array(combined_features).reshape(1,-1)
        bsum_with_new_feature = bsum + np.matmul(this_w.T, this_value)
        this_preds = 1/(1 + np.exp(-bsum_with_new_feature)) 
        preds = pd.DataFrame(this_preds.reshape(-1,1), columns=['preds'])
        return _calc_logloss(y_train, preds)

    col_coefs = np.array(list(coef_dict.values())).reshape(1,-1)
    bsum = np.add(np.matmul(col_coefs, X_train.values.T), intercept)
    
    combined_features = X_train[feature_pair[0]] | X_train[feature_pair[1]]
        
    if combined_features.equals(X_train[feature_pair[0]]) or combined_features.equals(X_train[feature_pair[1]]):
        return  {"coef": 0, "logloss" : 99}

    result = minimize(_obj, 1, args=(X_train,
                                 y_train,
                                 combined_features,
                                 bsum))
    
    this_coef = result['x'][0]
    this_logloss = result['fun']

#     start_classifier, start_logloss = run_initial_logit(X_train, y_train, X_test, y_test)
#     coef_dict = dict(list(zip(X_train.columns,start_classifier.coef_[0])))
#     intercept = start_classifier.intercept_[0]
#     this_coef = 0
    dict_result_combination = {"coef":this_coef,
                               "logloss" : this_logloss}
    
    return dict_result_combination

def iter_one_level(X_train, y_train, X_test, y_test, coef_dict, intercept):
    
    all_columns = list(X_train)
    pairwise_cols = list(itertools.combinations(all_columns, 2))
    all_results = {}
    with tqdm(total=len(pairwise_cols)) as pbar:
        for feature_pair in pairwise_cols:
            feature_name = str(feature_pair)
            if feature_name not in X_train.columns:
                all_results[feature_name] = score_one_pair(X_train, y_train, X_test, y_test, feature_pair, coef_dict, intercept)
                pbar.update(1)
    return all_results

def predict_logit(coef_dict, intercept, X_test, y_test):
    col_coefs = np.array(list(coef_dict.values())).reshape(1,-1)
    bsum = np.add(np.matmul(col_coefs, X_test.values.T), intercept)
    this_preds = 1/(1 + np.exp(-bsum)) 
    preds = pd.DataFrame(this_preds.reshape(-1,1), columns=['preds'])
    logloss = metrics.log_loss(y_test[y_test.columns[0]], preds['preds'])
    fpr, tpr, thresholds = metrics.roc_curve(y_test[y_test.columns[0]],  preds['preds'], pos_label=1)
    auc = metrics.auc(fpr, tpr)
    return logloss, auc

def beam_search(X_train, y_train, X_test, y_test):
    
    def _choose_best_feature(dict_level_results):
        min_logloss = 9999
        bsf_feature = None
        bst_coef = None
        for key, val in dict_level_results.items():
            if val['logloss'] < min_logloss:
                min_logloss = val['logloss']
                bsf_feature = key
                bst_coef = val['coef']
        print(f"Level - choose {bsf_feature} -{bst_coef} -{min_logloss}")
        return bsf_feature, bst_coef, min_logloss
    
    current_training_set = X_train.copy()
    current_test_set = X_test.copy()
    start_classifier, start_logloss, start_auc = run_initial_logit(current_training_set, y_train, current_test_set, y_test)
    
    coef_dict = dict(list(zip(current_training_set.columns,start_classifier.coef_[0])))
    intercept = start_classifier.intercept_[0]
    
    print(f"Start logloss : {start_logloss} - Start AUC {start_auc}")
    last_logloss = start_logloss
    this_logloss = -np.inf
    accepted_features = []
    while this_logloss <= last_logloss:
        # eval one level
        dict_level_results = iter_one_level(current_training_set, y_train, current_test_set, y_test, coef_dict, intercept)
        bst_feature, this_coef, this_logloss = _choose_best_feature(dict_level_results)
        
        # update X_train an X_test
        current_training_set[str(bst_feature)] = current_training_set[str(eval(bst_feature)[0])] | current_training_set[str(eval(bst_feature)[1])]
        current_test_set[str(bst_feature)] = current_test_set[str(eval(bst_feature)[0])] | current_test_set[str(eval(bst_feature)[1])]
        
        # retrain logit with new feature
#         this_clf, this_logloss, this_auc = run_initial_logit(current_training_set, y_train, current_test_set, y_test)
#         coef_dict = dict(list(zip(current_training_set.columns,this_clf.coef_[0])))
#         intercept = start_classifier.intercept_[0]
        coef_dict[bst_feature] = this_coef
        
        this_logloss, this_auc = predict_logit(coef_dict, intercept, current_test_set, y_test)
        print(f" new  logloss: {this_logloss} - new auc {this_auc}")
        
        coef_dict[bst_feature] = this_logloss
    
        current_logloss_diff = last_logloss - this_logloss
        
        if this_logloss > last_logloss:
            current_training_set =  current_training_set.drop([str(bst_feature)], axis=1)
            current_test_set =  current_test_set.drop([str(bst_feature)], axis=1)
            coef_dict.pop(str(bst_feature), None)
            
        else:
            last_logloss = this_logloss
            
        print(f"logloss gain with {bst_feature}: {current_logloss_diff}")
    return current_training_set, current_test_set, coef_dict, intercept   

In [81]:
complete_X_train, complete_X_test, coef_dict, intercept = beam_search(final_X_train, y_train, final_X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  0%|          | 1/1326 [00:00<04:18,  5.12it/s]

Start logloss : 0.2327655985368599 - Start AUC 0.9061106658764809


100%|██████████| 1326/1326 [03:42<00:00,  5.96it/s]
  0%|          | 0/1378 [00:00<?, ?it/s]

Level - choose ('contact_3', 'month_3') -0.16735013114575673 -0.2278955972596195
 new  logloss: 0.23231604045654392 - new auc 0.9066090048552927
logloss gain with ('contact_3', 'month_3'): 0.00044955808031599265


100%|█████████▉| 1377/1378 [03:57<00:00,  5.80it/s]
  0%|          | 0/1431 [00:00<?, ?it/s]

Level - choose ('contact_3', 'duration_9_9') --0.1780470059229591 -0.22749771691697013
 new  logloss: 0.23184214232056638 - new auc 0.906786992382682
logloss gain with ('contact_3', 'duration_9_9'): 0.00047389813597753516


100%|█████████▉| 1429/1431 [03:59<00:00,  5.97it/s]
  0%|          | 0/1485 [00:00<?, ?it/s]

Level - choose ('month_6', "('contact_3', 'duration_9_9')") --0.38356909435351466 -0.22738999798148193
 new  logloss: 0.23160808367028515 - new auc 0.9071357905210332
logloss gain with ('month_6', "('contact_3', 'duration_9_9')"): 0.00023405865028122697


100%|█████████▉| 1482/1485 [04:08<00:00,  5.97it/s]
  0%|          | 0/1540 [00:00<?, ?it/s]

Level - choose ('duration_9_1', '(\'month_6\', "(\'contact_3\', \'duration_9_9\')")') --0.6138723049671846 -0.227392864340058
 new  logloss: 0.2315266823035643 - new auc 0.9071753288060764
logloss gain with ('duration_9_1', '(\'month_6\', "(\'contact_3\', \'duration_9_9\')")'): 8.140136672085685e-05


100%|█████████▉| 1536/1540 [04:13<00:00,  6.07it/s]


Level - choose ('duration_10_1', '(\'month_6\', "(\'contact_3\', \'duration_9_9\')")') --0.8449532393229401 -0.22740935562055742
 new  logloss: 0.23154991996186344 - new auc 0.907156364348124
logloss gain with ('duration_10_1', '(\'month_6\', "(\'contact_3\', \'duration_9_9\')")'): -2.3237658299146702e-05


In [15]:
col_coefs = np.array(list(coef_dict.values())).reshape(1,-1)
bsum = np.add(np.matmul(col_coefs, complete_X_test.values.T), intercept)


In [16]:
this_preds = 1/(1 + np.exp(-bsum)) 
preds = pd.DataFrame(this_preds.reshape(-1,1), columns=['preds'])

In [17]:
all_preds =  preds['preds']
logloss = metrics.log_loss(y_test[y_test.columns[0]], all_preds)
fpr, tpr, thresholds = metrics.roc_curve(y_test[y_test.columns[0]], all_preds, pos_label=1)
auc = metrics.auc(fpr, tpr)

In [162]:
t.columnsa

Index(['NOX_10_bin_8', 'NOX_5_bin_4', 'RM_8_bin_2', 'AGE_8_bin_0',
       'CRIM_8_bin_7', 'CRIM_8_bin_5', 'NOX_10_bin_5', 'AGE_10_bin_0',
       'NOX_5_bin_2', 'NOX_9_bin_5',
       ...
       'CRIM_10_bin_1', '('NOX_10_bin_8', 'NOX_5_bin_4')',
       '('NOX_10_bin_8', 'RM_8_bin_2')', '('NOX_10_bin_8', 'AGE_8_bin_0')',
       '('NOX_10_bin_8', 'CRIM_8_bin_7')', '('NOX_10_bin_8', 'CRIM_8_bin_5')',
       '('NOX_10_bin_8', 'NOX_10_bin_5')', '('NOX_10_bin_8', 'AGE_10_bin_0')',
       '('NOX_10_bin_8', 'NOX_5_bin_2')', '('NOX_10_bin_8', 'NOX_9_bin_5')'],
      dtype='object', length=168)

In [49]:
all_columns = t.columns
pairwise_cols = list(itertools.combinations(all_columns, 2))

In [163]:
run_field_wise_minibatch_gradient_descent_lr(t.columns,
                                            'bin_lstat',
                                            t.copy(),
                                            y_all.copy(),
                                            all_features=True,
                                            return_clf=True)

(LogisticRegression(random_state=0),
 1.228669507272974,
      bin_lstat  preds
 0            0      0
 1            0      0
 2            0      0
 3            0      0
 4            0      0
 ..         ...    ...
 501          0      0
 502          0      0
 503          0      0
 504          0      0
 505          0      1
 
 [506 rows x 2 columns])

In [83]:
for col in pairwise_cols:
    t['x'] = t[col[0]] | t[col[1]]
    _,_, preds = run_field_wise_minibatch_gradient_descent_lr(t.columns,
                                            'bin_lstat',
                                            t.copy(),
                                            y_all.copy(),
                                            all_features=True,
                                            return_clf=True)
    print(roc_auc_score(preds['bin_lstat'], preds['preds']))

0.5351057228771896
0.514191844462476
0.44325648735939416
0.47713712949262566
0.5020683181391501
0.3120932970582152
0.38352186848678665
0.2823015607846881
0.42472912987446904
0.5869091371931332
0.3718836014191844
0.4879480693045678
0.4160421936900387
0.35687238477081445
0.3774521502553578
0.5063799659523014
0.3926781537874088
0.5935675305872432
0.48568087442126867
0.6574228755986189
0.4016116971345839
0.3151003134297488
0.25449063688288553
0.5003818433487662
0.3852162983469365
0.5034922756272573
0.3169140693363881
0.3631250696068605
0.3733155139770576
0.5591300335703944
0.5270154169252065
0.47917362735271196
0.5523523141297949
0.46956390307543
0.4224698900609359
0.6431355702989515
0.6554500182966605
0.44380538717324547
0.6431992108570792
0.5859465737514519
0.38883585509044916
0.364692218350755
0.5630598380347795
0.5472053839912177
0.436820835918731
0.4804543935850317
0.633637216998393
0.4551572717292731
0.5200785960892876
0.4042050498782874
0.6557523109477671
0.3814217300685727
0.463557

KeyboardInterrupt: 

In [47]:
from sklearn.metrics import roc_auc_score


0.4837637026076718

In [23]:
result_dict = _

In [12]:
import multiprocessing as mp
print("Number of processors: ", mp.cpu_count())
all_columns = list(original_feature_set)
pairwise_cols = list(itertools.combinations(all_columns, 2))
all_results = {}


Number of processors:  4


In [19]:
pairwise_cols = pairwise_cols[:2]

In [17]:
def make_preds(original_feature_set, new_coef_dict):
    col_coefs = np.array(list(new_coef_dict.values())).reshape(-1,1)
    this_pred = np.add(np.round(original_feature_set.values.dot(col_coefs),3), intercept)
    return 1/(1 + np.exp(-this_pred))

0.218

In [19]:
new_coef_dict = coef_dict

In [20]:
original_feature_set['teste_feature'] = original_feature_set['DIS_9_bin_1'] | original_feature_set['DIS_9_bin_3']

In [24]:
new_coef_dict['teste_feature'] = 0.218

In [25]:
y_all['new_preds'] = make_preds(original_feature_set, new_coef_dict)

In [26]:
metrics.log_loss(y_all['bin_lstat'], y_all['new_preds'])

12.60009471884184

In [72]:
y_all

Unnamed: 0,bin_lstat,new_preds
0,0,0.0
1,0,0.0
2,0,0.0
3,0,1.0
4,0,0.0
...,...,...
501,0,1.0
502,0,1.0
503,0,1.0
504,0,1.0


In [50]:
result

      fun: -18.59017554414894
 hess_inv: array([[1]])
      jac: array([0.])
  message: 'Optimization terminated successfully.'
     nfev: 9
      nit: 1
     njev: 3
   status: 0
  success: True
        x: array([-5.05])

In [172]:
def fun(x, a,b,c):
    return a*x**2 + b*x + c

minimize(fun, 100, args=(1,0,0))

      fun: 5.552074997367714e-17
 hess_inv: array([[0.50000004]])
      jac: array([-1.28826571e-12])
  message: 'Optimization terminated successfully.'
     nfev: 21
      nit: 4
     njev: 7
   status: 0
  success: True
        x: array([-7.45122473e-09])

In [169]:
result

      fun: -0.5860022592398136
 hess_inv: array([[1]])
      jac: array([0.])
  message: 'Optimization terminated successfully.'
     nfev: 3
      nit: 0
     njev: 1
   status: 0
  success: True
        x: array([0.])

In [100]:
preds['teste'] = np.round(s)

In [103]:
fpr, tpr, thresholds = metrics.roc_curve(preds['bin_lstat'], preds['teste'], pos_label=1)
metrics.auc(fpr, tpr)


0.5860022592398136

In [154]:
residuals.sort_values(['residual']).sample(10)

Unnamed: 0,bin_lstat,preds,residual
169,1,1.0,0.0
447,1,0.484,0.516
490,1,1.0,0.0
473,1,1.0,0.0
438,1,1.0,0.0
149,1,1.0,0.0
165,0,1.0,-1.0
496,1,1.0,0.0
337,1,0.0,1.0
227,0,1.0,-1.0


In [149]:
residuals['residual'].value_counts()

 0.000    295
 1.000    133
-1.000     53
 0.996      2
-0.199      2
 0.990      1
 0.125      1
-0.108      1
-0.044      1
 0.218      1
-0.036      1
-0.120      1
-0.790      1
-0.976      1
-0.998      1
-0.999      1
 0.516      1
-0.001      1
-0.846      1
-0.958      1
-0.917      1
-0.996      1
-0.952      1
 0.979      1
 0.998      1
 0.975      1
Name: residual, dtype: int64

In [105]:
-1/ (1 * -1)

1.0

In [106]:
-1 + 1

0

In [None]:
start_classifier.