In order to automate discretization and spare its dependence
on human experts, we propose a multi-granularity discretization
method. The basic idea is simple: instead of using a fine-tuned
granularity, we discretize each numerical feature into several, rather
than only one, categorical features, each with a different granularity.
Figure 5 gives an illustration of discretizing a numerical feature
with four levels of granularity. Since more levels of granularity are
considered, it is more likely to get a promising result

Vamos considerar p = 5 primeiramente

In [1]:
import sklearn
import sklearn.datasets
import pandas as pd
from sklearn import linear_model
import operator
import numpy as np
import itertools
from sklearn import metrics
from scipy.optimize import minimize 
import multiprocessing as mp
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression

from fangorn.files_prep import get_data, data_to_pandas
from fangorn.preprocessing import splitting, feature_selection
from fangorn.training import classifiers

from category_encoders import OneHotEncoder
import ray
ray.init()

2020-07-15 20:52:55,780	INFO resource_spec.py:212 -- Starting Ray with 53.66 GiB memory available for workers and up to 26.84 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-07-15 20:52:56,268	INFO services.py:1165 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


{'node_ip_address': '10.96.22.5',
 'raylet_ip_address': '10.96.22.5',
 'redis_address': '10.96.22.5:6379',
 'object_store_address': '/tmp/ray/session_2020-07-15_20-52-55_778754_1582/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-07-15_20-52-55_778754_1582/sockets/raylet',
 'webui_url': 'localhost:8265',
 'session_dir': '/tmp/ray/session_2020-07-15_20-52-55_778754_1582'}

In [2]:
# read dataset
all_datasets = get_data.get_all_data(only='ml_challenge')
this_dataset = 'madeline'

All ML_CHALLENGE files ready!


In [3]:
# configure dataset
X_all, y_all = data_to_pandas.read_prepare_data(this_dataset)
X_all = feature_selection.extra_trees_feature_selection(X_all, y_all)
dataset_split_dict = splitting.simple_train_test_val_split(X_all, y_all)

X_train = dataset_split_dict['train']['X']
y_train = dataset_split_dict['train']['y']
X_test = dataset_split_dict['test']['X']
y_test = dataset_split_dict['test']['y']
print(X_train.shape)

(2009, 28)


In [2]:
from sklearn.model_selection import train_test_split

bank = pd.read_csv('bank.csv', sep=";")



bank['y'] = bank['y'].map({'yes': 1, 'no':0})
y = bank[['y']]
X = bank.drop(['y'],axis=1)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

A remaining problem is how to determine the
levels of granularity. For an experienced user, it can set a group of
potentially good values. If no values are specified, AutoCross will
use {10^p
}
P
p=1
as default values, where P is an integer determined
by a rule-based mechanism that considers the available memory,
data size and feature numbers.

In [4]:
def discretize_numeric_features(X_train, X_test, max_gran=5):
    """
    multi-granularity discretization
    method. The basic idea is simple: instead of using a fine-tuned
    granularity, we discretize each numerical feature into several, rather
    than only one, categorical features, each with a different granularity.
    
    min granularity = 3
    
    Sometimes de edge values did not permit to execute correct discretization
    if this happens the step is not executed
    """
    
    # separa dados numericos que precisam de binarizacao
    is_numeric = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numeric_features = X_train.select_dtypes(include=is_numeric)
    discrete_features = []
    print(f"Discretizing {len(numeric_features.columns)} features...")
    for feat in numeric_features:
        print(f" Working in {feat}")
        for gran in range(2, max_gran+1):
            try:
                # calcula retibins no treino e aplica no teste
                X_train[f"{feat}_{gran}"], this_bins = pd.qcut(X_train[feat],
                                               gran,
                                               labels= [f"bin_{i}" for i in range(gran)],
                                               retbins = True
                                              )
                # aumenta range dos bins para garantir abrangencia
                this_bins = np.concatenate(([-np.inf], this_bins[1:-1], [np.inf]))
                
                # aplicando no teste
                X_test[f"{feat}_{gran}"] = pd.cut(X_test[feat], bins=this_bins, labels=[f"bin_{i}" for i in range(gran)])
                
                discrete_features.append(f"{feat}_{gran}")
            except:
                print(f"Not possible to correct work on cut {feat} > {gran}")
                break
        X_train = X_train.drop(feat, axis=1)
        X_test = X_test.drop(feat, axis=1)
        
    return X_train, X_test, discrete_features

def get_dummies(X_train, X_test):
    ohe = OneHotEncoder(cols=X_train.columns).fit(X=X_train)
    X_train = ohe.transform(X_train)
    X_test = ohe.transform(X_test)
    return X_train, X_test

def run_field_wise_minibatch_gradient_descent_lr(this_feature,
                                                 X_all,
                                                 y_all,
                                                 y_feature,
                                                 all_features=False,
                                                 return_clf = False):
    """
    Run field wise logistic regression
    """
        
    clf = linear_model.SGDClassifier(loss='log',
                                     n_jobs= -1,
                                     warm_start = True)
    if all_features:
        this_X = X_all
        
    else:
        this_X = pd.get_dummies(X_all[this_feature], columns=this_feature)
        
#     for i in range(0, 1000):
#         this_batch_samples = this_X.sample(frac=0.8)
#         this_y = y_all.loc[y_all.index.isin(this_batch_samples.index)]
#         clf.partial_fit(this_batch_samples, this_y[y_feature], classes=this_y[y_feature].unique())
    
    clf = LogisticRegression(random_state=0, max_iter=10000).fit(this_X, y_all[y_feature])

    if return_clf:
        all_preds = clf.predict_proba(this_X)
        y_all['preds'] = all_preds
        final_score = metrics.log_loss(y_all[y_feature], y_all['preds'])
        return clf, final_score, y_all
   
    return clf.score(this_X, y_all)


def run_initial_logit(X_train, y_train, X_test, y_test):
    clf = LogisticRegression().fit(X_train, y_train[y_train.columns[0]])
    all_preds = clf.predict_proba(X_test)[:, 1]
    logloss = metrics.log_loss(y_test[y_test.columns[0]], all_preds)
    fpr, tpr, thresholds = metrics.roc_curve(y_test[y_test.columns[0]], all_preds, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    return clf, logloss, auc
    

def measure_and_clean_discrete_features(X_train, y_train, X_test, y_test):
    """
    In order to avoid the dramatic increase in feature number caused
    by discretization, once these features are generated, we use fieldwise LR to evaluate them and keep
    only the best half. 
    """
    
    def _select_discrete_features(feature_score, abs=True):
        """
        Select only the best half of measured features
        """
        if abs:
            feature_score = {k: np.abs(v) for k,v in feature_score.items()}
            
        sorted_score = sorted(feature_score.items(), key=operator.itemgetter(1), reverse=True)
        half_features = int(np.floor(len(sorted_score)*0.5))
        return [k[0] for k in sorted_score[:half_features]]
    
    feature_score = {}
    # while what you describe is properly called minibatch learning.
    # That's implemented in sklearn.linear_model.SGDClassifier,
    # which fits a logistic regression model if you give it the option loss="log".
    this_classifier, logloss, auc = run_initial_logit(X_train, y_train, X_test, y_test)
    
    coef_dict = dict(list(zip(X_train.columns, this_classifier.coef_[0])))
    selected_features = _select_discrete_features(coef_dict)
    
    return X_train[selected_features], X_test[selected_features]

In [5]:
X_train_disretized, X_test_discretized, discrete_features = discretize_numeric_features(X_train.copy(), X_test.copy())

Discretizing 28 features...
 Working in 0
 Working in 1
 Working in 2
 Working in 3
 Working in 4
 Working in 5
 Working in 6
 Working in 7
 Working in 8
 Working in 9
 Working in 10
 Working in 11
 Working in 12
 Working in 13
 Working in 14
 Working in 15
 Working in 16
 Working in 17
 Working in 18
 Working in 19
 Working in 20
 Working in 21
 Working in 22
 Working in 23
 Working in 24
 Working in 25
 Working in 26
 Working in 27


In [6]:
X_train_disretized

Unnamed: 0,0_2,0_3,0_4,0_5,1_2,1_3,1_4,1_5,2_2,2_3,...,25_4,25_5,26_2,26_3,26_4,26_5,27_2,27_3,27_4,27_5
2953,bin_0,bin_0,bin_0,bin_0,bin_0,bin_0,bin_0,bin_0,bin_1,bin_2,...,bin_1,bin_2,bin_1,bin_2,bin_3,bin_4,bin_0,bin_0,bin_0,bin_0
1560,bin_1,bin_2,bin_2,bin_3,bin_0,bin_0,bin_0,bin_0,bin_1,bin_1,...,bin_3,bin_4,bin_1,bin_1,bin_2,bin_2,bin_0,bin_0,bin_0,bin_0
2098,bin_0,bin_0,bin_0,bin_0,bin_0,bin_0,bin_1,bin_1,bin_1,bin_2,...,bin_2,bin_2,bin_1,bin_2,bin_3,bin_4,bin_0,bin_0,bin_1,bin_1
71,bin_0,bin_0,bin_0,bin_0,bin_0,bin_1,bin_1,bin_2,bin_0,bin_1,...,bin_2,bin_2,bin_0,bin_1,bin_1,bin_2,bin_1,bin_1,bin_2,bin_3
1636,bin_1,bin_2,bin_2,bin_3,bin_0,bin_0,bin_0,bin_0,bin_0,bin_0,...,bin_2,bin_2,bin_0,bin_0,bin_0,bin_0,bin_0,bin_1,bin_1,bin_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2680,bin_1,bin_2,bin_3,bin_4,bin_1,bin_2,bin_3,bin_4,bin_0,bin_0,...,bin_1,bin_2,bin_0,bin_0,bin_1,bin_1,bin_1,bin_2,bin_3,bin_4
1854,bin_1,bin_2,bin_3,bin_4,bin_0,bin_0,bin_0,bin_1,bin_1,bin_1,...,bin_3,bin_4,bin_1,bin_1,bin_2,bin_3,bin_1,bin_1,bin_2,bin_3
217,bin_1,bin_1,bin_2,bin_3,bin_0,bin_0,bin_0,bin_0,bin_1,bin_2,...,bin_2,bin_3,bin_1,bin_2,bin_3,bin_3,bin_0,bin_0,bin_0,bin_0
2164,bin_1,bin_2,bin_3,bin_3,bin_1,bin_2,bin_3,bin_4,bin_0,bin_0,...,bin_0,bin_1,bin_0,bin_0,bin_0,bin_0,bin_1,bin_2,bin_3,bin_3


In [17]:
X_train_dummy

Unnamed: 0,0_2_1,0_2_2,0_3_1,0_3_2,0_3_3,0_4_1,0_4_2,0_4_3,0_4_4,0_5_1,...,27_3_3,27_4_1,27_4_2,27_4_3,27_4_4,27_5_1,27_5_2,27_5_3,27_5_4,27_5_5
2953,1,0,1,0,0,1,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0
1560,0,1,0,0,1,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,0
2098,1,0,1,0,0,1,0,0,0,1,...,0,0,1,0,0,0,1,0,0,0
71,1,0,1,0,0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
1636,0,1,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2680,0,1,0,0,1,0,0,0,1,0,...,1,0,0,0,1,0,0,0,0,1
1854,0,1,0,0,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
217,0,1,0,1,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,0
2164,0,1,0,0,1,0,0,0,1,0,...,1,0,0,0,1,0,0,0,1,0


In [7]:
%%time
X_train_dummy, X_test_dummy = get_dummies(X_train_disretized.copy(), X_test_discretized.copy())


CPU times: user 3.14 s, sys: 320 ms, total: 3.46 s
Wall time: 3.17 s


In [8]:
final_X_train, final_X_test = measure_and_clean_discrete_features(X_train_dummy.copy(), y_train.copy(), X_test_dummy.copy(), y_test.copy())



In [9]:
start_classifier, logloss, auc = run_initial_logit(final_X_train.copy(), y_train.copy(), final_X_test.copy(), y_test.copy())
print(f"LOGLOSS: {logloss} - AUC: {auc}")

LOGLOSS: 0.6649533504730653 - AUC: 0.6765447154471544




In [10]:
def to_iterator(obj_ids):
    """
    # https://github.com/ray-project/ray/issues/5554
    """
    while obj_ids:
        done, obj_ids = ray.wait(obj_ids)
        yield ray.get(done[0])

    
def score_one_pair_parallel(combined_features, bsum, y_train):

    def _calc_logloss(y_true, preds):
        return metrics.log_loss(y_true[y_true.columns[0]], preds['preds'])

    def _obj(x, 
             y_train,
             combined_features,
             bsum):

        # add x to bsum
        this_w = np.array(x).reshape(-1,1)
        this_value = np.array(combined_features).reshape(1,-1)
        bsum_with_new_feature = bsum + np.matmul(this_w.T, this_value)
        this_preds = 1/(1 + np.exp(-bsum_with_new_feature)) 
        preds = pd.DataFrame(this_preds.reshape(-1,1), columns=['preds'])
        return _calc_logloss(y_train, preds)
    
    
    result = minimize(_obj, 1, args=(y_train,
                             combined_features['feature_value'],
                             bsum))

    this_coef = result['x'][0]
    this_logloss = result['fun']
    dict_result_combination = {"coef":this_coef,
                               "logloss" : this_logloss,
                               "feature": combined_features['feature_name']}

    return dict_result_combination


def iter_one_level(X_train, y_train, X_test, y_test, coef_dict, intercept):
    
    def _combine_all_features(current_training_set, pairwise_cols):
        all_features_combined = []
        for feature_pair in pairwise_cols:
            feature_name = str(feature_pair)
            if feature_name not in current_training_set.columns:
                combined_dict = {}
                combined_features = current_training_set[feature_pair[0]] | current_training_set[feature_pair[1]]
                combined_dict['feature_name'] = str(feature_pair)
                combined_dict['feature_value'] = combined_features
                if combined_features.equals(current_training_set[feature_pair[0]]) or combined_features.equals(current_training_set[feature_pair[1]]):
                    continue
                else:
                    all_features_combined.append(combined_dict)
        return all_features_combined
    
    def _chunker(seq, size):
        return (seq[pos:pos + size] for pos in range(0, len(seq), size))
    
    all_columns = list(X_train)
    pairwise_cols = list(itertools.combinations(all_columns, 2))
    all_results = {}
    col_coefs = np.array(list(coef_dict.values())).reshape(1,-1)
    bsum = np.add(np.matmul(col_coefs, X_train.values.T), intercept)
    
    print("Combinando features do nivel...")
    all_features_combined = _combine_all_features(X_train, pairwise_cols)
    print(f"{len(all_features_combined)} pares criados")
    
    all_results = []
    
    print("Iniciando paralelismo do nivel...")
    chunk = 0
    chuncksize = int(10e3)
    for group in _chunker(all_features_combined, chuncksize):
        print(f"\t working in chunk {chunk}")
        score_one_pair_parallel_ray = ray.remote(score_one_pair_parallel)
        results = [score_one_pair_parallel_ray.remote(ray.put(one_pair), bsum, ray.put(y_train)) for one_pair in group]
        for x in tqdm(to_iterator(results), total=len(results)):
            pass

        all_results.append(ray.get(results))
        chunk += chuncksize
        
    return sum(all_results, [])

def predict_logit(coef_dict, intercept, X_test, y_test):
    col_coefs = np.array(list(coef_dict.values())).reshape(1,-1)
    bsum = np.add(np.matmul(col_coefs, X_test.values.T), intercept)
    this_preds = 1/(1 + np.exp(-bsum)) 
    preds = pd.DataFrame(this_preds.reshape(-1,1), columns=['preds'])
    logloss = metrics.log_loss(y_test[y_test.columns[0]], preds['preds'])
    fpr, tpr, thresholds = metrics.roc_curve(y_test[y_test.columns[0]],  preds['preds'], pos_label=1)
    auc = metrics.auc(fpr, tpr)
    return logloss, auc

def beam_search(X_train, y_train, X_test, y_test):
    
    def _choose_best_feature(level_results):
        min_logloss = 9999
        bsf_feature = None
        bst_coef = None
        for dict_feature in level_results:
            if  dict_feature['logloss'] < min_logloss:
                min_logloss = dict_feature['logloss']
                bsf_feature = dict_feature['feature']
                bst_coef = dict_feature['coef']
        print(f"Level - choose {bsf_feature} -{bst_coef} -{min_logloss}")
        return bsf_feature, bst_coef, min_logloss
    
    current_training_set = X_train.copy()
    current_test_set = X_test.copy()
    start_classifier, start_logloss, start_auc = run_initial_logit(current_training_set, y_train, current_test_set, y_test)
    
    coef_dict = dict(list(zip(current_training_set.columns,start_classifier.coef_[0])))
    intercept = start_classifier.intercept_[0]
    
    print(f"Start logloss : {start_logloss} - Start AUC {start_auc}")
    last_logloss = start_logloss
    this_logloss = -np.inf
    accepted_features = []
    while this_logloss <= last_logloss:
        # eval one level
        level_results = iter_one_level(current_training_set, y_train, current_test_set, y_test, coef_dict, intercept)
        bst_feature, this_coef, _ = _choose_best_feature(level_results)
        
        # update X_train an X_test
        current_training_set[str(bst_feature)] = current_training_set[str(eval(bst_feature)[0])] | current_training_set[str(eval(bst_feature)[1])]
        current_test_set[str(bst_feature)] = current_test_set[str(eval(bst_feature)[0])] | current_test_set[str(eval(bst_feature)[1])]
        
        # retrain logit with new feature
#         this_clf, this_logloss, this_auc = run_initial_logit(current_training_set, y_train, current_test_set, y_test)
#         coef_dict = dict(list(zip(current_training_set.columns,this_clf.coef_[0])))
#         intercept = start_classifier.intercept_[0]
        coef_dict[bst_feature] = this_coef
        
        this_logloss, this_auc = predict_logit(coef_dict, intercept, current_test_set, y_test)
        print(f" new  logloss: {this_logloss} - new auc {this_auc}")
        
        coef_dict[bst_feature] = this_logloss
    
        current_logloss_diff = last_logloss - this_logloss
        
        if this_logloss > last_logloss:
            current_training_set =  current_training_set.drop([str(bst_feature)], axis=1)
            current_test_set =  current_test_set.drop([str(bst_feature)], axis=1)
            coef_dict.pop(str(bst_feature), None)
            
        else:
            last_logloss = this_logloss
            
        print(f"logloss gain with {bst_feature}: {current_logloss_diff}")
    return current_training_set, current_test_set, coef_dict, intercept   

In [11]:
complete_X_train, complete_X_test, coef_dict, intercept = beam_search(final_X_train, y_train, final_X_test, y_test)



Start logloss : 0.6649533504730653 - Start AUC 0.6765447154471544
Combinando features do nivel...
18915 pares criados
Iniciando paralelismo do nivel...
	 working in chunk 0


100%|██████████| 10000/10000 [00:17<00:00, 574.58it/s]


	 working in chunk 10000


100%|██████████| 8915/8915 [00:13<00:00, 640.94it/s] 


Level - choose ('12_2_2', '9_2_2') --0.20469146092846477 -0.5545985753777033
 new  logloss: 0.6674501797484571 - new auc 0.6802642276422765
logloss gain with ('12_2_2', '9_2_2'): -0.002496829275391854


In [100]:
col_coefs = np.array(list(coef_dict.values())).reshape(1,-1)
bsum = np.add(np.matmul(col_coefs, complete_X_test.values.T), intercept)


In [101]:
this_preds = 1/(1 + np.exp(-bsum)) 
preds = pd.DataFrame(this_preds.reshape(-1,1), columns=['preds'])

In [102]:
all_preds =  preds['preds']
logloss = metrics.log_loss(y_test[y_test.columns[0]], all_preds)
fpr, tpr, thresholds = metrics.roc_curve(y_test[y_test.columns[0]], all_preds, pos_label=1)
auc = metrics.auc(fpr, tpr)

In [103]:
auc

0.9078039396921537

In [162]:
t.columnsa

Index(['NOX_10_bin_8', 'NOX_5_bin_4', 'RM_8_bin_2', 'AGE_8_bin_0',
       'CRIM_8_bin_7', 'CRIM_8_bin_5', 'NOX_10_bin_5', 'AGE_10_bin_0',
       'NOX_5_bin_2', 'NOX_9_bin_5',
       ...
       'CRIM_10_bin_1', '('NOX_10_bin_8', 'NOX_5_bin_4')',
       '('NOX_10_bin_8', 'RM_8_bin_2')', '('NOX_10_bin_8', 'AGE_8_bin_0')',
       '('NOX_10_bin_8', 'CRIM_8_bin_7')', '('NOX_10_bin_8', 'CRIM_8_bin_5')',
       '('NOX_10_bin_8', 'NOX_10_bin_5')', '('NOX_10_bin_8', 'AGE_10_bin_0')',
       '('NOX_10_bin_8', 'NOX_5_bin_2')', '('NOX_10_bin_8', 'NOX_9_bin_5')'],
      dtype='object', length=168)

In [49]:
all_columns = t.columns
pairwise_cols = list(itertools.combinations(all_columns, 2))

In [163]:
run_field_wise_minibatch_gradient_descent_lr(t.columns,
                                            'bin_lstat',
                                            t.copy(),
                                            y_all.copy(),
                                            all_features=True,
                                            return_clf=True)

(LogisticRegression(random_state=0),
 1.228669507272974,
      bin_lstat  preds
 0            0      0
 1            0      0
 2            0      0
 3            0      0
 4            0      0
 ..         ...    ...
 501          0      0
 502          0      0
 503          0      0
 504          0      0
 505          0      1
 
 [506 rows x 2 columns])

In [83]:
for col in pairwise_cols:
    t['x'] = t[col[0]] | t[col[1]]
    _,_, preds = run_field_wise_minibatch_gradient_descent_lr(t.columns,
                                            'bin_lstat',
                                            t.copy(),
                                            y_all.copy(),
                                            all_features=True,
                                            return_clf=True)
    print(roc_auc_score(preds['bin_lstat'], preds['preds']))

0.5351057228771896
0.514191844462476
0.44325648735939416
0.47713712949262566
0.5020683181391501
0.3120932970582152
0.38352186848678665
0.2823015607846881
0.42472912987446904
0.5869091371931332
0.3718836014191844
0.4879480693045678
0.4160421936900387
0.35687238477081445
0.3774521502553578
0.5063799659523014
0.3926781537874088
0.5935675305872432
0.48568087442126867
0.6574228755986189
0.4016116971345839
0.3151003134297488
0.25449063688288553
0.5003818433487662
0.3852162983469365
0.5034922756272573
0.3169140693363881
0.3631250696068605
0.3733155139770576
0.5591300335703944
0.5270154169252065
0.47917362735271196
0.5523523141297949
0.46956390307543
0.4224698900609359
0.6431355702989515
0.6554500182966605
0.44380538717324547
0.6431992108570792
0.5859465737514519
0.38883585509044916
0.364692218350755
0.5630598380347795
0.5472053839912177
0.436820835918731
0.4804543935850317
0.633637216998393
0.4551572717292731
0.5200785960892876
0.4042050498782874
0.6557523109477671
0.3814217300685727
0.463557

KeyboardInterrupt: 

In [47]:
from sklearn.metrics import roc_auc_score


0.4837637026076718

In [23]:
result_dict = _

In [12]:
import multiprocessing as mp
print("Number of processors: ", mp.cpu_count())
all_columns = list(original_feature_set)
pairwise_cols = list(itertools.combinations(all_columns, 2))
all_results = {}


Number of processors:  4


In [19]:
pairwise_cols = pairwise_cols[:2]

In [17]:
def make_preds(original_feature_set, new_coef_dict):
    col_coefs = np.array(list(new_coef_dict.values())).reshape(-1,1)
    this_pred = np.add(np.round(original_feature_set.values.dot(col_coefs),3), intercept)
    return 1/(1 + np.exp(-this_pred))

0.218

In [19]:
new_coef_dict = coef_dict

In [20]:
original_feature_set['teste_feature'] = original_feature_set['DIS_9_bin_1'] | original_feature_set['DIS_9_bin_3']

In [24]:
new_coef_dict['teste_feature'] = 0.218

In [25]:
y_all['new_preds'] = make_preds(original_feature_set, new_coef_dict)

In [26]:
metrics.log_loss(y_all['bin_lstat'], y_all['new_preds'])

12.60009471884184

In [72]:
y_all

Unnamed: 0,bin_lstat,new_preds
0,0,0.0
1,0,0.0
2,0,0.0
3,0,1.0
4,0,0.0
...,...,...
501,0,1.0
502,0,1.0
503,0,1.0
504,0,1.0


In [50]:
result

      fun: -18.59017554414894
 hess_inv: array([[1]])
      jac: array([0.])
  message: 'Optimization terminated successfully.'
     nfev: 9
      nit: 1
     njev: 3
   status: 0
  success: True
        x: array([-5.05])

In [172]:
def fun(x, a,b,c):
    return a*x**2 + b*x + c

minimize(fun, 100, args=(1,0,0))

      fun: 5.552074997367714e-17
 hess_inv: array([[0.50000004]])
      jac: array([-1.28826571e-12])
  message: 'Optimization terminated successfully.'
     nfev: 21
      nit: 4
     njev: 7
   status: 0
  success: True
        x: array([-7.45122473e-09])

In [169]:
result

      fun: -0.5860022592398136
 hess_inv: array([[1]])
      jac: array([0.])
  message: 'Optimization terminated successfully.'
     nfev: 3
      nit: 0
     njev: 1
   status: 0
  success: True
        x: array([0.])

In [100]:
preds['teste'] = np.round(s)

In [103]:
fpr, tpr, thresholds = metrics.roc_curve(preds['bin_lstat'], preds['teste'], pos_label=1)
metrics.auc(fpr, tpr)


0.5860022592398136

In [154]:
residuals.sort_values(['residual']).sample(10)

Unnamed: 0,bin_lstat,preds,residual
169,1,1.0,0.0
447,1,0.484,0.516
490,1,1.0,0.0
473,1,1.0,0.0
438,1,1.0,0.0
149,1,1.0,0.0
165,0,1.0,-1.0
496,1,1.0,0.0
337,1,0.0,1.0
227,0,1.0,-1.0


In [149]:
residuals['residual'].value_counts()

 0.000    295
 1.000    133
-1.000     53
 0.996      2
-0.199      2
 0.990      1
 0.125      1
-0.108      1
-0.044      1
 0.218      1
-0.036      1
-0.120      1
-0.790      1
-0.976      1
-0.998      1
-0.999      1
 0.516      1
-0.001      1
-0.846      1
-0.958      1
-0.917      1
-0.996      1
-0.952      1
 0.979      1
 0.998      1
 0.975      1
Name: residual, dtype: int64

In [105]:
-1/ (1 * -1)

1.0

In [106]:
-1 + 1

0

In [None]:
start_classifier.