In order to automate discretization and spare its dependence
on human experts, we propose a multi-granularity discretization
method. The basic idea is simple: instead of using a fine-tuned
granularity, we discretize each numerical feature into several, rather
than only one, categorical features, each with a different granularity.
Figure 5 gives an illustration of discretizing a numerical feature
with four levels of granularity. Since more levels of granularity are
considered, it is more likely to get a promising result

Vamos considerar p = 5 primeiramente

In [1]:
import sklearn
import sklearn.datasets
import pandas as pd
from sklearn import linear_model
import operator
import numpy as np
import itertools
from sklearn import metrics
from scipy.optimize import minimize 
import multiprocessing as mp
from tqdm import tqdm

In [2]:
boston = sklearn.datasets.load_boston()
boston = pd.DataFrame(boston.data, columns=boston.feature_names)

boston['bin_lstat'] = boston['LSTAT'].apply(lambda x: 0 if x < 10 else 1)
boston = boston.drop(['LSTAT'],axis=1)

A remaining problem is how to determine the
levels of granularity. For an experienced user, it can set a group of
potentially good values. If no values are specified, AutoCross will
use {10^p
}
P
p=1
as default values, where P is an integer determined
by a rule-based mechanism that considers the available memory,
data size and feature numbers.

In [124]:
def discretize_numeric_features(df, y_feature, max_gran=10):
    """
    multi-granularity discretization
    method. The basic idea is simple: instead of using a fine-tuned
    granularity, we discretize each numerical feature into several, rather
    than only one, categorical features, each with a different granularity.
    
    min granularity = 3
    
    Sometimes de edge values did not permit to execute correct discretization
    if this happens the step is not executed
    """
    is_numeric = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numeric_features = df.select_dtypes(include=is_numeric)
    if y_feature in numeric_features:
        numeric_features = numeric_features.drop([y_feature], axis=1)
    discrete_features = []
    print(f"Discretizing {len(numeric_features.columns)} features...")
    for feat in numeric_features:
        print(f" Working in {feat}")
        for gran in range(3, max_gran+1):
            try:
                df[f"{feat}_{gran}"] = pd.qcut(df[feat],
                                               gran,
                                               labels= [f"bin_{i}" for i in range(gran)]
                                              )
                discrete_features.append(f"{feat}_{gran}")
            except:
                print(f"Not possible to correct work on cut {feat} > {gran}")
                break
        df = df.drop(feat, axis=1)
        
    return df, discrete_features


def run_field_wise_minibatch_gradient_descent_lr(this_feature,
                                                 y_feature,
                                                 X_all,
                                                 y_all,
                                                 all_features=False,
                                                 return_clf = False):
    """
    Run field wise logistic regression
    """
        
    clf = linear_model.SGDClassifier(loss='log',
                                     n_jobs= -1,
                                     warm_start = True)
    if all_features:
        this_X = X_all
        
    else:
        this_X = pd.get_dummies(X_all[this_feature], columns=this_feature)
        
#     for i in range(0, 1000):
#         this_batch_samples = this_X.sample(frac=0.8)
#         this_y = y_all.loc[y_all.index.isin(this_batch_samples.index)]
#         clf.partial_fit(this_batch_samples, this_y[y_feature], classes=this_y[y_feature].unique())
    
    from sklearn.linear_model import LogisticRegression
    clf = LogisticRegression(random_state=0).fit(this_X, y_all[y_feature])

    if return_clf:
        all_preds = clf.predict(this_X)
        y_all['preds'] = all_preds
        final_score = metrics.log_loss(y_all[y_feature], y_all['preds'])
        return clf, final_score, y_all
   
    return clf.score(this_X, y_all)

def measure_and_clean_discrete_features(df, discrete_features, y_feature):
    """
    In order to avoid the dramatic increase in feature number caused
    by discretization, once these features are generated, we use fieldwise LR to evaluate them and keep
    only the best half. 
    """
    
    def _select_discrete_features(feature_score, abs=True):
        """
        Select only the best half of measured features
        """
        if abs:
            feature_score = {k: np.abs(v) for k,v in feature_score.items()}
            
        sorted_score = sorted(feature_score.items(), key=operator.itemgetter(1), reverse=True)
        half_features = int(np.floor(len(sorted_score)/2))
        return [k[0] for k in sorted_score[:half_features]]
    
    feature_score = {}
    # while what you describe is properly called minibatch learning.
    # That's implemented in sklearn.linear_model.SGDClassifier,
    # which fits a logistic regression model if you give it the option loss="log".
    y_all = df[[y_feature]]
    X_all = df.drop([y_feature], axis=1)
    this_classifier, final_score, preds = run_field_wise_minibatch_gradient_descent_lr(X_all,
                                                                          y_feature,
                                                                          X_all,
                                                                          y_all,
                                                                         all_features=True,
                                                                         return_clf=True)
    
    coef_dict = dict(list(zip(X_all.columns, this_classifier.coef_[0])))
    return df[_select_discrete_features(coef_dict)], y_all

In [125]:
boston_discretize, discrete_features = discretize_numeric_features(boston.copy(), y_feature='bin_lstat')

Discretizing 12 features...
 Working in CRIM
 Working in ZN
Not possible to correct work on cut ZN > 3
 Working in INDUS
Not possible to correct work on cut INDUS > 6
 Working in CHAS
Not possible to correct work on cut CHAS > 3
 Working in NOX
 Working in RM
 Working in AGE
 Working in DIS
 Working in RAD
Not possible to correct work on cut RAD > 4
 Working in TAX
Not possible to correct work on cut TAX > 8
 Working in PTRATIO
Not possible to correct work on cut PTRATIO > 6
 Working in B
Not possible to correct work on cut B > 5


In [126]:
original_feature_set = pd.get_dummies(boston_discretize)
original_feature_set, y_all = measure_and_clean_discrete_features(original_feature_set.copy(),
                                                    discrete_features,
                                                    y_feature='bin_lstat')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [127]:
start_classifier, final_score, preds = run_field_wise_minibatch_gradient_descent_lr(original_feature_set.columns,
                                                                                        'bin_lstat',
                                                                                        original_feature_set.copy(),
                                                                                        y_all.copy(),
                                                                                        all_features=True,
                                                                                        return_clf=True)
print(final_score)

1.1604110558996328


In [160]:
def score_one_pair(original_feature_set, y_feature, feature_pair, coef_dict, intercept, y_all):
    
    def _calc_logloss(y_true, y_feature, preds):
        return metrics.log_loss(y_true[y_feature], preds['preds'])


    def _obj(x, 
            y_feature,
            original_feature_set,
            feature_pair,
            coef_dict,
            intercept,
            y_true):

        combined_features = original_feature_set[feature_pair[0]] | original_feature_set[feature_pair[1]]
        col_coefs = np.array(list(coef_dict.values())).reshape(-1,1)
        bsum = np.add(original_feature_set.values.dot(col_coefs), intercept)
        # add x to bsum
        bsum_with_new_feature = bsum + np.sum(np.dot(combined_features.values.reshape(-1,1), x))
        this_preds = 1/(1 + np.exp(-bsum_with_new_feature)) 

        preds = pd.DataFrame(this_preds, columns=['preds'])
        return _calc_logloss(y_true, y_feature, preds)
    
    

    result = minimize(_obj, 0, args=(y_feature,
                                    original_feature_set.copy(),
                                    feature_pair,
                                    coef_dict,
                                    intercept,
                                    y_all),
                     method = 'Nelder-Mead')
    
    this_coef = np.round(result['x'][0],3)
    this_logloss = result['fun']
    
    dict_result_combination = {"coef":this_coef,
                               "logloss" : this_logloss}
    
    return dict_result_combination

def iter_one_level(original_feature_set, y_feature, coef_dict, intercept, y_all):
    
    all_columns = list(original_feature_set)
    pairwise_cols = list(itertools.combinations(all_columns, 2))
    all_results = {}
    with tqdm(total=len(pairwise_cols)) as pbar:
        for pair in pairwise_cols:
            feature_name = str(pair)
            if feature_name not in original_feature_set.columns:
                all_results[feature_name] = score_one_pair(original_feature_set.copy(),
                               y_feature,
                               pair,
                               coef_dict,
                               intercept,
                               y_all.copy())
                pbar.update(1)
    return all_results


def beam_search(original_feature_set, y_feature, y_all):
    
    def _choose_best_feature(dict_level_results):
        min_logloss = 9999
        min_key = None
        this_coef = None
        for key, val in dict_level_results.items():
            if val['logloss'] < min_logloss:
                min_logloss = val['logloss']
                min_key = key
                this_coef = val['coef']
        print(f"Level - choose {min_key} - logloss of {min_logloss}")
        return min_key, this_coef, min_logloss
    
    current_feature_set = original_feature_set.copy()
    start_classifier, start_logloss, preds = run_field_wise_minibatch_gradient_descent_lr(original_feature_set.columns,
                                                                                        'bin_lstat',
                                                                                        original_feature_set.copy(),
                                                                                        y_all.copy(),
                                                                                        all_features=True,
                                                                                        return_clf=True)
    
    coef_dict = dict(list(zip(current_feature_set.columns,start_classifier.coef_[0])))
    intercept = start_classifier.intercept_[0]
    
    print(f"Start score : {start_logloss}")
    
    current_logloss_diff = start_logloss
    accepted_features = []
    while current_logloss_diff >= 0.1:
    
        dict_level_results = iter_one_level(current_feature_set.copy(),
                   y_feature,
                   coef_dict,
                   intercept,
                   y_all)

        bst_feature, this_coef, this_logloss = _choose_best_feature(dict_level_results)
        current_feature_set[str(bst_feature)] = current_feature_set[str(eval(bst_feature)[0])] | current_feature_set[str(eval(bst_feature)[1])]
        coef_dict[bst_feature] = this_coef
        current_logloss_diff = current_logloss_diff - this_logloss
        if current_logloss_diff < 0.1:
            current_feature_set =  current_feature_set.drop([str(bst_feature)], axis=1)
        print(f"logloss gain with {bst_feature}: {current_logloss_diff}")
    return current_feature_set    

In [161]:
y_feature = 'bin_lstat'
t = beam_search(original_feature_set.copy(),
            y_feature,
            y_all)

  0%|                                                                                | 9/12561 [00:00<02:23, 87.59it/s]

Start score : 1.1604110558996328


100%|████████████████████████████████████████████████████████████████████████████| 12561/12561 [02:09<00:00, 97.24it/s]
  0%|                                                                               | 10/12720 [00:00<02:15, 93.68it/s]

Level - choose ('NOX_10_bin_8', 'NOX_5_bin_4') - logloss of 0.11461223490114177
logloss gain with ('NOX_10_bin_8', 'NOX_5_bin_4'): 1.045798820998491


100%|███████████████████████████████████████████████████████████████████████████▉| 12719/12720 [02:10<00:00, 97.59it/s]
  0%|                                                                               | 10/12880 [00:00<02:19, 91.96it/s]

Level - choose ('NOX_10_bin_8', 'RM_8_bin_2') - logloss of 0.11461223490114177
logloss gain with ('NOX_10_bin_8', 'RM_8_bin_2'): 0.9311865860973492


100%|███████████████████████████████████████████████████████████████████████████▉| 12878/12880 [02:12<00:00, 96.88it/s]
  0%|                                                                               | 10/13041 [00:00<02:17, 94.59it/s]

Level - choose ('NOX_10_bin_8', 'AGE_8_bin_0') - logloss of 0.11461223490114177
logloss gain with ('NOX_10_bin_8', 'AGE_8_bin_0'): 0.8165743511962074


100%|███████████████████████████████████████████████████████████████████████████▉| 13038/13041 [02:13<00:00, 97.41it/s]
  0%|                                                                               | 10/13203 [00:00<02:22, 92.84it/s]

Level - choose ('NOX_10_bin_8', 'CRIM_8_bin_7') - logloss of 0.11461223490114177
logloss gain with ('NOX_10_bin_8', 'CRIM_8_bin_7'): 0.7019621162950656


100%|███████████████████████████████████████████████████████████████████████████▉| 13199/13203 [02:16<00:00, 97.01it/s]
  0%|                                                                               | 10/13366 [00:00<02:25, 91.96it/s]

Level - choose ('NOX_10_bin_8', 'CRIM_8_bin_5') - logloss of 0.11461223490114177
logloss gain with ('NOX_10_bin_8', 'CRIM_8_bin_5'): 0.5873498813939237


100%|███████████████████████████████████████████████████████████████████████████▉| 13361/13366 [02:17<00:00, 96.90it/s]
  0%|                                                                               | 10/13530 [00:00<02:22, 94.57it/s]

Level - choose ('NOX_10_bin_8', 'NOX_10_bin_5') - logloss of 0.11461223490114177
logloss gain with ('NOX_10_bin_8', 'NOX_10_bin_5'): 0.472737646492782


100%|███████████████████████████████████████████████████████████████████████████▉| 13524/13530 [02:21<00:00, 95.90it/s]
  0%|                                                                               | 10/13695 [00:00<02:17, 99.27it/s]

Level - choose ('NOX_10_bin_8', 'AGE_10_bin_0') - logloss of 0.11461223490114177
logloss gain with ('NOX_10_bin_8', 'AGE_10_bin_0'): 0.35812541159164024


100%|███████████████████████████████████████████████████████████████████████████▉| 13688/13695 [02:21<00:00, 97.04it/s]
  0%|                                                                               | 10/13861 [00:00<02:27, 93.72it/s]

Level - choose ('NOX_10_bin_8', 'NOX_5_bin_2') - logloss of 0.11461223490114177
logloss gain with ('NOX_10_bin_8', 'NOX_5_bin_2'): 0.2435131766904985


100%|███████████████████████████████████████████████████████████████████████████▉| 13853/13861 [02:23<00:00, 96.85it/s]
  0%|                                                                               | 10/14028 [00:00<02:26, 95.47it/s]

Level - choose ('NOX_10_bin_8', 'NOX_9_bin_5') - logloss of 0.11461223490114177
logloss gain with ('NOX_10_bin_8', 'NOX_9_bin_5'): 0.12890094178935674


100%|███████████████████████████████████████████████████████████████████████████▉| 14019/14028 [02:25<00:00, 96.53it/s]

Level - choose ('NOX_10_bin_8', 'NOX_9_bin_7') - logloss of 0.11461223490114177
logloss gain with ('NOX_10_bin_8', 'NOX_9_bin_7'): 0.014288706888214972





In [162]:
t.columns

Index(['NOX_10_bin_8', 'NOX_5_bin_4', 'RM_8_bin_2', 'AGE_8_bin_0',
       'CRIM_8_bin_7', 'CRIM_8_bin_5', 'NOX_10_bin_5', 'AGE_10_bin_0',
       'NOX_5_bin_2', 'NOX_9_bin_5',
       ...
       'CRIM_10_bin_1', '('NOX_10_bin_8', 'NOX_5_bin_4')',
       '('NOX_10_bin_8', 'RM_8_bin_2')', '('NOX_10_bin_8', 'AGE_8_bin_0')',
       '('NOX_10_bin_8', 'CRIM_8_bin_7')', '('NOX_10_bin_8', 'CRIM_8_bin_5')',
       '('NOX_10_bin_8', 'NOX_10_bin_5')', '('NOX_10_bin_8', 'AGE_10_bin_0')',
       '('NOX_10_bin_8', 'NOX_5_bin_2')', '('NOX_10_bin_8', 'NOX_9_bin_5')'],
      dtype='object', length=168)

In [49]:
all_columns = t.columns
pairwise_cols = list(itertools.combinations(all_columns, 2))

In [163]:
run_field_wise_minibatch_gradient_descent_lr(t.columns,
                                            'bin_lstat',
                                            t.copy(),
                                            y_all.copy(),
                                            all_features=True,
                                            return_clf=True)

(LogisticRegression(random_state=0),
 1.228669507272974,
      bin_lstat  preds
 0            0      0
 1            0      0
 2            0      0
 3            0      0
 4            0      0
 ..         ...    ...
 501          0      0
 502          0      0
 503          0      0
 504          0      0
 505          0      1
 
 [506 rows x 2 columns])

In [83]:
for col in pairwise_cols:
    t['x'] = t[col[0]] | t[col[1]]
    _,_, preds = run_field_wise_minibatch_gradient_descent_lr(t.columns,
                                            'bin_lstat',
                                            t.copy(),
                                            y_all.copy(),
                                            all_features=True,
                                            return_clf=True)
    print(roc_auc_score(preds['bin_lstat'], preds['preds']))

0.5351057228771896
0.514191844462476
0.44325648735939416
0.47713712949262566
0.5020683181391501
0.3120932970582152
0.38352186848678665
0.2823015607846881
0.42472912987446904
0.5869091371931332
0.3718836014191844
0.4879480693045678
0.4160421936900387
0.35687238477081445
0.3774521502553578
0.5063799659523014
0.3926781537874088
0.5935675305872432
0.48568087442126867
0.6574228755986189
0.4016116971345839
0.3151003134297488
0.25449063688288553
0.5003818433487662
0.3852162983469365
0.5034922756272573
0.3169140693363881
0.3631250696068605
0.3733155139770576
0.5591300335703944
0.5270154169252065
0.47917362735271196
0.5523523141297949
0.46956390307543
0.4224698900609359
0.6431355702989515
0.6554500182966605
0.44380538717324547
0.6431992108570792
0.5859465737514519
0.38883585509044916
0.364692218350755
0.5630598380347795
0.5472053839912177
0.436820835918731
0.4804543935850317
0.633637216998393
0.4551572717292731
0.5200785960892876
0.4042050498782874
0.6557523109477671
0.3814217300685727
0.463557

KeyboardInterrupt: 

In [47]:
from sklearn.metrics import roc_auc_score


0.4837637026076718

In [23]:
result_dict = _

In [12]:
import multiprocessing as mp
print("Number of processors: ", mp.cpu_count())
all_columns = list(original_feature_set)
pairwise_cols = list(itertools.combinations(all_columns, 2))
all_results = {}


Number of processors:  4


In [19]:
pairwise_cols = pairwise_cols[:2]

In [17]:
def make_preds(original_feature_set, new_coef_dict):
    col_coefs = np.array(list(new_coef_dict.values())).reshape(-1,1)
    this_pred = np.add(np.round(original_feature_set.values.dot(col_coefs),3), intercept)
    return 1/(1 + np.exp(-this_pred))

0.218

In [19]:
new_coef_dict = coef_dict

In [20]:
original_feature_set['teste_feature'] = original_feature_set['DIS_9_bin_1'] | original_feature_set['DIS_9_bin_3']

In [24]:
new_coef_dict['teste_feature'] = 0.218

In [25]:
y_all['new_preds'] = make_preds(original_feature_set, new_coef_dict)

In [26]:
metrics.log_loss(y_all['bin_lstat'], y_all['new_preds'])

12.60009471884184

In [72]:
y_all

Unnamed: 0,bin_lstat,new_preds
0,0,0.0
1,0,0.0
2,0,0.0
3,0,1.0
4,0,0.0
...,...,...
501,0,1.0
502,0,1.0
503,0,1.0
504,0,1.0


In [50]:
result

      fun: -18.59017554414894
 hess_inv: array([[1]])
      jac: array([0.])
  message: 'Optimization terminated successfully.'
     nfev: 9
      nit: 1
     njev: 3
   status: 0
  success: True
        x: array([-5.05])

In [172]:
def fun(x, a,b,c):
    return a*x**2 + b*x + c

minimize(fun, 100, args=(1,0,0))

      fun: 5.552074997367714e-17
 hess_inv: array([[0.50000004]])
      jac: array([-1.28826571e-12])
  message: 'Optimization terminated successfully.'
     nfev: 21
      nit: 4
     njev: 7
   status: 0
  success: True
        x: array([-7.45122473e-09])

In [169]:
result

      fun: -0.5860022592398136
 hess_inv: array([[1]])
      jac: array([0.])
  message: 'Optimization terminated successfully.'
     nfev: 3
      nit: 0
     njev: 1
   status: 0
  success: True
        x: array([0.])

In [100]:
preds['teste'] = np.round(s)

In [103]:
fpr, tpr, thresholds = metrics.roc_curve(preds['bin_lstat'], preds['teste'], pos_label=1)
metrics.auc(fpr, tpr)


0.5860022592398136

In [154]:
residuals.sort_values(['residual']).sample(10)

Unnamed: 0,bin_lstat,preds,residual
169,1,1.0,0.0
447,1,0.484,0.516
490,1,1.0,0.0
473,1,1.0,0.0
438,1,1.0,0.0
149,1,1.0,0.0
165,0,1.0,-1.0
496,1,1.0,0.0
337,1,0.0,1.0
227,0,1.0,-1.0


In [149]:
residuals['residual'].value_counts()

 0.000    295
 1.000    133
-1.000     53
 0.996      2
-0.199      2
 0.990      1
 0.125      1
-0.108      1
-0.044      1
 0.218      1
-0.036      1
-0.120      1
-0.790      1
-0.976      1
-0.998      1
-0.999      1
 0.516      1
-0.001      1
-0.846      1
-0.958      1
-0.917      1
-0.996      1
-0.952      1
 0.979      1
 0.998      1
 0.975      1
Name: residual, dtype: int64

In [105]:
-1/ (1 * -1)

1.0

In [106]:
-1 + 1

0

In [None]:
start_classifier.