In [36]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import statsmodels.api as sm

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import KBinsDiscretizer
from scipy import stats

%matplotlib inline

In [8]:
def roc_auc(y, y_pred):
    
    return roc_auc_score(y, y_pred)


def gini_score(y, y_pred):
    
    GINI = (2 * roc_auc(y, y_pred)) - 1
    
    return GINI

binnings

In [9]:
strategies = ['uniform', 'kmeans', 'quantile']

def k_binning(data, feature, n_bins, strategy = 'uniform'):
    
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy = strategy)
    
    est.fit(data[feature].values)
    
    Xt = est.transform(data[feature].values)
    
    data[feature] = pd.DataFrame(Xt)
    
    return data


In [40]:
def generate_bins(df, bins_num, feature, label):
    
        df = df[[feature, label]]
        
        df['bin'] = pd.qcut(df[feature], bins_num, duplicates='drop') \
                    .apply(lambda x: x.left) \
                    .astype(float)
        return df

def generate_correct_bins(df_origin, feature, label, bins_max=20, bin_min_size = 5):
    
        for bins_num in range(bins_max, 1, -1):
            
            df = generate_bins(df_origin, bins_num, feature, label)
            
            df_grouped = pd.DataFrame(df.groupby('bin') \
                                      .agg({feature: 'count',
                                            label: 'sum'})) \
                                      .reset_index()
            
            r, p = stats.stats.spearmanr(df_grouped['bin'], df_grouped[label])

            if (
                    abs(r)==1 and                                                        
                    df_grouped[feature].min() > bin_min_size                   
                    and not (df_grouped[feature] == df_grouped[label]).any()      
            ):
                break

        return df



def group_by_feature(df, feature, label):
        df = df.groupby('bin') \
                            .agg({label: ['count', 'sum']}) \
                            .reset_index()
        
        df.columns = [feature, 'count', 'good']
        df['bad'] = df['count'] - df['good']
        return df
    
    
def perc_share(df, group_name):
    
        return df[group_name] / df[group_name].sum()

def calculate_perc_share(df, feature, label):
    
        df = group_by_feature(df, feature, label)
        
        df['perc_good'] = perc_share(df, 'good')
        
        df['perc_bad'] = perc_share(df, 'bad')
        
        df['perc_diff'] = df['perc_good'] - df['perc_bad']
        
        return df

def woe_binning(df, feature, label):
    
        df = calculate_perc_share(df, feature, label)
        
        df['woe'] = np.log(df['perc_good']/df['perc_bad'])
        
        df['woe'] = df['woe'].replace([np.inf, -np.inf], np.nan).fillna(0)
        
        return df

In [120]:
from sklearn.datasets import load_iris

data = load_iris()

df = pd.DataFrame({'col1' : data['data'][:,0], 'col2' : data['data'][:,1], 'label' : data['target']})

df = df[df['label'].isin([0, 1])]

In [127]:
df_bined = generate_correct_bins(df, 'col1', 'label', 20, 4.1)
df_res = woe_binning(df_bined, 'col1', 'label')

In [128]:
df_res

Unnamed: 0,col1,count,good,bad,perc_good,perc_bad,perc_diff,woe
0,4.299,40,4,36,0.08,0.72,-0.64,-2.197225
1,5.1,30,17,13,0.34,0.26,0.08,0.268264
2,5.7,30,29,1,0.58,0.02,0.56,3.367296


In [12]:
def iv_binning(df):
    
        df = woe_binning(df)
        
        df['iv'] = df['perc_diff'] * df['woe']
        
        return df, df['iv'].sum()

In [None]:
def baseline_model(train_x, train_y, run_num = 10, fold = 5):
    
    train_result, test_result = [], []
    
    for i in range(run_num):
        
        # result list
        train_fold, test_fold = [], []
        
        # split dataset
        skf = StratifiedKFold(n_splits = fold, shuffle = True)
        
        fold_num = 1
        
        for train_index, valid_index in skf.split(train_x, train_y):
            
            # dataset
            X_train, X_valid = train_x.iloc[train_index], train_x.iloc[valid_index]
            y_train, y_valid = train_y.iloc[train_index], train_y.iloc[valid_index]
            
            # model
            reg = LogisticRegression(solver = "liblinear", penalty = "l2")
            
            reg.fit(X_train, y_train)
            
            y_train_pred = reg.predict(X_train)
            
            y_valid_pred = reg.predict(X_valid)
            
            # result AUC
            train_auc = roc_auc_score(y_train, y_train_pred)
            test_auc = roc_auc_score(y_valid, y_valid_pred)
            
            if i == 1:
                print("TRAIN Fold {0}, AUC score: {1}".format(fold_num, round(train_auc, 4)))
                print("TEST Fold {0}, AUC score: {1}".format(fold_num, round(test_auc, 4)))
            fold_num += 1
            train_fold.append(train_auc)
            test_fold.append(test_auc)
        train_result.append(train_fold)
        test_result.append(test_fold)
    return train_result, test_result