In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("C:/Trees/Bankloan.csv", encoding='cp1251', sep=';')
df.head()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,28,2,7,2,44,177,2990592,4797408,0
1,64,5,34,17,116,147,5047392,12004608,0
2,40,1,20,12,61,48,1042368,1885632,0
3,30,1,11,3,27,345,175122,756378,0
4,25,1,2,2,30,224,",75936",596064,1


In [3]:
for i in ['debtinc', 'creddebt', 'othdebt']:
    if i in df.columns:
        df[i]=df[i].str.replace(',', '.').astype('float')

In [4]:
for i in ['ed', 'default']:
    if i in df.columns:
        df[i]=df[i].astype('object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 9 columns):
age         1500 non-null int64
ed          1500 non-null object
employ      1500 non-null int64
address     1500 non-null int64
income      1500 non-null int64
debtinc     1500 non-null float64
creddebt    1500 non-null float64
othdebt     1500 non-null float64
default     1500 non-null object
dtypes: float64(3), int64(4), object(2)
memory usage: 105.5+ KB


In [6]:
# строим таблицу сопряженности ed * default
biv = pd.crosstab(df["ed"],df["default"])
biv

default,0,1
ed,Unnamed: 1_level_1,Unnamed: 2_level_1
1,182,64
2,330,197
3,213,120
4,170,140
5,57,27


In [7]:
# вычисляем WoE, при этом используем умножение на 1.0,
# чтобы преобразовать во float и добавляем "a=0.01", 
# чтобы избежать деления на 0
a= 0.01
WoE = np.log((1.0*biv[0]/sum(biv[0])+a) / (1.0*biv[1]/sum(biv[1])+a))
WoE

ed
1    0.461664
2   -0.035398
3    0.020581
4   -0.342042
5    0.164588
dtype: float64

In [8]:
# вычисляем IV
IV = sum(((1.0*biv[0]/sum(biv[0])+a) - (1.0*biv[1]/sum(biv[1])+a))*np.log((1.0*biv[0]/sum(biv[0])+a) 
                                                                          / (1.0*biv[1]/sum(biv[1])+a)))
IV

0.062944488850102398

In [9]:
# пишем функцию, вычисляющую IV
# по всем предикторам
def auto_woe(df): 
    iv_list = [] 
    a= 0.01
    df_drop = df.drop("default", axis=1) 
    for var_name in df_drop.columns: 
        biv = pd.crosstab(df[var_name],df["default"])        
        IV = sum(((1.0*biv[0]/sum(biv[0])+a) - (1.0*biv[1]/sum(biv[1])+a))*np.log((1.0*biv[0]/sum(biv[0])+a) / (1.0*biv[1]/sum(biv[1])+a)))
        iv_list.append(IV)
    col_list =list(df_drop.columns)
    results = pd.DataFrame({'Column Name' : col_list,'IV' : iv_list})
    results['Usefulness'] = ['Suspicous' if x > 0.5 else 'Strong' if x <= 0.5 and x > 0.3 else 'Medium'
                         if x <= 0.3 and x > 0.1 else 'Weak' if x <= 0.1 and x > 0.02 else
                         'Not Useful' for x in results['IV']]  # Source for 'Usefullness Values' Siddiqi (2006)
    return(results.sort_values(by = 'IV', ascending = False))  

In [10]:
# вычисляем IV для всех предикторов
auto_woe(df)

Unnamed: 0,Column Name,IV,Usefulness
2,employ,0.365597,Strong
3,address,0.290419,Medium
0,age,0.28139,Medium
5,debtinc,0.278861,Medium
6,creddebt,0.267687,Medium
7,othdebt,0.266615,Medium
4,income,0.166448,Medium
1,ed,0.062944,Weak


In [11]:
df['age'].max()

79

In [12]:
# выполняем биннинг переменной age
bins=[0, 25, 45, 60, 79]
df['age_binned'] = pd.cut(df['age'], bins=bins)
auto_woe(df)

Unnamed: 0,Column Name,IV,Usefulness
2,employ,0.365597,Strong
8,age_binned,0.35194,Strong
3,address,0.290419,Medium
0,age,0.28139,Medium
5,debtinc,0.278861,Medium
6,creddebt,0.267687,Medium
7,othdebt,0.266615,Medium
4,income,0.166448,Medium
1,ed,0.062944,Weak


In [13]:
# смотрим первые 10 наблюдений переменной age_binned
df["age_binned"].head(10)

0    (25, 45]
1    (60, 79]
2    (25, 45]
3    (25, 45]
4     (0, 25]
5    (25, 45]
6    (25, 45]
7     (0, 25]
8    (60, 79]
9     (0, 25]
Name: age_binned, dtype: category
Categories (4, interval[int64]): [(0, 25] < (25, 45] < (45, 60] < (60, 79]]