In [60]:
# import packages
import pandas as pd
import numpy as np
import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import re
import traceback
import string

In [61]:
df_modelling_set = pd.read_csv('../1.data-preparation/processed-data/modelling_data.csv')

# Weight of Evidence & Information Value Calculation
The formula to calculate WOE and IV is provided below.

[![N|Solid](https://miro.medium.com/max/384/1*6Aw782wiyiFtzvK7EOY8CA.png)](https://medium.com/@sundarstyles89/weight-of-evidence-and-information-value-using-python-6f05072e83eb)
[![N|Solid](https://miro.medium.com/max/700/1*xWA7a2KsTQOhaQ9MZFJJeQ.png)](https://medium.com/@sundarstyles89/weight-of-evidence-and-information-value-using-python-6f05072e83eb)

or simply

[![N|Solid](https://miro.medium.com/max/600/1*9Gi0fGyTpxfwM2TpV4GZQQ.png)](https://medium.com/@sundarstyles89/weight-of-evidence-and-information-value-using-python-6f05072e83eb)

In [62]:
# Set constants
max_bin = 20
force_bin = 3

In [63]:
# FUNCTIONS FOR CALCULATING INFORMATION VALUE

# define a binning function
def mono_bin(Y, X, n = max_bin):
    
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]
    r = 0
    while np.abs(r) < 1:
        try:
            d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
            d2 = d1.groupby('Bucket', as_index=True)
            r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
            n = n - 1 
        except Exception as e:
            n = n - 1

    if len(d2) == 1:
        n = force_bin         
        bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
        if len(np.unique(bins)) == 2:
            bins = np.insert(bins, 0, 1)
            bins[1] = bins[1]-(bins[1]/2)
        d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins),include_lowest=True)}) 
        d2 = d1.groupby('Bucket', as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["MIN_VALUE"] = d2.min().X
    d3["MAX_VALUE"] = d2.max().X
    d3["COUNT"] = d2.count().Y
    d3["EVENT"] = d2.sum().Y
    d3["NONEVENT"] = d2.count().Y - d2.sum().Y
    d3=d3.reset_index(drop=True)
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]       
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    
    return(d3)

def char_bin(Y, X):
        
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]    
    df2 = notmiss.groupby('X',as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["COUNT"] = df2.count().Y
    d3["MIN_VALUE"] = df2.sum().Y.index
    d3["MAX_VALUE"] = d3["MIN_VALUE"]
    d3["EVENT"] = df2.sum().Y
    d3["NONEVENT"] = df2.count().Y - df2.sum().Y
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]      
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    d3 = d3.reset_index(drop=True)
    
    return(d3)

def data_vars(df1, target):
    
    stack = traceback.extract_stack()
    filename, lineno, function_name, code = stack[-2]
    vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
    final = (re.findall(r"[\w']+", vars_name))[-1]
    
    x = df1.dtypes.index
    count = -1
    
    for i in x:
        if i.upper() not in (final.upper()):
            if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
                conv = mono_bin(target, df1[i])
                conv["VAR_NAME"] = i
                count = count + 1
            else:
                conv = char_bin(target, df1[i])
                conv["VAR_NAME"] = i            
                count = count + 1
                
            if count == 0:
                iv_df = conv
            else:
                iv_df = iv_df.append(conv,ignore_index=True)
    
    iv = pd.DataFrame({'IV':iv_df.groupby('VAR_NAME').IV.max()})
    iv = iv.reset_index()
    return(iv_df,iv)

In [64]:
final_iv, IV = data_vars(df_modelling_set,df_modelling_set['DaysInHospital'])

In [65]:
final_iv

Unnamed: 0,VAR_NAME,MIN_VALUE,MAX_VALUE,COUNT,EVENT,EVENT_RATE,NONEVENT,NON_EVENT_RATE,DIST_EVENT,DIST_NON_EVENT,WOE,IV
0,MemberID,4,49931940,73737,33867,0.459295,39870,0.540705,0.507006,0.494205,0.025573,0.000656
1,MemberID,49932307,99998824,73736,32931,0.446607,40805,0.553393,0.492994,0.505795,-0.025635,0.000656
2,ClaimsTruncated,0,0,139458,54497,0.390777,84961,0.609223,0.815848,1.053127,-0.255291,0.060575
3,ClaimsTruncated,1,1,8015,12301,1.534747,-4286,-0.534747,0.184152,-0.053127,,0.060575
4,Year,Y1,Y1,76038,35517,0.467095,40521,0.532905,0.531708,0.502275,0.056947,0.003470
...,...,...,...,...,...,...,...,...,...,...,...,...
396,AAFC_MISS,1,1,9163,8731,0.952854,432,0.047146,0.130708,0.005355,3.194965,0.417383
397,no_drug_instances_MISS,0,0,96050,50886,0.529787,45164,0.470213,0.761789,0.559826,0.308043,0.186221
398,no_drug_instances_MISS,1,1,51423,15912,0.309434,35511,0.690566,0.238211,0.440174,-0.614013,0.186221
399,no_lab_instances_MISS,0,0,104495,54274,0.519393,50221,0.480607,0.812509,0.622510,0.266368,0.183574


Below are the features with the **highest** information values

In [66]:
IV = IV.sort_values('IV', ascending = False)
IV.head(20)

Unnamed: 0,VAR_NAME,IV
51,no_PlaceSvcs,1.131554
125,ps1,1.001983
47,age_80+,0.945133
137,sexMISS,0.917605
55,no_Specialties,0.91758
36,PayDelay_sum,0.885694
53,no_ProcedureGroups,0.802507
7,DSFS_max,0.796983
16,DrugCount_std,0.707619
10,DSFS_range,0.688119


We then use this guideline to classify the features according to their information values


[![N|Solid](https://miro.medium.com/max/578/1*5S_5aAHWe0_knDGZUK3W8w.png)](https://medium.com/@sundarstyles89/weight-of-evidence-and-information-value-using-python-6f05072e83eb)


In [67]:
def predictive_power_classifier(row):
    if row.IV < 0.02:
        return '1. useless'
    elif row.IV < 0.1:
        return '2. weak'
    elif row.IV < 0.3:
        return '3. medium'
    elif row.IV < 0.5:
        return '4. strong'
    else:
        return '5. too good to be true'

IV['Predictive Power'] = IV.apply(predictive_power_classifier, axis = 1)

In [68]:
IV.groupby(['Predictive Power']).agg({'IV':'count'}).sort_values('Predictive Power', ascending=False)

Unnamed: 0_level_0,IV
Predictive Power,Unnamed: 1_level_1
5. too good to be true,19
4. strong,24
3. medium,24
2. weak,24
1. useless,61


In [69]:
IV.to_excel('Information Values.xlsx', index=False)

Even though group 5 variables have such high Information Value scores that they are considered **too good to be true** according to the guideline, close examination has revealed that all these variables make intuitive sense that justifies their predictive power.

**We will use all variables that  have weak prediction power above for our model**

In [70]:
chosen_features = list(IV[(IV['Predictive Power'] == '2. weak') | (IV['Predictive Power'] == '3. medium') | (IV['Predictive Power'] == '4. strong') | (IV['Predictive Power']== '5. too good to be true')]['VAR_NAME'])

In [71]:
df_modelling_set_trunc = df_modelling_set[['DaysInHospital', 'Year', 'MemberID'] + chosen_features]

In [72]:
df_modelling_set_trunc.to_csv('../1.data-preparation/processed-data/modelling_data_trunc.csv', index=False)