In [2]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import validation_curve
import warnings 
warnings.filterwarnings("ignore")
from sklearn.metrics import roc_auc_score

In [3]:
df = pd.read_csv('/Users/hawaii/Documents/Aviana/Aviana/train_sample.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Row_ID,Household_ID,Vehicle,Calendar_Year,Model_Year,Blind_Make,Blind_Model,Blind_Submodel,Cat1,...,Var5,Var6,Var7,Var8,NVCat,NVVar1,NVVar2,NVVar3,NVVar4,Claim_Amount
0,0,1,1,3,2005,2005,K,K.78,K.78.2,D,...,1.008912,0.26104,0.907793,-0.077998,M,-0.23153,-0.266117,-0.272337,-0.251419,0.0
1,1,2,2,2,2005,2003,Q,Q.22,Q.22.3,B,...,1.240851,0.432987,-0.726459,0.204785,O,-0.23153,-0.266117,-0.272337,-0.251419,0.0
2,2,3,3,1,2005,1998,AR,AR.41,AR.41.1,B,...,-0.971487,-1.405797,-0.837048,-1.176858,F,-0.23153,-0.266117,-0.272337,-0.251419,0.0
3,3,4,3,1,2006,1998,AR,AR.41,AR.41.1,B,...,-0.971487,-1.405797,-0.837048,-1.176858,F,-0.23153,-0.266117,-0.272337,-0.251419,0.0
4,4,5,3,2,2005,2001,D,D.20,D.20.0,J,...,0.812656,2.112691,1.534462,2.34726,F,-0.23153,-0.266117,-0.272337,-0.251419,0.0


In [4]:
df = df.iloc[:,1:]
df_type = pd.read_csv('/Users/hawaii/Documents/Aviana/Aviana/column_type.csv')

In [5]:
df.isnull().any()

Row_ID            False
Household_ID      False
Vehicle           False
Calendar_Year     False
Model_Year        False
Blind_Make        False
Blind_Model       False
Blind_Submodel    False
Cat1              False
Cat2              False
Cat3              False
Cat4              False
Cat5              False
Cat6              False
Cat7              False
Cat8              False
Cat9              False
Cat10             False
Cat11             False
Cat12              True
OrdCat            False
Var1              False
Var2              False
Var3              False
Var4              False
Var5              False
Var6              False
Var7              False
Var8              False
NVCat             False
NVVar1            False
NVVar2            False
NVVar3            False
NVVar4            False
Claim_Amount      False
dtype: bool

In [14]:
# funtion to get column type
def column_type(column_name,df_type):
    return (df_type.loc[df_type['Variable'] == column_name, 'Type'].iloc[0])

In [15]:
#Function to get target variable
def get_target(df,df_type):
    for c in df:
        if (column_type(c,df_type) == 'Flag_Continuous' or column_type(c,df_type) == 'Flag_Categorical'):
            return(c)

In [7]:
# Function to get column which is continuous or categorical
# Input: the data including variable and column type
# Output: the of column which is continuous variable
def get_cont_column(df_type):
    continuous_column = []
    for i in range(len(df_type)):
        if (df_type.iloc[i,1] == 'Continuous'):
            continuous_column.append(df_type.iloc[i,0])
    return (continuous_column)

In [8]:
# Function to get best depth which help to train optimal model
# Input: data of each column, data of decision varaiable
# Output: best depth
def get_best_depth(d_column, d_flag):
    score_mean = [] # here I will store the roc auc
    depth_list = [1,2,3,4,5,6,7,8,9,10]
    
    for depth in depth_list:
        tree_model = DecisionTreeClassifier(max_depth=depth)
        # calculate roc_auc value
        score = cross_val_score(tree_model, d_column, d_flag, cv=3, scoring='roc_auc')    
        score_mean.append(np.mean(score))
    
    # create a dataframe to store depth and roc_auc value
    table = pd.concat([pd.Series(depth_list), pd.Series(score_mean)], axis=1)
    table.columns = ['depth', 'roc_auc_mean']
    
    # get best depth
    table_sort = table.sort_values(by='roc_auc_mean', ascending=False) 
    best_depth = table_sort.iloc[0,0] # get depth which lead ot the largest roc_auc
    print(table_sort)
    print('Best Depth:',best_depth)
    return (best_depth)


In [26]:
# Function to do supervised binning, based single variable decision tree model
# Input: all data processed in the previous step，the data including variable and column type
# Output: new data file
def supervised_binning(df,df_type):
    
    # get all continuous variable
    new_df = df[get_cont_column(df_type)] 
    # get target
    d_flag = df[[get_target(df,df_type)]] # get data of target
#     d_flag = df[['Claim_Amount']]
    d_flag.loc[d_flag['Claim_Amount'] != 0] = 1
    column_list = new_df.columns.values.tolist()
    num_row = len(new_df)

    i = 0
    for column in column_list:
        
        # get data of a certain column
        d_column = new_df[[column]]
        # get the number of unique value
        num_unique = len(new_df[column].unique())
        
        print('')
        print('Column name:', column)
        
        # select best parameter (max_depth, max_leaf_node) 
        if num_row <= 10000:
            num_bins = None
            depth = get_best_depth(d_column,d_flag)
            print("do not set 'max_leaf_node'")
        elif (num_row >= 10000 and num_unique <=64):
            num_bins = None
            depth = get_best_depth(d_column,d_flag)
            print("do not set 'max_leaf_node'")
        else:
            depth = None
            num_bins = np.sqrt(num_unique)
            print("do not set 'max_depth'")
         
        # train optimal single variable to do supervised binning
        optimal_model = DecisionTreeClassifier(max_depth=depth,max_leaf_nodes=num_bins)
        optimal_model.fit(d_column, d_flag)
        y_pred = optimal_model.predict_proba(d_column)[:,1]
        score = roc_auc_score(d_flag,y_pred)
        df[column]=y_pred
        print('The number of original unique value (bins):', num_unique)
        print('The number of unique value (bins):', len(df[column].unique()))
        print('The value of each bins:', df[column].unique() )
        print('Roc_Auc value:', score)

        i=i+1
    return (df)

In [27]:
supervised_binning(df,df_type)


Column name: Vehicle
   depth  roc_auc_mean
4      5      0.576985
5      6      0.576985
6      7      0.576985
7      8      0.576985
8      9      0.576985
9     10      0.576985
3      4      0.559208
1      2      0.550903
2      3      0.536333
0      1      0.504728
Best Depth: 5
do not set 'max_leaf_node'
The number of original unique value (bins): 14
The number of unique value (bins): 6
The value of each bins: [0.01423488 0.00818554 0.01140487 0.01030928 0.02617801 0.        ]
Roc_Auc value: 0.5884989429175476

Column name: Var1
   depth  roc_auc_mean
4      5      0.546146
3      4      0.540855
0      1      0.528005
5      6      0.525092
6      7      0.511957
1      2      0.508252
7      8      0.507624
2      3      0.506539
8      9      0.495103
9     10      0.483341
Best Depth: 5
do not set 'max_leaf_node'
The number of original unique value (bins): 217
The number of unique value (bins): 11
The value of each bins: [0.00673401 0.07142857 0.01290323 0.         0.0156

Unnamed: 0,Row_ID,Household_ID,Vehicle,Calendar_Year,Model_Year,Blind_Make,Blind_Model,Blind_Submodel,Cat1,Cat2,...,Var5,Var6,Var7,Var8,NVCat,NVVar1,NVVar2,NVVar3,NVVar4,Claim_Amount
0,1,1,0.014235,2005,2005,K,K.78,K.78.2,D,C,...,0.010797,0.013166,0.000000,0.011858,M,0.010563,0.009319,0.010822,0.010269,0.0
1,2,2,0.008186,2005,2003,Q,Q.22,Q.22.3,B,C,...,0.010797,0.013166,0.009749,0.011858,O,0.010563,0.009319,0.010822,0.010269,0.0
2,3,3,0.011405,2005,1998,AR,AR.41,AR.41.1,B,?,...,0.020661,0.090909,0.036585,0.002304,F,0.010563,0.009319,0.010822,0.010269,0.0
3,4,3,0.011405,2006,1998,AR,AR.41,AR.41.1,B,?,...,0.020661,0.090909,0.036585,0.002304,F,0.010563,0.009319,0.010822,0.010269,0.0
4,5,3,0.008186,2005,2001,D,D.20,D.20.0,J,C,...,0.010797,0.000000,0.000000,0.011858,F,0.010563,0.009319,0.010822,0.010269,0.0
5,6,3,0.008186,2006,2001,D,D.20,D.20.0,J,C,...,0.010797,0.000000,0.000000,0.011858,F,0.010563,0.009319,0.010822,0.010269,0.0
6,7,4,0.011405,2006,2001,AJ,AJ.129,AJ.129.3,G,C,...,0.010797,0.000000,0.000000,0.011858,M,0.010563,0.009319,0.010822,0.010269,0.0
7,8,4,0.008186,2006,2002,AQ,AQ.17,AQ.17.1,B,C,...,0.010797,0.013166,0.009749,0.011858,M,0.010563,0.009319,0.010822,0.010269,0.0
8,9,4,0.014235,2005,2002,AQ,AQ.17,AQ.17.1,B,C,...,0.010797,0.013166,0.009749,0.011858,M,0.010563,0.009319,0.010822,0.010269,0.0
9,10,5,0.011405,2005,1995,BW,BW.3,BW.3.0,D,?,...,0.010797,0.013166,0.000000,0.011858,N,0.008584,0.009319,0.010822,0.010269,0.0
