In [1]:
import pandas as pd
import numpy as np


# Load Data

In [73]:
df = pd.read_csv('../data/raw/Kaggle/train_sample.csv')

In [74]:
df_type = pd.read_csv('../data/raw/Kaggle/column_type.csv')

# qz-Basic stats functions

In [75]:
#function to get number of missing values in a column
def get_na_num(column): #input the whole column
    if column.dtype == np.object:
        return column.isnull().sum() + column[column == ''].count() + column[column == '?'].count()
    else:
        return column.isnull().sum()

In [76]:
#function to get number of valid values in a column
def get_valid_num(column): #input the whole column
    return len(column) - get_na_num(column)

In [77]:
#function to get minimum value in a column
def get_min(column): #input the whole column
    return min(column)

In [78]:
#function to get maximum value in a column
def get_max(column): #input the whole column
    return max(column)

In [79]:
#function to get mean in a column
def get_mean(column): #input the whole column
    return column.mean()

In [80]:
#function to get std in a column
def get_std(column): #input the whole column
    return column.std()

In [81]:
#function to get skewness in a column
def get_skew(column): #input the whole column
    return column.skew()

In [82]:
#function to get number of distict values in a column
def get_distinct_num(column): #input the whole column
    return len(column.unique().tolist())

In [83]:
#function to get count of each distict value in a column
def get_distinct_count(column): #input the whole column
    dis_count={}
    values = column.value_counts().tolist()       
    for i in range(get_distinct_num(column)):
        key = column.value_counts().index[i]
        value = values[i]
        print(key,value)
        dis_count[key] = value
    return dis_count

In [84]:
#function to get median in a column
import statistics
def get_median(column):
    if get_distinct_num(column) > 5:
        print('Number of distict values is larger than 5. We do not calculate median')
    else:
        return statistics.median(column)

In [85]:
#function to get mode and count for the mode in a column
def get_mode(column):
    return (column.mode()[0],column[column==column.mode()[0]].count())

In [86]:
#Function to get target variable
def get_target(df,df_type):
    for c in df:
        if (column_type(c,df_type) == 'Flag_Continuous' or column_type(c,df_type) == 'Flag_Categorical'):
            return(c)

In [87]:
# funtion to get column type
def column_type(column_name,df_type):
    return (df_type.loc[df_type['Variable'] == column_name, 'Type'].iloc[0])

# 1-qz-Basic variable screening and creating stats report

In [88]:
#function to do basic variable screening and create basic statistical report
def Stats_Collection(df,df_type):
    for c in df:
        #exclude Target 
        if (column_type(c,df_type) != 'Flag_Continuous' and column_type(c,df_type) != 'Flag_Categorical'):
            print('Variable name: ',c)

            #Basic variable screening
            if get_na_num(df[c])/len(df[c]) > 0.5:
                print('More 50% missing values, drop this column\n')
                df = df.drop(columns=c)
                df_type = df_type.drop(index=int(df_type[df_type.Variable == c].index[0]))
                continue
            if (column_type(c,df_type) == 'Delete'):
                print('Column type is Delete, drop this column\n')
                df = df.drop(columns=c)
                df_type = df_type.drop(index=int(df_type[df_type.Variable == c].index[0]))
                continue
            if (column_type(c,df_type) == 'Continuous') and (get_min(df[c]) == get_max(df[c])):
                print('All same value, drop this column\n')
                df = df.drop(columns=c)
                df_type = df_type.drop(index=int(df_type[df_type.Variable == c].index[0]))
                continue
            if (column_type(c,df_type) == 'Ordinal' or column_type(c,df_type) == 'Nominal') and (get_mode(df[c])[1]/get_valid_num(df[c]) > 0.95):
                print('Mode contains more than 95% cases, drop this column\n')
                df = df.drop(columns=c)
                df_type = df_type.drop(index=int(df_type[df_type.Variable == c].index[0]))
                continue
            if (column_type(c,df_type) == 'Nominal') and (get_distinct_num(df[c]) > 100):
                print('More than 100 categories, drop this column\n')
                df = df.drop(columns=c)
                df_type = df_type.drop(index=int(df_type[df_type.Variable == c].index[0]))
                continue

            #Basic statistic report
            print('Variable type: ', column_type(c,df_type))
            print ('Number of missing values: ',get_na_num(df[c]))
            print ('Number of valid values: ',get_valid_num(df[c]))
            if column_type(c,df_type) == 'Continuous' or column_type(c,df_type) == 'Ordinal':
                print('Minimum value: ', get_min(df[c]))
                print('Maximum value: ', get_max(df[c]))
            if column_type(c,df_type) == 'Continuous':
                print('Mean: ',get_mean(df[c]))
                print('Standard Deviation: ',get_std(df[c]))
                print('Skewness: ',get_skew(df[c]))
                print('Number of distinct values: ',get_distinct_num(df[c]))
                print('Number of cases for each distinct value: ')
                if get_distinct_num(df[c]) > 5:
                    print('Number of distict values is larger than 5. We stop updating the number of distinct values\n')
                else:
                    print(get_distinct_count(df[c]))
            else:
                print('Number of categories: ', get_distinct_num(df[c]))
                print('The counts of each category: ')
                if get_distinct_num(df[c]) > 5:
                    print('Number of distict values is larger than 5. We stop updating the number of distinct values\n')
                else:
                    print(get_distinct_count(df[c]))
                print('Mode: ', get_mode(df[c])[0],'Count: ',get_mode(df[c])[1])                
        print()
    return(df,df_type)

In [89]:
d = Stats_Collection(df,df_type)

Variable name:  Row_ID
Column type is Delete, drop this column

Variable name:  Household_ID
More than 100 categories, drop this column

Variable name:  Vehicle
Variable type:  Continuous
Number of missing values:  0
Number of valid values:  5000
Minimum value:  1
Maximum value:  14
Mean:  2.2438
Standard Deviation:  1.4993369075135676
Skewness:  2.0299344354564077
Number of distinct values:  14
Number of cases for each distinct value: 
Number of distict values is larger than 5. We stop updating the number of distinct values


Variable name:  Calendar_Year
Variable type:  Nominal
Number of missing values:  0
Number of valid values:  5000
Number of categories:  2
The counts of each category: 
2005 4026
2006 974
{2005: 4026, 2006: 974}
Mode:  2005 Count:  4026

Variable name:  Model_Year
Variable type:  Nominal
Number of missing values:  0
Number of valid values:  5000
Number of categories:  27
The counts of each category: 
Number of distict values is larger than 5. We stop updating the 

## After deleting useless variables, new dataset and new column type dataset are named as new_df and new_df_type

In [90]:
new_df_type = d[1]

In [91]:
new_df = d[0]

# 2-hz&tcz-Outliers handling

In [92]:
new_df.columns

Index(['Vehicle', 'Calendar_Year', 'Model_Year', 'Blind_Make', 'Cat1', 'Cat2',
       'Cat3', 'Cat6', 'Cat8', 'Cat9', 'Cat10', 'Cat11', 'Cat12', 'OrdCat',
       'Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6', 'Var7', 'Var8', 'NVCat',
       'NVVar1', 'NVVar2', 'NVVar3', 'NVVar4', 'Claim_Amount'],
      dtype='object')

In [93]:
# function to identify outliers in continuous variables
# returns a bool panda series, a lower cutoff value and an upper cutoff value(used in outlier handling)
def outlier_identification(column):
    ori_mean = get_mean(column.dropna())
    ori_std = get_std(column.dropna())
    N_i = []
    X_i = []
    M_i = []
    for i in range(-3, 5):
        lower = ori_mean + (i - 1) * ori_std if i != -3 else -float('inf')
        upper = ori_mean + i * ori_std if i != 4 else float('inf')
        temp1 = column[(column <= upper) & (column > lower)]
        N_i.append(len(temp1))
        X_i.append(get_mean(temp1))
        M_i.append(np.var(temp1) * len(temp1))
    l = -3
    r = 4
    p = 0
    while 1:
        if N_i[l + 3] <= N_i[r + 3]:
            p_current = N_i[l + 3] / get_valid_num(column)
            if p + p_current < 0.05:
                l = l + 1
                p = p + p_current
            else:
                break
        else:
            p_current = N_i[r + 3] / get_valid_num(column)
            if p + p_current < 0.05:
                r = r - 1
                p = p + p_current
            else:
                break
    lower = ori_mean + (l - 1) * ori_std if l != -3 else -float('inf')
    upper = ori_mean + r * ori_std if r != 4 else float('inf')
    temp1 = column[(column <= upper) & (column > lower)]
    x_robust = get_mean(temp1)
    M_robust = 0
    N_sum = 0
    for i in range(l, r + 1):
        A_i = M_i[i + 3] + N_i[i + 3] * (x_robust - X_i[i + 3])**2
        M_robust = M_robust + A_i
        N_sum = N_sum + N_i[i + 3]
    sd_robust = np.sqrt(M_robust / (N_sum - 1))
    result = (column - x_robust < -3 * sd_robust) | (column + x_robust > 3 * sd_robust)
    lower_cutoff_value = x_robust - 3 * sd_robust
    upper_cutoff_value = x_robust + 3 * sd_robust
    return(result, lower_cutoff_value, upper_cutoff_value)
outlier_identification(new_df['Var1'])

(0       False
 1       False
 2       False
 3       False
 4       False
 5       False
 6       False
 7       False
 8       False
 9       False
 10      False
 11      False
 12      False
 13      False
 14      False
 15      False
 16      False
 17      False
 18      False
 19      False
 20      False
 21      False
 22      False
 23      False
 24      False
 25      False
 26      False
 27      False
 28      False
 29      False
         ...  
 4970    False
 4971    False
 4972    False
 4973    False
 4974    False
 4975    False
 4976    False
 4977    False
 4978    False
 4979    False
 4980    False
 4981    False
 4982    False
 4983    False
 4984    False
 4985    False
 4986    False
 4987    False
 4988     True
 4989     True
 4990    False
 4991    False
 4992    False
 4993    False
 4994    False
 4995    False
 4996    False
 4997     True
 4998    False
 4999    False
 Name: Var1, Length: 5000, dtype: bool, -2.827643758741096, 3.1088128074921366)

In [94]:
# function to trim outliers to cutoff values
def outlier_trim(df,column):
    (flag, lower, upper) = outlier_identification(column)
    df.loc[flag, column.name] = pd.Series(map(lambda x : (lower if x <= lower else upper), column[flag]))

In [95]:
# function to set outliers to missing values
def outlier_toNone(df,column):
    (flag, lower, upper) = outlier_identification(column)
    df.loc[flag, column.name] = None

In [96]:
new_df.columns

Index(['Vehicle', 'Calendar_Year', 'Model_Year', 'Blind_Make', 'Cat1', 'Cat2',
       'Cat3', 'Cat6', 'Cat8', 'Cat9', 'Cat10', 'Cat11', 'Cat12', 'OrdCat',
       'Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6', 'Var7', 'Var8', 'NVCat',
       'NVVar1', 'NVVar2', 'NVVar3', 'NVVar4', 'Claim_Amount'],
      dtype='object')

In [97]:
print(sum(new_df['Var1'] > 3.1088128074921366))
print(sum(new_df['Var1'] < -2.827643758741096))
for c in new_df:
    if column_type(c,new_df_type) == 'Continuous':
        outlier_trim(new_df,new_df[c])
print(sum(new_df['Var1'] > 3.1088128074921366))
print(sum(new_df['Var1'] < -2.827643758741096))

124
0
0
0


# 3-wyx-Missing values handling

In [98]:
# function to fill missing value and update statistic
def fill_missing_value(mydata, column_type):
    # filling missing value and updata statistic
    column_list = mydata.columns.values.tolist()
    typelist = list(column_type.iloc[:,1])
    i = 0
    for typ in typelist:
        column_data = mydata[column_list[i]].dropna()
        if (typ == 'Continuous'):
            mean_value = column_data.mean() # calculate mean
            mydata[column_list[i]] = mydata[column_list[i]].fillna(mean_value) # fill missing value with mean
            cont_sd = mydata[column_list[i]].std() # calculate standard deviation
            cont_skew = mydata[column_list[i]].skew() # calculate skewness
            print('')
            print('Column:', column_list[i])
            print('Column type: continuous')
            print('Mean:', mean_value)
            print('Standard deviation:', cont_sd)
            print('Skewness:', cont_skew)
            print('The number of missing values:', get_na_num(mydata[column_list[i]]))
            print('The number of valid values:', get_valid_num(mydata[column_list[i]]))
        elif (typ == 'Ordinal' and mydata[column_list[i]].dtype != 'object'):
            num_median = column_data.median() # calculate median 
            mydata[column_list[i]] = mydata[column_list[i]].fillna(num_median) # fill missing value with median
            count_median_num = mydata[column_list[i]][mydata[column_list[i]] == num_median].count() # count the the number of cases in the median category
            print('')
            print('Column:', column_list[i])
            print('Column type: num_ordinal')
            print('Median:', num_median)
            print('The number of cases in the median category:', count_median_num)
            print('The number of missing values:', get_na_num(mydata[column_list[i]]))
            print('The number of valid values:', get_valid_num(mydata[column_list[i]]))
        elif (typ == 'Ordinal' and mydata[column_list[i]].dtype == 'object'):
            mode_value = column_data.mode()[0] # calculate mode
            mydata[column_list[i]] = mydata[column_list[i]].fillna(mode_value) # fill missing valye with mode
            count_mode = mydata[column_list[i]][mydata[column_list[i]] == mode_value].count() # count the the number of cases in the modal category
            print('')
            print('Column:', column_list[i])
            print('Column type: cat_ordinal')
            print('Mode:', mode_value)
            print('The number of cases in the modal category:', count_mode)
            print('The number of missing values:', get_na_num(mydata[column_list[i]]))
            print('The number of valid values:', get_valid_num(mydata[column_list[i]]))
        else:
            mode_value = column_data.mode()[0] # calculate mode
            mydata[column_list[i]] = mydata[column_list[i]].fillna(mode_value) # fill missing valye with mode
            count_mode = mydata[column_list[i]][mydata[column_list[i]] == mode_value].count() # count the the number of cases in the modal category
            print('')
            print('Column:', column_list[i])
            print('Column type: nominal')
            print('Mode:', mode_value)
            print('The number of cases in the modal category:', count_mode)
            print('The number of missing values:', get_na_num(mydata[column_list[i]]))
            print('The number of valid values:', get_valid_num(mydata[column_list[i]]))
        i = i + 1
    # add column type at the last row
    print('add column type at the last row:')
    return (mydata)

In [99]:
for c in new_df:
    print(get_na_num(new_df[c]))

1180
0
0
1
22
1747
2
22
0
0
0
5
5
5
127
9
13
11
93
36
0
144
0
0
0
0
0
0


In [100]:
new_df = new_df.replace('?',np.NaN)
fill_missing_value(new_df,new_df_type)


Column: Vehicle
Column type: continuous
Mean: 3.1836131972203994
Standard deviation: 1.9279074114234207
Skewness: 0.02276959772090265
The number of missing values: 0
The number of valid values: 5000

Column: Calendar_Year
Column type: nominal
Mode: 2005
The number of cases in the modal category: 4026
The number of missing values: 0
The number of valid values: 5000

Column: Model_Year
Column type: nominal
Mode: 2002
The number of cases in the modal category: 506
The number of missing values: 0
The number of valid values: 5000

Column: Blind_Make
Column type: nominal
Mode: AJ
The number of cases in the modal category: 858
The number of missing values: 0
The number of valid values: 5000

Column: Cat1
Column type: nominal
Mode: D
The number of cases in the modal category: 1352
The number of missing values: 0
The number of valid values: 5000

Column: Cat2
Column type: nominal
Mode: C
The number of cases in the modal category: 4138
The number of missing values: 0
The number of valid values:

Unnamed: 0,Vehicle,Calendar_Year,Model_Year,Blind_Make,Cat1,Cat2,Cat3,Cat6,Cat8,Cat9,...,Var5,Var6,Var7,Var8,NVCat,NVVar1,NVVar2,NVVar3,NVVar4,Claim_Amount
0,5.411107,2005,2005,K,D,C,F,C,C,A,...,1.008912,0.261040,0.907793,-0.077998,M,-0.231530,-0.266117,-0.272337,-0.251419,0.0
1,5.411107,2005,2003,Q,B,C,A,E,A,B,...,1.240851,0.432987,-0.726459,0.204785,O,-0.231530,-0.266117,-0.272337,-0.251419,0.0
2,1.000000,2005,1998,AR,B,C,A,C,A,B,...,-0.971487,-1.405797,-0.837048,-1.176858,F,-0.231530,-0.266117,-0.272337,-0.251419,0.0
3,1.000000,2006,1998,AR,B,C,A,C,A,B,...,-0.971487,-1.405797,-0.837048,-1.176858,F,-0.231530,-0.266117,-0.272337,-0.251419,0.0
4,5.411107,2005,2001,D,J,C,B,D,A,B,...,0.812656,2.112691,1.534462,2.347260,F,-0.231530,-0.266117,-0.272337,-0.251419,0.0
5,5.411107,2006,2001,D,J,C,B,D,A,B,...,0.812656,2.112691,1.534462,2.347260,F,-0.231530,-0.266117,-0.272337,-0.251419,0.0
6,1.000000,2006,2001,AJ,G,C,A,E,A,B,...,0.580718,0.551128,0.416289,-0.024395,M,-0.231530,-0.266117,-0.272337,-0.251419,0.0
7,5.411107,2006,2002,AQ,B,C,B,D,B,A,...,0.527193,-0.023200,-0.701884,0.226664,M,-0.231530,-0.266117,-0.272337,-0.251419,0.0
8,5.411107,2005,2002,AQ,B,C,B,D,B,A,...,0.527193,-0.023200,-0.701884,0.226664,M,-0.231530,-0.266117,-0.272337,-0.251419,0.0
9,1.000000,2005,1995,BW,D,C,E,D,A,B,...,0.176312,0.283264,0.969232,-0.792339,N,2.054683,-0.266117,-0.272337,-0.251419,0.0


In [101]:
for c in new_df:
    print(get_na_num(new_df[c]))

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [102]:
new_df.columns

Index(['Vehicle', 'Calendar_Year', 'Model_Year', 'Blind_Make', 'Cat1', 'Cat2',
       'Cat3', 'Cat6', 'Cat8', 'Cat9', 'Cat10', 'Cat11', 'Cat12', 'OrdCat',
       'Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6', 'Var7', 'Var8', 'NVCat',
       'NVVar1', 'NVVar2', 'NVVar3', 'NVVar4', 'Claim_Amount'],
      dtype='object')

# 4-tcz-Continuous variable transformation (z-score or min-max)

In [103]:
from scipy import stats
import sklearn
from sklearn import preprocessing

In [104]:
#function to do z-score transformation of a column
def zscore(df, column,df_type):
    if column_type(column,df_type) == 'Continuous':
        return(stats.zscore(df[column]))
    else:
        return(df[column])

In [105]:
 #function to do min-max transformation of a column
def minmax(df, column,df_type):
    if column_type(column,df_type) == 'Continuous':
        return(preprocessing.minmax_scale(df[column], feature_range=(0, 100)))
    else:
        return(df[column])

In [106]:
# zscore for continuous predictor
#check on Var1
print(new_df['Var1'])
get_mean(new_df['Var1'])
get_std(new_df['Var1'])

for c in new_df:
    new_df[c] = zscore(new_df, c,new_df_type)

#check on Var1 result
print(new_df['Var1'])
get_mean(new_df['Var1'])
get_std(new_df['Var1'])

0       1.270533
1       0.217951
2      -0.754282
3      -0.754282
4       0.563454
5       0.563454
6       0.700049
7       0.017076
8       0.017076
9       0.442930
10      0.442930
11      0.322405
12      0.322405
13     -0.633757
14     -0.633757
15     -0.376638
16     -0.376638
17      0.322405
18      0.322405
19      0.322405
20      1.849051
21     -0.681967
22     -0.175763
23     -0.537337
24     -0.537337
25     -0.931051
26      0.017076
27      0.017076
28     -0.408778
29     -0.408778
          ...   
4970    0.491140
4971    1.447302
4972    1.447302
4973   -0.633757
4974   -0.633757
4975    1.366952
4976    1.366952
4977    0.563454
4978    0.563454
4979    2.130275
4980    2.130275
4981   -0.714107
4982    1.447302
4983    1.447302
4984   -1.011401
4985   -1.011401
4986   -1.027471
4987   -1.027471
4988    0.170694
4989    0.170694
4990   -0.400743
4991   -0.400743
4992   -0.031134
4993   -0.031134
4994   -0.119519
4995    0.442930
4996    0.442930
4997    0.1706

1.0001000150025041

# 5-tcz-target-handling

# 6-jbl-Categorical-variable-handling(Reorder and Supervised Merged)

In [107]:
# Function to sort the Series by value and then by index(Lexical order)
def sort_data(Series):
    return Series.iloc[np.lexsort([Series.index, Series.values])]

In [108]:
# Function to supervised merged categories in categorical variables
from CHAID import Tree
import re

# df = dataset, Predictor_type = Nominal or Ordinal, dependent_variable_name = target name, indep_column_num = column index
def Supervised_Merged (df, Predictor_type, dependent_variable_name, indep_column_num, Categorical = True):
    
    
    # Get the names of Independent and Dependent variables
    independent_variable_column = [df.columns[indep_column_num]]
    dep_variable = dependent_variable_name
    
    # Check for Target variable type to decide which CHAID TREE to implement
    if Categorical == True:
        
        # fit the Chaid tree model to supervised merged the categories in category predictor
        tree = Tree.from_pandas_df(df, dict(zip(independent_variable_column, [Predictor_type] *1)), 
                                   dep_variable, max_depth = 1)
        
    else:
        
        # Convert the target variable to numeric  
        df[dependent_variable_name] = pd.to_numeric(df[dependent_variable_name],errors='coerce')
        
        # fit the Chaid tree model to supervised merged the categories in category predictor
        tree = Tree.from_pandas_df(df, dict(zip(independent_variable_column, [Predictor_type] * 1)), 
                                   dep_variable, dep_variable_type='continuous',max_depth = 1)
    
    # Print the fitted tree
    print('The CHAID TREE is presented below:')
    print('')
    tree.print_tree()

    # Get the merged categoriess string from the tree
    Merged_group = tree.tree_store[0].split.groupings.split('],')
    # Get numbers of merged caegroeis
    length_Merged_group = np.arange(0,len(Merged_group))
    
    if len(Merged_group) >= 2: 
        
        # Etract the number from the string 
        New_Merged_Categories = {}
        for i in length_Merged_group:
            group = list(map(int, re.findall(r'\d+',Merged_group[i])))
            New_Merged_Categories[i] = group 
        print('The P-Values of this node is',tree.tree_store[0].split.p)
        print('The new categories are:' )
        print(New_Merged_Categories)
        print('')
        
        # Convert the dict_format to match the previous dic
        # For example: new_merged: {0:[1,2,3,4,5],1:[6,7,8],2:[0,9]}
        #              map_dict: {0:2, 1:0, 2:0, 3:0, 4:0, 5:0, 6:1, 7:1, 8:1, 9:2}               
        new_dict={}
        length_New_Merged = np.arange(0,len(New_Merged_Categories))
        for j in length_New_Merged:
            values = New_Merged_Categories.get(j) 
            for k in np.arange(0,len(values)):
                new_dict[values[k]]=j
    else:
        print('The P-Values of this node is',tree.tree_store[0].split.p)
        print('The P-values is too large.')
        print('There is no categories can be merged in this variables.')
        print('')
        new_dict={}
    return new_dict

In [109]:
# Function to Rearrange categories and Supervised Merged for Categorical Predictors
# dataset = original dataset, column_type = dataset includes the columns type, dep_variable_name = target name.
def Reorder_Categories (dataset,column_type):
    dep_variable_name = get_target(dataset,column_type)
    
    # Get the target column index
    T_colnumber = dataset.columns.get_loc(dep_variable_name)
    
    # Get the type of Target column
    Flag_type = 'Flag_Continuous'
    Flag_type1 = Flag_type in column_type.iloc[:,1].values
    
    # Get the row and column number of Dataset
    n_columns=np.arange(0,len(dataset.columns),1)
    length_data = len(dataset)-1
    
    
    # Loop through all columns 
    for i in n_columns:
        
        Predictor_type = column_type.iloc[i,1]
        
        # Check the type of Categorical predictor
        if Predictor_type == 'Nominal':
            
            Pre_type = 'nominal'
            
            # Get the total counts of each category in each column
            Column_name = dataset.columns[i]
            Count_Each_Level = dataset.iloc[:length_data-1,i].value_counts()
            
            # Sort the categories  
            Count_Each_Level = sort_data(Count_Each_Level)
            print('Column name:',Column_name.upper())
            print(Count_Each_Level.to_frame())


            # Assign each category a number, starting from 0 to N, by counts.
            n_distinct = np.arange(0,len(Count_Each_Level),1)
            dict_Level={}
            for j in n_distinct:
                Level_name = Count_Each_Level.index[j]
                dict_Level[Level_name]= j 


            print('Reorder Categories :')
            print(dict_Level)
            print('')

            # Substitute orignal Categories to number
            dataset[Column_name] = dataset[Column_name].map(dict_Level)


            # Supervised Merged
            print('Supervised Merged:')
            
            New_Categories ={}
            if T_colnumber != i:
                
                # Check if target is Categorical or Continuous
                if Flag_type1 == True:
                    New_Categories = Supervised_Merged(dataset, Pre_type, dependent_variable_name = dep_variable_name, indep_column_num = i, Categorical = False)
                else:
                    New_Categories = Supervised_Merged(dataset, Pre_type, dependent_variable_name = dep_variable_name, indep_column_num = i)
            else:
                print('')
                print('This is the Target column.')
                
            # Check if there the New_Categories is empty set
            if len(New_Categories) != 0:
                dataset[Column_name] = dataset[Column_name].map(New_Categories)
                
            print('------------------------------------------------------------------------------------------------------------------')
            print('')
            
         
        # If the Predictor type is Ordinal
        if Predictor_type == 'Ordinal':
            
            Column_name = dataset.columns[i]
            Pre_type ='ordinal'
            
            New_Categories ={}
            
            # Check if target is Categorical or Continuous
            if Flag_type1 == True:
                New_Categories = Supervised_Merged(dataset, Pre_type, dependent_variable_name = dep_variable_name, indep_column_num = i, Categorical = False)
            else:
                New_Categories = Supervised_Merged(dataset,Pre_type, dependent_variable_name = dep_variable_name, indep_column_num = i)
                
            if len(New_Categories) != 0:
                    dataset[Column_name] = dataset[Column_name].map(New_Categories)
            
    return dataset

In [110]:
new_df = Reorder_Categories(new_df,new_df_type)

Column name: CALENDAR_YEAR
      Calendar_Year
2006            973
2005           4025
Reorder Categories :
{2006: 0, 2005: 1}

Supervised Merged:
The CHAID TREE is presented below:

([], {'mean': 1.1324685878199998, 's.t.d': 28.210900917931}, <Invalid Chaid Split> - splitting would create nodes with less than the minimum child node size)

The P-Values of this node is None
The P-values is too large.
There is no categories can be merged in this variables.

------------------------------------------------------------------------------------------------------------------

Column name: MODEL_YEAR
      Model_Year
1982           7
2007           9
1983          13
1981          16
1984          25
1987          28
1986          33
1985          35
1988          50
1989          59
1990          65
1992          95
1991          97
2006         105
1993         138
1994         169
1996         198
1995         208
1998         269
1997         293
1999         367
2005         377
2000     


invalid value encountered in double_scalars



([], {'mean': 1.1324685878199998, 's.t.d': 28.210900917931}, (Model_Year, p=5.241546592811847e-06, score=20.792136682582637, groups=[[0, 1, 2, 3, 4, 7, 12, 23, 26, 20, 8, 9, 10, 11, 17, 22, 14, 13, 18, 6, 19, 21, 15, 24, 25], [5, 16]]), dof=4998))
|-- ([0, 1, 2, 3, 4, 7, 12, 23, 26, 20, 8, 9, 10, 11, 17, 22, 14, 13, 18, 6, 19, 21, 15, 24, 25], {'mean': 0.7373932463552577, 's.t.d': 16.847092670367292}, <Invalid Chaid Split> - the max depth has been reached)
+-- ([5, 16], {'mean': 9.477998146017699, 's.t.d': 107.41967106142823}, <Invalid Chaid Split> - the max depth has been reached)

The P-Values of this node is 5.241546592811847e-06
The new categories are:
{0: [0, 1, 2, 3, 4, 7, 12, 23, 26, 20, 8, 9, 10, 11, 17, 22, 14, 13, 18, 6, 19, 21, 15, 24, 25], 1: [5, 16]}

------------------------------------------------------------------------------------------------------------------

Column name: BLIND_MAKE
    Blind_Make
CB           1
E            1
U            1
BD           3
AW        

([], {'mean': 1.1324685878199998, 's.t.d': 28.210900917931}, <Invalid Chaid Split> - splitting would create nodes with less than the minimum child node size)

The P-Values of this node is None
The P-values is too large.
There is no categories can be merged in this variables.

Column name: NVCAT
   NVCat
D      7
I     55
C     62
G     77
B     84
A    101
E    127
K    134
H    160
F    199
L    305
J    323
O    615
N    730
M   2019
Reorder Categories :
{'D': 0, 'I': 1, 'C': 2, 'G': 3, 'B': 4, 'A': 5, 'E': 6, 'K': 7, 'H': 8, 'F': 9, 'L': 10, 'J': 11, 'O': 12, 'N': 13, 'M': 14}

Supervised Merged:
The CHAID TREE is presented below:

([], {'mean': 1.1324685878199998, 's.t.d': 28.210900917931}, (NVCat, p=0.006536703028126455, score=7.402272238386896, groups=[[0, 1, 4, 14, 7, 5, 6, 8, 11, 12, 9, 13, 10], [2, 3]]), dof=4998))
|-- ([0, 1, 4, 14, 7, 5, 6, 8, 11, 12, 9, 13, 10], {'mean': 0.9490156889734622, 's.t.d': 26.9028274132576}, <Invalid Chaid Split> - the max depth has been reached)


In [111]:
new_df.columns

Index(['Vehicle', 'Calendar_Year', 'Model_Year', 'Blind_Make', 'Cat1', 'Cat2',
       'Cat3', 'Cat6', 'Cat8', 'Cat9', 'Cat10', 'Cat11', 'Cat12', 'OrdCat',
       'Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6', 'Var7', 'Var8', 'NVCat',
       'NVVar1', 'NVVar2', 'NVVar3', 'NVVar4', 'Claim_Amount'],
      dtype='object')

# hz-p-value calculation

In [112]:
def p_value_continuous(column1,column2):
    column2=pd.to_numeric(column2,errors='ignore')
    F = np.var(column1) / np.var(column2)
    #degree of freedom
    df1 = len(column1) - 1
    df2 = len(column2) - 1
    p_value = round(stats.f.sf(F, df1, df2),5)
    return p_value

In [113]:
def p_value_target_predictor(target,column,df_type):
    target_name = target.name
    if column_type(target_name,df_type)=="Flag_Continuous":
        return (p_value_continuous(target,column))
    else:
        contingency = pd.crosstab(target, column)
        c, p, dof, expected=stats.chi2_contingency(contingency)
        return p

# 7.1-wyx-Continuous_variable_handling(for categorical target):supervised binning

In [69]:
#Function to get all continuous variables. Preparing for PCA
#Input: a record dataset and a column type dataset after predictors handling
#Output: a dataset contains only continuous variables
def get_continuous_variables(new_df,new_df_type):
    continuous_predictor_name = []
    for c in new_df:get_continuous_variables
        if column_type(c,new_df_type) == "Continuous":
            continuous_predictor_name.append(c)
    return (new_df[continuous_predictor_name])

In [70]:
get_cont_column(new_df_type)

['Vehicle',
 'Var1',
 'Var2',
 'Var3',
 'Var4',
 'Var5',
 'Var6',
 'Var7',
 'Var8',
 'NVVar1',
 'NVVar2',
 'NVVar3',
 'NVVar4']

In [59]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

In [60]:
# Function to get best depth which help to train optimal model
# Input: data of each column, data of decision varaiable
# Output: best depth
def get_best_depth(d_column, d_flag):
    score_mean = [] # here I will store the roc auc
    depth_list = [1,2,3,4,5,6,7,8,9,10]
    
    for depth in depth_list:
        tree_model = DecisionTreeClassifier(max_depth=depth)
        # calculate roc_auc value
        score = cross_val_score(tree_model, d_column, d_flag, cv=3, scoring='roc_auc')    
        score_mean.append(np.mean(score))
    
    # create a dataframe to store depth and roc_auc value
    table = pd.concat([pd.Series(depth_list), pd.Series(score_mean)], axis=1)
    table.columns = ['depth', 'roc_auc_mean']
    
    # get best depth
    table_sort = table.sort_values(by='roc_auc_mean', ascending=False) 
    best_depth = table_sort.iloc[0,0] # get depth which lead ot the largest roc_auc
    print(table_sort)
    print('Best Depth:',best_depth)
    return (best_depth)

In [71]:
# Function to do supervised binning, based single variable decision tree model
# Input: all data processed in the previous step，the data including variable and column type
# Output: new data file
def supervised_binning(df,df_type):
    
    # get all continuous variable
    new_df = get_continuous_variables(df,df_type)
    # get target
    d_flag = df[[get_target(df,df_type)]] # get data of target

#     d_flag.loc[d_flag['Claim_Amount'] != 0] = 1
    column_list = new_df.columns.values.tolist()
    num_row = len(new_df)

    i = 0
    for column in column_list:
        
        # get data of a certain column
        d_column = new_df[[column]]
        
        num_unique = len(new_df[column].unique())
        
        # select best parameter (max_depth, max_leaf_node) 
        if num_row <= 10000:
            num_bins = None
            depth = get_best_depth(d_column,d_flag)
            print("do not set 'max_leaf_node'")
        elif (num_row >= 10000 and num_unique <=64):
            num_bins = None
            depth = get_best_depth(d_column,d_flag)
            print("do not set 'max_leaf_node'")
        else:
            depth = None
            num_bins = np.sqrt(num_unique)
            print("do not set 'max_depth'")
         
        # train optimal single variable to do supervised binning
        optimal_model = DecisionTreeClassifier(max_depth=depth,max_leaf_nodes=num_bins)
        optimal_model.fit(d_column, d_flag)
        y_pred = optimal_model.predict_proba(d_column)[:,1]
        score = roc_auc_score(d_flag,y_pred)
        df[column]=y_pred
        print('Column name:', column)
        print('The number of original unique value (bins):', num_unique)
        print('The number of unique value (bins):', len(df[column].unique()))
        print('The value of each bins:', df[column].unique() )
        print('Roc_Auc value:', score)
        i=i+1
    return (df)

In [72]:
new_df1 = new_df.iloc[0:3000,]
supervised_binning(new_df1,new_df_type)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



   depth  roc_auc_mean
0      1      0.440512
1      2      0.440512
2      3      0.440512
3      4      0.440512
4      5      0.440512
5      6      0.440512
6      7      0.440512
7      8      0.440512
8      9      0.440512
9     10      0.440512
Best Depth: 1
do not set 'max_leaf_node'
Column name: Vehicle
The number of original unique value (bins): 2
The number of unique value (bins): 2
The value of each bins: [0.01135749 0.01042572]
Roc_Auc value: 0.5101265434935809




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



   depth  roc_auc_mean
1      2      0.600193
2      3      0.600193
3      4      0.600193
4      5      0.600193
5      6      0.600193
6      7      0.600193
7      8      0.600193
8      9      0.600193
9     10      0.600193
0      1      0.562751
Best Depth: 2
do not set 'max_leaf_node'
Column name: Var1
The number of original unique value (bins): 3
The number of unique value (bins): 3
The value of each bins: [0.01059495 0.02651515 0.        ]
Roc_Auc value: 0.6001930324478353




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



   depth  roc_auc_mean
2      3      0.825275
1      2      0.780035
3      4      0.734228
4      5      0.715446
5      6      0.678003
6      7      0.666896
7      8      0.666743
8      9      0.666743
9     10      0.666743
0      1      0.593728
Best Depth: 3
do not set 'max_leaf_node'
Column name: Var2
The number of original unique value (bins): 18
The number of unique value (bins): 7
The value of each bins: [0.01826484 0.03133159 0.         0.07246377 0.00860215 0.1
 0.33333333]
Roc_Auc value: 0.8631001623923767




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



   depth  roc_auc_mean
2      3      0.573153
1      2      0.552333
3      4      0.549315
4      5      0.523179
5      6      0.516346
6      7      0.516346
7      8      0.516346
8      9      0.516346
9     10      0.516346
0      1      0.512792
Best Depth: 3
do not set 'max_leaf_node'
Column name: Var3
The number of original unique value (bins): 8
The number of unique value (bins): 4
The value of each bins: [0.01163242 0.         0.05       0.5       ]
Roc_Auc value: 0.5971290253393389




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



   depth  roc_auc_mean
2      3      0.745871
3      4      0.701504
4      5      0.630986
1      2      0.605877
5      6      0.602215
6      7      0.599335
7      8      0.599335
8      9      0.599335
9     10      0.599335
0      1      0.517664
Best Depth: 3
do not set 'max_leaf_node'
Column name: Var4
The number of original unique value (bins): 16
The number of unique value (bins): 5
The value of each bins: [0.01626016 0.03867403 0.00346706 0.33333333 1.        ]
Roc_Auc value: 0.7679831683876174




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



   depth  roc_auc_mean
3      4      0.846141
2      3      0.836520
5      6      0.778089
4      5      0.771348
6      7      0.755416
7      8      0.755416
8      9      0.755416
9     10      0.755416
1      2      0.714603
0      1      0.625578
Best Depth: 4
do not set 'max_leaf_node'
Column name: Var5
The number of original unique value (bins): 16
The number of unique value (bins): 11
The value of each bins: [0.         0.01515152 0.01076555 0.02352941 0.2        0.0952381
 0.03571429 0.06818182 0.14285714 0.33333333 0.125     ]
Roc_Auc value: 0.8630746289998059




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



   depth  roc_auc_mean
2      3      0.832414
3      4      0.799139
4      5      0.785106
5      6      0.692358
1      2      0.691684
6      7      0.671401
7      8      0.671401
8      9      0.671401
9     10      0.671401
0      1      0.571376
Best Depth: 3
do not set 'max_leaf_node'
Column name: Var6
The number of original unique value (bins): 17
The number of unique value (bins): 7
The value of each bins: [0.         0.05263158 0.01143947 0.26315789 0.11864407 1.
 0.5       ]
Roc_Auc value: 0.9078346661764255




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



   depth  roc_auc_mean
1      2      0.654962
3      4      0.602981
4      5      0.600239
5      6      0.600239
6      7      0.600239
7      8      0.600239
8      9      0.600239
9     10      0.600239
2      3      0.596869
0      1      0.538698
Best Depth: 2
do not set 'max_leaf_node'
Column name: Var7
The number of original unique value (bins): 7
The number of unique value (bins): 4
The value of each bins: [0.0076746  0.02747253 0.09090909 0.125     ]
Roc_Auc value: 0.6368385574654534




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



   depth  roc_auc_mean
2      3      0.875785
3      4      0.846876
4      5      0.832659
1      2      0.821751
5      6      0.819637
6      7      0.817753
7      8      0.817753
8      9      0.817753
9     10      0.817753
0      1      0.677437
Best Depth: 3
do not set 'max_leaf_node'
Column name: Var8
The number of original unique value (bins): 16
The number of unique value (bins): 7
The value of each bins: [0.00219619 0.09722222 0.19230769 0.02684564 0.5        0.33333333
 1.        ]
Roc_Auc value: 0.8889399556740306




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



   depth  roc_auc_mean
0      1      0.494776
1      2      0.467184
2      3      0.467184
3      4      0.467184
4      5      0.467184
5      6      0.467184
6      7      0.467184
7      8      0.467184
8      9      0.467184
9     10      0.467184
Best Depth: 1
do not set 'max_leaf_node'
Column name: NVVar1
The number of original unique value (bins): 3
The number of unique value (bins): 2
The value of each bins: [0.01034126 0.03030303]
Roc_Auc value: 0.529276587921684




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



   depth  roc_auc_mean
0      1      0.616953
1      2      0.616953
2      3      0.616953
3      4      0.616953
4      5      0.616953
5      6      0.616953
6      7      0.616953
7      8      0.616953
8      9      0.616953
9     10      0.616953
Best Depth: 1
do not set 'max_leaf_node'
Column name: NVVar2
The number of original unique value (bins): 2
The number of unique value (bins): 2
The value of each bins: [0.0081663  0.03594771]
Roc_Auc value: 0.6169531513313111




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



   depth  roc_auc_mean
0      1           0.5
1      2           0.5
2      3           0.5
3      4           0.5
4      5           0.5
5      6           0.5
6      7           0.5
7      8           0.5
8      9           0.5
9     10           0.5
Best Depth: 1
do not set 'max_leaf_node'
Column name: NVVar3
The number of original unique value (bins): 2
The number of unique value (bins): 2
The value of each bins: [0.01067734 0.33333333]
Roc_Auc value: 0.5148144743695805




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



   depth  roc_auc_mean
0      1       0.52036
1      2       0.52036
2      3       0.52036
3      4       0.52036
4      5       0.52036
5      6       0.52036
6      7       0.52036
7      8       0.52036
8      9       0.52036
9     10       0.52036
Best Depth: 1
do not set 'max_leaf_node'
Column name: NVVar4
The number of original unique value (bins): 2
The number of unique value (bins): 2
The value of each bins: [0.01054781 0.03278689]
Roc_Auc value: 0.5203603272359592




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Unnamed: 0,Vehicle,Calendar_Year,Model_Year,Blind_Make,Cat1,Cat2,Cat3,Cat6,Cat8,Cat9,...,Var5,Var6,Var7,Var8,NVCat,NVVar1,NVVar2,NVVar3,NVVar4,Claim_Amount
0,0.011357,1,0,0,9,2,2,2,0,0,...,0.000000,0.000000,0.007675,0.002196,0,0.010341,0.008166,0.010677,0.010548,0.00000
1,0.011357,1,0,0,8,2,5,1,1,1,...,0.000000,0.000000,0.007675,0.002196,0,0.010341,0.008166,0.010677,0.010548,0.00000
2,0.010426,1,0,1,8,2,5,2,1,1,...,0.015152,0.052632,0.027473,0.002196,0,0.010341,0.008166,0.010677,0.010548,0.00000
3,0.010426,0,0,1,8,2,5,2,1,1,...,0.015152,0.052632,0.027473,0.002196,0,0.010341,0.008166,0.010677,0.010548,0.00000
4,0.011357,1,0,0,2,2,4,4,1,1,...,0.000000,0.011439,0.007675,0.002196,0,0.010341,0.008166,0.010677,0.010548,0.00000
5,0.011357,0,0,0,2,2,4,4,1,1,...,0.000000,0.011439,0.007675,0.002196,0,0.010341,0.008166,0.010677,0.010548,0.00000
6,0.010426,0,0,1,3,2,5,1,1,1,...,0.000000,0.000000,0.007675,0.002196,0,0.010341,0.008166,0.010677,0.010548,0.00000
7,0.011357,0,0,0,8,2,4,4,1,0,...,0.015152,0.000000,0.007675,0.002196,0,0.010341,0.008166,0.010677,0.010548,0.00000
8,0.011357,1,0,0,8,2,4,4,1,0,...,0.015152,0.000000,0.007675,0.002196,0,0.010341,0.008166,0.010677,0.010548,0.00000
9,0.010426,1,0,0,9,2,1,4,1,1,...,0.000000,0.000000,0.007675,0.002196,0,0.010341,0.008166,0.010677,0.010548,0.00000


# 7.2.1-qz-Continuous_variable_selection(for continuous target)

In [114]:
#function to get all continuous variables. Preparing for PCA
#input: a record dataset and a column type dataset after predictors handling
#output: a dataset contains only continuous variables
def get_continuous_variables(new_df,new_df_type):
    continuous_predictor_name = []
    for c in new_df:
        if column_type(c,new_df_type) == "Continuous":
            continuous_predictor_name.append(c)
    return (new_df[continuous_predictor_name])

In [115]:
#function to delete non highly correlated features with target
def continuous_selection(new_df,new_df_type):
    target_name = get_target(new_df,new_df_type)
    target = new_df[target_name]
    df_continuous_predictors = get_continuous_variables(new_df,new_df_type)
    for c in df_continuous_predictors:
        if p_value_target_predictor(target,df_continuous_predictors[c],new_df_type) > 0.05:
            df_continuous_predictors = df_continuous_predictors.drop(columns = c)
            new_df = new_df.drop(columns = c)
            new_df_type = new_df_type.drop(index=int(new_df_type[new_df_type.Variable == c].index[0]))
    return (df_continuous_predictors,new_df,new_df_type)

In [116]:
#function to get correlation between a feature and a group
def get_corr_group(variable_index,group_list,new_continuous_predictors):
    corr_list = []
    for i in group_list:
        matrix = np.corrcoef(new_continuous_predictors.iloc[:,variable_index],new_continuous_predictors.iloc[:,i])
        corr_list.append(abs(matrix[0,1]))
    return min(corr_list)

In [117]:
#function to get grouped feature index. 
#input: a datset of all continuous variables. 
#output: a list of names of grouped features.
def get_grouped_features(new_continuous_predictors):
    #correlation matrix
    corre = new_continuous_predictors.corr()
    #triangular matrix
    tri_corre = np.triu(corre)
    #changed to absolute values
    tri_corre = np.absolute(tri_corre)
    #changed correlation to 0 for correlation of feature itself
    for i in range(len(tri_corre)):
        tri_corre[i,i] = 0
    groups = []
    alpha = 0.9
    while alpha > 0.1:
        #get first pair in a group
        if (np.amax(tri_corre) > alpha):
            group = list(np.unravel_index(np.argmax(tri_corre), (len(tri_corre),len(tri_corre))))

            #max features in a group is 5
            while len(group) <= 5:
                #get the next correlated feature to group
                group_var_corr = {}
                for i in range(new_continuous_predictors.shape[1]):
                    if i in group:
                        continue
                    else:
                        group_var_corr[i] = get_corr_group(i,group,new_continuous_predictors)
                best_var_index = max(group_var_corr, key=group_var_corr.get)
                if group_var_corr[best_var_index] > alpha:
                    #add feature i to the group
                    group.append(i)

                else:
                    #remove correlations of grouped features
                    group_name = []
                    for i in group:
                        tri_corre[i] = 0
                        tri_corre[:, i] = 0
                        group_name.append(new_continuous_predictors.iloc[:,i].name)                
                    groups.append(group_name)
                    break

        else:
            alpha -= 0.1
    return(groups)


# 7.2.2-qz-continuous-variable-construction-PCA

In [118]:
#function to get continuous features after PCA
#input: a dataset contains only continuous features with high correlation with target and the new_df from previous steps
#output: a dataset contains only continuous features after PCA 
from sklearn.decomposition import PCA
def get_continuous_after_pca(new_continuous_predictors,new_df,new_df_type):
    pca = PCA(n_components=1)
    groups = get_grouped_features(new_continuous_predictors)
    for group in groups:
        #get new feature name
        for i in range(len(group)):
            if i == 0:
                new_pca = str(group[i])
            else:
                new_pca = new_pca + '_' + str(group[i])
        df_pca = new_continuous_predictors[group]
        pca.fit(df_pca)
        X_pca=pca.transform(df_pca) 
        X_pca = pd.DataFrame(X_pca)
        new_df = new_df.drop(columns=group)
        for c in group:
            new_df_type = new_df_type.drop(index=int(new_df_type[new_df_type.Variable == c].index[0]))
        new_df[new_pca] = X_pca
        new_df_type = new_df_type.append(pd.DataFrame([[new_pca,'Continuous']], columns=['Variable','Type']))
    return(new_df,new_df_type)

In [119]:
target_name = get_target(df,df_type)
target_type = column_type(target_name,df_type)

In [120]:
if target_type == 'Flag_Continuous':
    selection = continuous_selection(new_df,new_df_type)
    new_continuous_predictors = selection[0]
    new_df = selection[1]
    new_df_type = selection[2]

    #Continuous variable construction:
    pca = get_continuous_after_pca(new_continuous_predictors,new_df,new_df_type)
    new_df = pca[0]
    new_df_type = pca[1]

In [121]:
new_df.columns

Index(['Vehicle', 'Calendar_Year', 'Model_Year', 'Blind_Make', 'Cat1', 'Cat2',
       'Cat3', 'Cat6', 'Cat8', 'Cat9', 'Cat10', 'Cat11', 'Cat12', 'OrdCat',
       'NVCat', 'NVVar1', 'NVVar2', 'NVVar3', 'Claim_Amount', 'Var2_Var4',
       'Var1_Var5', 'Var3_Var6', 'Var7_Var8_NVVar4'],
      dtype='object')

In [122]:
Stats_Collection(new_df,new_df_type)

Variable name:  Vehicle
Variable type:  Continuous
Number of missing values:  0
Number of valid values:  5000
Minimum value:  -1.1327471321287792
Maximum value:  1.1555098984010526
Mean:  6.2447346790730955e-15
Standard Deviation:  1.0001000150025021
Skewness:  0.02276959772090277
Number of distinct values:  3
Number of cases for each distinct value: 
-1.1327471321287792 1929
1.1555098984010526 1891
2.1424486944016847e-14 1180
{-1.1327471321287792: 1929, 1.1555098984010526: 1891, 2.1424486944016847e-14: 1180}

Variable name:  Calendar_Year
Variable type:  Nominal
Number of missing values:  0
Number of valid values:  5000
Number of categories:  2
The counts of each category: 
1 4026
0 974
{1: 4026, 0: 974}
Mode:  1 Count:  4026

Variable name:  Model_Year
Mode contains more than 95% cases, drop this column

Variable name:  Blind_Make
Variable type:  Nominal
Number of missing values:  0
Number of valid values:  5000
Number of categories:  2
The counts of each category: 
0 2614
1 2386
{0:

(           Vehicle  Calendar_Year  Blind_Make  Cat1  Cat2  Cat3  Cat6  Cat8  \
 0     1.155510e+00              1           0     9     2     2     2     0   
 1     1.155510e+00              1           0     8     2     5     1     1   
 2    -1.132747e+00              1           1     8     2     5     2     1   
 3    -1.132747e+00              0           1     8     2     5     2     1   
 4     1.155510e+00              1           0     2     2     4     4     1   
 5     1.155510e+00              0           0     2     2     4     4     1   
 6    -1.132747e+00              0           1     3     2     5     1     1   
 7     1.155510e+00              0           0     8     2     4     4     1   
 8     1.155510e+00              1           0     8     2     4     4     1   
 9    -1.132747e+00              1           0     9     2     1     4     1   
 10   -1.132747e+00              0           0     9     2     1     4     1   
 11    1.155510e+00              1      

# 8.1-hz-linear-regression

# 8.2-jbl-logistic-regression

# 9.1-qz-XG-Boost-regression

# 9.2-tcz-XG-Boost-classification