In [73]:
import pandas as pd
import numpy as np

# Load Data

In [74]:
df = pd.read_csv('data/raw/Kaggle/train_sample.csv')
df=df.drop(columns = ['Unnamed: 0'])

In [75]:
df_type = pd.read_csv('data/raw/Kaggle/column_type.csv')

In [76]:
df.columns

Index(['Row_ID', 'Household_ID', 'Vehicle', 'Calendar_Year', 'Model_Year',
       'Blind_Make', 'Blind_Model', 'Blind_Submodel', 'Cat1', 'Cat2', 'Cat3',
       'Cat4', 'Cat5', 'Cat6', 'Cat7', 'Cat8', 'Cat9', 'Cat10', 'Cat11',
       'Cat12', 'OrdCat', 'Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6',
       'Var7', 'Var8', 'NVCat', 'NVVar1', 'NVVar2', 'NVVar3', 'NVVar4',
       'Claim_Amount'],
      dtype='object')

In [77]:
df_type

Unnamed: 0,Variable,Type
0,Row_ID,Delete
1,Household_ID,Nominal
2,Vehicle,Continuous
3,Calendar_Year,Nominal
4,Model_Year,Nominal
5,Blind_Make,Nominal
6,Blind_Model,Nominal
7,Blind_Submodel,Nominal
8,Cat1,Nominal
9,Cat2,Nominal


# Basic stats functions

In [78]:
#function to get number of missing values in a column
def get_na_num(column): #input the whole column
    if column.dtype == np.object:
        return column.isnull().sum() + column[column == ''].count() + column[column == '?'].count()
    else:
        return column.isnull().sum()

In [79]:
#function to get number of valid values in a column
def get_valid_num(column): #input the whole column
    return len(column) - get_na_num(column)

In [50]:
#function to get minimum value in a column
def get_min(column): #input the whole column
    return min(column)

In [51]:
#function to get maximum value in a column
def get_max(column): #input the whole column
    return max(column)

In [52]:
#function to get mean in a column
def get_mean(column): #input the whole column
    return column.mean()

In [53]:
#function to get std in a column
def get_std(column): #input the whole column
    return column.std()

In [54]:
#function to get skewness in a column
def get_skew(column): #input the whole column
    return column.skew()

In [55]:
#function to get number of distict values in a column
def get_distinct_num(column): #input the whole column
    return len(column.unique().tolist())

In [56]:
#function to get count of each distict value in a column
def get_distinct_count(column): #input the whole column
    if get_distinct_num(column) > 5:
        print('Number of distict values is larger than 5. We stop updating the number of distinct values')
    else:
        return column.value_counts()

In [57]:
#function to get median in a column
import statistics
def get_median(column):
    if get_distinct_num(column) > 5:
        print('Number of distict values is larger than 5. We do not calculate median')
    else:
        return statistics.median(column)

In [58]:
#function to get mode and count for the mode in a column
def get_mode(column):
    return (column.mode()[0],column[column==column.mode()[0]].count())

In [59]:
#function to check whether a numerical column is continuous or discrete
def check_cont_or_dis(column):
    if get_distinct_num(column) > max(0.0001 * get_valid_num(column), 10):
        return ('continuous')
    else:
        return ('discrete')

In [60]:
#Function to get target variable
def get_target(df):
    for c in df:
        if (column_type(c) == 'Flag_Continuous' or column_type(c) == 'Flag_Categorical'):
            return(c)

In [61]:
# funtion to get column type
def column_type(column):
    return (df_type.loc[df_type['Variable'] == column, 'Type'].iloc[0])

# Basic variable screening and creating stats report

In [62]:
#function to do basic variable screening and create basic statistical report
def Stats_Collection(df,df_type):
    for c in df:
        #exclude Target 
        if (column_type(c) != 'Flag_Continuous' and column_type(c) != 'Flag_Categorical'):
            print('Variable name: ',c)

            #Basic variable screening
            if get_na_num(df[c])/len(df[c]) > 0.5:
                print('More 50% missing values, drop this column\n')
                df = df.drop(columns=c)
                df_type = df_type.drop(index=int(df_type[df_type.Variable == c].index[0]))
                continue
            if (column_type(c) == 'Delete'):
                print('Column type is Delete, drop this column\n')
                df = df.drop(columns=c)
                df_type = df_type.drop(index=int(df_type[df_type.Variable == c].index[0]))
                continue
            if (column_type(c) == 'Continuous') and (get_min(df[c]) == get_max(df[c])):
                print('All same value, drop this column\n')
                df = df.drop(columns=c)
                df_type = df_type.drop(index=int(df_type[df_type.Variable == c].index[0]))
                continue
            if (column_type(c) == 'Ordinal' or column_type(c) == 'Nominal') and (get_mode(df[c])[1]/get_valid_num(df[c]) > 0.95):
                print('Mode contains more than 95% cases, drop this column\n')
                df = df.drop(columns=c)
                df_type = df_type.drop(index=int(df_type[df_type.Variable == c].index[0]))
                continue
            if (column_type(c) == 'Nominal') and (get_distinct_num(df[c]) > 100):
                print('More than 100 categories, drop this column\n')
                df = df.drop(columns=c)
                df_type = df_type.drop(index=int(df_type[df_type.Variable == c].index[0]))
                continue

            #Basic statistic report
            print('Variable type: ', column_type(c))
            print ('Number of missing values: ',get_na_num(df[c]))
            print ('Number of valid values: ',get_valid_num(df[c]))
            if column_type(c) == 'Continuous' or column_type(c) == 'Ordinal':
                print('Minimum value: ', get_min(df[c]))
                print('Maximum value: ', get_max(df[c]))
            if column_type(c) == 'Continuous':
                print('Mean: ',get_mean(df[c]))
                print('Standard Deviation: ',get_std(df[c]))
                print('Skewness: ',get_skew(df[c]))
                print('Number of distinct values: ',get_distinct_num(df[c]))
                print('Number of cases for each distinct value: ')
                print(get_distinct_count(df[c]))
            else:
                print('Number of categories: ', get_distinct_num(df[c]))
                print('The counts of each category: ')
                print(get_distinct_count(df[c]))
                print('Mode: ', get_mode(df[c])[0],'Count: ',get_mode(df[c])[1])                
        print()
    return(df,df_type)

In [80]:
d = Stats_Collection(df,df_type)

Variable name:  Row_ID
Column type is Delete, drop this column

Variable name:  Household_ID
More than 100 categories, drop this column

Variable name:  Vehicle
Variable type:  Continuous
Number of missing values:  0
Number of valid values:  5000
Minimum value:  1
Maximum value:  14
Mean:  2.2438
Standard Deviation:  1.4993369075136003
Skewness:  2.0299344354564077
Number of distinct values:  14
Number of cases for each distinct value: 
Number of distict values is larger than 5. We stop updating the number of distinct values
None

Variable name:  Calendar_Year
Variable type:  Nominal
Number of missing values:  0
Number of valid values:  5000
Number of categories:  2
The counts of each category: 
2005    4026
2006     974
Name: Calendar_Year, dtype: int64
Mode:  2005 Count:  4026

Variable name:  Model_Year
Variable type:  Nominal
Number of missing values:  0
Number of valid values:  5000
Number of categories:  27
The counts of each category: 
Number of distict values is larger than 5. 

## After deleting useless variables, new dataset and new column type dataset are named as new_df and new_df_type

In [81]:
new_df_type = d[1]

In [82]:
new_df = d[0]

# Outliers handling

In [66]:
# function to identify outliers in continuous variables
# returns a bool panda series, a lower cutoff value and an upper cutoff value(used in outlier handling)
def outlier_identification(column):
    ori_mean = get_mean(column.dropna())
    ori_std = get_std(column.dropna())
    N_i = []
    X_i = []
    M_i = []
    for i in range(-3, 5):
        lower = ori_mean + (i - 1) * ori_std if i != -3 else -float('inf')
        upper = ori_mean + i * ori_std if i != 4 else float('inf')
        temp1 = column[(column <= upper) & (column > lower)]
        N_i.append(len(temp1))
        X_i.append(get_mean(temp1))
        M_i.append(np.var(temp1) * len(temp1))
    l = -3
    r = 4
    p = 0
    while 1:
        if N_i[l + 3] <= N_i[r + 3]:
            p_current = N_i[l + 3] / get_valid_num(column)
            if p + p_current < 0.05:
                l = l + 1
                p = p + p_current
            else:
                break
        else:
            p_current = N_i[r + 3] / get_valid_num(column)
            if p + p_current < 0.05:
                r = r - 1
                p = p + p_current
            else:
                break
    lower = ori_mean + (l - 1) * ori_std if l != -3 else -float('inf')
    upper = ori_mean + r * ori_std if r != 4 else float('inf')
    temp1 = column[(column <= upper) & (column > lower)]
    x_robust = get_mean(temp1)
    M_robust = 0
    N_sum = 0
    for i in range(l, r + 1):
        A_i = M_i[i + 3] + N_i[i + 3] * (x_robust - X_i[i + 3])**2
        M_robust = M_robust + A_i
        N_sum = N_sum + N_i[i + 3]
    sd_robust = np.sqrt(M_robust / (N_sum - 1))
    result = (column - x_robust < -3 * sd_robust) | (column + x_robust > 3 * sd_robust)
    lower_cutoff_value = x_robust - 3 * sd_robust
    upper_cutoff_value = x_robust + 3 * sd_robust
    return(result, lower_cutoff_value, upper_cutoff_value)
outlier_identification(new_df['Var1'])

(0       False
 1       False
 2       False
 3       False
 4       False
 5       False
 6       False
 7       False
 8       False
 9       False
 10      False
 11      False
 12      False
 13      False
 14      False
 15      False
 16      False
 17      False
 18      False
 19      False
 20      False
 21      False
 22      False
 23      False
 24      False
 25      False
 26      False
 27      False
 28      False
 29      False
         ...  
 4970    False
 4971    False
 4972    False
 4973    False
 4974    False
 4975    False
 4976    False
 4977    False
 4978    False
 4979    False
 4980    False
 4981    False
 4982    False
 4983    False
 4984    False
 4985    False
 4986    False
 4987    False
 4988     True
 4989     True
 4990    False
 4991    False
 4992    False
 4993    False
 4994    False
 4995    False
 4996    False
 4997     True
 4998    False
 4999    False
 Name: Var1, Length: 5000, dtype: bool, -2.8276437587410994, 3.10881280749214)

In [67]:
# function to trim outliers to cutoff values
def outlier_trim(df,column):
    (flag, lower, upper) = outlier_identification(column)
    df.loc[flag, column.name] = pd.Series(map(lambda x : (lower if x <= lower else upper), column[flag]))

In [68]:
# function to set outliers to missing values
def outlier_toNone(df,column):
    (flag, lower, upper) = outlier_identification(column)
    df.loc[flag, column.name] = None

In [83]:
print(sum(new_df['Var1'] > 3.1088128074921366))
print(sum(new_df['Var1'] < -2.827643758741096))
for c in new_df:
    if column_type(c) == 'Continuous':
        outlier_trim(new_df,new_df[c])
print(sum(new_df['Var1'] > 3.1088128074921366))
print(sum(new_df['Var1'] < -2.827643758741096))

124
0
0
0


# Missing values handling

In [70]:
# function to fill missing value and update statistic
def fill_missing_value(mydata, column_type):
    # filling missing value and updata statistic
    column_list = mydata.columns.values.tolist()
    typelist = list(column_type.iloc[:,1])
    i = 0
    for typ in typelist:
        column_data = mydata[column_list[i]].dropna()
        if (typ == 'Continuous'):
            mean_value = column_data.mean() # calculate mean
            mydata[column_list[i]] = mydata[column_list[i]].fillna(mean_value) # fill missing value with mean
            cont_sd = mydata[column_list[i]].std() # calculate standard deviation
            cont_skew = mydata[column_list[i]].skew() # calculate skewness
            print('')
            print('Column:', column_list[i])
            print('Column type: continuous')
            print('Mean:', mean_value)
            print('Standard deviation:', cont_sd)
            print('Skewness:', cont_skew)
            print('The number of missing values:', get_na_num(mydata[column_list[i]]))
            print('The number of valid values:', get_valid_num(mydata[column_list[i]]))
        elif (typ == 'Ordinal' and mydata[column_list[i]].dtype != 'object'):
            num_median = column_data.median() # calculate median 
            mydata[column_list[i]] = mydata[column_list[i]].fillna(num_median) # fill missing value with median
            count_median_num = mydata[column_list[i]][mydata[column_list[i]] == num_median].count() # count the the number of cases in the median category
            print('')
            print('Column:', column_list[i])
            print('Column type: num_ordinal')
            print('Median:', num_median)
            print('The number of cases in the median category:', count_median_num)
            print('The number of missing values:', get_na_num(mydata[column_list[i]]))
            print('The number of valid values:', get_valid_num(mydata[column_list[i]]))
        elif (typ == 'Ordinal' and mydata[column_list[i]].dtype == 'object'):
            mode_value = column_data.mode()[0] # calculate mode
            mydata[column_list[i]] = mydata[column_list[i]].fillna(mode_value) # fill missing valye with mode
            count_mode = mydata[column_list[i]][mydata[column_list[i]] == mode_value].count() # count the the number of cases in the modal category
            print('')
            print('Column:', column_list[i])
            print('Column type: cat_ordinal')
            print('Mode:', mode_value)
            print('The number of cases in the modal category:', count_mode)
            print('The number of missing values:', get_na_num(mydata[column_list[i]]))
            print('The number of valid values:', get_valid_num(mydata[column_list[i]]))
        else:
            mode_value = column_data.mode()[0] # calculate mode
            mydata[column_list[i]] = mydata[column_list[i]].fillna(mode_value) # fill missing valye with mode
            count_mode = mydata[column_list[i]][mydata[column_list[i]] == mode_value].count() # count the the number of cases in the modal category
            print('')
            print('Column:', column_list[i])
            print('Column type: nominal')
            print('Mode:', mode_value)
            print('The number of cases in the modal category:', count_mode)
            print('The number of missing values:', get_na_num(mydata[column_list[i]]))
            print('The number of valid values:', get_valid_num(mydata[column_list[i]]))
        i = i + 1
    # add column type at the last row
    print('add column type at the last row:')
    return (mydata)

In [84]:
for c in new_df:
    print(get_na_num(new_df[c]))

1180
0
0
1
22
1747
2
22
0
0
0
5
5
5
127
9
13
11
93
36
0
144
0
0
0
0
0
0


In [85]:
new_df = new_df.replace('?',np.NaN)
fill_missing_value(new_df,new_df_type)


Column: Vehicle
Column type: continuous
Mean: 3.183613197220352
Standard deviation: 1.9279074114234616
Skewness: 0.022769597720921908
The number of missing values: 0
The number of valid values: 5000

Column: Calendar_Year
Column type: nominal
Mode: 2005
The number of cases in the modal category: 4026
The number of missing values: 0
The number of valid values: 5000

Column: Model_Year
Column type: nominal
Mode: 2002
The number of cases in the modal category: 506
The number of missing values: 0
The number of valid values: 5000

Column: Blind_Make
Column type: nominal
Mode: AJ
The number of cases in the modal category: 858
The number of missing values: 0
The number of valid values: 5000

Column: Cat1
Column type: nominal
Mode: D
The number of cases in the modal category: 1352
The number of missing values: 0
The number of valid values: 5000

Column: Cat2
Column type: nominal
Mode: C
The number of cases in the modal category: 4138
The number of missing values: 0
The number of valid values:

Unnamed: 0,Vehicle,Calendar_Year,Model_Year,Blind_Make,Cat1,Cat2,Cat3,Cat6,Cat8,Cat9,...,Var5,Var6,Var7,Var8,NVCat,NVVar1,NVVar2,NVVar3,NVVar4,Claim_Amount
0,5.411107,2005,2005,K,D,C,F,C,C,A,...,1.008912,0.261040,0.907793,-0.077998,M,-0.231530,-0.266117,-0.272337,-0.251419,0.0
1,5.411107,2005,2003,Q,B,C,A,E,A,B,...,1.240851,0.432987,-0.726459,0.204785,O,-0.231530,-0.266117,-0.272337,-0.251419,0.0
2,1.000000,2005,1998,AR,B,C,A,C,A,B,...,-0.971487,-1.405797,-0.837048,-1.176858,F,-0.231530,-0.266117,-0.272337,-0.251419,0.0
3,1.000000,2006,1998,AR,B,C,A,C,A,B,...,-0.971487,-1.405797,-0.837048,-1.176858,F,-0.231530,-0.266117,-0.272337,-0.251419,0.0
4,5.411107,2005,2001,D,J,C,B,D,A,B,...,0.812656,2.112691,1.534462,2.347260,F,-0.231530,-0.266117,-0.272337,-0.251419,0.0
5,5.411107,2006,2001,D,J,C,B,D,A,B,...,0.812656,2.112691,1.534462,2.347260,F,-0.231530,-0.266117,-0.272337,-0.251419,0.0
6,1.000000,2006,2001,AJ,G,C,A,E,A,B,...,0.580718,0.551128,0.416289,-0.024395,M,-0.231530,-0.266117,-0.272337,-0.251419,0.0
7,5.411107,2006,2002,AQ,B,C,B,D,B,A,...,0.527193,-0.023200,-0.701884,0.226663,M,-0.231530,-0.266117,-0.272337,-0.251419,0.0
8,5.411107,2005,2002,AQ,B,C,B,D,B,A,...,0.527193,-0.023200,-0.701884,0.226663,M,-0.231530,-0.266117,-0.272337,-0.251419,0.0
9,1.000000,2005,1995,BW,D,C,E,D,A,B,...,0.176312,0.283264,0.969232,-0.792339,N,2.054683,-0.266117,-0.272337,-0.251419,0.0


In [86]:
for c in new_df:
    print(get_na_num(new_df[c]))

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


# Continuous variable transformation (z-score or min-max)

In [87]:
from scipy import stats
import sklearn
from sklearn import preprocessing

In [88]:
#function to do z-score transformation of a column
def zscore(df, column):
    if column_type(column) == 'Continuous':
        return(stats.zscore(df[column]))
    else:
        return(df[column])

In [89]:
 #function to do min-max transformation of a column
def minmax(df, column):
    if column_type(column) == 'Continuous':
        return(preprocessing.minmax_scale(df[column], feature_range=(0, 100)))
    else:
        return(df[column])

In [90]:
# zscore for continuous predictor
#check on Var1
print(new_df['Var1'])
get_mean(new_df['Var1'])
get_std(new_df['Var1'])

for c in new_df:
    new_df[c] = zscore(new_df, c)

#check on Var1 result
print(new_df['Var1'])
get_mean(new_df['Var1'])
get_std(new_df['Var1'])

0       1.270533
1       0.217950
2      -0.754282
3      -0.754282
4       0.563454
5       0.563454
6       0.700049
7       0.017076
8       0.017076
9       0.442930
10      0.442930
11      0.322405
12      0.322405
13     -0.633757
14     -0.633757
15     -0.376638
16     -0.376638
17      0.322405
18      0.322405
19      0.322405
20      1.849051
21     -0.681967
22     -0.175763
23     -0.537337
24     -0.537337
25     -0.931051
26      0.017076
27      0.017076
28     -0.408778
29     -0.408778
          ...   
4970    0.491140
4971    1.447302
4972    1.447302
4973   -0.633757
4974   -0.633757
4975    1.366952
4976    1.366952
4977    0.563454
4978    0.563454
4979    2.130275
4980    2.130275
4981   -0.714107
4982    1.447302
4983    1.447302
4984   -1.011401
4985   -1.011401
4986   -1.027471
4987   -1.027471
4988    0.170694
4989    0.170694
4990   -0.400743
4991   -0.400743
4992   -0.031134
4993   -0.031134
4994   -0.119519
4995    0.442930
4996    0.442930
4997    0.1706

1.0001000150025003

# Reorder Categories and Supervised Merged

In [91]:
# Function to sort the Series by value and then by index(Lexical order)
def sort_data(Series):
    return Series.iloc[np.lexsort([Series.index, Series.values])]

In [92]:
# Function to supervised merged categories in categorical variables
from CHAID import Tree
import re

# df = dataset, Predictor_type = Nominal or Ordinal, dependent_variable_name = target name, indep_column_num = column index
def Supervised_Merged (df, Predictor_type, dependent_variable_name, indep_column_num, Categorical = True):
    
    
    # Get the names of Independent and Dependent variables
    independent_variable_column = [df.columns[indep_column_num]]
    dep_variable = dependent_variable_name
    
    # Check for Target variable type to decide which CHAID TREE to implement
    if Categorical == True:
        
        # fit the Chaid tree model to supervised merged the categories in category predictor
        tree = Tree.from_pandas_df(df, dict(zip(independent_variable_column, [Predictor_type] *1)), 
                                   dep_variable, max_depth = 1)
        
    else:
        
        # Convert the target variable to numeric  
        df[dependent_variable_name] = pd.to_numeric(df[dependent_variable_name],errors='coerce')
        
        # fit the Chaid tree model to supervised merged the categories in category predictor
        tree = Tree.from_pandas_df(df, dict(zip(independent_variable_column, [Predictor_type] * 1)), 
                                   dep_variable, dep_variable_type='continuous',max_depth = 1)
    
    # Print the fitted tree
    print('The CHAID TREE is presented below:')
    print('')
    tree.print_tree()

    # Get the merged categoriess string from the tree
    Merged_group = tree.tree_store[0].split.groupings.split('],')
    # Get numbers of merged caegroeis
    length_Merged_group = np.arange(0,len(Merged_group))
    
    if len(Merged_group) >= 2: 
        
        # Etract the number from the string 
        New_Merged_Categories = {}
        for i in length_Merged_group:
            group = list(map(int, re.findall(r'\d+',Merged_group[i])))
            New_Merged_Categories[i] = group 
        print('The P-Values of this node is',tree.tree_store[0].split.p)
        print('The new categories are:' )
        print(New_Merged_Categories)
        print('')
        
        # Convert the dict_format to match the previous dic
        # For example: new_merged: {0:[1,2,3,4,5],1:[6,7,8],2:[0,9]}
        #              map_dict: {0:2, 1:0, 2:0, 3:0, 4:0, 5:0, 6:1, 7:1, 8:1, 9:2}               
        new_dict={}
        length_New_Merged = np.arange(0,len(New_Merged_Categories))
        for j in length_New_Merged:
            values = New_Merged_Categories.get(j) 
            for k in np.arange(0,len(values)):
                new_dict[values[k]]=j
    else:
        print('The P-Values of this node is',tree.tree_store[0].split.p)
        print('The P-values is too large.')
        print('There is no categories can be merged in this variables.')
        print('')
        new_dict={}
    return new_dict

In [37]:
# Function to Rearrange categories and Supervised Merged for Categorical Predictors
# dataset = original dataset, column_type = dataset includes the columns type, dep_variable_name = target name.
def Reorder_Categories (dataset,column_type):
    dep_variable_name = get_target(dataset)
    
    # Get the target column index
    T_colnumber = dataset.columns.get_loc(dep_variable_name)
    
    # Get the type of Target column
    Flag_type = 'Flag_Continuous'
    Flag_type1 = Flag_type in column_type.iloc[:,1].values
    
    # Get the row and column number of Dataset
    n_columns=np.arange(0,len(dataset.columns),1)
    length_data = len(dataset)-1
    
    
    # Loop through all columns 
    for i in n_columns:
        
        Predictor_type = column_type.iloc[i,1]
        
        # Check the type of Categorical predictor
        if Predictor_type == 'Nominal':
            
            Pre_type = 'nominal'
            
            # Get the total counts of each category in each column
            Column_name = dataset.columns[i]
            Count_Each_Level = dataset.iloc[:length_data-1,i].value_counts()
            
            # Sort the categories  
            Count_Each_Level = sort_data(Count_Each_Level)
            print('Column name:',Column_name.upper())
            print(Count_Each_Level.to_frame())


            # Assign each category a number, starting from 0 to N, by counts.
            n_distinct = np.arange(0,len(Count_Each_Level),1)
            dict_Level={}
            for j in n_distinct:
                Level_name = Count_Each_Level.index[j]
                dict_Level[Level_name]= j 


            print('Reorder Categories :')
            print(dict_Level)
            print('')

            # Substitute orignal Categories to number
            dataset[Column_name] = dataset[Column_name].map(dict_Level)


            # Supervised Merged
            print('Supervised Merged:')
            
            New_Categories ={}
            if T_colnumber != i:
                
                # Check if target is Categorical or Continuous
                if Flag_type1 == True:
                    New_Categories = Supervised_Merged(dataset, Pre_type, dependent_variable_name = dep_variable_name, indep_column_num = i, Categorical = False)
                else:
                    New_Categories = Supervised_Merged(dataset, Pre_type, dependent_variable_name = dep_variable_name, indep_column_num = i)
            else:
                print('')
                print('This is the Target column.')
                
            # Check if there the New_Categories is empty set
            if len(New_Categories) != 0:
                dataset[Column_name] = dataset[Column_name].map(New_Categories)
                
            print('------------------------------------------------------------------------------------------------------------------')
            print('')
            
         
        # If the Predictor type is Ordinal
        if Predictor_type == 'Ordinal':
            
            Column_name = dataset.columns[i]
            Pre_type ='ordinal'
            
            New_Categories ={}
            
            # Check if target is Categorical or Continuous
            if Flag_type1 == True:
                New_Categories = Supervised_Merged(dataset, Pre_type, dependent_variable_name = dep_variable_name, indep_column_num = i, Categorical = False)
            else:
                New_Categories = Supervised_Merged(dataset,Pre_type, dependent_variable_name = dep_variable_name, indep_column_num = i)
                
            if len(New_Categories) != 0:
                    dataset[Column_name] = dataset[Column_name].map(New_Categories)
            
    return dataset

In [93]:
Reorder_Categories(new_df,new_df_type)

Column name: CALENDAR_YEAR
      Calendar_Year
2006            973
2005           4025
Reorder Categories :
{2006: 0, 2005: 1}

Supervised Merged:
The CHAID TREE is presented below:

([], {'mean': 1.13246858782, 's.t.d': 28.210900917931}, <Invalid Chaid Split> - splitting would create nodes with less than the minimum child node size)

The P-Values of this node is None
The P-values is too large.
There is no categories can be merged in this variables.

------------------------------------------------------------------------------------------------------------------

Column name: MODEL_YEAR
      Model_Year
1982           7
2007           9
1983          13
1981          16
1984          25
1987          28
1986          33
1985          35
1988          50
1989          59
1990          65
1992          95
1991          97
2006         105
1993         138
1994         169
1996         198
1995         208
1998         269
1997         293
1999         367
2005         377
2000         4


Column name: CAT12
   Cat12
A      1
F    177
E    462
D   1333
C   1383
B   1642
Reorder Categories :
{'A': 0, 'F': 1, 'E': 2, 'D': 3, 'C': 4, 'B': 5}

Supervised Merged:
The CHAID TREE is presented below:

([], {'mean': 1.13246858782, 's.t.d': 28.210900917931}, <Invalid Chaid Split> - splitting would create nodes with less than the minimum child node size)

The P-Values of this node is None
The P-values is too large.
There is no categories can be merged in this variables.

------------------------------------------------------------------------------------------------------------------

The CHAID TREE is presented below:

([], {'mean': 1.13246858782, 's.t.d': 28.210900917931}, <Invalid Chaid Split> - splitting would create nodes with less than the minimum child node size)

The P-Values of this node is None
The P-values is too large.
There is no categories can be merged in this variables.

Column name: NVCAT
   NVCat
D      7
I     55
C     62
G     77
B     84
A    101
E    127
K   

Unnamed: 0,Vehicle,Calendar_Year,Model_Year,Blind_Make,Cat1,Cat2,Cat3,Cat6,Cat8,Cat9,...,Var5,Var6,Var7,Var8,NVCat,NVVar1,NVVar2,NVVar3,NVVar4,Claim_Amount
0,1.155510e+00,1,0,0,9,2,2,2,0,0,...,7.980141e-01,1.218645e-01,0.685056,1.953377e-02,0,-0.262917,-0.323505,-0.308787,-0.289619,0.0
1,1.155510e+00,1,0,0,8,2,5,1,1,1,...,1.027066e+00,2.929631e-01,-0.859782,3.793475e-01,0,-0.262917,-0.323505,-0.308787,-0.289619,0.0
2,-1.132747e+00,1,0,1,8,2,5,2,1,1,...,-1.157740e+00,-1.536745e+00,-0.964320,-1.378659e+00,0,-0.262917,-0.323505,-0.308787,-0.289619,0.0
3,-1.132747e+00,0,0,1,8,2,5,2,1,1,...,-1.157740e+00,-1.536745e+00,-0.964320,-1.378659e+00,0,-0.262917,-0.323505,-0.308787,-0.289619,0.0
4,1.155510e+00,1,0,0,2,2,4,4,1,1,...,6.042006e-01,1.964376e+00,1.277438,3.105440e+00,0,-0.262917,-0.323505,-0.308787,-0.289619,0.0
5,1.155510e+00,0,0,0,2,2,4,4,1,1,...,6.042006e-01,1.964376e+00,1.277438,3.105440e+00,0,-0.262917,-0.323505,-0.308787,-0.289619,0.0
6,-1.132747e+00,0,0,1,3,2,5,1,1,1,...,3.751483e-01,4.105207e-01,0.220443,8.773840e-02,0,-0.262917,-0.323505,-0.308787,-0.289619,0.0
7,1.155510e+00,0,0,0,8,2,4,4,1,0,...,3.222901e-01,-1.609721e-01,-0.836551,4.071860e-01,0,-0.262917,-0.323505,-0.308787,-0.289619,0.0
8,1.155510e+00,1,0,0,8,2,4,4,1,0,...,3.222901e-01,-1.609721e-01,-0.836551,4.071860e-01,0,-0.262917,-0.323505,-0.308787,-0.289619,0.0
9,-1.132747e+00,1,0,0,9,2,1,4,1,1,...,-2.422472e-02,1.439793e-01,0.743133,-8.893958e-01,0,3.824093,-0.323505,-0.308787,-0.289619,0.0


In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(df_variable)
X_pca=pca.transform(df_variable) 
PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)
print(pca.explained_variance_ratio_) 

In [None]:
X_pca = pd.DataFrame(X_pca)

In [None]:
X_pca

In [8]:
get_target(df)

'Claim_Amount'