In [None]:
import os

def scale_input_data(scale_factor):
  file_bases = ['./input/train', './input/test']
  for file_base in file_bases:
    import pandas as pd
    import shutil
    if scale_factor == 1.0:
      shutil.copyfile(file_base + '.csv', file_base + '.scaled.csv')
      continue
    df_to_scale = pd.read_csv(file_base + '.csv')
    new_num_rows = int(scale_factor * len(df_to_scale))
    if scale_factor <= 1.0:
      df_to_scale = df_to_scale.iloc[:new_num_rows]
    else:
      while len(df_to_scale) < new_num_rows:
        df_to_scale = pd.concat([df_to_scale, df_to_scale[:min(new_num_rows - len(df_to_scale), len(df_to_scale))]])
    df_to_scale.to_csv(file_base + '.scaled.csv', index=False)

if 'INPUT_SCALE_FACTOR' in os.environ:
  scale_input_data(float(os.environ['INPUT_SCALE_FACTOR']))

In [1]:
#################################### AllState Claims Severity ############################################################

# Below is function to encode categorical variables with high cardinality into numeric values such that they can 
# used in modeling exercises. The technique has been inspired from Owen Zhang's method of dealing with categorical variables
# with high cardinality


# Reading in training and test data

# import pandas as pd
exec(os.environ['IREWR_IMPORTS'])
import numpy as np
# ALEX: remove plotting
# import matplotlib as plt
# %pylab inline
df_train = pd.read_csv("./input/train.scaled.csv", index_col='id')
df_test = pd.read_csv("./input/test.scaled.csv", index_col='id')

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  warn("pylab import has clobbered these variables: %s"  % clobbered +


In [2]:
# Getting all continuous features into a separate dataset

contfeatures = df_train.select_dtypes(include=["float64"])

In [3]:
# Getting all categorical features into a separate dataset
catfeatures = df_train.select_dtypes(include=["object"])

In [4]:
catfeatures_list = list(catfeatures)

In [5]:
# We can possibly feed categorical variables with less or eq 10 levels direclty into our model.
# But, cat variables with >10 levels have to be feature engineered so that their effects can be included into the model
catvarbs_10 = list((df_train[catfeatures_list].apply(pd.Series.nunique)>10))

catvarlist = []
for (i, v) in zip(catfeatures_list, catvarbs_10):
    if(v):
        catvarlist.append(i)

In [6]:
print(catvarlist)

['cat99', 'cat100', 'cat101', 'cat103', 'cat104', 'cat105', 'cat106', 'cat107', 'cat108', 'cat109', 'cat110', 'cat111', 'cat112', 'cat113', 'cat114', 'cat115', 'cat116']


In [7]:
# WE append 'loss' variable to the cat varb dataset to compute means and variance

catvarlist.append('loss')
df_cat_encod = df_train[catvarlist]
df_cat_encod.head(5)

Unnamed: 0_level_0,cat99,cat100,cat101,cat103,cat104,cat105,cat106,cat107,cat108,cat109,cat110,cat111,cat112,cat113,cat114,cat115,cat116,loss
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,T,B,G,A,I,E,G,J,G,BU,BC,C,AS,S,A,O,LB,2213.18
2,T,L,F,A,E,E,I,K,K,BI,CQ,A,AV,BM,A,O,DP,1283.6
5,D,L,O,B,E,F,H,F,A,AB,DK,A,C,AF,A,I,GK,3005.09
10,T,I,D,A,E,E,I,K,K,BI,CS,C,N,AE,A,O,DJ,939.85
11,P,F,J,A,D,E,K,G,B,H,C,C,Y,BM,A,K,CK,2763.85


In [8]:
#before running our function to encode, we need to ensure that the list of char variables which we pass to the function
#does not the 'loss' variable in it

catvarlist.remove('loss')
catvarlist
target=['loss']

In [9]:
df_cat_encod.head(5)

Unnamed: 0_level_0,cat99,cat100,cat101,cat103,cat104,cat105,cat106,cat107,cat108,cat109,cat110,cat111,cat112,cat113,cat114,cat115,cat116,loss
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,T,B,G,A,I,E,G,J,G,BU,BC,C,AS,S,A,O,LB,2213.18
2,T,L,F,A,E,E,I,K,K,BI,CQ,A,AV,BM,A,O,DP,1283.6
5,D,L,O,B,E,F,H,F,A,AB,DK,A,C,AF,A,I,GK,3005.09
10,T,I,D,A,E,E,I,K,K,BI,CS,C,N,AE,A,O,DJ,939.85
11,P,F,J,A,D,E,K,G,B,H,C,C,Y,BM,A,K,CK,2763.85


In [10]:
# We define a function which will flatten a multi index column names which are created after aggregation of data
# This will be useful after creating mean & standard dev of categorical variable levels


def flattenHierarchicalCol(col,sep = ','):
    if not type(col) is tuple:
        return col
    else:
        new_col = ''
        for leveli,level in enumerate(col):
            if not level == '':
                if not leveli == 0:
                    new_col += sep
                new_col += level
        return new_col

In [11]:
# The function below computes the mean and std dev of the target variable across each level of each categorical variable
# identified and creates two separate features. This can instead be used as a continuous feature in any models we build
# We add the std dev too so as to introduce some random variation/noise into the data
def cat_encoding(list, dataframe, target):
    for i in range(len(list)):
# ALEX: make notebook run
#         group_df = dataframe.groupby([list[i]], as_index=False).agg({target:{"mean"+list[i]:'mean', 
#                                                                     "stdev"+list[i]:'std'}})
        group_df = dataframe.groupby([list[i]], as_index=False).agg(_mean=(target, 'mean'), 
                                                                    _stdev=(target, 'std'))
        group_df.rename(columns={'_mean': 'mean' + list[i], '_stdev': 'stdev' + list[i]})
        dataframe = pd.merge(dataframe, group_df, on=list[i], how='left')
    
    dataframe.columns = dataframe.columns.map(flattenHierarchicalCol)
    return dataframe

In [12]:
cat_encoded = cat_encoding(catvarlist,df_cat_encod,target[0])

  dataframe = pd.merge(dataframe, group_df, on=list[i], how='left')
  dataframe = pd.merge(dataframe, group_df, on=list[i], how='left')
  dataframe = pd.merge(dataframe, group_df, on=list[i], how='left')
  dataframe = pd.merge(dataframe, group_df, on=list[i], how='left')
  dataframe = pd.merge(dataframe, group_df, on=list[i], how='left')
  dataframe = pd.merge(dataframe, group_df, on=list[i], how='left')
  dataframe = pd.merge(dataframe, group_df, on=list[i], how='left')


In [13]:
cat_encoded.head(5)

# Mean and std dev of all categorical variables identified have been computed and returned as a separate dataset which can be joined
# to our original training set. The same mean & std dev values can be used to transform the same variables in the test set

Unnamed: 0,cat99,cat100,cat101,cat103,cat104,cat105,cat106,cat107,cat108,cat109,...,_mean_x,_stdev_x,_mean_y,_stdev_y,_mean_x.1,_stdev_x.1,_mean_y.1,_stdev_y.1,_mean,_stdev
0,T,B,G,A,I,E,G,J,G,BU,...,3409.983392,3040.851201,2744.910924,2475.689837,3259.916396,3005.511481,2948.525441,2699.909989,2917.5232,2545.417315
1,T,L,F,A,E,E,I,K,K,BI,...,2380.8509,2349.891371,2874.471697,2581.380533,3259.916396,3005.511481,2948.525441,2699.909989,3107.697517,2837.734327
2,D,L,O,B,E,F,H,F,A,AB,...,3250.374479,3124.220044,2942.861834,2506.936938,3259.916396,3005.511481,2991.019257,3208.069463,2695.767964,2131.3728
3,T,I,D,A,E,E,I,K,K,BI,...,3043.584628,2846.254422,2907.740038,2558.986877,3259.916396,3005.511481,2948.525441,2699.909989,2911.900687,2562.341762
4,P,F,J,A,D,E,K,G,B,H,...,2837.032428,2447.238321,2874.471697,2581.380533,3259.916396,3005.511481,3016.756872,2741.299127,3037.328947,2445.806699


In [14]:
names = cat_encoded.columns
names

Index(['cat99', 'cat100', 'cat101', 'cat103', 'cat104', 'cat105', 'cat106',
       'cat107', 'cat108', 'cat109', 'cat110', 'cat111', 'cat112', 'cat113',
       'cat114', 'cat115', 'cat116', 'loss', '_mean_x', '_stdev_x', '_mean_y',
       '_stdev_y', '_mean_x', '_stdev_x', '_mean_y', '_stdev_y', '_mean_x',
       '_stdev_x', '_mean_y', '_stdev_y', '_mean_x', '_stdev_x', '_mean_y',
       '_stdev_y', '_mean_x', '_stdev_x', '_mean_y', '_stdev_y', '_mean_x',
       '_stdev_x', '_mean_y', '_stdev_y', '_mean_x', '_stdev_x', '_mean_y',
       '_stdev_y', '_mean_x', '_stdev_x', '_mean_y', '_stdev_y', '_mean',
       '_stdev'],
      dtype='object')

In [15]:
del cat_encoded['loss']

In [16]:
# Removing the word 'loss' from the left of the newly created columns

cat_encoded.rename(columns = lambda x: x.replace('loss,',''), inplace=True)

In [17]:
cat_encoded.columns

Index(['cat99', 'cat100', 'cat101', 'cat103', 'cat104', 'cat105', 'cat106',
       'cat107', 'cat108', 'cat109', 'cat110', 'cat111', 'cat112', 'cat113',
       'cat114', 'cat115', 'cat116', '_mean_x', '_stdev_x', '_mean_y',
       '_stdev_y', '_mean_x', '_stdev_x', '_mean_y', '_stdev_y', '_mean_x',
       '_stdev_x', '_mean_y', '_stdev_y', '_mean_x', '_stdev_x', '_mean_y',
       '_stdev_y', '_mean_x', '_stdev_x', '_mean_y', '_stdev_y', '_mean_x',
       '_stdev_x', '_mean_y', '_stdev_y', '_mean_x', '_stdev_x', '_mean_y',
       '_stdev_y', '_mean_x', '_stdev_x', '_mean_y', '_stdev_y', '_mean',
       '_stdev'],
      dtype='object')

In [18]:
# Taking the same categorical variables we encoded in train set from test set

cat_encod_test = df_test[catvarlist]
cat_encod_test.head(5)

Unnamed: 0_level_0,cat99,cat100,cat101,cat103,cat104,cat105,cat106,cat107,cat108,cat109,cat110,cat111,cat112,cat113,cat114,cat115,cat116
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
4,T,H,G,A,G,E,I,L,K,BI,BC,A,J,AX,A,Q,HG
6,P,B,D,A,G,G,G,F,B,BI,CO,E,G,X,A,L,HK
9,D,G,Q,D,D,E,J,G,A,BI,CS,C,U,AE,A,K,CK
12,T,G,A,D,E,E,I,K,K,BI,CR,A,AY,AJ,A,P,DJ
15,P,A,A,A,F,E,G,E,B,AB,EG,A,E,I,C,J,HA


In [19]:
cat_encod_test = cat_encod_test.reset_index()

In [20]:
del cat_encod_test['id']

In [21]:
cat_encoded.head(5)

Unnamed: 0,cat99,cat100,cat101,cat103,cat104,cat105,cat106,cat107,cat108,cat109,...,_mean_x,_stdev_x,_mean_y,_stdev_y,_mean_x.1,_stdev_x.1,_mean_y.1,_stdev_y.1,_mean,_stdev
0,T,B,G,A,I,E,G,J,G,BU,...,3409.983392,3040.851201,2744.910924,2475.689837,3259.916396,3005.511481,2948.525441,2699.909989,2917.5232,2545.417315
1,T,L,F,A,E,E,I,K,K,BI,...,2380.8509,2349.891371,2874.471697,2581.380533,3259.916396,3005.511481,2948.525441,2699.909989,3107.697517,2837.734327
2,D,L,O,B,E,F,H,F,A,AB,...,3250.374479,3124.220044,2942.861834,2506.936938,3259.916396,3005.511481,2991.019257,3208.069463,2695.767964,2131.3728
3,T,I,D,A,E,E,I,K,K,BI,...,3043.584628,2846.254422,2907.740038,2558.986877,3259.916396,3005.511481,2948.525441,2699.909989,2911.900687,2562.341762
4,P,F,J,A,D,E,K,G,B,H,...,2837.032428,2447.238321,2874.471697,2581.380533,3259.916396,3005.511481,3016.756872,2741.299127,3037.328947,2445.806699


In [22]:
cat_encoded2 = cat_encoded

In [23]:
cat_encoded2 = cat_encoded2.drop(cat_encoded2[catvarlist],axis=1)

In [24]:
cat_encoded2.head(5)

Unnamed: 0,_mean_x,_stdev_x,_mean_y,_stdev_y,_mean_x.1,_stdev_x.1,_mean_y.1,_stdev_y.1,_mean_x.2,_stdev_x.2,...,_mean_x.3,_stdev_x.3,_mean_y.2,_stdev_y.2,_mean_x.4,_stdev_x.4,_mean_y.3,_stdev_y.3,_mean,_stdev
0,3067.992359,2932.563972,3090.589334,2583.974161,3450.680947,2779.116912,2814.648335,2730.730664,3014.904158,3360.386503,...,3409.983392,3040.851201,2744.910924,2475.689837,3259.916396,3005.511481,2948.525441,2699.909989,2917.5232,2545.417315
1,3067.992359,2932.563972,4005.581714,3048.822161,3560.151861,2976.537533,2814.648335,2730.730664,2981.080942,2645.879582,...,2380.8509,2349.891371,2874.471697,2581.380533,3259.916396,3005.511481,2948.525441,2699.909989,3107.697517,2837.734327
2,3403.895737,3379.862054,4005.581714,3048.822161,6870.387172,4215.629488,3078.89028,2956.262034,2981.080942,2645.879582,...,3250.374479,3124.220044,2942.861834,2506.936938,3259.916396,3005.511481,2991.019257,3208.069463,2695.767964,2131.3728
3,3067.992359,2932.563972,1970.402509,1720.6643,2812.990306,2711.813767,2814.648335,2730.730664,2981.080942,2645.879582,...,3043.584628,2846.254422,2907.740038,2558.986877,3259.916396,3005.511481,2948.525441,2699.909989,2911.900687,2562.341762
4,2993.899862,2772.785165,3200.09894,2942.289968,4603.86379,3272.064591,2814.648335,2730.730664,2970.460095,2465.145125,...,2837.032428,2447.238321,2874.471697,2581.380533,3259.916396,3005.511481,3016.756872,2741.299127,3037.328947,2445.806699


In [25]:
onlystdev = cat_encoded2.filter(like='stdev', axis=1)
onlystdev.head(5)

Unnamed: 0,_stdev_x,_stdev_y,_stdev_x.1,_stdev_y.1,_stdev_x.2,_stdev_y.2,_stdev_x.3,_stdev_y.3,_stdev_x.4,_stdev_y.4,_stdev_x.5,_stdev_y.5,_stdev_x.6,_stdev_y.6,_stdev_x.7,_stdev_y.7,_stdev
0,2932.563972,2583.974161,2779.116912,2730.730664,3360.386503,2440.20316,2741.305845,3018.081009,2827.247746,2102.608022,2151.528924,3046.972088,3040.851201,2475.689837,3005.511481,2699.909989,2545.417315
1,2932.563972,3048.822161,2976.537533,2730.730664,2645.879582,2440.20316,2440.269065,2781.060283,2713.847698,3076.370083,3032.288518,2709.744506,2349.891371,2581.380533,3005.511481,2699.909989,2837.734327
2,3379.862054,3048.822161,4215.629488,2956.262034,2645.879582,2881.047093,2565.659203,2923.395751,2537.371903,1900.096355,2537.056515,2709.744506,3124.220044,2506.936938,3005.511481,3208.069463,2131.3728
3,2932.563972,1720.6643,2711.813767,2730.730664,2645.879582,2440.20316,2440.269065,2781.060283,2713.847698,3076.370083,2850.364645,3046.972088,2846.254422,2558.986877,3005.511481,2699.909989,2562.341762
4,2772.785165,2942.289968,3272.064591,2730.730664,2465.145125,2440.20316,2098.58023,2833.424198,2416.835931,1029.312229,1699.755843,3046.972088,2447.238321,2581.380533,3005.511481,2741.299127,2445.806699


In [26]:
stdev_names  = onlystdev.columns

In [27]:
onlymean = cat_encoded2.filter(like='mean', axis=1)
mean_names = onlymean.columns
mean_names

Index(['_mean_x', '_mean_y', '_mean_x', '_mean_y', '_mean_x', '_mean_y',
       '_mean_x', '_mean_y', '_mean_x', '_mean_y', '_mean_x', '_mean_y',
       '_mean_x', '_mean_y', '_mean_x', '_mean_y', '_mean'],
      dtype='object')

In [28]:
stdev_names.sort
mean_names.sort

<bound method Index.sort of Index(['_mean_x', '_mean_y', '_mean_x', '_mean_y', '_mean_x', '_mean_y',
       '_mean_x', '_mean_y', '_mean_x', '_mean_y', '_mean_x', '_mean_y',
       '_mean_x', '_mean_y', '_mean_x', '_mean_y', '_mean'],
      dtype='object')>

In [29]:
# Getting a dictionary based on training set encoding and mapping the same encoding to our test dataset


for i in range(len(catvarlist)):
    mydict = dict(zip(cat_encoded[catvarlist[i]], cat_encoded[mean_names[i]]))
    cat_encod_test[mean_names[i]] = cat_encod_test[catvarlist[i]].map(mydict)
    mydict2 = dict(zip(cat_encoded[catvarlist[i]], cat_encoded[stdev_names[i]]))
    cat_encod_test[stdev_names[i]] = cat_encod_test[catvarlist[i]].map(mydict2)

In [30]:
cat_encod_test.head(5)

Unnamed: 0,cat99,cat100,cat101,cat103,cat104,cat105,cat106,cat107,cat108,cat109,...,cat113,cat114,cat115,cat116,_mean_x,_stdev_x,_mean_y,_stdev_y,_mean,_stdev
0,T,H,G,A,G,E,I,L,K,BI,...,AX,A,Q,HG,_mean_x,_stdev_x,,,2709.464662,2334.144596
1,P,B,D,A,G,G,G,F,B,BI,...,X,A,L,HK,_mean_x,_stdev_x,,,3125.668896,3019.687107
2,D,G,Q,D,D,E,J,G,A,BI,...,AE,A,K,CK,_mean_x,_stdev_x,_mean_y,_stdev_y,3037.328947,2445.806699
3,T,G,A,D,E,E,I,K,K,BI,...,AJ,A,P,DJ,_mean_x,_stdev_x,_mean_y,_stdev_y,2911.900687,2562.341762
4,P,A,A,A,F,E,G,E,B,AB,...,I,C,J,HA,,,,,1995.420482,1296.275266
