In [1]:
import time

from IPython.core.display import display
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
sns.set(font='Osaka')

In [4]:
# Load the data

data_dir = '../data/'
train_data = 'train.csv'
test_data = 'test.csv'

t0 = time.time()
df_train = pd.read_csv(data_dir + train_data)
df_test = pd.read_csv(data_dir + test_data)
print('Time = ', time.time() - t0)

print(df_train.shape)
print(df_test.shape)

df_train_test = pd.concat([df_train, df_test])
print(df_train_test.shape)

(188318, 132)
(125546, 131)


In [20]:
# Check the sample of the train data
df_train.head()

Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,1,A,B,A,B,A,A,A,A,B,...,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843,2213.18
1,2,A,B,A,A,A,A,A,A,B,...,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,1283.6
2,5,A,B,A,A,B,A,A,A,B,...,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425,3005.09
3,10,B,B,A,B,A,A,A,A,B,...,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642,939.85
4,11,A,B,A,B,A,A,A,A,B,...,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606,2763.85


In [22]:
# How many categorical or numerical features we have?
categorical_features = [col for col in df_train.columns if 'cat' in col]
print('# of categorical features: ', len(categorical_features))

numerical_features = [col for col in df_train.columns if 'cont' in col]
print('# of numerical features: ', len(numerical_features))

# of categorical features:  116
# of numerical features:  14


## Overview of the target variable `loss`

In [23]:
df_train['loss'].describe()

count    188318.000000
mean       3037.337686
std        2904.086186
min           0.670000
25%        1204.460000
50%        2115.570000
75%        3864.045000
max      121012.250000
Name: loss, dtype: float64

## Overview of the categorical features

In [19]:
# Check the unique values of each categorical feature
for cat_feature in categorical_features:
    print(cat_feature, df_train_test[cat_feature].unique())


# NOTE:
# cat1 ~ cat72: A, B
# cat73 ~ cat76: A, B, C
# cat77 ~ cat87: from A to D
# cat88: A, B, D, E
# cat89 ~ cat108, cat111, cat114, cat115: 5 categories or more, single letter
# cat109, cat110, cat112, cat113, cat116: 2-letter, Basically *Z don't appear (only cat109 has ZZ)

cat1 ['A' 'B']
cat2 ['B' 'A']
cat3 ['A' 'B']
cat4 ['B' 'A']
cat5 ['A' 'B']
cat6 ['A' 'B']
cat7 ['A' 'B']
cat8 ['A' 'B']
cat9 ['B' 'A']
cat10 ['A' 'B']
cat11 ['B' 'A']
cat12 ['A' 'B']
cat13 ['A' 'B']
cat14 ['A' 'B']
cat15 ['A' 'B']
cat16 ['A' 'B']
cat17 ['A' 'B']
cat18 ['A' 'B']
cat19 ['A' 'B']
cat20 ['A' 'B']
cat21 ['A' 'B']
cat22 ['A' 'B']
cat23 ['B' 'A']
cat24 ['A' 'B']
cat25 ['A' 'B']
cat26 ['A' 'B']
cat27 ['A' 'B']
cat28 ['A' 'B']
cat29 ['A' 'B']
cat30 ['A' 'B']
cat31 ['A' 'B']
cat32 ['A' 'B']
cat33 ['A' 'B']
cat34 ['A' 'B']
cat35 ['A' 'B']
cat36 ['A' 'B']
cat37 ['A' 'B']
cat38 ['A' 'B']
cat39 ['A' 'B']
cat40 ['A' 'B']
cat41 ['A' 'B']
cat42 ['A' 'B']
cat43 ['A' 'B']
cat44 ['A' 'B']
cat45 ['A' 'B']
cat46 ['A' 'B']
cat47 ['A' 'B']
cat48 ['A' 'B']
cat49 ['A' 'B']
cat50 ['A' 'B']
cat51 ['A' 'B']
cat52 ['A' 'B']
cat53 ['A' 'B']
cat54 ['A' 'B']
cat55 ['A' 'B']
cat56 ['A' 'B']
cat57 ['A' 'B']
cat58 ['A' 'B']
cat59 ['A' 'B']
cat60 ['A' 'B']
cat61 ['A' 'B']
cat62 ['A' 'B']
cat63 ['A' 'B']
c