# Allstate Claims Severity

## Initialization

In [20]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline

## Data preparation

In [4]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
sample = pd.read_csv('../data/sample_submission.csv')

In [5]:
train.head()

Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,1,A,B,A,B,A,A,A,A,B,...,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843,2213.18
1,2,A,B,A,A,A,A,A,A,B,...,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,1283.6
2,5,A,B,A,A,B,A,A,A,B,...,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425,3005.09
3,10,B,B,A,B,A,A,A,A,B,...,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642,939.85
4,11,A,B,A,B,A,A,A,A,B,...,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606,2763.85


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188318 entries, 0 to 188317
Columns: 132 entries, id to loss
dtypes: float64(15), int64(1), object(116)
memory usage: 189.7+ MB


In [7]:
train.columns

Index(['id', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9',
       ...
       'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12',
       'cont13', 'cont14', 'loss'],
      dtype='object', length=132)

In [8]:
test.head()

Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14
0,4,A,B,A,A,A,A,A,A,B,...,0.281143,0.466591,0.317681,0.61229,0.34365,0.38016,0.377724,0.369858,0.704052,0.392562
1,6,A,B,A,B,A,A,A,A,B,...,0.836443,0.482425,0.44376,0.7133,0.5189,0.60401,0.689039,0.675759,0.453468,0.208045
2,9,A,B,A,B,B,A,B,A,B,...,0.718531,0.212308,0.325779,0.29758,0.34365,0.30529,0.24541,0.241676,0.258586,0.297232
3,12,A,A,A,A,B,A,A,A,A,...,0.397069,0.36993,0.342355,0.40028,0.33237,0.3148,0.348867,0.341872,0.592264,0.555955
4,15,B,A,A,A,A,B,A,A,A,...,0.302678,0.398862,0.391833,0.23688,0.43731,0.50556,0.359572,0.352251,0.301535,0.825823


In [9]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125546 entries, 0 to 125545
Columns: 131 entries, id to cont14
dtypes: float64(14), int64(1), object(116)
memory usage: 125.5+ MB


In [10]:
test.columns

Index(['id', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9',
       ...
       'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11',
       'cont12', 'cont13', 'cont14'],
      dtype='object', length=131)

In [11]:
train.cat5.value_counts()

A    123737
B     64581
Name: cat5, dtype: int64

In [12]:
sample.head()

Unnamed: 0,id,loss
0,4,0
1,6,0
2,9,0
3,12,0
4,15,0


In [14]:
outcome = train.loss
train.drop(['id', 'loss'], axis=1, inplace=True)
testId = test['id']
test.drop(['id'], axis=1, inplace=True)
train['Test'] = False
test['Test'] = True
data = pd.concat([train, test], ignore_index=True)

In [15]:
data = pd.get_dummies(data, columns=data.columns[:116])

In [16]:
data.columns

Index(['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8',
       'cont9', 'cont10',
       ...
       'cat116_P', 'cat116_Q', 'cat116_R', 'cat116_S', 'cat116_T', 'cat116_U',
       'cat116_V', 'cat116_W', 'cat116_X', 'cat116_Y'],
      dtype='object', length=1191)

## Machine Learning

In [17]:
train = data[data.Test == 0].drop('Test', axis=1).copy()
test = data[data.Test == 1].drop('Test', axis=1).copy()
del data

In [18]:
X = train
y = outcome
Xtr, Xval, ytr, yval = train_test_split(X, y, test_size=0.15, random_state=777)

In [19]:
Xtr.shape, Xval.shape

((160070, 1190), (28248, 1190))

### Linear regression

In [None]:
lr = LinearRegression()

In [112]:
lr.fit(Xtr,ytr)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [128]:
lr_pred_val = lr.predict(Xval)

In [129]:
lr_mae = mean_absolute_error(yval, lr_pred_val, sample_weight=None, multioutput='uniform_average')

In [130]:
lr_mae

16111576285.918974

In [131]:
#print('Train correct prediction: ', lr.score(Xtr, ytr))
#print('Validation correct prediction: ', lr.score(Xval, yval))

In [134]:
lr_pred = lr.predict(test)

In [135]:
lr_submit = pd.DataFrame({'id': testId, 'loss': lr_pred})

In [136]:
lr_submit.head()

Unnamed: 0,id,loss
0,4,1156.0
1,6,2150.0
2,9,11316.0
3,12,5971.0
4,15,237.0


In [138]:
lr_submit.to_csv('../output/2016-11-11-bff-linear-regression.csv', index=False)

### Random forest

In [150]:
rf1 = RandomForestRegressor(criterion="mae")

In [None]:
rf = RandomForestRegressor(n_estimators = 100, criterion="mae")

In [None]:
rf1.fit(Xtr, ytr)

In [None]:
rf_pred_val = rf.predict(Xval)

### XGBoost

In [35]:
# read in data
dtrain = xgb.DMatrix(X, label=y)
dtest = xgb.DMatrix(test)

In [41]:
# specify parameters via map
#param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'reg:linear' }
param = { 'eval_metric':'mae', 'objective':'reg:linear' }
num_round = 2

In [42]:
bst = xgb.train(param, dtrain, num_round)

XGBoostError: b'unknown evaluation metric type: mae'

In [30]:
# make prediction
xgb_pred = bst.predict(dtest)

In [31]:
xgb_submit = pd.DataFrame({'id': testId, 'loss': xgb_pred})

In [32]:
xgb_submit.head()

Unnamed: 0,id,loss
0,4,1011.44043
1,6,1011.44043
2,9,5153.730469
3,12,3075.475342
4,15,741.963867


In [34]:
xgb_submit.to_csv('../output/2016-11-13-bff-xgboost-basic.csv', index=False)