# Overview 
From kaggle: 

Allstate is currently developing automated methods of predicting the cost, and hence severity, of claims. In this recruitment challenge, Kagglers are invited to show off their creativity and flex their technical chops by creating an algorithm which accurately predicts claims severity. Aspiring competitors will demonstrate insight into better ways to predict claims severity for the chance to be part of Allstate’s efforts to ensure a worry-free customer experience.

Evaluation:
Submissions are evaluated on the mean absolute error (MAE) between the predicted loss and the actual loss.

## Evaluation Metric: mean absolute error 

Mean absolute error calculates the expected value of the absolute error. 

$$\text{MAE}(y, \hat{y}) = \frac{1}{n}\sum_0^{n-1} | y_i - \hat{y}_i |$$

## Data 
Each row in this dataset represents an insurance claim. You must predict the value for the 'loss' column. Variables prefaced with 'cat' are categorical, while those prefaced with 'cont' are continuous.
File descriptions

    train.csv - the training set
    test.csv - the test set. You must predict the loss value for the ids in this file.
    sample_submission.csv - a sample submission file in the correct format

In [3]:
import os 
os.makedirs('data', exist_ok=True)

Now download the zip file from [kaggle](https://www.kaggle.com/c/allstate-claims-severity/data) and unzip it in the data directory.

In [4]:
ls data

[0m[01;31mallstate.zip[0m           [01;31msample_submission.csv.zip[0m  [01;31mtest.csv.zip[0m  [01;31mtrain.csv.zip[0m
sample_submission.csv  test.csv                   train.csv


In [46]:
# imports 
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor

from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype


In [117]:
# helper functions 
def train_cats(df):
    """Converts all string columns to categorical columns"""
    for n,c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()

def encode_cat_codes(df):
    for n, c in df.items():
        if is_categorical_dtype(c): df[n] = c.cat.codes

def split_vals(X,y, t = .7):
    idx = np.random.permutation(len(X))
    n = int(np.round(t*len(X)))
    return X.iloc[idx][:n], X.iloc[idx][n:], y.iloc[idx][:n], X.iloc[idx][n:]


# EDA

In [9]:
# Read in data
train = pd.read_csv('data/train.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')
test = pd.read_csv('data/test.csv')

### Look at the head, info, and describe

In [11]:
train.head()

Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,1,A,B,A,B,A,A,A,A,B,...,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843,2213.18
1,2,A,B,A,A,A,A,A,A,B,...,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,1283.6
2,5,A,B,A,A,B,A,A,A,B,...,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425,3005.09
3,10,B,B,A,B,A,A,A,A,B,...,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642,939.85
4,11,A,B,A,B,A,A,A,A,B,...,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606,2763.85


In [12]:
test.head()

Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14
0,4,A,B,A,A,A,A,A,A,B,...,0.281143,0.466591,0.317681,0.61229,0.34365,0.38016,0.377724,0.369858,0.704052,0.392562
1,6,A,B,A,B,A,A,A,A,B,...,0.836443,0.482425,0.44376,0.7133,0.5189,0.60401,0.689039,0.675759,0.453468,0.208045
2,9,A,B,A,B,B,A,B,A,B,...,0.718531,0.212308,0.325779,0.29758,0.34365,0.30529,0.24541,0.241676,0.258586,0.297232
3,12,A,A,A,A,B,A,A,A,A,...,0.397069,0.36993,0.342355,0.40028,0.33237,0.3148,0.348867,0.341872,0.592264,0.555955
4,15,B,A,A,A,A,B,A,A,A,...,0.302678,0.398862,0.391833,0.23688,0.43731,0.50556,0.359572,0.352251,0.301535,0.825823


In [23]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188318 entries, 0 to 188317
Columns: 132 entries, id to loss
dtypes: float64(15), int64(1), object(116)
memory usage: 189.7+ MB


In [19]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125546 entries, 0 to 125545
Columns: 131 entries, id to cont14
dtypes: float64(14), int64(1), object(116)
memory usage: 125.5+ MB


In [24]:
train.describe()

Unnamed: 0,id,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
count,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0
mean,294135.982561,0.493861,0.507188,0.498918,0.491812,0.487428,0.490945,0.48497,0.486437,0.485506,0.498066,0.493511,0.49315,0.493138,0.495717,3037.337686
std,169336.084867,0.18764,0.207202,0.202105,0.211292,0.209027,0.205273,0.17845,0.19937,0.18166,0.185877,0.209737,0.209427,0.212777,0.222488,2904.086186
min,1.0,1.6e-05,0.001149,0.002634,0.176921,0.281143,0.012683,0.069503,0.23688,8e-05,0.0,0.035321,0.036232,0.000228,0.179722,0.67
25%,147748.25,0.34609,0.358319,0.336963,0.327354,0.281143,0.336105,0.350175,0.3128,0.35897,0.36458,0.310961,0.311661,0.315758,0.29461,1204.46
50%,294539.5,0.475784,0.555782,0.527991,0.452887,0.422268,0.440945,0.438285,0.44106,0.44145,0.46119,0.457203,0.462286,0.363547,0.407403,2115.57
75%,440680.5,0.623912,0.681761,0.634224,0.652072,0.643315,0.655021,0.591045,0.62358,0.56682,0.61459,0.678924,0.675759,0.689974,0.724623,3864.045
max,587633.0,0.984975,0.862654,0.944251,0.954297,0.983674,0.997162,1.0,0.9802,0.9954,0.99498,0.998742,0.998484,0.988494,0.844848,121012.25


### Check for missing values

In [31]:
 train.isnull().sum().max()

0

In [32]:
test.isnull().sum().max()

0

### Look at train.describe() for categorical columns

In [48]:
cat_cols = [col for col in train.columns if train.dtypes.to_dict()[col] == 'O']

In [52]:
num_cols = train.select_dtypes(include='number').columns

In [54]:
num_cols

Index(['id', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14',
       'loss'],
      dtype='object')

In [57]:
train[cat_cols].describe().T

Unnamed: 0,count,unique,top,freq
cat1,188318,2,A,141550
cat2,188318,2,A,106721
cat3,188318,2,A,177993
cat4,188318,2,A,128395
cat5,188318,2,A,123737
...,...,...,...,...
cat112,188318,51,E,25148
cat113,188318,61,BM,26191
cat114,188318,19,A,131693
cat115,188318,23,K,43866


# Preprocessing
With no missing values, we will simply change the columns with strings into categorical columns and replace the entries with numbers


In [71]:
def train_cats(df):
    """Converts all string columns to categorical columns"""
    for n,c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()

In [77]:
def encode_cat_codes(df):
    for n, c in df.items():
        if is_categorical_dtype(c): df[n] = c.cat.codes

In [78]:
train_cats(train)

NameError: name 'df' is not defined

In [79]:
encode_cat_codes(train)

In [81]:
train.dtypes

id          int64
cat1         int8
cat2         int8
cat3         int8
cat4         int8
           ...   
cont11    float64
cont12    float64
cont13    float64
cont14    float64
loss      float64
Length: 132, dtype: object

#  Modeling

## Get a train and validation set 

In [123]:
def split_vals(X,y, t = .7):
    idx = np.random.permutation(len(X))
    n = int(np.round(t*len(X)))
    return X.iloc[idx][:n], X.iloc[idx][n:], y.iloc[idx][:n], X.iloc[idx][n:]

In [96]:
X = train.drop('loss', axis = 1)
y = train.loss

In [97]:
X.shape

(188318, 131)

In [98]:
y.shape

(188318,)

In [124]:
X_train, X_valid, y_train, y_valid = split_vals(X, y)

In [125]:
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(131823, 131) (56495, 131) (131823,) (56495,)


### Def print_score(y, pred)

In [115]:
def mae(y, preds): 
    return (1/len(y))*np.sum(np.abs(y-preds))

In [None]:
def print_score(m):
    res = [mae(m.predict(X_train), y_train), mae(m.predict(X_valid), y_valid),
          m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, oob_score_): res.append(m.oob_score)
    return res

### Train a few models to find good hyperparameters

In [137]:
m = RandomForestRegressor(n_estimators=1,max_depth=3, min_samples_split=1000, criterion='mae', oob_score=True, n_jobs=-1)

In [136]:
RandomForestRegressor??

In [138]:
m.fit(X_train, y_train)

KeyboardInterrupt: 