### Load the dataset

In [303]:
import numpy as np
import pandas as pd

In [304]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.columns = [_.lower() for _ in train.columns]
test.columns = [_.lower() for _ in test.columns]
print('train.shape: {}'.format(train.shape))
print('test.shape: {}'.format(test.shape))

train.shape: (26729, 10)
test.shape: (11456, 8)


### Fill in missing values & Feature engineering

#### Drop the useless attribute outcomesubtype

In [305]:
train = train.drop('outcomesubtype', axis=1)

#### Combine the training and testing data for easy processing

In [306]:
tmp_train = train.drop(['animalid', 'outcometype'], axis=1)
tmp_test = test.drop('id', axis=1)
sep = tmp_train.shape[0]
combine = pd.concat([tmp_train, tmp_test], axis=0)
print('combine.shape: {}'.format(combine.shape))

combine.shape: (38185, 7)


In [307]:
combine.head()

Unnamed: 0,name,datetime,animaltype,sexuponoutcome,ageuponoutcome,breed,color
0,Hambone,2014-02-12 18:22:00,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,Emily,2013-10-13 12:44:00,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,Pearce,2015-01-31 12:28:00,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,,2014-07-11 19:09:00,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,,2013-11-15 12:52:00,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


#### Create the hasname attribute and drop the original name

In [308]:
combine['hasname'] = combine.name.notnull().astype(int)
combine = combine.drop('name', axis=1)

#### Get the time attributes and drop the original datetime

In [309]:
combine['datetime'] = pd.to_datetime(combine.datetime)
combine['year'] = combine.datetime.dt.year.astype('category')
combine['month'] = combine.datetime.dt.month.astype('category')
combine['day'] = combine.datetime.dt.day.astype('category')
combine['hour'] = combine.datetime.dt.hour.astype('category')
combine = combine.drop('datetime', axis=1)

#### Use 'intact' to replace 'sexuponoutcome'

In [310]:
combine['intact'] = combine.sexuponoutcome.fillna('Unknown').map({
    'Neutered Male': 0,
    'Spayed Female': 0,
    'Intact Male': 1,
    'Intact Female': 1,
    'Unknown': 1
})
combine = combine.drop('sexuponoutcome', axis=1)

#### Transfer animaltype into a binary attribute, use 1 to represent dog

In [311]:
combine['animaltype'] = combine.animaltype.apply(lambda x: int(x=='Dog'))

#### Transfer sexuponoutcome into days and fill in the missing values

In [312]:
def age_transfer(age_str):
    num = age_str.split(' ')[0]
    num = int(num)
    if 'day' in age_str:
        return num
    elif 'week' in age_str:
        return num * 7
    elif 'month' in age_str:
        return num * 30
    elif 'year' in age_str:
        return num * 365
    else:
        return np.nan
    
combine['ageuponoutcome'] = combine.ageuponoutcome.fillna('0').apply(age_transfer)
combine['ageuponoutcome'] = combine.groupby(['animaltype', 'intact']).ageuponoutcome.transform(lambda x: x.fillna(x.median()))

#### Extract some attributes from breed

In [313]:
# Hair
combine['hair'] = combine.breed.str.extract(r'(\w+)\s?[hH]air', expand=False).fillna('Unclear').astype('category')

# Mix
combine['mix'] = combine.breed.str.contains('Mix').astype(int)

# Domestic
combine['domestic'] = combine.breed.str.contains('Domestic').astype(int)
    
# Drop the original
combine = combine.drop('breed', axis=1)

#### Create attribute 'morethanonecolor' from color and drop color attribute

In [314]:
combine['morethanonecolor'] = combine.color.str.contains('/').astype(int)

combine = combine.drop('color', axis=1)

#### Turn categorical attributes into dummy variables

In [315]:
combine = pd.get_dummies(combine)

In [316]:
combine.head()

Unnamed: 0,animaltype,ageuponoutcome,hasname,intact,mix,domestic,morethanonecolor,year_2013,year_2014,year_2015,...,hour_20,hour_21,hour_22,hour_23,hair_Long,hair_Medium,hair_Mexican,hair_Short,hair_Unclear,hair_Wire
0,1,365.0,1,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,0,365.0,1,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,730.0,1,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,0,21.0,0,1,1,1,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,1,730.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


### Prepare the data for modeling

In [317]:
X_train = combine[:sep]
y_train = train['outcometype']
X_test = combine[sep:]

In [318]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

### Set up the metric

In [319]:
mt = 'neg_log_loss'
kf = StratifiedKFold(n_splits=8, shuffle=True, random_state=0)

In [320]:
def performance(model):
    scores = - cross_val_score(model, X_train, y_train, cv=kf, scoring=mt, n_jobs=4)
    print('score mean: {:.4f}'.format(scores.mean()))
    print('score std: {:.4f}'.format(scores.std()))
    
def grid_search(model, params):
    grid = GridSearchCV(model, params, cv=kf, scoring=mt, verbose=True, n_jobs=4).fit(X_train, y_train)
    print('grid.best_score_:{:.4f}'.format(- grid.best_score_))
    print('grid.best_params_:\n{}'.format(grid.best_params_))
    return grid.best_estimator_

### Get the baseline

In [335]:
from lightgbm.sklearn import LGBMClassifier
from xgboost.sklearn import XGBClassifier

In [333]:
%%time
performance(LGBMClassifier())

score mean: 0.7654
score std: 0.0152
Wall time: 19.1 s


In [334]:
%%time
performance(RandomForestClassifier(n_estimators=100))

score mean: 0.9150
score std: 0.0444
Wall time: 26.4 s


In [336]:
%%time
performance(XGBClassifier())

score mean: 0.8021
score std: 0.0147
Wall time: 3min 54s


### Search the best parameters

In [326]:
params = {
    'n_estimators': [100],
    'max_depth': np.arange(2, 10, 1),
    'random_state': [0]
}

clf = grid_search(RandomForestClassifier(), params)

Fitting 8 folds for each of 8 candidates, totalling 64 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   38.8s
[Parallel(n_jobs=4)]: Done  64 out of  64 | elapsed:  1.1min finished


grid.best_score_:0.8737
grid.best_params_:
{'max_depth': 9, 'n_estimators': 100, 'random_state': 0}


### Feature importance

In [323]:
# feature_importance = clf.fit(X_train, y_train).feature_importances_
# feature_importance = pd.Series(feature_importance, index=X_train.columns).reset_index()
# feature_importance.columns = ['feature', 'importance']
# feature_importance = feature_importance.sort_values('importance', ascending=False).reset_index(drop=True)

### Make submission

In [324]:
# test_id = pd.Series(test.id.values, name='ID')
# y_pred = clf.fit(X_train, y_train).predict_proba(X_test)
# y_pred = pd.DataFrame(y_pred, columns=clf.classes_)
# sub = pd.concat([test_id, y_pred], axis=1)

In [325]:
# sub.to_csv('sub.csv', index=False)