### Load the dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.columns = [_.lower() for _ in train.columns]
test.columns = [_.lower() for _ in test.columns]
print('train.shape: {}'.format(train.shape))
print('test.shape: {}'.format(test.shape))

train.shape: (26729, 10)
test.shape: (11456, 8)


### Fill in missing values & Feature engineering

#### Drop the useless attribute outcomesubtype

In [3]:
train = train.drop('outcomesubtype', axis=1)

#### Combine the training and testing data for easy processing

In [4]:
tmp_train = train.drop(['animalid', 'outcometype'], axis=1)
tmp_test = test.drop('id', axis=1)
sep = tmp_train.shape[0]
combine = pd.concat([tmp_train, tmp_test], axis=0)
print('combine.shape: {}'.format(combine.shape))

combine.shape: (38185, 7)


In [5]:
combine.head()

Unnamed: 0,name,datetime,animaltype,sexuponoutcome,ageuponoutcome,breed,color
0,Hambone,2014-02-12 18:22:00,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,Emily,2013-10-13 12:44:00,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,Pearce,2015-01-31 12:28:00,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,,2014-07-11 19:09:00,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,,2013-11-15 12:52:00,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


#### Create the hasname attribute and drop the original name

In [6]:
combine['hasname'] = combine.name.notnull().astype(int)
combine = combine.drop('name', axis=1)

#### Get the time attributes and drop the original datetime

In [7]:
combine['datetime'] = pd.to_datetime(combine.datetime)
combine['year'] = combine.datetime.dt.year.astype('category')
combine['month'] = combine.datetime.dt.month.astype('category')
combine['day'] = combine.datetime.dt.day.astype('category')
combine['hour'] = combine.datetime.dt.hour.astype('category')
combine = combine.drop('datetime', axis=1)

#### Use 'intact' to replace 'sexuponoutcome'

In [8]:
combine['intact'] = combine.sexuponoutcome.fillna('Unknown').map({
    'Neutered Male': 0,
    'Spayed Female': 0,
    'Intact Male': 1,
    'Intact Female': 1,
    'Unknown': 1
})
combine = combine.drop('sexuponoutcome', axis=1)

#### Transfer animaltype into a binary attribute, use 1 to represent dog

In [9]:
combine['animaltype'] = combine.animaltype.apply(lambda x: int(x=='Dog'))

#### Transfer sexuponoutcome into days and fill in the missing values

In [10]:
def age_transfer(age_str):
    num = age_str.split(' ')[0]
    num = int(num)
    if 'day' in age_str:
        return num
    elif 'week' in age_str:
        return num * 7
    elif 'month' in age_str:
        return num * 30
    elif 'year' in age_str:
        return num * 365
    else:
        return np.nan
    
combine['ageuponoutcome'] = combine.ageuponoutcome.fillna('0').apply(age_transfer)
combine['ageuponoutcome'] = combine.groupby(['animaltype', 'intact']).ageuponoutcome.transform(lambda x: x.fillna(x.median()))

#### Extract some attributes from breed

In [11]:
# Hair
combine['hair'] = combine.breed.str.extract(r'(\w+)\s?[hH]air', expand=False).fillna('Unclear').astype('category')

# Mix
combine['mix'] = combine.breed.str.contains('Mix').astype(int)

# Domestic
combine['domestic'] = combine.breed.str.contains('Domestic').astype(int)
    
# Drop the original
combine = combine.drop('breed', axis=1)

#### Create attribute 'morethanonecolor' from color and drop color attribute

In [12]:
combine['morethanonecolor'] = combine.color.str.contains('/').astype(int)

combine = combine.drop('color', axis=1)

#### Turn categorical attributes into dummy variables

In [13]:
combine = pd.get_dummies(combine)

In [14]:
combine.head()

Unnamed: 0,animaltype,ageuponoutcome,hasname,intact,mix,domestic,morethanonecolor,year_2013,year_2014,year_2015,...,hour_20,hour_21,hour_22,hour_23,hair_Long,hair_Medium,hair_Mexican,hair_Short,hair_Unclear,hair_Wire
0,1,365.0,1,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,0,365.0,1,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,730.0,1,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,0,21.0,0,1,1,1,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,1,730.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


### Prepare the data for modeling

In [15]:
X_train = combine[:sep]
y_train = train['outcometype']
X_test = combine[sep:]

In [16]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from lightgbm.sklearn import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier

### Set up the metric

In [17]:
mt = 'neg_log_loss'
kf = StratifiedKFold(n_splits=8, shuffle=True, random_state=0)

In [18]:
def performance(model):
    scores = - cross_val_score(model, X_train, y_train, cv=kf, scoring=mt, n_jobs=4)
    print('score mean: {:.4f}'.format(scores.mean()))
    print('score std: {:.4f}'.format(scores.std()))
    
def grid_search(model, params):
    grid = GridSearchCV(model, params, cv=kf, scoring=mt, verbose=True, n_jobs=4).fit(X_train, y_train)
    print('grid.best_score_:{:.4f}'.format(- grid.best_score_))
    print('grid.best_params_:\n{}'.format(grid.best_params_))
    return grid.best_estimator_

### Get the baselines

In [19]:
%%time
performance(LGBMClassifier())

score mean: 0.7654
score std: 0.0152
Wall time: 33.9 s


In [20]:
%%time
performance(XGBClassifier())

score mean: 0.8021
score std: 0.0147
Wall time: 4min 53s


In [21]:
%%time
performance

Wall time: 0 ns


<function __main__.performance(model)>

### Search the best parameters

In [22]:
params = {
    'random_state': [0],
    'n_estimators': [500],
    'learning_rate': [0.06],
    'num_leaves': [42],
    'colsample_bytree': [0.8],
    'reg_lambda': [0.01],
    'reg_alpha': [1.0]
}

clf = grid_search(LGBMClassifier(), params)

Fitting 8 folds for each of 9 candidates, totalling 72 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  9.4min
[Parallel(n_jobs=4)]: Done  72 out of  72 | elapsed: 14.9min finished


grid.best_score_:0.7589
grid.best_params_:
{'colsample_bytree': 0.8, 'learning_rate': 0.060000000000000005, 'n_estimators': 500, 'num_leaves': 42, 'random_state': 0, 'reg_alpha': 1.0, 'reg_lambda': 0.01}


### Feature importance

In [26]:
feature_importance = clf.fit(X_train, y_train).feature_importances_
feature_importance = pd.Series(feature_importance, index=X_train.columns).reset_index()
feature_importance.columns = ['feature', 'importance']
feature_importance = feature_importance.sort_values('importance', ascending=False).reset_index(drop=True)

In [29]:
feature_importance.head(10)

Unnamed: 0,feature,importance
0,ageuponoutcome,30482
1,hasname,2129
2,animaltype,2112
3,year_2015,2110
4,year_2014,1935
5,intact,1888
6,hair_Unclear,1782
7,morethanonecolor,1689
8,mix,1497
9,hour_18,1489


### Make submission

In [30]:
test_id = pd.Series(test.id.values, name='ID')
y_pred = clf.fit(X_train, y_train).predict_proba(X_test)
y_pred = pd.DataFrame(y_pred, columns=clf.classes_)
sub = pd.concat([test_id, y_pred], axis=1)

In [31]:
# sub.to_csv('sub.csv', index=False)