In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
le = LabelEncoder()

In [2]:
def label_encode_dataset(dataset):
    for column in dataset.columns.values:
        dataset[column] = dataset[column].astype(str)
        le.fit(y=dataset[column])
        dataset[column] = le.transform(dataset[column])
    return dataset

In [3]:
#import the data I need
test_data = pd.read_csv('csv_files/test.csv')
animal_data = pd.read_csv('csv_files/train.csv')

#Drop columns that I don't need
animal_data = animal_data.drop(labels=['AnimalID', 'Name', 'DateTime', 'OutcomeSubtype'], axis=1)

#setting up the test data
test_index = test_data.ID
test_data = test_data.drop(labels=['ID', 'Name', 'DateTime'], axis=1)
test_data = label_encode_dataset(test_data)

#splitting up the training data so I can use it to test how well my predictions are coming along
train_animal_data, test_animal_data = train_test_split(animal_data, test_size= .25, random_state=53)

#since the evaluation for this kaggle competition will be in multi-class logloss I've popped the outcomes off of the
#dataset and changed the results into dummy variables
outcome_train_animal_data = train_animal_data.pop('OutcomeType')
le.fit(outcome_train_animal_data)
outcome_train_animal_data = le.transform(outcome_train_animal_data)

#repeat for the test data
outcome_test_animal_data = test_animal_data.pop('OutcomeType')
le.fit(outcome_test_animal_data)
outcome_test_animal_data = le.transform(outcome_test_animal_data)

train_animal_data = label_encode_dataset(train_animal_data)
test_animal_data = label_encode_dataset(test_animal_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [4]:
dtrain = xgb.DMatrix(train_animal_data, label=outcome_train_animal_data)
dtest = xgb.DMatrix(test_animal_data, label=outcome_test_animal_data)

In [5]:
parm = {'bst:max_depth':1, 'bst:eta':.1, 'silent':5, 'objective':'multi:softprob', 'num_class':5, 'max_delta_step':8}
parm['nthread'] = 10
parm['eval_metric'] = 'mlogloss'

In [6]:
evallist = [(dtrain,'train'), (dtest,'eval')]

In [7]:
num_round = 1000
bst = xgb.train(parm, dtrain, num_round, evallist, early_stopping_rounds=3)

Will train until eval error hasn't decreased in 3 rounds.
[0]	train-mlogloss:1.541485	eval-mlogloss:1.542814
[1]	train-mlogloss:1.484621	eval-mlogloss:1.486820
[2]	train-mlogloss:1.435884	eval-mlogloss:1.439115
[3]	train-mlogloss:1.393712	eval-mlogloss:1.397406
[4]	train-mlogloss:1.356622	eval-mlogloss:1.361093
[5]	train-mlogloss:1.323951	eval-mlogloss:1.329154
[6]	train-mlogloss:1.295218	eval-mlogloss:1.300867
[7]	train-mlogloss:1.269649	eval-mlogloss:1.275866
[8]	train-mlogloss:1.246802	eval-mlogloss:1.253438
[9]	train-mlogloss:1.226371	eval-mlogloss:1.233467
[10]	train-mlogloss:1.207724	eval-mlogloss:1.215178
[11]	train-mlogloss:1.191138	eval-mlogloss:1.198808
[12]	train-mlogloss:1.175664	eval-mlogloss:1.183908
[13]	train-mlogloss:1.162032	eval-mlogloss:1.170411
[14]	train-mlogloss:1.149248	eval-mlogloss:1.158058
[15]	train-mlogloss:1.137991	eval-mlogloss:1.146963
[16]	train-mlogloss:1.127539	eval-mlogloss:1.136628
[17]	train-mlogloss:1.117904	eval-mlogloss:1.127468
[18]	train-mlogl

In [8]:
dtest = xgb.DMatrix(test_data)
ypred = bst.predict(dtest, output_margin=False)

  preds = preds.reshape(nrow, preds.size / nrow)


In [9]:
ypred = pd.DataFrame(ypred)
ypred = ypred.rename(columns={0: 'Adoption', 1:'Died', 2:'Euthanasia', 3:'Return_to_owner', 4:'Transfer'})
ypred.index = test_index

In [10]:
ypred.to_csv('Predictions/XGBoost.csv')