In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
le = LabelEncoder()

In [2]:
def label_encode_dataset(dataset):
    for column in dataset.columns.values:
        dataset[column] = dataset[column].astype(str)
        le.fit(y=dataset[column])
        dataset[column] = le.transform(dataset[column])
    return dataset

In [3]:
#import the data I need
test_data = pd.read_csv('csv_files/test.csv')
animal_data = pd.read_csv('csv_files/train.csv')

#Drop columns that I don't need
animal_data = animal_data.drop(labels=['AnimalID', 'Name', 'DateTime', 'OutcomeSubtype'], axis=1)

#setting up the test data
test_index = test_data.ID
test_data = test_data.drop(labels=['ID', 'Name', 'DateTime'], axis=1)
test_data = label_encode_dataset(test_data)

#splitting up the training data so I can use it to test how well my predictions are coming along
train_animal_data, test_animal_data = train_test_split(animal_data, test_size= .25, random_state=53)

#since the evaluation for this kaggle competition will be in multi-class logloss I've popped the outcomes off of the
#dataset and changed the results into dummy variables
outcome_train_animal_data = train_animal_data.pop('OutcomeType')
le.fit(outcome_train_animal_data)
outcome_train_animal_data = le.transform(outcome_train_animal_data)

#repeat for the test data
outcome_test_animal_data = test_animal_data.pop('OutcomeType')
le.fit(outcome_test_animal_data)
outcome_test_animal_data = le.transform(outcome_test_animal_data)

train_animal_data = label_encode_dataset(train_animal_data)
test_animal_data = label_encode_dataset(test_animal_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [4]:
dtrain = xgb.DMatrix(train_animal_data, label=outcome_train_animal_data)
dtest = xgb.DMatrix(test_animal_data, label=outcome_test_animal_data)

In [5]:
parm = {'bst:max_depth':1, 'bst:eta':1, 'silent':3, 'objective':'multi:softprob', 'num_class':5, 'max_delta_step':2}
parm['nthread'] = 4
parm['eval_metric'] = 'mlogloss'

In [6]:
evallist = [(dtrain,'train'), (dtest,'eval')]

In [7]:
num_round = 100
bst = xgb.train(parm, dtrain, num_round, evallist, early_stopping_rounds=3)

Will train until eval error hasn't decreased in 3 rounds.
[0]	train-mlogloss:1.160471	eval-mlogloss:1.172114
[1]	train-mlogloss:1.051350	eval-mlogloss:1.064836
[2]	train-mlogloss:1.017548	eval-mlogloss:1.032481
[3]	train-mlogloss:1.000159	eval-mlogloss:1.016694
[4]	train-mlogloss:0.987283	eval-mlogloss:1.006241
[5]	train-mlogloss:0.978141	eval-mlogloss:0.996914
[6]	train-mlogloss:0.972597	eval-mlogloss:0.992939
[7]	train-mlogloss:0.968765	eval-mlogloss:0.989346
[8]	train-mlogloss:0.965750	eval-mlogloss:0.987218
[9]	train-mlogloss:0.962387	eval-mlogloss:0.983450
[10]	train-mlogloss:0.959825	eval-mlogloss:0.981719
[11]	train-mlogloss:0.956983	eval-mlogloss:0.978535
[12]	train-mlogloss:0.954717	eval-mlogloss:0.978012
[13]	train-mlogloss:0.952774	eval-mlogloss:0.977539
[14]	train-mlogloss:0.951135	eval-mlogloss:0.976389
[15]	train-mlogloss:0.949459	eval-mlogloss:0.975701
[16]	train-mlogloss:0.947103	eval-mlogloss:0.972953
[17]	train-mlogloss:0.945513	eval-mlogloss:0.972225
[18]	train-mlogl

In [8]:
dtest = xgb.DMatrix(test_data)
ypred = bst.predict(dtest, output_margin=False)

  preds = preds.reshape(nrow, preds.size / nrow)


In [9]:
ypred = pd.DataFrame(ypred)
ypred = ypred.rename(columns={0: 'Adoption', 1:'Died', 2:'Euthanasia', 3:'Return_to_owner', 4:'Transfer'})
ypred.index = test_index

In [10]:
ypred.to_csv('Predictions/XGBoost.csv')