# Kaggle Animal Shelter Data Exploration

In [2]:
# Import modules

import pandas as pd
import numpy as np
import os
import sys
import random
import copy
import cPickle as pickle
import datetime

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import colorlover as cl
import matplotlib.pyplot as plt 

src_dir = os.path.join(os.getcwd(), os.pardir)
sys.path.append(src_dir)

from sklearn.model_selection import ShuffleSplit
from scipy.stats import spearmanr
from scipy import stats
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler
from scipy.stats import boxcox
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import src.plotting_methods as pm
#import imp
#pm = imp.load_source('pm', '/Volumes/Work/Projects/Kaggle_Animal_Shelter/src/plotting_methods.py')

init_notebook_mode(connected=True)

%reload_ext autoreload
%autoreload 2

pd.options.display.float_format = '{:,.4f}'.format

In [3]:
# Load data

#raw_data_dir = 'C:\Users\Colleen\Documents\Kaggle Animal Shelter\data'
raw_data_dir = '/Volumes/Work/Projects/Kaggle_Animal_Shelter/data'

f = open(os.path.join(raw_data_dir, 'train.csv'), 'r')
train_data = pd.read_csv(f)
f.close()


## Data Cleaning and Basic Feature Engineering

In [14]:
train_data.columns

Index([u'AnimalID', u'Name', u'DateTime', u'OutcomeType', u'OutcomeSubtype',
       u'AnimalType', u'SexuponOutcome', u'AgeuponOutcome', u'Breed',
       u'Color'],
      dtype='object')

In [16]:
'Number of samples: ' + str(train_data.shape[0])

'Number of samples: 26729'

In [17]:
train_data.dtypes

AnimalID          object
Name              object
DateTime          object
OutcomeType       object
OutcomeSubtype    object
AnimalType        object
SexuponOutcome    object
AgeuponOutcome    object
Breed             object
Color             object
dtype: object

Every feature is likely a string type, but based on the column names we can convert some into other types.

Number of unique values for each feature:

In [19]:
train_data.apply(lambda x: len(np.unique(x)))

AnimalID          26729
Name              14065
DateTime          22918
OutcomeType           5
OutcomeSubtype    13628
AnimalType            2
SexuponOutcome        6
AgeuponOutcome       62
Breed              1380
Color               366
dtype: int64

Now we go through each feature to clean and convert to other types

## DateTime

We can convert these into datetime object and look at the distribution.

In [4]:
train_data['DateTime'].head()

0    2014-02-12 18:22:00
1    2013-10-13 12:44:00
2    2015-01-31 12:28:00
3    2014-07-11 19:09:00
4    2013-11-15 12:52:00
Name: DateTime, dtype: object

In [4]:
times = [datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') for x in train_data['DateTime']]

In [6]:
iplot([go.Histogram(x = times)])

There doesn't seem to be any outliers or weird values, and the date range agrees with what's given in the competition description. In addition there appears to be spikes in events in the summer, which we'll investigate later.

## OutcomeType

This should be the target. We print the unique values to check if they fit the data description. Right away we can see the target variable is unbalanced.

In [32]:
np.unique(train_data['OutcomeType'])

array(['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'], dtype=object)

In [41]:
train_data['OutcomeType'].groupby(train_data['OutcomeType']).count().append(pd.Series({'nan': len(train_data['OutcomeType']) - train_data['OutcomeType'].count()}))

Adoption           10769
Died                 197
Euthanasia          1555
Return_to_owner     4786
Transfer            9422
nan                    0
dtype: int64

## OutcomeSubtype

Printing the unique values shows there are missing values here.

Re-printing them, remove missing values gives a much more mangeable number.  This variable is clearly categorical.  We may be able to merge some of these categories, but we need to be careful to not remove any information. We'll leave it for now.

In [40]:
train_data['OutcomeSubtype'].groupby(train_data['OutcomeSubtype']).count().append(pd.Series({'nan': len(train_data['OutcomeSubtype']) - train_data['OutcomeSubtype'].count()}))

Aggressive               320
At Vet                     4
Barn                       2
Behavior                  86
Court/Investigation        6
Enroute                    8
Foster                  1800
In Foster                 52
In Kennel                114
In Surgery                 3
Medical                   66
Offsite                  165
Partner                 7816
Rabies Risk               74
SCRP                    1599
Suffering               1002
nan                    13612
dtype: int64

In [5]:
def plot_cnts(feats1, levels1 = None, feats2, levels2 = None):
    
    if levels1 == None:
        f1 = feats1
    else:
        f1 = feats1.loc[np.array([x in levels1 for x in feats1])]
    cnts1 = f1.groupby(f1).count()
        
    if levels2 == None:
        f2 = feats2
    else:
        f2 = feats2.loc[np.array([x in levels2 for x in feats2])]     
    cnts2 = f2.groupby(f2).count()
    
    return [go.Bar(x = cnts1)]

SyntaxError: non-default argument follows default argument (<ipython-input-5-e7f7e71c0c0b>, line 1)

## AnimalType

This one is categorical and simple - cat or dog

In [35]:
np.unique(train_data['AnimalType'])

array(['Cat', 'Dog'], dtype=object)

In [42]:
train_data['AnimalType'].groupby(train_data['AnimalType']).count().append(pd.Series({'nan': len(train_data['AnimalType']) - train_data['AnimalType'].count()}))

Cat    11134
Dog    15595
nan        0
dtype: int64

In [86]:
pd.crosstab(index=train_data['AnimalType'], columns=train_data['OutcomeType'])

OutcomeType,Adoption,Died,Euthanasia,Return_to_owner,Transfer
AnimalType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Cat,4272,147,710,500,5505
Dog,6497,50,845,4286,3917


## SexuponOutcome

This one is also categorical, indicating spayed/neutered status.  The missing value and 'Unknown' could likely be merged. We can also separate in two main categories: Sex and Is fixed

In [36]:
np.unique(train_data['SexuponOutcome'])

array([nan, 'Intact Female', 'Intact Male', 'Neutered Male',
       'Spayed Female', 'Unknown'], dtype=object)

In [43]:
train_data['SexuponOutcome'].groupby(train_data['SexuponOutcome']).count().append(pd.Series({'nan': len(train_data['SexuponOutcome']) - train_data['SexuponOutcome'].count()}))

Intact Female    3511
Intact Male      3525
Neutered Male    9779
Spayed Female    8820
Unknown          1093
nan                 1
dtype: int64

In [6]:
sex = [int('Female' in x) if isinstance(x, str) else np.nan for x in train_data['SexuponOutcome']]
is_fixed = [int('Intact' not in x) if isinstance(x, str) else np.nan for x in train_data['SexuponOutcome']]

## AgeuponOutcome 

This variable can be converted to nominal, using the number of days as the measure of age

In [8]:
np.unique(train_data['AgeuponOutcome'])

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, '0 years', '1 day', '1 month', '1 week',
       '1 weeks', '1 year', '10 months', '10 years', '11 months',
       '11 years', '12 years', '13 years', '14 years', '15 years',
       '16 years', '17 years', '18 years', '19 years', '2 days',
       '2 months', '2 weeks', '2 years', '20 years', '3 days', '3 months',
       '3 weeks', '3 years', '4 days', '4 months', '4 weeks', '4 years',
       '5 days', '5 months', '5 weeks', '5 years', '6 days', '6 months',
       '6 years', '7 months', '7 years', '8 months', '8 years',
       '9 months', '9 years'], dtype=object)

In [7]:
def get_age(s):
    
    if (s == 'nan') or (isinstance(s, float) and np.isnan(s)):
        return s
    
    n = s.split(' ')
    num = int(n[0])
    time = n[1]
    if num == 0:
        return 0
    elif 'year' in time:
        return num * 365
    elif 'month' in time:
        return num * 30
    else:
        return num

In [8]:
ages = [get_age(s) for s in train_data['AgeuponOutcome']]

In [11]:
iplot([go.Histogram(x = ages)])

There are a range of ages but its quite skewed.

## Breed

There are a lot of categories here. We definitely need to compress these or come up with other features based on it.

In [62]:
x = 'Breed'
cnts = train_data[x].groupby(train_data[x]).count().append(pd.Series({'nan': len(train_data[x]) - train_data[x].count()}))

In [70]:
len(cnts)

1381

In [68]:
cnts.sort_values()

nan                                                   0
Newfoundland/Queensland Heeler                        1
Newfoundland/Great Pyrenees                           1
Newfoundland/Border Collie                            1
Newfoundland/Australian Cattle Dog                    1
Chinese Crested/Chihuahua Longhair                    1
Chinese Crested/Papillon                              1
Munchkin Longhair Mix                                 1
Miniature Schnauzer/Whippet                           1
Chihuahua Shorthair/Smooth Fox Terrier                1
Chinese Sharpei/Airedale Terrier                      1
Chinese Sharpei/Basset Hound                          1
Miniature Schnauzer/West Highland                     1
Chinese Sharpei/Great Dane                            1
Miniature Schnauzer/Soft Coated Wheaten Terrier       1
Chinese Sharpei/Pit Bull                              1
Miniature Schnauzer/Shih Tzu                          1
Miniature Schnauzer/Scottish Terrier            

In [9]:
from itertools import chain
breeds = np.unique(list(chain.from_iterable([x.split('/') for x in train_data['Breed']])))
breed_cnts = dict([(x, sum([x in y for y in train_data['Breed']])) for x in breeds])

In [96]:
len(breed_cnts)

397

In [97]:
breed_cnts

{'Abyssinian Mix': 2,
 'Affenpinscher': 10,
 'Affenpinscher Mix': 6,
 'Afghan Hound Mix': 1,
 'Airedale Terrier': 9,
 'Airedale Terrier Mix': 5,
 'Akita': 29,
 'Akita Mix': 11,
 'Alaskan Husky': 26,
 'Alaskan Husky Mix': 10,
 'Alaskan Malamute': 17,
 'Alaskan Malamute Mix': 5,
 'American Bulldog': 147,
 'American Bulldog Mix': 109,
 'American Eskimo': 14,
 'American Eskimo Mix': 9,
 'American Foxhound': 7,
 'American Foxhound Mix': 2,
 'American Pit Bull Terrier': 90,
 'American Pit Bull Terrier Mix': 68,
 'American Shorthair Mix': 9,
 'American Staffordshire Terrier': 121,
 'American Staffordshire Terrier Mix': 92,
 'Anatol Shepherd': 152,
 'Anatol Shepherd Mix': 76,
 'Angora Mix': 7,
 'Australian Cattle Dog': 640,
 'Australian Cattle Dog Mix': 367,
 'Australian Kelpie': 133,
 'Australian Kelpie Mix': 95,
 'Australian Shepherd': 272,
 'Australian Shepherd Mix': 163,
 'Australian Terrier Mix': 4,
 'Balinese Mix': 5,
 'Basenji': 24,
 'Basenji Mix': 11,
 'Basset Hound': 101,
 'Basset Hou

Looking at these, we could try pulling out the 'mix' identifier.

In [101]:
new_breeds = list(np.unique([x.split(' Mix')[0] for x in breeds]))
new_breeds.append('Mix')
new_breed_cnts = dict([(x, sum([x in y for y in train_data['Breed']])) for x in new_breeds])

In [104]:
len(new_breed_cnts)

226

In [13]:
new_breed_cnts

{'Abyssinian': 2,
 'Affenpinscher': 10,
 'Afghan Hound': 1,
 'Airedale Terrier': 9,
 'Akita': 29,
 'Alaskan Husky': 26,
 'Alaskan Malamute': 17,
 'American Bulldog': 147,
 'American Eskimo': 14,
 'American Foxhound': 7,
 'American Pit Bull Terrier': 90,
 'American Shorthair': 9,
 'American Staffordshire Terrier': 121,
 'Anatol Shepherd': 152,
 'Angora': 7,
 'Australian Cattle Dog': 640,
 'Australian Kelpie': 133,
 'Australian Shepherd': 272,
 'Australian Terrier': 4,
 'Balinese': 5,
 'Basenji': 24,
 'Basset Hound': 101,
 'Beagle': 311,
 'Bearded Collie': 2,
 'Beauceron': 14,
 'Bedlington Terr': 3,
 'Belgian Malinois': 20,
 'Belgian Sheepdog': 3,
 'Belgian Tervuren': 2,
 'Bengal': 5,
 'Bernese Mountain Dog': 9,
 'Bichon Frise': 22,
 'Black': 130,
 'Black Mouth Cur': 101,
 'Bloodhound': 12,
 'Blue Lacy': 74,
 'Bluetick Hound': 7,
 'Boerboel': 4,
 'Bombay': 5,
 'Border Collie': 432,
 'Border Terrier': 74,
 'Borzoi': 1,
 'Boston Terrier': 86,
 'Boxer': 437,
 'Boykin Span': 4,
 'British Sho

In [11]:
def get_breed_cat(x):
    n = x.split('/')
    
    cats = []
    for y in n:
        n2 = y.split(' Mix')
        if len(n2) == 2:
            n2[1] = 'Mix'
        cats.extend(n2)
    return cats

In [12]:
get_breed_cat( 'Manchester Terrier/Black cat')

['Manchester Terrier', 'Black cat']

In [17]:
breed_cats = [get_breed_cat(x) for x in train_data['Breed']]

## Color

There are a lot of these.  They also need to be compressed.

In [74]:
x = 'Color'
cnts = train_data[x].groupby(train_data[x]).count().append(pd.Series({'nan': len(train_data[x]) - train_data[x].count()}))
cnts.sort_values()

nan                               0
Chocolate/Gold                    1
Chocolate/Gray                    1
Chocolate/Red Tick                1
Cream/Orange                      1
Cream/Red                         1
Cream/Red Tick                    1
Cream/Seal Point                  1
Fawn/Brown                        1
Chocolate/Cream                   1
Fawn/Brown Brindle                1
Fawn/Tricolor                     1
Gold/Black                        1
Gold/Buff                         1
Gold/Tan                          1
Gold/Yellow                       1
Gray Tabby/Black                  1
Gray/Red                          1
Liver Tick/White                  1
Yellow/Yellow                     1
Chocolate/Brown Merle             1
Chocolate/Brown Brindle           1
Calico/Orange Tabby               1
Brown Brindle/Blue Tick           1
Brown Brindle/Brown Brindle       1
Brown Brindle/Brown Merle         1
Brown Merle/Blue Merle            1
Brown Merle/Tan             

First we'll get a list of all the colors, by separating the combos

In [14]:
from itertools import chain
colors = np.unique(list(chain.from_iterable([x.split('/') for x in train_data['Color']])))

In [91]:
color_cnts = dict([(x, sum([x in y for y in train_data['Color']])) for x in colors])

In [106]:
len(color_cnts)

57

In [92]:
color_cnts

{'Agouti': 2,
 'Apricot': 30,
 'Black': 8024,
 'Black Brindle': 114,
 'Black Smoke': 69,
 'Black Tabby': 67,
 'Black Tiger': 2,
 'Blue': 2354,
 'Blue Cream': 34,
 'Blue Merle': 181,
 'Blue Point': 31,
 'Blue Smoke': 6,
 'Blue Tabby': 696,
 'Blue Tick': 54,
 'Blue Tiger': 9,
 'Brown': 6693,
 'Brown Brindle': 867,
 'Brown Merle': 84,
 'Brown Tabby': 2696,
 'Brown Tiger': 5,
 'Buff': 328,
 'Calico': 583,
 'Calico Point': 27,
 'Chocolate': 519,
 'Chocolate Point': 24,
 'Cream': 612,
 'Cream Tabby': 282,
 'Fawn': 209,
 'Flame Point': 86,
 'Gold': 84,
 'Gray': 407,
 'Gray Tabby': 55,
 'Lilac Point': 39,
 'Liver': 31,
 'Liver Tick': 4,
 'Lynx Point': 188,
 'Orange': 1438,
 'Orange Tabby': 1353,
 'Orange Tiger': 1,
 'Pink': 4,
 'Red': 985,
 'Red Merle': 59,
 'Red Tick': 59,
 'Ruddy': 1,
 'Sable': 324,
 'Seal Point': 159,
 'Silver': 122,
 'Silver Lynx Point': 2,
 'Silver Tabby': 42,
 'Tan': 3025,
 'Torbie': 398,
 'Tortie': 618,
 'Tortie Point': 34,
 'Tricolor': 912,
 'White': 12186,
 'Yellow': 

In [15]:
color_cats = [x.split('/') for x in train_data['Color']]

## Feature Exploration

In [122]:
def convert_multi_cat_dummy(vals):
    dummies = {}
    df = pd.DataFrame(vals)
    cols = df.columns
    dummies = pd.get_dummies(df[cols[0]], columns = [cols[0]], prefix_sep='', prefix='')
    
    if len(cols) > 1:
        for c in cols[1:]:
            new_dummies = pd.get_dummies(df[c], columns = [c], prefix_sep='', prefix='')
            for x in new_dummies.columns:
                try:
                    v = dummies[x].copy()
                    dummies[x] = v + new_dummies[x]
                except:
                    dummies[x] = new_dummies[x]
                    
    return dummies

In [116]:
[x for x in train_data['Breed'] if len(x.split('/')) > 2]

['Plott Hound/Black/Tan Hound',
 'Labrador Retriever/Black/Tan Hound',
 'German Shepherd/Black/Tan Hound',
 'Labrador Retriever/Black/Tan Hound',
 'Black/Tan Hound/Black Mouth Cur',
 'Labrador Retriever/Black/Tan Hound',
 'Labrador Retriever/Black/Tan Hound',
 'Black/Tan Hound/Black Mouth Cur',
 'Black/Tan Hound/Labrador Retriever',
 'Black/Tan Hound/German Shepherd']

In [123]:
n = convert_multi_cat_dummy(breed_cats)

In [119]:
df = pd.DataFrame(breed_cats)
cols = df.columns
print df.loc[df[2] == 'Black Mouth Cur',:]
dummies = pd.get_dummies(df[0], columns = [cols[0]], prefix_sep='', prefix='')

           0          1                2
10488  Black  Tan Hound  Black Mouth Cur
18126  Black  Tan Hound  Black Mouth Cur


In [127]:
n.sum(1)

0        2
1        2
2        2
3        2
4        2
5        2
6        2
7        2
8        2
9        1
10       2
11       2
12       2
13       2
14       2
15       2
16       2
17       2
18       2
19       2
20       2
21       2
22       2
23       2
24       2
25       2
26       2
27       2
28       2
29       2
        ..
26699    2
26700    2
26701    2
26702    2
26703    2
26704    2
26705    2
26706    2
26707    2
26708    2
26709    2
26710    2
26711    2
26712    1
26713    2
26714    2
26715    2
26716    2
26717    2
26718    2
26719    2
26720    2
26721    2
26722    2
26723    2
26724    2
26725    2
26726    2
26727    2
26728    2
Length: 26729, dtype: int64

In [102]:
len(new_breeds)

226

In [84]:
# First we create the feature set using one-hot encoding for the categorical features
all_feats = train_data.loc[:, ['Name', 'DateTime', 'AgeuponOutcome']]
all_feats.index = range(all_feats.shape[0])
all_feats['DateTime'] = times
all_feats['AgeuponOutcome'] = ages
all_feats['AnimalType'] = train_data['AnimalType'].astype('category').cat.codes

all_feats['Sex'] = pd.Series(sex).astype('category')
all_feats['IsFixed'] = pd.Series(is_fixed).astype('category')

all_feats['OutcomeType'] = train_data['OutcomeType'].copy().astype('category')
all_feats['OutcomeSubtype'] = train_data['OutcomeSubtype'].copy().astype('category')

color_dummies = pd.get_dummies(pd.DataFrame(pd.Series(color_cats).values.tolist()), prefix_sep='', prefix='')
breed_dummies = pd.get_dummies(pd.DataFrame(pd.Series(breed_cats).values.tolist()), prefix_sep='', prefix='')

color_cols = list(color_dummies.columns)
breed_cols = list(breed_dummies.columns)
otsub_cols = list(all_feats['OutcomeSubtype'].cat.categories)
at_keys = dict(zip(range(2), train_data['AnimalType'].astype('category').cat.categories))

In [85]:
all_feats = pd.get_dummies(all_feats, columns=['OutcomeSubtype'], prefix = '', prefix_sep = '')
all_feats = pd.concat([all_feats, color_dummies, breed_dummies], 1)

In [86]:
all_feats.shape

(26729, 492)

In [95]:
color_cats

[['Brown', 'White'],
 ['Cream Tabby'],
 ['Blue', 'White'],
 ['Blue Cream'],
 ['Tan'],
 ['Black', 'Tan'],
 ['Blue Tabby'],
 ['Brown Tabby'],
 ['Red', 'White'],
 ['White'],
 ['Black'],
 ['Silver'],
 ['Brown'],
 ['Black', 'Red'],
 ['White', 'Cream'],
 ['Orange Tabby', 'White'],
 ['Brown Tabby'],
 ['Brown', 'White'],
 ['White'],
 ['Black'],
 ['Black', 'White'],
 ['White'],
 ['Brown Brindle', 'White'],
 ['Brown', 'White'],
 ['Black', 'Brown'],
 ['Orange Tabby'],
 ['Chocolate', 'White'],
 ['White', 'Tan'],
 ['Cream Tabby', 'White'],
 ['Blue'],
 ['Calico'],
 ['Black', 'White'],
 ['Torbie'],
 ['Brown', 'Black'],
 ['Brown'],
 ['Yellow'],
 ['Brown', 'White'],
 ['Blue', 'White'],
 ['Black', 'White'],
 ['Brown Tabby'],
 ['Blue Tabby'],
 ['Orange Tabby'],
 ['Brown Tabby'],
 ['Black', 'White'],
 ['Black', 'White'],
 ['Tricolor'],
 ['White', 'Black'],
 ['Black'],
 ['Black', 'Brown'],
 ['Chocolate', 'White'],
 ['Tortie'],
 ['Brown Tabby'],
 ['Blue Tabby', 'White'],
 ['Gray', 'White'],
 ['Orange Tabby'

In [87]:
all_feats.head()

Unnamed: 0,Name,DateTime,AgeuponOutcome,AnimalType,Sex,IsFixed,OutcomeType,Aggressive,At Vet,Barn,...,West Highland,Whippet,Wire Hair Fox Terrier,Yorkshire,Yorkshire Terrier,Black Mouth Cur,German Shepherd,Labrador Retriever,Mix,Tan Hound
0,Hambone,2014-02-12 18:22:00,365.0,1,0.0,1.0,Return_to_owner,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Emily,2013-10-13 12:44:00,365.0,0,1.0,1.0,Euthanasia,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Pearce,2015-01-31 12:28:00,730.0,1,0.0,1.0,Adoption,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,,2014-07-11 19:09:00,3.0,0,0.0,0.0,Transfer,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,,2013-11-15 12:52:00,730.0,1,0.0,1.0,Transfer,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [88]:
# Split data
# Given the target in unbalanced, we'll use a stratified sampling with proportions 80-20

from sklearn.model_selection import train_test_split

split_seed = 5

df = all_feats.drop(['Name', 'OutcomeType'], 1).copy()

X_train, X_test, y_train, y_test = train_test_split(
    df, all_feats['OutcomeType'], test_size=0.2, random_state=split_seed,
    stratify = all_feats['OutcomeType'])

### Feature Plots

Below is the normalied distribution of age, divided by class. There is clearly a relation between the age and the outcome but it is likely not linear.

1. The vast majority of animals who died in the shelter were young - probably babies that were discovered or abandoned by their mother.
2. Most of the adoptions were for young animals, whith some for older animals.
3. Only older animals were returned to owner - makes sense for lost animals.
4. The ages have a pseudo continuous distribution for young animals, but becomes more categorical for older animals - which makes sense since we measure the age of babies in terms of days and weeks, but older animals in years. 

In [140]:
iplot(go.Figure(data = [go.Histogram(x = g, name = ind, histnorm='probability') for ind,g in X_train['AgeuponOutcome'].groupby(y_train)],
               layout = go.Layout(xaxis = dict(range = [0, 1000]),
                                 title = 'Distribution of Age with Outcome')))

Below is the distribution of time with outcome.

1. For all outcomes except for return to owner, there is a peak in the summer of 2015 and a smaller peak in the summer of 2014
2. Return to owner is fairly evenly distributed, except for a small peak around fall 2016.
3. There's a small peak in adoption around xmas time in 2014 and 2015, which makes sense.
4. Euthanasia drops off significantly after summer 2015 - possible due to different policies on euthanizing unwanted animals.

In [24]:
figs = [go.Figure(data = [go.Histogram(x = g, name = ind, histnorm='probability')], 
                  layout = go.Layout(title = ind)) for ind,g in X_train['DateTime'].groupby(y_train)]
fig = pm.subplot_helper_fig(3, 2, figs)
fig['layout'].update(height = 1000)
fig['layout'].update(title = 'Distribution of Date with Outcome')
iplot(fig)

The following plot shows the following:

1. About 40% of cats and 40% of dogs are adopted
2. A greater percentage of dogs are returned to owner than cats.  Possibly because cats are harder to find when they get lost and many are outdoor cats
3. A greater percentage of cats are transfered.

In [46]:
cnts = pd.crosstab(X_train['AnimalType'], y_train)
cnts.index = [at_keys[x] for x in cnts.index]
cnts = cnts.apply(lambda x: x/float(sum(x)), 1)
iplot(go.Figure(data = [go.Bar(x = cnts.columns, y = cnts.loc[x,:], name = x) for x in cnts.index],
               layout = go.Layout(title = 'Distribution of Animal Type with Outcome')))

The sex of the animals appears to have no effect on outcome.

In [49]:
cnts = pd.crosstab(X_train['Sex'], y_train)
cnts.index = ['Male', 'Female']
cnts = cnts.apply(lambda x: x/float(sum(x)), 1)
iplot(go.Figure(data = [go.Bar(x = cnts.columns, y = cnts.loc[x,:], name = x) for x in cnts.index],
               layout = go.Layout(title = 'Distribution of Sex with Outcome')))

It appears that fixed animals are much more likely to be adopted, but non-fixed animals are much more likely to be transfered. (Possibly for fixing procedure)

In [51]:
cnts = pd.crosstab(X_train['IsFixed'], y_train)
cnts.index = ['Not Fixed', 'Fixed']
cnts = cnts.apply(lambda x: x/float(sum(x)), 1)
iplot(go.Figure(data = [go.Bar(x = cnts.columns, y = cnts.loc[x,:], name = x) for x in cnts.index],
               layout = go.Layout(title = 'Distribution of Fixed Status with Outcome')))

In [92]:
any([np.unique(x) < len(x) for x in breed_cats])


unorderable dtypes; returning scalar but in the future this will be an error



False

In [91]:
breed_dummies['Yorkshire Terrier']

Unnamed: 0,Yorkshire Terrier,Yorkshire Terrier.1
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [89]:
X_train.sum()

AgeuponOutcome                17,004,667.0000
AnimalType                        12,473.0000
Aggressive                           252.0000
At Vet                                 3.0000
Barn                                   2.0000
Behavior                              67.0000
Court/Investigation                    2.0000
Enroute                                6.0000
Foster                             1,433.0000
In Foster                             40.0000
In Kennel                             94.0000
In Surgery                             2.0000
Medical                               53.0000
Offsite                              128.0000
Partner                            6,263.0000
Rabies Risk                           56.0000
SCRP                               1,268.0000
Suffering                            813.0000
Agouti                                 1.0000
Apricot                               18.0000
Black                              5,176.0000
Black Brindle                     

In [90]:
X_train['Yorkshire Terrier']

Unnamed: 0,Yorkshire Terrier,Yorkshire Terrier.1
12467,0,0
15506,0,0
7251,0,0
23360,0,0
7993,0,0
6196,0,0
18876,0,0
16192,0,0
8842,0,0
16361,0,0


In [135]:
n = y_train[X_train['AgeuponOutcome'] < 49]
n.groupby(n).count()

OutcomeType
Adoption            374
Died                 85
Euthanasia          171
Return_to_owner      33
Transfer           2210
Name: OutcomeType, dtype: int64

In [137]:
2210.0 / len(y_train[y_train == 'Transfer'])

0.2932201141037548