# Kaggle Animal Shelter Data Exploration

In [23]:
# Import modules

import pandas as pd
import numpy as np
import os
import sys
import random
import copy
import cPickle as pickle
import datetime

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import colorlover as cl
import matplotlib.pyplot as plt 

src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

from sklearn.model_selection import ShuffleSplit
from scipy.stats import spearmanr
from scipy import stats
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler
from scipy.stats import boxcox
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

#import plotting_methods as pm

init_notebook_mode(connected=True)

%reload_ext autoreload
%autoreload 2

pd.options.display.float_format = '{:,.4f}'.format

In [13]:
# Load data

raw_data_dir = 'C:\Users\Colleen\Documents\Kaggle Animal Shelter\data'

f = open(os.path.join(raw_data_dir, 'train.csv'), 'r')
train_data = pd.read_csv(f)
f.close()


## Data Cleaning and Basic Feature Engineering

In [14]:
train_data.columns

Index([u'AnimalID', u'Name', u'DateTime', u'OutcomeType', u'OutcomeSubtype',
       u'AnimalType', u'SexuponOutcome', u'AgeuponOutcome', u'Breed',
       u'Color'],
      dtype='object')

In [16]:
'Number of samples: ' + str(train_data.shape[0])

'Number of samples: 26729'

In [17]:
train_data.dtypes

AnimalID          object
Name              object
DateTime          object
OutcomeType       object
OutcomeSubtype    object
AnimalType        object
SexuponOutcome    object
AgeuponOutcome    object
Breed             object
Color             object
dtype: object

Every feature is likely a string type, but based on the column names we can convert some into other types.

Number of unique values for each feature:

In [19]:
train_data.apply(lambda x: len(np.unique(x)))

AnimalID          26729
Name              14065
DateTime          22918
OutcomeType           5
OutcomeSubtype    13628
AnimalType            2
SexuponOutcome        6
AgeuponOutcome       62
Breed              1380
Color               366
dtype: int64

Now we go through each feature to clean and convert to other types

## DateTime

We can convert these into datetime object and look at the distribution.

In [24]:
train_data['DateTime'].head()

0    2014-02-12 18:22:00
1    2013-10-13 12:44:00
2    2015-01-31 12:28:00
3    2014-07-11 19:09:00
4    2013-11-15 12:52:00
Name: DateTime, dtype: object

In [26]:
times = [datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') for x in train_data['DateTime']]

In [30]:
iplot([go.Histogram(x = times)])

There doesn't seem to be any outliers or weird values, and the date range agrees with what's given in the competition description.

## OutcomeType

This should be the target. We print the unique values to check if they fit the data description.

In [32]:
np.unique(train_data['OutcomeType'])

array(['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'], dtype=object)

In [41]:
train_data['OutcomeType'].groupby(train_data['OutcomeType']).count().append(pd.Series({'nan': len(train_data['OutcomeType']) - train_data['OutcomeType'].count()}))

Adoption           10769
Died                 197
Euthanasia          1555
Return_to_owner     4786
Transfer            9422
nan                    0
dtype: int64

## OutcomeSubtype

Printing the unique values shows there are missing values here.

Re-printing them, remove missing values gives a much more mangeable number.  This variable is clearly categorical.  We may be able to merge some of these categories, but we need to be careful to not remove any information. We'll leave it for now.

In [40]:
train_data['OutcomeSubtype'].groupby(train_data['OutcomeSubtype']).count().append(pd.Series({'nan': len(train_data['OutcomeSubtype']) - train_data['OutcomeSubtype'].count()}))

Aggressive               320
At Vet                     4
Barn                       2
Behavior                  86
Court/Investigation        6
Enroute                    8
Foster                  1800
In Foster                 52
In Kennel                114
In Surgery                 3
Medical                   66
Offsite                  165
Partner                 7816
Rabies Risk               74
SCRP                    1599
Suffering               1002
nan                    13612
dtype: int64

In [None]:
def plot_cnts(feats1, levels1 = None, feats2, levels2 = None):
    
    if levels1 == None:
        f1 = feats1
    else:
        f1 = feats1.loc[np.array([x in levels1 for x in feats1])]
    cnts1 = f1.groupby(f1).count()
        
    if levels2 == None:
        f2 = feats2
    else:
        f2 = feats2.loc[np.array([x in levels2 for x in feats2])]     
    cnts2 = f2.groupby(f2).count()
    
    return [go.Bar(x = cnts1)]

## AnimalType

This one is categorical and simple - cat or dog

In [35]:
np.unique(train_data['AnimalType'])

array(['Cat', 'Dog'], dtype=object)

In [42]:
train_data['AnimalType'].groupby(train_data['AnimalType']).count().append(pd.Series({'nan': len(train_data['AnimalType']) - train_data['AnimalType'].count()}))

Cat    11134
Dog    15595
nan        0
dtype: int64

In [86]:
pd.crosstab(index=train_data['AnimalType'], columns=train_data['OutcomeType'])

OutcomeType,Adoption,Died,Euthanasia,Return_to_owner,Transfer
AnimalType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Cat,4272,147,710,500,5505
Dog,6497,50,845,4286,3917


## SexuponOutcome

This one is also categorical, indicating spayed/neutered status.  The missing value and 'Unknown' could likely be merged.

In [36]:
np.unique(train_data['SexuponOutcome'])

array([nan, 'Intact Female', 'Intact Male', 'Neutered Male',
       'Spayed Female', 'Unknown'], dtype=object)

In [43]:
train_data['SexuponOutcome'].groupby(train_data['SexuponOutcome']).count().append(pd.Series({'nan': len(train_data['SexuponOutcome']) - train_data['SexuponOutcome'].count()}))

Intact Female    3511
Intact Male      3525
Neutered Male    9779
Spayed Female    8820
Unknown          1093
nan                 1
dtype: int64

## AgeuponOutcome 

This variable can be converted to nominal, using the number of days as the measure of age

In [46]:
np.unique(train_data['AgeuponOutcome'])

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, '0 years', '1 day', '1 month', '1 week',
       '1 weeks', '1 year', '10 months', '10 years', '11 months',
       '11 years', '12 years', '13 years', '14 years', '15 years',
       '16 years', '17 years', '18 years', '19 years', '2 days',
       '2 months', '2 weeks', '2 years', '20 years', '3 days', '3 months',
       '3 weeks', '3 years', '4 days', '4 months', '4 weeks', '4 years',
       '5 days', '5 months', '5 weeks', '5 years', '6 days', '6 months',
       '6 years', '7 months', '7 years', '8 months', '8 years', '9 months',
       '9 years'], dtype=object)

In [55]:
def get_age(s):
    
    if (s == 'nan') or (isinstance(s, float) and np.isnan(s)):
        return s
    
    n = s.split(' ')
    num = int(n[0])
    time = n[1]
    if num == 0:
        return 0
    elif 'year' in time:
        return num * 365
    elif 'month' in time:
        return num * 30
    else:
        return num

In [56]:
ages = [get_age(s) for s in train_data['AgeuponOutcome']]

In [59]:
iplot([go.Histogram(x = ages)])

There are a range of ages but its quite skewed.

## Breed

There are a lot of categories here. We definitely need to compress these or come up with other features based on it.

In [62]:
x = 'Breed'
cnts = train_data[x].groupby(train_data[x]).count().append(pd.Series({'nan': len(train_data[x]) - train_data[x].count()}))

In [70]:
len(cnts)

1381

In [68]:
cnts.sort_values()

nan                                                   0
Newfoundland/Queensland Heeler                        1
Newfoundland/Great Pyrenees                           1
Newfoundland/Border Collie                            1
Newfoundland/Australian Cattle Dog                    1
Chinese Crested/Chihuahua Longhair                    1
Chinese Crested/Papillon                              1
Munchkin Longhair Mix                                 1
Miniature Schnauzer/Whippet                           1
Chihuahua Shorthair/Smooth Fox Terrier                1
Chinese Sharpei/Airedale Terrier                      1
Chinese Sharpei/Basset Hound                          1
Miniature Schnauzer/West Highland                     1
Chinese Sharpei/Great Dane                            1
Miniature Schnauzer/Soft Coated Wheaten Terrier       1
Chinese Sharpei/Pit Bull                              1
Miniature Schnauzer/Shih Tzu                          1
Miniature Schnauzer/Scottish Terrier            

In [94]:
from itertools import chain
breeds = np.unique(list(chain.from_iterable([x.split('/') for x in train_data['Breed']])))
breed_cnts = dict([(x, sum([x in y for y in train_data['Breed']])) for x in breeds])

In [96]:
len(breed_cnts)

397

In [97]:
breed_cnts

{'Abyssinian Mix': 2,
 'Affenpinscher': 10,
 'Affenpinscher Mix': 6,
 'Afghan Hound Mix': 1,
 'Airedale Terrier': 9,
 'Airedale Terrier Mix': 5,
 'Akita': 29,
 'Akita Mix': 11,
 'Alaskan Husky': 26,
 'Alaskan Husky Mix': 10,
 'Alaskan Malamute': 17,
 'Alaskan Malamute Mix': 5,
 'American Bulldog': 147,
 'American Bulldog Mix': 109,
 'American Eskimo': 14,
 'American Eskimo Mix': 9,
 'American Foxhound': 7,
 'American Foxhound Mix': 2,
 'American Pit Bull Terrier': 90,
 'American Pit Bull Terrier Mix': 68,
 'American Shorthair Mix': 9,
 'American Staffordshire Terrier': 121,
 'American Staffordshire Terrier Mix': 92,
 'Anatol Shepherd': 152,
 'Anatol Shepherd Mix': 76,
 'Angora Mix': 7,
 'Australian Cattle Dog': 640,
 'Australian Cattle Dog Mix': 367,
 'Australian Kelpie': 133,
 'Australian Kelpie Mix': 95,
 'Australian Shepherd': 272,
 'Australian Shepherd Mix': 163,
 'Australian Terrier Mix': 4,
 'Balinese Mix': 5,
 'Basenji': 24,
 'Basenji Mix': 11,
 'Basset Hound': 101,
 'Basset Hou

Looking at these, we could try pulling out the 'mix' identifier.

In [103]:
new_breeds = list(np.unique([x.split(' Mix')[0] for x in breeds]))
new_breeds.append('Mix')
new_breed_cnts = dict([(x, sum([x in y for y in train_data['Breed']])) for x in new_breeds])

In [104]:
len(new_breed_cnts)

226

In [105]:
new_breed_cnts

{'Abyssinian': 2,
 'Affenpinscher': 10,
 'Afghan Hound': 1,
 'Airedale Terrier': 9,
 'Akita': 29,
 'Alaskan Husky': 26,
 'Alaskan Malamute': 17,
 'American Bulldog': 147,
 'American Eskimo': 14,
 'American Foxhound': 7,
 'American Pit Bull Terrier': 90,
 'American Shorthair': 9,
 'American Staffordshire Terrier': 121,
 'Anatol Shepherd': 152,
 'Angora': 7,
 'Australian Cattle Dog': 640,
 'Australian Kelpie': 133,
 'Australian Shepherd': 272,
 'Australian Terrier': 4,
 'Balinese': 5,
 'Basenji': 24,
 'Basset Hound': 101,
 'Beagle': 311,
 'Bearded Collie': 2,
 'Beauceron': 14,
 'Bedlington Terr': 3,
 'Belgian Malinois': 20,
 'Belgian Sheepdog': 3,
 'Belgian Tervuren': 2,
 'Bengal': 5,
 'Bernese Mountain Dog': 9,
 'Bichon Frise': 22,
 'Black': 130,
 'Black Mouth Cur': 101,
 'Bloodhound': 12,
 'Blue Lacy': 74,
 'Bluetick Hound': 7,
 'Boerboel': 4,
 'Bombay': 5,
 'Border Collie': 432,
 'Border Terrier': 74,
 'Borzoi': 1,
 'Boston Terrier': 86,
 'Boxer': 437,
 'Boykin Span': 4,
 'British Sho

In [100]:
'Manchester Terrier'.split(' Mix')

['Manchester Terrier']

## Color

There are a lot of these.  They also need to be compressed.

In [74]:
x = 'Color'
cnts = train_data[x].groupby(train_data[x]).count().append(pd.Series({'nan': len(train_data[x]) - train_data[x].count()}))
cnts.sort_values()

nan                               0
Chocolate/Gold                    1
Chocolate/Gray                    1
Chocolate/Red Tick                1
Cream/Orange                      1
Cream/Red                         1
Cream/Red Tick                    1
Cream/Seal Point                  1
Fawn/Brown                        1
Chocolate/Cream                   1
Fawn/Brown Brindle                1
Fawn/Tricolor                     1
Gold/Black                        1
Gold/Buff                         1
Gold/Tan                          1
Gold/Yellow                       1
Gray Tabby/Black                  1
Gray/Red                          1
Liver Tick/White                  1
Yellow/Yellow                     1
Chocolate/Brown Merle             1
Chocolate/Brown Brindle           1
Calico/Orange Tabby               1
Brown Brindle/Blue Tick           1
Brown Brindle/Brown Brindle       1
Brown Brindle/Brown Merle         1
Brown Merle/Blue Merle            1
Brown Merle/Tan             

First we'll get a list of all the colors, by separating the combos

In [90]:
from itertools import chain
colors = np.unique(list(chain.from_iterable([x.split('/') for x in train_data['Color']])))

In [91]:
color_cnts = dict([(x, sum([x in y for y in train_data['Color']])) for x in colors])

In [106]:
len(color_cnts)

57

In [92]:
color_cnts

{'Agouti': 2,
 'Apricot': 30,
 'Black': 8024,
 'Black Brindle': 114,
 'Black Smoke': 69,
 'Black Tabby': 67,
 'Black Tiger': 2,
 'Blue': 2354,
 'Blue Cream': 34,
 'Blue Merle': 181,
 'Blue Point': 31,
 'Blue Smoke': 6,
 'Blue Tabby': 696,
 'Blue Tick': 54,
 'Blue Tiger': 9,
 'Brown': 6693,
 'Brown Brindle': 867,
 'Brown Merle': 84,
 'Brown Tabby': 2696,
 'Brown Tiger': 5,
 'Buff': 328,
 'Calico': 583,
 'Calico Point': 27,
 'Chocolate': 519,
 'Chocolate Point': 24,
 'Cream': 612,
 'Cream Tabby': 282,
 'Fawn': 209,
 'Flame Point': 86,
 'Gold': 84,
 'Gray': 407,
 'Gray Tabby': 55,
 'Lilac Point': 39,
 'Liver': 31,
 'Liver Tick': 4,
 'Lynx Point': 188,
 'Orange': 1438,
 'Orange Tabby': 1353,
 'Orange Tiger': 1,
 'Pink': 4,
 'Red': 985,
 'Red Merle': 59,
 'Red Tick': 59,
 'Ruddy': 1,
 'Sable': 324,
 'Seal Point': 159,
 'Silver': 122,
 'Silver Lynx Point': 2,
 'Silver Tabby': 42,
 'Tan': 3025,
 'Torbie': 398,
 'Tortie': 618,
 'Tortie Point': 34,
 'Tricolor': 912,
 'White': 12186,
 'Yellow': 

## Feature Exploration