# INTRODUCTION

The data comes from KAGGLE which consits of animal shelters. Outcomes represent the status of animals as they leave the Animal Center. All animals receive a unique Animal ID during intake.

In this competition, we are going to predict the outcome of the animal as they leave the Animal Center. These outcomes include: Adoption, Died, Euthanasia, Return to owner, and Transfer. 

Dataset can be found at - https://www.kaggle.com/c/shelter-animal-outcomes/data

# Using H2O

### IMPORTING LIBRARIES

In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time, warnings, h2o, logging, os, sys, psutil, random
import numpy as np
from h2o.automl import H2OAutoML

In [4]:
pct_memory=0.95
virtual_memory=psutil.virtual_memory()
min_mem_size=int(round(int(pct_memory*virtual_memory.available)/1073741824,0))
print(min_mem_size)

11


In [5]:
# Connect to a cluster
port_no=random.randint(5555,55555)

#  h2o.init(strict_version_check=False,min_mem_size_GB=min_mem_size,port=port_no) # start h2o
try:
  h2o.init(strict_version_check=False,min_mem_size_GB=min_mem_size,port=port_no) # start h2o
except:
  logging.critical('h2o.init')
  h2o.download_all_logs(dirname=logs_path, filename=logfile)      
  h2o.cluster().shutdown()
  sys.exit(2)

Checking whether there is an H2O instance running at http://localhost:34077..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_121"; OpenJDK Runtime Environment (Zulu 8.20.0.5-linux64) (build 1.8.0_121-b15); OpenJDK 64-Bit Server VM (Zulu 8.20.0.5-linux64) (build 25.121-b15, mixed mode)
  Starting server from /home/nikunj/miniconda3/envs/py3.6/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpdmyqzij2
  JVM stdout: /tmp/tmpdmyqzij2/h2o_nikunj_started_from_python.out
  JVM stderr: /tmp/tmpdmyqzij2/h2o_nikunj_started_from_python.err
  Server is running at http://127.0.0.1:34077
Connecting to H2O server at http://127.0.0.1:34077... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,America/New_York
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.1.2
H2O cluster version age:,1 month and 8 days
H2O cluster name:,H2O_from_python_nikunj_tf1b8h
H2O cluster total nodes:,1
H2O cluster free memory:,10.54 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


### READING DATA AND PRE-PROCESSING

In [6]:
#Setting the path
current_dir = os.path.dirname(os.path.abspath(os.getcwd() + "/Kaggle Competition.ipynb"))
os.chdir('../data')
data_dir = os.getcwd()
data_path = data_dir + '/train.csv'

In [None]:
#Ingest data
train_data = h2o.import_file(path = data_path, destination_frame = "train_data")

In [None]:
#Peeking inside the data
train_data.show()

In [None]:
# used to gain statistical information of the columns present in the dataset
train_data.describe()

In [None]:
target = 'OutcomeType'

def get_independent_variables(train_data, targ):
    C = [name for name in train_data.columns if name != targ]
    # determine column types
    ints, reals, enums = [], [], []
    for key, val in train_data.types.items():
        if key in C:
            if val == 'enum':
                enums.append(key)
            elif val == 'int':
                ints.append(key)            
            else: 
                reals.append(key)    
    x = ints + enums + reals
    return x

X = get_independent_variables(train_data, target) 
print(X)
y = target

In [None]:
train_data[y] = train_data[y].asfactor()

In [None]:
train_data.describe()

In [None]:
# setup autoML
# min_mem_size=6 
run_time=333
aml = H2OAutoML(max_runtime_secs=run_time)

In [None]:
os.getcwd()
os.chdir('../logs')
logs_path = os.getcwd()
logfile = 'logs.txt'

In [None]:
model_start_time = time.time()
  
try:
  aml.train(x=X,y=y,training_frame=train_data)  # Change training_frame=train
except Exception as e:
  logging.critical('aml.train') 
  h2o.download_all_logs(dirname=logs_path, filename=logfile)      
  h2o.cluster().shutdown()   
  sys.exit(4)

In [None]:
meta_data={}
meta_data['model_execution_time'] = {"classification":(time.time() - model_start_time)}
meta_data
# d = meta_data['model_execution_time']
# d['classification'] = (time.time() - model_start_time)
# meta_data['model_execution_time'] = d

In [None]:
print(aml.leaderboard)

## Save the leaderboard model

There are two ways to save the leader model -- binary format and MOJO format. If you're taking your leader model to production, then we'd suggest the MOJO format since it's optimized for production use.

In [None]:
best_model = h2o.get_model(aml.leaderboard[0,'model_id'])

In [None]:
best_model.algo

In [None]:
print(best_model.logloss(train = True))

## RESULTS

Our evaluation metric is logloss for this dataset. The best on the kaggle leaderboard is logloss = 0.0000 whereas we get the logloss = 0.1485 for the first model while running it on H2O. We stand 3rd on the Kaggle public leaderboard and hence we are in the top 1% in this competition. Following is the leaderboard link for this competition:

[Kaggle Leaderboard](https://www.kaggle.com/c/shelter-animal-outcomes/leaderboard)


# Optimizing existing Kaggle Kernel

In [7]:
animals = pd.read_csv(data_path)
animals.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [8]:
animals.describe()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
count,26729,19038,26729,26729,13117,26729,26728,26711,26729,26729
unique,26729,6374,22918,5,16,2,5,44,1380,366
top,A679707,Max,2015-08-11 00:00:00,Adoption,Partner,Dog,Neutered Male,1 year,Domestic Shorthair Mix,Black/White
freq,1,136,19,10769,7816,15595,9779,3969,8810,2824


In [9]:
animals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26729 entries, 0 to 26728
Data columns (total 10 columns):
AnimalID          26729 non-null object
Name              19038 non-null object
DateTime          26729 non-null object
OutcomeType       26729 non-null object
OutcomeSubtype    13117 non-null object
AnimalType        26729 non-null object
SexuponOutcome    26728 non-null object
AgeuponOutcome    26711 non-null object
Breed             26729 non-null object
Color             26729 non-null object
dtypes: object(10)
memory usage: 2.0+ MB


In [10]:
cat_columns = ['OutcomeType', 'OutcomeSubtype', 'AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color']
for col in cat_columns:
    animals[col] = animals[col].astype('category')
    

animals['AnimalID'] = animals['AnimalID'].astype(str)
animals['Name'] = animals['Name'].astype(str)
animals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26729 entries, 0 to 26728
Data columns (total 10 columns):
AnimalID          26729 non-null object
Name              26729 non-null object
DateTime          26729 non-null object
OutcomeType       26729 non-null category
OutcomeSubtype    13117 non-null category
AnimalType        26729 non-null category
SexuponOutcome    26728 non-null category
AgeuponOutcome    26711 non-null category
Breed             26729 non-null category
Color             26729 non-null category
dtypes: category(7), object(3)
memory usage: 927.9+ KB


In [11]:
animals.drop('DateTime', axis=1)

Unnamed: 0,AnimalID,Name,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan
5,A677334,Elsa,Transfer,Partner,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan
6,A699218,Jimmy,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Tabby
7,A701489,,Transfer,Partner,Cat,Unknown,3 weeks,Domestic Shorthair Mix,Brown Tabby
8,A671784,Lucy,Adoption,,Dog,Spayed Female,5 months,American Pit Bull Terrier Mix,Red/White
9,A677747,,Adoption,Offsite,Dog,Spayed Female,1 year,Cairn Terrier,White


In [15]:
cat_columns = ['OutcomeType', 'OutcomeSubtype', 'AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color']
for col in cat_columns:
    print(animals[col].unique())
    

[Return_to_owner, Euthanasia, Adoption, Transfer, Died]
Categories (5, object): [Return_to_owner, Euthanasia, Adoption, Transfer, Died]
[NaN, Suffering, Foster, Partner, Offsite, ..., Barn, Court/Investigation, Enroute, At Vet, In Surgery]
Length: 17
Categories (16, object): [Suffering, Foster, Partner, Offsite, ..., Court/Investigation, Enroute, At Vet, In Surgery]
[Dog, Cat]
Categories (2, object): [Dog, Cat]
[Neutered Male, Spayed Female, Intact Male, Intact Female, Unknown, NaN]
Categories (5, object): [Neutered Male, Spayed Female, Intact Male, Intact Female, Unknown]
[1 year, 2 years, 3 weeks, 1 month, 5 months, ..., NaN, 17 years, 18 years, 19 years, 20 years]
Length: 45
Categories (44, object): [1 year, 2 years, 3 weeks, 1 month, ..., 17 years, 18 years, 19 years, 20 years]
[Shetland Sheepdog Mix, Domestic Shorthair Mix, Pit Bull Mix, Lhasa Apso/Miniature Poodle, Cairn Terrier/Chihuahua Shorthair, ..., Boxer/Neapolitan Mastiff, French Bulldog/English Bulldog, Vizsla/Boxer, Germ

In [None]:
sns.countplot(animals.AnimalType, palette='Set3')

In [None]:
sns.countplot(animals.OutcomeType, palette='Set3')

In [None]:
var = 'AnimalType'
data = pd.concat([animals['OutcomeType'], animals[var]], axis=1)
f, ax = plt.subplots(figsize=(7, 7))
fig = sns.boxplot(x=var, y="OutcomeType", data=animals)

In [None]:
sns.countplot(animals.SexuponOutcome, palette='Set3')

In [None]:
# functions to get new parameters from the column
def get_sex(x):
    x = str(x)
    if x.find('Male') >= 0: return 'male'
    if x.find('Female') >= 0: return 'female'
    return 'unknown'
def get_neutered(x):
    x = str(x)
    if x.find('Spayed') >= 0: return 'neutered'
    if x.find('Neutered') >= 0: return 'neutered'
    if x.find('Intact') >= 0: return 'intact'
    return 'unknown'

In [None]:
animals['Sex'] = animals.SexuponOutcome.apply(get_sex)
animals['Neutered'] = animals.SexuponOutcome.apply(get_neutered)
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 4))
sns.countplot(animals.Sex, palette='Set3', ax=ax1)
sns.countplot(animals.Neutered, palette='Set3', ax=ax2)

In [None]:
def get_mix(x):
    x = str(x)
    if x.find('Mix') >= 0: return 'mix'
    return 'not'
animals['Mix'] = animals.Breed.apply(get_mix)
sns.countplot(animals.Mix, palette='Set3')

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
sns.countplot(data=animals, x='OutcomeType',hue='Sex', ax=ax1)
sns.countplot(data=animals, x='Sex',hue='OutcomeType', ax=ax2)

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
sns.countplot(data=animals, x='OutcomeType',hue='AnimalType', ax=ax1)
sns.countplot(data=animals, x='AnimalType',hue='OutcomeType', ax=ax2)

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
sns.countplot(data=animals, x='OutcomeType',hue='Neutered', ax=ax1)
sns.countplot(data=animals, x='Neutered',hue='OutcomeType', ax=ax2)

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
sns.countplot(data=animals, x='OutcomeType',hue='Mix', ax=ax1)
sns.countplot(data=animals, x='Mix',hue='OutcomeType', ax=ax2)

In [None]:
def calc_age_in_years(x):
    x = str(x)
    if x == 'nan': return 0
    age = int(x.split()[0])
    if x.find('year') > -1: return age 
    if x.find('month')> -1: return age / 12.
    if x.find('week')> -1: return age / 52.
    if x.find('day')> -1: return age / 365.
    else: return 0

In [None]:
animals['AgeInYears'] = animals.AgeuponOutcome.apply(calc_age_in_years)
sns.distplot(animals.AgeInYears, bins = 20, kde=False)

In [None]:
def calc_age_category(x):
    if x < 3: return 'young'
    if x < 5: return 'young adult'
    if x < 10: return 'adult'
    return 'old'
animals['AgeCategory'] = animals.AgeInYears.apply(calc_age_category)

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
sns.countplot(data=animals, x='OutcomeType',hue='AgeCategory', ax=ax1)
sns.countplot(data=animals, x='AgeCategory',hue='OutcomeType', ax=ax2)