Link to dataset: [Here](https://www.kaggle.com/aaronschlegel/austin-animal-center-shelter-intakes-and-outcomes)

In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import helper as hlp
import numpy as np

# Pretty display for notebooks
%matplotlib inline

# Load data and cleanup

In [13]:
df_intakes = pd.read_csv('./data/aac_intakes.csv')
df_intakes_outcomes = pd.read_csv('./data/aac_intakes_outcomes.csv')
df_outcomes = pd.read_csv('./data/aac_outcomes.csv')

## Check dataframe shapes

In [14]:
print("Intakes:\n\tLines: {}\n\tColumns: {}".format(df_intakes.shape[0], df_intakes.shape[1]))
print("Intakes_outcomes:\n\tLines: {}\n\tColumns: {}".format(df_intakes_outcomes.shape[0], df_intakes_outcomes.shape[1]))
print("Outcomes:\n\tLines: {}\n\tColumns: {}".format(df_outcomes.shape[0], df_outcomes.shape[1]))

Intakes:
	Lines: 80187
	Columns: 12
Intakes_outcomes:
	Lines: 79672
	Columns: 41
Outcomes:
	Lines: 80681
	Columns: 12


## Check head content

### Intakes

In [15]:
df_intakes.head(5)

Unnamed: 0,age_upon_intake,animal_id,animal_type,breed,color,datetime,datetime2,found_location,intake_condition,intake_type,name,sex_upon_intake
0,8 years,A706918,Dog,English Springer Spaniel,White/Liver,2015-07-05T12:59:00.000,2015-07-05T12:59:00.000,9409 Bluegrass Dr in Austin (TX),Normal,Stray,Belle,Spayed Female
1,11 months,A724273,Dog,Basenji Mix,Sable/White,2016-04-14T18:43:00.000,2016-04-14T18:43:00.000,2818 Palomino Trail in Austin (TX),Normal,Stray,Runster,Intact Male
2,4 weeks,A665644,Cat,Domestic Shorthair Mix,Calico,2013-10-21T07:59:00.000,2013-10-21T07:59:00.000,Austin (TX),Sick,Stray,,Intact Female
3,4 years,A682524,Dog,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,2014-06-29T10:38:00.000,2014-06-29T10:38:00.000,800 Grove Blvd in Austin (TX),Normal,Stray,Rio,Neutered Male
4,2 years,A743852,Dog,Labrador Retriever Mix,Chocolate,2017-02-18T12:46:00.000,2017-02-18T12:46:00.000,Austin (TX),Normal,Owner Surrender,Odin,Neutered Male


## Intakes_outcomes

In [16]:
df_intakes_outcomes.head(4)

Unnamed: 0,age_upon_outcome,animal_id_outcome,date_of_birth,outcome_subtype,outcome_type,sex_upon_outcome,age_upon_outcome_(days),age_upon_outcome_(years),age_upon_outcome_age_group,outcome_datetime,...,age_upon_intake_age_group,intake_datetime,intake_month,intake_year,intake_monthyear,intake_weekday,intake_hour,intake_number,time_in_shelter,time_in_shelter_days
0,10 years,A006100,2007-07-09 00:00:00,,Return to Owner,Neutered Male,3650,10.0,"(7.5, 10.0]",2017-12-07 14:07:00,...,"(7.5, 10.0]",2017-12-07 00:00:00,12,2017,2017-12,Thursday,14,1.0,0 days 14:07:00.000000000,0.588194
1,7 years,A006100,2007-07-09 00:00:00,,Return to Owner,Neutered Male,2555,7.0,"(5.0, 7.5]",2014-12-20 16:35:00,...,"(5.0, 7.5]",2014-12-19 10:21:00,12,2014,2014-12,Friday,10,2.0,1 days 06:14:00.000000000,1.259722
2,6 years,A006100,2007-07-09 00:00:00,,Return to Owner,Neutered Male,2190,6.0,"(5.0, 7.5]",2014-03-08 17:10:00,...,"(5.0, 7.5]",2014-03-07 14:26:00,3,2014,2014-03,Friday,14,3.0,1 days 02:44:00.000000000,1.113889
3,10 years,A047759,2004-04-02 00:00:00,Partner,Transfer,Neutered Male,3650,10.0,"(7.5, 10.0]",2014-04-07 15:12:00,...,"(7.5, 10.0]",2014-04-02 15:55:00,4,2014,2014-04,Wednesday,15,1.0,4 days 23:17:00.000000000,4.970139


### Outcomes

In [17]:
df_outcomes.head(4)

Unnamed: 0,age_upon_outcome,animal_id,animal_type,breed,color,date_of_birth,datetime,monthyear,name,outcome_subtype,outcome_type,sex_upon_outcome
0,2 weeks,A684346,Cat,Domestic Shorthair Mix,Orange Tabby,2014-07-07T00:00:00,2014-07-22T16:04:00,2014-07-22T16:04:00,,Partner,Transfer,Intact Male
1,1 year,A666430,Dog,Beagle Mix,White/Brown,2012-11-06T00:00:00,2013-11-07T11:47:00,2013-11-07T11:47:00,Lucy,Partner,Transfer,Spayed Female
2,1 year,A675708,Dog,Pit Bull,Blue/White,2013-03-31T00:00:00,2014-06-03T14:20:00,2014-06-03T14:20:00,*Johnny,,Adoption,Neutered Male
3,9 years,A680386,Dog,Miniature Schnauzer Mix,White,2005-06-02T00:00:00,2014-06-15T15:50:00,2014-06-15T15:50:00,Monday,Partner,Transfer,Neutered Male


### Clean the data

In [23]:
df_intakes = hlp.transform_age(df_intakes, 'age_upon_intake')
df_intakes = hlp.transform_date(df_intakes, 'intake')
print(df_intakes.shape)
df_intakes.head()

(80187, 23)


Unnamed: 0,age_upon_intake,animal_id,animal_type,breed,color,datetime,datetime2,found_location,intake_condition,intake_type,...,age_upon_intake_Period Range,age_upon_intake_(days),age_upon_intake_(years),age_upon_intake_age_group,intake_datetime,intake_month,intake_year,intake_monthyear,intake_weekday,intake_hour
0,8 years,A706918,Dog,English Springer Spaniel,White/Liver,2015-07-05T12:59:00.000,2015-07-05T12:59:00.000,9409 Bluegrass Dr in Austin (TX),Normal,Stray,...,365,2920,8.0,"(7.5, 10.0]",2015-07-05 12:59:00,7,2015,2015-07,Sunday,12
1,11 months,A724273,Dog,Basenji Mix,Sable/White,2016-04-14T18:43:00.000,2016-04-14T18:43:00.000,2818 Palomino Trail in Austin (TX),Normal,Stray,...,30,330,0.90411,"(-0.025, 2.5]",2016-04-14 18:43:00,4,2016,2016-04,Thursday,18
2,4 weeks,A665644,Cat,Domestic Shorthair Mix,Calico,2013-10-21T07:59:00.000,2013-10-21T07:59:00.000,Austin (TX),Sick,Stray,...,7,28,0.076712,"(-0.025, 2.5]",2013-10-21 07:59:00,10,2013,2013-10,Monday,7
3,4 years,A682524,Dog,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,2014-06-29T10:38:00.000,2014-06-29T10:38:00.000,800 Grove Blvd in Austin (TX),Normal,Stray,...,365,1460,4.0,"(2.5, 5.0]",2014-06-29 10:38:00,6,2014,2014-06,Sunday,10
4,2 years,A743852,Dog,Labrador Retriever Mix,Chocolate,2017-02-18T12:46:00.000,2017-02-18T12:46:00.000,Austin (TX),Normal,Owner Surrender,...,365,730,2.0,"(-0.025, 2.5]",2017-02-18 12:46:00,2,2017,2017-02,Saturday,12


In [25]:
df_outcomes = hlp.transform_age(df_outcomes, 'age_upon_outcome')
df_outcomes = hlp.transform_date(df_outcomes, 'outcome')
print(df_outcomes.shape)
df_outcomes.head()

(80681, 23)


Unnamed: 0,age_upon_outcome,animal_id,animal_type,breed,color,date_of_birth,datetime,monthyear,name,outcome_subtype,...,age_upon_outcome_Period Range,age_upon_outcome_(days),age_upon_outcome_(years),age_upon_outcome_age_group,outcome_datetime,outcome_month,outcome_year,outcome_monthyear,outcome_weekday,outcome_hour
0,2 weeks,A684346,Cat,Domestic Shorthair Mix,Orange Tabby,2014-07-07T00:00:00,2014-07-22T16:04:00,2014-07-22T16:04:00,,Partner,...,7,14,0.038356,"(-0.025, 2.5]",2014-07-22 16:04:00,7,2014,2014-07,Tuesday,16
1,1 year,A666430,Dog,Beagle Mix,White/Brown,2012-11-06T00:00:00,2013-11-07T11:47:00,2013-11-07T11:47:00,Lucy,Partner,...,365,365,1.0,"(-0.025, 2.5]",2013-11-07 11:47:00,11,2013,2013-11,Thursday,11
2,1 year,A675708,Dog,Pit Bull,Blue/White,2013-03-31T00:00:00,2014-06-03T14:20:00,2014-06-03T14:20:00,*Johnny,,...,365,365,1.0,"(-0.025, 2.5]",2014-06-03 14:20:00,6,2014,2014-06,Tuesday,14
3,9 years,A680386,Dog,Miniature Schnauzer Mix,White,2005-06-02T00:00:00,2014-06-15T15:50:00,2014-06-15T15:50:00,Monday,Partner,...,365,3285,9.0,"(7.5, 10.0]",2014-06-15 15:50:00,6,2014,2014-06,Sunday,15
4,5 months,A683115,Other,Bat Mix,Brown,2014-01-07T00:00:00,2014-07-07T14:04:00,2014-07-07T14:04:00,,Rabies Risk,...,30,150,0.410959,"(-0.025, 2.5]",2014-07-07 14:04:00,7,2014,2014-07,Monday,14


### Combine intake and outakes

In [9]:
set(df_intakes.columns).intersection(df_outcomes.columns)

{'animal_id', 'animal_type', 'breed', 'color', 'datetime', 'name'}

In [10]:
# Remove duplicated columns before merging
df_outcomes.drop(['animal_type', 'breed', 'color', 'name', 'datetime', 'animal_type'], axis=1, inplace=True)

In [11]:
# Merge datasets
df_outcomes.set_index('animal_id_new', inplace=True)
df_intakes.set_index('animal_id_new', inplace=True)

in_out = pd.merge(intakes_df, outcomes_df, how='inner', right_index=True, left_index=True, suffixes=['_intake','_outcome'])

KeyError: "None of ['animal_id_new'] are in the columns"