In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import helper as hlp
import seaborn as sns

# Pretty display for notebooks
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

## Fetch dataset and create in & out dataframes

In [None]:
incomes_endpoint = 'https://data.austintexas.gov/resource/wter-evkm.json'
outcomes_endpoint = 'https://data.austintexas.gov/resource/9t4d-g238.json'

import requests
params = {'$limit': 10000000} # If not specified it only returns 1000 records
r = requests.get(incomes_endpoint, params)
in_df = pd.DataFrame(r.json())
r = requests.get(outcomes_endpoint, params)
out_df = pd.DataFrame(r.json())

in_df['income_time'] = pd.to_datetime(in_df['datetime'])
out_df['outcome_time'] = pd.to_datetime(out_df['datetime'])
out_df['date_of_birth'] = pd.to_datetime(out_df['date_of_birth'])

in_df.drop(['datetime','datetime2'], axis=1, inplace=True)
out_df.drop(['monthyear', 'datetime'], axis=1, inplace=True)

print('Incomes shape',in_df.shape)
print('Outcomes shape',out_df.shape)


In [None]:
in_df.head(5)

In [None]:
out_df.head(5)

## Create unique id to merge in & out datasets

In [None]:
in_df['animal_id'].value_counts()

In [None]:
# In order to merge the two datasets we need a unique parameter to link the datasets. animal_id can't be used 
# because one animal may stay in the shelter more than once. Thus, the a new id must be created.

In [None]:
def create_income_id(df, dataframe_type):
    
    # Get for each animal_id its occurrence number in the shelter
    occurrence = df.groupby(['animal_id'])[dataframe_type + '_time'].rank(method='dense', ascending=False).astype(int)
    
    # The new id will be df['animal_id'] + '_' + rank where rank is the number of occurence of the animal in the shelter 
    df['income_id'] = df['animal_id'] + '_' + occurrence.astype(str)
    return df

In [None]:
in_df = create_income_id(in_df, 'income')
out_df = create_income_id(out_df, 'outcome')

## Check columns in common

In [None]:
print('Columns of the income ->',in_df.columns)
print('Columns of the ou0tcome ->',out_df.columns)
set(in_df.columns).intersection(out_df.columns)

## Remove columns to avoid duplicates in merge

In [None]:
#in_df.drop(['index'], axis=1, inplace=True)
out_df.drop(['animal_type', 'breed', 'color','name', 'animal_id'], axis=1, inplace=True)

#### Merge

In [None]:
in_df.set_index('income_id', inplace=True)
out_df.set_index('income_id', inplace=True)
# Merge datasets using type 'inner' to only take animals with both income and outcome 
in_out_df = pd.merge(out_df, in_df, how='inner', 
                  right_index=True, left_index=True, suffixes=['_outcome', '_income'])

In [None]:
print(in_out_df.columns)
print('Income shape', in_df.shape)
print('Outcome shape', out_df.shape)
print('In/out shape', in_out_df.shape)
in_out_df.head()
# TODO: see why the diference in row numbers (animals with income but not yet an outcome?) 

### Data cleaning

In [None]:
animal_types = in_out_df['animal_type'].value_counts()
animal_types

In [None]:
# Plot animal types in a chart
pie, ax = plt.subplots(figsize=[10, 7])
labels = animal_types.keys()
plt.pie(x=animal_types, autopct="%.1f%%", explode=[0.05]*len(labels), labels=labels, pctdistance=0.5)
plt.title("Types of animals", fontsize=14);

In [None]:
# Remove animals of type 'Livestock' and 'Other' because its of no use for the recomendation system
in_out_df = in_out_df.loc[in_out_df['animal_type'] != 'Livestock']

In [None]:
# See the different outcome types
outcome_labels = in_out_df['outcome_type'].unique()
total_records = in_out_df.shape[0]
outcome_types = in_out_df['outcome_type'].value_counts()
print("Outcome labels: ", outcome_labels)
print("Records total: ", total_records)
print("Values: \n", outcome_types)

In [None]:
ax = outcome_types.plot(kind='bar', figsize=(10, 7), width=0.8, edgecolor=None)
plt.title("Outcome types",fontsize= 16)

for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy() 
    ax.annotate(f'{(height / total_records):.0%}', (x + width/2, y + height*1.02), ha='center')


In [None]:
# Check income type
intake_types = in_out_df['intake_type'].value_counts()
intake_types

In [None]:
ax = intake_types.plot(kind='bar', figsize=(10, 7), width=0.8, edgecolor=None)
plt.title("Intake types",fontsize= 16)

for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy() 
    ax.annotate(f'{(height / total_records):.0%}', (x + width/2, y + height*1.02), ha='center')

In [None]:
# Remove 'outcome_type' of 'Return to Owner'. For the recomendation system we don't want cases of 
# lost animals to train the model
in_out_df = in_out_df.loc[in_out_df['outcome_type'] != 'Return to Owner']

## Create column time in shelter using outcome and income times

In [None]:
in_out_df['time_in_shelter'] = in_out_df['outcome_time'] - in_out_df['income_time']
in_out_df['time_in_shelter'].describe()

In [None]:
# Remove rows where income_time is greater than outcome_time
in_out_df = in_out_df.loc[in_out_df['income_time'] < in_out_df['outcome_time']]

# Export dataframe

In [None]:
in_out_df.to_csv('./data/in_out_shelter.csv', encoding='utf-8')