## Import packages

In [21]:
import pandas as pd
import numpy as np
import re
# Display Setting
from IPython.display import display
pd.options.display.max_colwidth=100
pd.options.display.float_format="{:.2f}".format
pd.set_option("display.max_columns", None)
import warnings
warnings.simplefilter('ignore')

# Exploratory data analysis
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as pl
import chart_studio.plotly as py
import cufflinks as cf
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
sns.set_theme(style='darkgrid')  # default style
import tensorflow as tf
np.set_printoptions(precision=3, suppress=True)  # improve float readability
from sklearn import datasets


# Data preprocessing
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import auc,roc_curve,classification_report,confusion_matrix,mean_absolute_error,mean_squared_error,root_mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.svm import SVC
from sklearn.cluster import KMeans

from sklearn.decomposition import PCA
import string
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
%matplotlib inline

## Helper Function

In [22]:
def calculate_days(col):
    if pd.isna(col):
        return np.nan
    
    col = str(col).strip().lower()
    units = re.match(r"^\s*(\d+)\s*(year|years|yr|yrs|month|months|mo|mos|week|weeks|wk|wks|day|days)\s*$",col)
    if not units:
        return np.nan
    val = float(units.group(1)); 
    unit = units.group(2)
    if unit in {"year","years","yr","yrs"}:
        return val * 365
    if unit in {"month","months","mo","mos"}:
        return val * 30
    if unit in {"week","weeks","wk","wks"}:
        return val * 7
    if unit in {"day","days"}:
        return val
    return np.nan
    

## Read data

Import raw data

In [23]:
animal_intakes_raw_data = pd.read_csv("Austin_Animal_Center_Intakes.csv")
animal_outcomes_raw_data = pd.read_csv("Austin_Animal_Center_Outcomes.csv")

In [24]:
print("In the intake dataset, we have {} records with {} variables".format(*animal_intakes_raw_data.shape))
print("In the outcome dataset, we have {} records with {} variables".format(*animal_outcomes_raw_data.shape))

In the intake dataset, we have 173812 records with 12 variables
In the outcome dataset, we have 173775 records with 12 variables


In [25]:
animal_intakes_raw_data.head()

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color
0,A521520,Nina,10/01/2013 07:51:00 AM,October 2013,Norht Ec in Austin (TX),Stray,Normal,Dog,Spayed Female,7 years,Border Terrier/Border Collie,White/Tan
1,A664235,,10/01/2013 08:33:00 AM,October 2013,Abia in Austin (TX),Stray,Normal,Cat,Unknown,1 week,Domestic Shorthair Mix,Orange/White
2,A664236,,10/01/2013 08:33:00 AM,October 2013,Abia in Austin (TX),Stray,Normal,Cat,Unknown,1 week,Domestic Shorthair Mix,Orange/White
3,A664237,,10/01/2013 08:33:00 AM,October 2013,Abia in Austin (TX),Stray,Normal,Cat,Unknown,1 week,Domestic Shorthair Mix,Orange/White
4,A664233,Stevie,10/01/2013 08:53:00 AM,October 2013,7405 Springtime in Austin (TX),Stray,Injured,Dog,Intact Female,3 years,Pit Bull Mix,Blue/White


In [26]:
animal_intakes_raw_data['Intake Type'].value_counts()

Intake Type
Stray                 119160
Owner Surrender        35563
Public Assist          10432
Wildlife                6483
Abandoned               1910
Euthanasia Request       264
Name: count, dtype: int64

In [27]:
animal_intakes_raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173812 entries, 0 to 173811
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Animal ID         173812 non-null  object
 1   Name              123821 non-null  object
 2   DateTime          173812 non-null  object
 3   MonthYear         173812 non-null  object
 4   Found Location    173812 non-null  object
 5   Intake Type       173812 non-null  object
 6   Intake Condition  173812 non-null  object
 7   Animal Type       173812 non-null  object
 8   Sex upon Intake   173811 non-null  object
 9   Age upon Intake   173812 non-null  object
 10  Breed             173812 non-null  object
 11  Color             173812 non-null  object
dtypes: object(12)
memory usage: 15.9+ MB


In [28]:
animal_intakes_raw_data.describe()

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color
count,173812,123821,173812,173812,173812,173812,173812,173812,173811,173812,173812,173812
unique,156287,29774,119722,140,70183,6,20,5,5,55,3006,661
top,A721033,Luna,09/23/2016 12:00:00 PM,June 2015,Austin (TX),Stray,Normal,Dog,Intact Male,1 year,Domestic Shorthair Mix,Black/White
freq,33,761,64,2189,31541,119160,147141,94608,58996,28294,33665,17976


In [29]:
animal_outcomes_raw_data.head()


Unnamed: 0,Animal ID,Date of Birth,Name,DateTime,MonthYear,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,A668305,2012-12-01,,2013-12-02T00:00:00-05:00,12-2013,Transfer,Partner,Other,Unknown,1 year,Turtle Mix,Brown/Yellow
1,A673335,2012-02-22,,2014-02-22T00:00:00-05:00,02-2014,Euthanasia,Suffering,Other,Unknown,2 years,Raccoon,Black/Gray
2,A675999,2013-04-03,,2014-04-07T00:00:00-05:00,04-2014,Transfer,Partner,Other,Unknown,1 year,Turtle Mix,Green
3,A679066,2014-04-16,,2014-05-16T00:00:00-05:00,05-2014,,,Other,Unknown,4 weeks,Rabbit Sh,Brown
4,A680855,2014-05-25,,2014-06-10T00:00:00-05:00,06-2014,Transfer,Partner,Bird,Unknown,2 weeks,Duck,Yellow/Black


In [30]:
animal_outcomes_raw_data['Outcome Type'].value_counts()

Outcome Type
Adoption           84598
Transfer           48689
Return to Owner    25691
Euthanasia         10833
Died                1672
Rto-Adopt           1241
Disposal             877
Missing               92
Relocate              29
Stolen                 5
Lost                   2
Name: count, dtype: int64

In [31]:
animal_outcomes_raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173775 entries, 0 to 173774
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Animal ID         173775 non-null  object
 1   Date of Birth     173775 non-null  object
 2   Name              123991 non-null  object
 3   DateTime          173775 non-null  object
 4   MonthYear         173775 non-null  object
 5   Outcome Type      173729 non-null  object
 6   Outcome Subtype   79660 non-null   object
 7   Animal Type       173775 non-null  object
 8   Sex upon Outcome  173774 non-null  object
 9   Age upon Outcome  173766 non-null  object
 10  Breed             173775 non-null  object
 11  Color             173775 non-null  object
dtypes: object(12)
memory usage: 15.9+ MB


## EDA 

## Data Preprocessing 

In [32]:
animal_intakes_data = animal_intakes_raw_data.copy()

# Check Intakes ID uniqueness
animal_intakes_data['Readoption'] = animal_intakes_data.groupby(['Animal ID']).cumcount()
animal_intakes_data['Animal ID'] = animal_intakes_data.apply(lambda x : f"{x['Animal ID']}_{x['Readoption']}" if x['Readoption'] > 0 else x['Animal ID'],axis=1)

# Fill name as N/A if no name
# animal_intakes_data['Intake Name'] = animal_intakes_data['Name'].fillna('Unknown')

animal_intakes_data.insert(1,'Intake Name',animal_intakes_data['Name'].fillna('Unknown'))
# Format intake date
animal_intakes_data['DateTime'] = animal_intakes_data['DateTime'].apply(lambda x : x[0:10])
animal_intakes_data.insert(2,'Intake Date',pd.to_datetime(animal_intakes_data['DateTime'],errors = "coerce").dt.date)
# animal_intakes_data['Intake Date'] = pd.to_datetime(animal_intakes_data['DateTime'],errors = "coerce").dt.date

# Transfer age to days
# animal_intakes_data['Age upon Intake (days)'] = animal_intakes_data['Age upon Intake'].apply(calculate_days)

# Drop duplicated columns
animal_intakes_data.drop(['Name','DateTime','MonthYear','Age upon Intake'],axis=1,inplace=True)

In [33]:
animal_intakes_data

Unnamed: 0,Animal ID,Intake Name,Intake Date,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Breed,Color,Readoption
0,A521520,Nina,2013-10-01,Norht Ec in Austin (TX),Stray,Normal,Dog,Spayed Female,Border Terrier/Border Collie,White/Tan,0
1,A664235,Unknown,2013-10-01,Abia in Austin (TX),Stray,Normal,Cat,Unknown,Domestic Shorthair Mix,Orange/White,0
2,A664236,Unknown,2013-10-01,Abia in Austin (TX),Stray,Normal,Cat,Unknown,Domestic Shorthair Mix,Orange/White,0
3,A664237,Unknown,2013-10-01,Abia in Austin (TX),Stray,Normal,Cat,Unknown,Domestic Shorthair Mix,Orange/White,0
4,A664233,Stevie,2013-10-01,7405 Springtime in Austin (TX),Stray,Injured,Dog,Intact Female,Pit Bull Mix,Blue/White,0
...,...,...,...,...,...,...,...,...,...,...,...
173807,A929690,Unknown,2025-05-03,8038 Exchange Dr in Austin (TX),Stray,Injured,Dog,Intact Male,Belgian Malinois,Brown/Black,0
173808,A929717,Unknown,2025-05-04,Austin (TX),Public Assist,Normal,Dog,Intact Male,Shih Tzu Mix,White/Blue,0
173809,A929724,Unknown,2025-05-04,7105 Providence Ave Apt 3 in Austin (TX),Stray,Normal,Other,Unknown,Rabbit Sh,Tan/White,0
173810,A929725,Oswold,2025-05-04,1501 Red River St in Austin (TX),Public Assist,Normal,Dog,Intact Male,Boxer Mix,Tan/White,0


In [34]:
animal_outcomes_data = animal_outcomes_raw_data.copy()

# Check ID uniqueness
animal_outcomes_data['Readoption'] = animal_outcomes_data.groupby(['Animal ID']).cumcount()
animal_outcomes_data['Animal ID'] = animal_outcomes_data.apply(lambda x : f"{x['Animal ID']}_{x['Readoption']}" if x['Readoption'] > 0 else x['Animal ID'],axis=1)

# change DOB dtype
animal_outcomes_data['Date of Birth'] = pd.to_datetime(animal_outcomes_data['Date of Birth'],errors = "coerce").dt.date

# Fill name as N/A if no name

animal_outcomes_data.insert(1,'Outcome Name',animal_outcomes_data['Name'].fillna('Unknown'))
#  Format outcome date
animal_outcomes_data['DateTime'] = animal_outcomes_data['DateTime'].apply(lambda x : x[0:10])
animal_outcomes_data.insert(3,'Outcome Date',pd.to_datetime(animal_outcomes_data['DateTime'],errors = "coerce").dt.date)

# Transfer age to days
# animal_outcomes_data['Age upon Outcome (days)'] = animal_outcomes_data['Age upon Outcome'].apply(calculate_days)

# Drop duplicated columns
animal_outcomes_data = animal_outcomes_data[['Animal ID','Outcome Date', 'Outcome Name', 'Date of Birth', 'Outcome Type', 'Outcome Subtype']]

In [35]:
# merge two dataset by animal ID
animal_data = pd.merge(animal_intakes_data,animal_outcomes_data,on='Animal ID',how='inner')

In [36]:
# Calcuate Length of Stay
animal_data["Length of Stay (days)"] = (animal_data['Outcome Date'] - animal_data['Intake Date']).apply(lambda x: x.days)

animal_data.insert(8,'Age upon Intake (days)',((animal_data['Intake Date'] - animal_data['Date of Birth']).apply(lambda x: x.days))/30)

In [37]:
animal_data = animal_data[animal_data['Animal Type'].isin(['Dog','Cat'])]

In [51]:
# animal_data[animal_data['Age upon Intake (days)'] < 0]

In [39]:
animal_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 162987 entries, 0 to 172849
Data columns (total 18 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Animal ID               162987 non-null  object 
 1   Intake Name             162987 non-null  object 
 2   Intake Date             162987 non-null  object 
 3   Found Location          162987 non-null  object 
 4   Intake Type             162987 non-null  object 
 5   Intake Condition        162987 non-null  object 
 6   Animal Type             162987 non-null  object 
 7   Sex upon Intake         162986 non-null  object 
 8   Age upon Intake (days)  162987 non-null  float64
 9   Breed                   162987 non-null  object 
 10  Color                   162987 non-null  object 
 11  Readoption              162987 non-null  int64  
 12  Outcome Date            162987 non-null  object 
 13  Outcome Name            162987 non-null  object 
 14  Date of Birth           1

In [40]:
animal_data.describe()

Unnamed: 0,Age upon Intake (days),Readoption,Length of Stay (days)
count,162987.0,162987.0,162987.0
mean,25.17,0.15,21.53
std,35.43,0.56,58.57
min,-51.83,0.0,-2746.0
25%,2.03,0.0,2.0
50%,12.17,0.0,6.0
75%,26.97,0.0,21.0
max,292.2,32.0,1913.0


In [41]:
print(f"Data usage:{162987/173812}")

Data usage:0.937720065357973


In [42]:
animal_data


Unnamed: 0,Animal ID,Intake Name,Intake Date,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake (days),Breed,Color,Readoption,Outcome Date,Outcome Name,Date of Birth,Outcome Type,Outcome Subtype,Length of Stay (days)
0,A521520,Nina,2013-10-01,Norht Ec in Austin (TX),Stray,Normal,Dog,Spayed Female,86.03,Border Terrier/Border Collie,White/Tan,0,2013-10-01,Nina,2006-09-07,Return to Owner,,0
1,A664235,Unknown,2013-10-01,Abia in Austin (TX),Stray,Normal,Cat,Unknown,0.23,Domestic Shorthair Mix,Orange/White,0,2013-10-01,Unknown,2013-09-24,Transfer,Partner,0
2,A664236,Unknown,2013-10-01,Abia in Austin (TX),Stray,Normal,Cat,Unknown,0.23,Domestic Shorthair Mix,Orange/White,0,2013-10-01,Unknown,2013-09-24,Transfer,Partner,0
3,A664237,Unknown,2013-10-01,Abia in Austin (TX),Stray,Normal,Cat,Unknown,0.23,Domestic Shorthair Mix,Orange/White,0,2013-10-01,Unknown,2013-09-24,Transfer,Partner,0
4,A664233,Stevie,2013-10-01,7405 Springtime in Austin (TX),Stray,Injured,Dog,Intact Female,36.57,Pit Bull Mix,Blue/White,0,2013-10-01,Stevie,2010-09-30,Euthanasia,Suffering,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172845,A929600,Unknown,2025-05-02,25204 Fawn Drive in Travis (TX),Stray,Neonatal,Cat,Unknown,0.13,Domestic Shorthair,Brown Tabby,0,2025-05-02,Unknown,2025-04-28,Transfer,Partner,0
172846,A929602,Unknown,2025-05-02,25204 Fawn Drive in Travis (TX),Stray,Normal,Cat,Unknown,12.17,Domestic Shorthair,Black,0,2025-05-03,Unknown,2024-05-02,Transfer,Partner,1
172847,A848454_1,Olli,2025-05-02,Austin (TX),Owner Surrender,Normal,Cat,Spayed Female,134.33,Domestic Shorthair Mix,Orange Tabby,1,2025-05-04,Olli,2014-04-20,Adoption,,2
172848,A929631,Fiona,2025-05-02,Fm 1327 And Bradshaw in Austin (TX),Stray,Injured,Dog,Spayed Female,203.47,Chihuahua Shorthair/Dachshund,Tan,0,2025-05-03,Fiona,2008-08-15,Euthanasia,At Vet,1


In [68]:
(((pd.to_datetime(animal_data['Intake Date']).dt.year).value_counts(sort='Intake Date'))/162987)


Intake Date
2019   0.12
2015   0.11
2014   0.11
2016   0.10
2017   0.10
2018   0.10
2021   0.07
2022   0.07
2024   0.07
2023   0.07
2020   0.05
2013   0.02
2025   0.02
Name: count, dtype: float64

In [43]:
""" 
Set up a for loop to iterate over each feature
Print the name of the feature (excel header)
Plot how many are empty/null/na/0
Print data type
If categorical:
Print top values, 
print unique values [na, 'missing', 'did not provide']
If numerical:
Print max, mean, plot distributions [ages may be off, duration may not make sense]
Put together a rough data dictionary, provide to stakeholders for feedback (share the cleanup load!)
Put together a correlation matrix
Spearman
Pearson
"""

" \nSet up a for loop to iterate over each feature\nPrint the name of the feature (excel header)\nPlot how many are empty/null/na/0\nPrint data type\nIf categorical:\nPrint top values, \nprint unique values [na, 'missing', 'did not provide']\nIf numerical:\nPrint max, mean, plot distributions [ages may be off, duration may not make sense]\nPut together a rough data dictionary, provide to stakeholders for feedback (share the cleanup load!)\nPut together a correlation matrix\nSpearman\nPearson\n"

In [44]:
col_dict = []
def col_info(df):
    for col in df:
        entry = {
            "feature":col,
            "dtype":str(df[col].dtype),
            "N/A values": df[col].isnull().sum(),
            "N/A values %": (df[col].isnull().mean()* 100).round(),
            "unique values":df[col].nunique(),
        }
        
        col_dict.append(entry)
    return col_dict

In [46]:
intake_col_info = col_info(animal_data)

#### Feature Engineering

In [24]:
# Drop duplicated columns : DateTime,MonthYear

### Missing value:

# only one Sex Type is missing 
display(animal_data[animal_data['Sex upon Intake'].isnull()])
display(animal_data[animal_data['Intake Name'] == 'Diego'][['Intake Name','Sex upon Intake']].value_counts())
# Check all other dog named 'Diego' and all of them are Male. So we full the unknown sex to Intact Male
animal_data.loc[2375,'Sex upon Intake'] = "Intact Male"

### outliers: remove - age
animal_data['Age upon Intake'].value_counts() 
# Feature selection :
# Drop OutcomeSubtype
# drop Animal Type ! = "Dog" or "Cat"
animal_data['Animal Type'].value_counts()
# do not include covid period 

Unnamed: 0,Animal ID,Intake Name,Intake Date,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake (days),Breed,Color,Readoption,Outcome Date,Outcome Name,Date of Birth,Outcome Type,Outcome Subtype,Length of Stay (days)
2375,A667395,Diego,2013-11-17,Pflugerville (TX),Owner Surrender,Normal,Dog,,2557,Dachshund,Brown Merle,0,2013-11-27,Diego,2006-11-17,Return to Owner,,10


Intake Name  Sex upon Intake
Diego        Neutered Male      23
             Intact Male        17
Name: count, dtype: int64

KeyError: 'Age upon Intake'