In [1]:
import numpy as np
import pandas as pd

In [2]:
#
data = pd.read_csv("RAW_Austin-Texas_AnimalCenter-Outcomes.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128740 entries, 0 to 128739
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Animal ID         128740 non-null  object
 1   Name              88719 non-null   object
 2   DateTime          128740 non-null  object
 3   MonthYear         128740 non-null  object
 4   Date of Birth     128740 non-null  object
 5   Outcome Type      128716 non-null  object
 6   Outcome Subtype   59349 non-null   object
 7   Animal Type       128740 non-null  object
 8   Sex upon Outcome  128739 non-null  object
 9   Age upon Outcome  128737 non-null  object
 10  Breed             128740 non-null  object
 11  Color             128740 non-null  object
dtypes: object(12)
memory usage: 11.8+ MB


<b>REMOVE duplicate rows, FILL null values</b>

In [3]:
#
data.drop_duplicates(inplace=True)

In [4]:
#
data.drop("MonthYear", axis=1, inplace=True)

In [5]:
#
data["Name"].fillna("Unknown", inplace=True)
data["Name"] = data["Name"].str.replace("*", "", regex=False)
data.loc[data["Name"]=="", "Name"] = "Unknown"

In [6]:
#
data["Outcome Type"].fillna("Unknown", inplace=True)

In [7]:
#
data["Outcome Subtype"].fillna("No Subtype", inplace=True)

In [8]:
#
data["Sex upon Outcome"].fillna("Unknown", inplace=True)

In [9]:
#
#data[data["Age upon Outcome"].isnull()]
data.loc[data["Animal ID"]=="A834932", "Age upon Outcome"] = "1 year"
data.loc[data["Animal ID"]=="A839109", "Age upon Outcome"] = "4 years"
data.loc[data["Animal ID"]=="A837996", "Age upon Outcome"] = "3 years"

<b>ADJUST & CREATE additional COLUMNS</b>

In [10]:
#
data.loc[data["Animal ID"]==data["Name"], "Name"] = "Unknown"

In [11]:
#
data[["out_date", "out_time1", "out_time2"]] = data["DateTime"].str.split(pat=" ", n=2, expand=True)

In [12]:
# 
data["out_date"] = pd.to_datetime(data["out_date"], format="%m/%d/%Y").dt.strftime("%d/%m/%Y")
data["out_date"] = pd.to_datetime(data["out_date"], format="%d/%m/%Y")
data.drop("DateTime", axis=1, inplace=True)

In [13]:
#data[data["animal_id"]=="A761266"].sort_values(by="out_date")
# 01/10/2013
# 17/07/2021

In [14]:
#
data.rename(columns={"Animal ID":"animal_id",
                     "Name":"name",
                     "Date of Birth":"date_of_birth",
                     "Color":"color",
                     "Outcome Type":"out_type",
                     "Outcome Subtype":"out_subtype",
                     "Animal Type":"animal_type",
                     "Sex upon Outcome":"out_sex",
                     "Age upon Outcome":"out_age",
                     "Breed":"breed",
                     "Color":"color"}, inplace=True)

In [15]:
#
data[["auo1", "auo2"]] = data["out_age"].str.split(n=1, expand=True)
data["auo1"] = data["auo1"].astype("int")
data.loc[data["auo1"]<0, "auo1"] = 0

In [16]:
conditions = [(data["auo2"]=="day") | (data["auo2"]=="days"),
              (data["auo2"]=="week") | (data["auo2"]=="weeks"),
              (data["auo2"]=="month") | (data["auo2"]=="months"),
              (data["auo2"]=="year") | (data["auo2"]=="years"),]
values = [1/30, 1/(30/7), 1, 12]
data["auo3"] = np.select(conditions, values)

In [17]:
#
data["out_age2"] = data["auo1"] * data["auo3"]
data.drop(["auo1", "auo2", "auo3"], axis=1, inplace=True)

In [18]:
#
data.sort_values(by="out_date", inplace=True)

<b>RECHECK data</b>

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128724 entries, 67181 to 128739
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   animal_id      128724 non-null  object        
 1   name           128724 non-null  object        
 2   date_of_birth  128724 non-null  object        
 3   out_type       128724 non-null  object        
 4   out_subtype    128724 non-null  object        
 5   animal_type    128724 non-null  object        
 6   out_sex        128724 non-null  object        
 7   out_age        128724 non-null  object        
 8   breed          128724 non-null  object        
 9   color          128724 non-null  object        
 10  out_date       128724 non-null  datetime64[ns]
 11  out_time1      128724 non-null  object        
 12  out_time2      128724 non-null  object        
 13  out_age2       128724 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(12)
mem

In [20]:
data

Unnamed: 0,animal_id,name,date_of_birth,out_type,out_subtype,animal_type,out_sex,out_age,breed,color,out_date,out_time1,out_time2,out_age2
67181,A664261,Unknown,09/10/2013,Transfer,Partner,Cat,Intact Female,3 weeks,Domestic Shorthair Mix,Tortie,2013-10-01,01:00:00,PM,0.700000
48547,A663342,Deco,09/17/2012,Euthanasia,Behavior,Dog,Neutered Male,1 year,Chesa Bay Retr Mix,Red/White,2013-10-01,12:05:00,PM,12.000000
48441,A656894,Jake,04/22/2013,Adoption,No Subtype,Cat,Neutered Male,5 months,Domestic Shorthair Mix,Black,2013-10-01,11:53:00,AM,5.000000
46141,A664235,Unknown,09/24/2013,Transfer,Partner,Cat,Unknown,1 week,Domestic Shorthair Mix,Orange/White,2013-10-01,10:39:00,AM,0.233333
107465,A663888,Unknown,09/25/2011,Transfer,Partner,Dog,Spayed Female,2 years,Boxer Mix,Red/White,2013-10-01,11:13:00,AM,24.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128708,A831547,Leo,03/01/2021,Adoption,Foster,Cat,Neutered Male,4 months,Domestic Shorthair,Black/White,2021-07-17,09:48:00,AM,4.000000
128707,A831545,April,03/01/2021,Adoption,Foster,Cat,Spayed Female,4 months,Domestic Shorthair,Black/White,2021-07-17,09:48:00,AM,4.000000
128738,A835526,Lily,05/28/2019,Adoption,No Subtype,Dog,Spayed Female,2 years,German Shepherd,Black/Tan,2021-07-17,04:38:00,PM,24.000000
128723,A837464,Unknown,04/18/2021,Adoption,Foster,Cat,Spayed Female,2 months,Domestic Shorthair,Tortie,2021-07-17,01:56:00,PM,2.000000


<b>EXPORT data</b>

In [21]:
data.to_csv("preprocessed_AnimalCenter-Outcomes.csv", index=False)