In [32]:
import pathlib
import pandas as pd
import numpy as np
import tempfile
import zipfile

In [2]:
load_path = pathlib.Path("../data/train.csv.zip")

zf = zipfile.ZipFile(load_path)
with tempfile.TemporaryDirectory() as _temp_dir:
    zf.extractall(_temp_dir)

    temp_dir = pathlib.Path(_temp_dir)
    data = pd.read_csv(temp_dir / "train.csv")

In [4]:
TIME = "DateTime"
FEATURES = ["Name", "SexuponOutcome", "AnimalType", "AgeuponOutcome", "Breed", "Color"]
TARGET = "Outcome"

In [6]:
data[FEATURES].head(10)

Unnamed: 0,Name,SexuponOutcome,AnimalType,AgeuponOutcome,Breed,Color
0,Socks,Neutered Male,Cat,2 months,Domestic Shorthair Mix,Black/White
1,Vera,Intact Female,Cat,1 month,Domestic Shorthair Mix,Tortie/White
2,Biscuit,Neutered Male,Dog,3 months,Chihuahua Shorthair Mix,Yellow
3,Kitten,Spayed Female,Cat,2 years,Domestic Shorthair Mix,Calico
4,,Neutered Male,Cat,2 months,Domestic Shorthair Mix,Orange Tabby
5,London,Spayed Female,Dog,1 month,Rottweiler Mix,Black/Tan
6,Dixie,Spayed Female,Dog,4 months,Chihuahua Shorthair Mix,Buff
7,,Neutered Male,Dog,2 years,Chihuahua Shorthair Mix,Brown/White
8,Ruby,Spayed Female,Dog,3 years,English Coonhound/Italian Greyhound,Brown Brindle/White
9,Magnum,Neutered Male,Cat,2 years,Domestic Shorthair Mix,White/Black


In [11]:
data[TARGET].head(10)

0    0
1    3
2    2
3    0
4    0
5    0
6    0
7    3
8    0
9    1
Name: Outcome, dtype: int64

In [17]:
len(data)

18710

In [18]:
data.groupby(TARGET, as_index=False).count()

Unnamed: 0,Outcome,Name,SexuponOutcome,AnimalType,AgeuponOutcome,Breed,Color,DateTime,ID
0,0,6342,7538,7538,7538,7538,7538,7538,7538
1,1,3133,6595,6595,6583,6595,6595,6595,6595
2,2,3244,3350,3350,3350,3350,3350,3350,3350
3,3,515,1089,1089,1087,1089,1089,1089,1089
4,4,51,138,138,138,138,138,138,138


In [12]:
data["AnimalType"].unique()

array(['Cat', 'Dog'], dtype=object)

In [14]:
print(data["SexuponOutcome"].unique())

data["SexuponOutcome"] = data["SexuponOutcome"].fillna("Unknown")

print(data["SexuponOutcome"].unique())


['Neutered Male' 'Intact Female' 'Spayed Female' 'Intact Male' 'Unknown'
 nan]
['Neutered Male' 'Intact Female' 'Spayed Female' 'Intact Male' 'Unknown']


In [20]:
data["SexuponOutcome"].str.split(expand=True)

Unnamed: 0,0,1
0,Neutered,Male
1,Intact,Female
2,Neutered,Male
3,Spayed,Female
4,Neutered,Male
...,...,...
18705,Neutered,Male
18706,Intact,Male
18707,Spayed,Female
18708,Intact,Male


In [47]:
print(data["AgeuponOutcome"].unique())

data["AgeuponOutcome"] = data["AgeuponOutcome"].str.replace("years", "year")
data["AgeuponOutcome"] = data["AgeuponOutcome"].str.replace("months", "month")
data["AgeuponOutcome"] = data["AgeuponOutcome"].str.replace("weeks", "week")
data["AgeuponOutcome"] = data["AgeuponOutcome"].str.replace("days", "day")

splitted_age = data["AgeuponOutcome"].str.split(expand=True)
splitted_age.columns = ["num_age", "magnitude_age"]

splitted_age["num_age"] = splitted_age["num_age"].astype("float")

is_year = splitted_age["magnitude_age"] == "year"
splitted_age.loc[is_year, "num_age"] = 365 * splitted_age.loc[is_year, "num_age"]
is_month = splitted_age["magnitude_age"] == "month"
splitted_age.loc[is_month, "num_age"] = 30 * splitted_age.loc[is_month, "num_age"]
is_week = splitted_age["magnitude_age"] == "week"
splitted_age.loc[is_week, "num_age"] = 7 * splitted_age.loc[is_week, "num_age"]

mean_num_age = splitted_age["num_age"].mean(skipna=True)

splitted_age["num_age"] = splitted_age["num_age"].fillna(mean_num_age)
is_zero = splitted_age["num_age"] == 0.0
splitted_age.loc[is_zero, "num_age"] = mean_num_age

splitted_age["num_age"] = np.log(splitted_age["num_age"])
splitted_age["num_age"] = splitted_age["num_age"] - np.mean(splitted_age["num_age"])

splitted_age["magnitude_age"] = splitted_age["magnitude_age"].fillna("Unknown")

processed = pd.concat([data, splitted_age], axis=1)
processed.drop(columns="AgeuponOutcome", inplace=True)

splitted_age

['2 month' '1 month' '3 month' '2 year' '4 month' '3 year' '5 year'
 '8 month' '7 year' '1 week' '4 year' '1 year' '4 week' '6 year' '13 year'
 '8 year' '9 month' '6 month' '10 month' '9 year' '3 week' '2 week'
 '10 year' '3 day' '5 month' '2 day' '11 year' '12 year' '15 year'
 '14 year' '1 day' '11 month' '16 year' '7 month' '18 year' '6 day' nan
 '5 week' '5 day' '17 year' '4 day' '0 year' '19 year' '20 year']


Unnamed: 0,num_age,magnitude_age
0,-1.550950,month
1,-2.244098,month
2,-1.145485,month
3,0.947749,year
4,-1.550950,month
...,...,...
18705,-0.298188,month
18706,-1.145485,month
18707,0.947749,year
18708,0.947749,year


In [64]:
data["Name"] = data["Name"].str.replace(" ", "")
data.groupby("Name")["ID"].count()

Name
007           1
3Buster       1
Aaron         1
AaronElvis    1
Abbadon       1
             ..
Zuko          1
Zulma         1
Zulu          1
Zuzu          1
Zz            1
Name: ID, Length: 5061, dtype: int64

In [81]:
colors = data["Color"].str.split(pat="/", expand=True)
colors.columns = ["First_color", "Second_color"]

colors[["First_color", "Second_color"]] = colors[["First_color", "Second_color"]].fillna("Unknown")
colors

Unnamed: 0,First_color,Second_color
0,Black,White
1,Tortie,White
2,Yellow,Unknown
3,Calico,Unknown
4,Orange Tabby,Unknown
...,...,...
18705,Chocolate,White
18706,Tan,Unknown
18707,White,Brown Brindle
18708,Black,Unknown


In [85]:
pd.DataFrame(np.sort(colors.values, axis=1), columns=colors.columns)

Unnamed: 0,First_color,Second_color
0,Black,White
1,Tortie,White
2,Unknown,Yellow
3,Calico,Unknown
4,Orange Tabby,Unknown
...,...,...
18705,Chocolate,White
18706,Tan,Unknown
18707,Brown Brindle,White
18708,Black,Unknown


In [90]:
data[data["Breed"].str.split(pat="/", expand=True)[2].isna(])

Unnamed: 0,Name,SexuponOutcome,AnimalType,AgeuponOutcome,Breed,Color,DateTime,Outcome,ID
703,Jade,Spayed Female,Dog,6 month,Labrador Retriever/Black/Tan Hound,Black,2013-10-21 11:20:00,0,703
2447,Parker,Neutered Male,Dog,8 month,Black/Tan Hound/German Shepherd,Black/Tan,2013-12-19 15:05:00,1,2447
4004,Wyatt,Intact Male,Dog,4 month,Black/Tan Hound/Black Mouth Cur,Brown,2014-11-08 16:14:00,1,4004
5651,,Intact Female,Dog,7 month,Plott Hound/Black/Tan Hound,Brown/Tan,2014-01-01 13:00:00,1,5651
9778,Lucille,Intact Female,Dog,6 month,Labrador Retriever/Black/Tan Hound,Yellow,2013-10-16 17:05:00,1,9778
13403,,Neutered Male,Dog,2 month,Labrador Retriever/Black/Tan Hound,Black,2015-01-31 11:08:00,1,13403
18070,Cooper,Neutered Male,Dog,1 year,German Shepherd/Black/Tan Hound,Black/Tan,2014-10-25 12:23:00,0,18070
