In [1]:
import datetime as dt

import numpy as np
import pandas as pd

In [2]:
#make pandas display large datasets without '...'s
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)


In [3]:
data = pd.read_csv('data/raw_data.csv', index_col=0)

In [4]:
data.sample(10)  #what does the data look like?

Unnamed: 0_level_0,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
AnimalID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A705931,Gideon,2015-06-29 17:31:00,Adoption,,Dog,Neutered Male,8 years,Miniature Pinscher Mix,Black/Brown
A698194,Jackson,2015-03-17 17:13:00,Return_to_owner,,Dog,Neutered Male,2 years,Australian Cattle Dog,Blue Merle/White
A666546,Ami,2013-11-12 10:39:00,Euthanasia,Suffering,Cat,Intact Female,2 months,Domestic Shorthair Mix,Brown Tabby/White
A697180,Mr. Pickles,2015-02-20 14:00:00,Return_to_owner,,Dog,Neutered Male,7 years,Shih Tzu Mix,Cream
A684786,Reno,2014-08-03 15:47:00,Transfer,Partner,Dog,Intact Male,10 months,Border Collie/Catahoula,Brown Merle/White
A717008,Taz,2015-12-06 15:12:00,Transfer,Partner,Dog,Neutered Male,7 months,Border Collie Mix,Black/White
A668317,Scottie,2014-02-11 13:48:00,Transfer,Partner,Cat,Neutered Male,5 months,Domestic Shorthair Mix,Black
A684471,Captain,2014-09-21 15:10:00,Adoption,,Dog,Neutered Male,2 years,Border Collie Mix,Sable
A718034,,2015-12-25 19:19:00,Transfer,SCRP,Cat,Neutered Male,6 years,Domestic Shorthair Mix,Brown Tabby
A563492,Sam,2015-03-10 08:26:00,Adoption,Foster,Dog,Neutered Male,5 years,Chihuahua Shorthair Mix,Tan


## standardize AgeuponOutcome to years

In [5]:
data.AgeuponOutcome.value_counts(dropna=False)  #what are the old values? for comparison with the result

1 year       3969
2 years      3742
2 months     3397
3 years      1823
1 month      1281
3 months     1277
4 years      1071
5 years       992
4 months      888
6 years       670
3 weeks       659
5 months      652
6 months      588
8 years       536
7 years       531
2 weeks       529
10 months     457
10 years      446
8 months      402
4 weeks       334
9 years       288
7 months      288
12 years      234
9 months      224
1 weeks       171
11 months     166
1 week        146
13 years      143
11 years      126
3 days        109
2 days         99
14 years       97
15 years       85
1 day          66
6 days         50
4 days         50
16 years       36
5 days         24
0 years        22
NaN            18
17 years       17
5 weeks        11
18 years       10
19 years        3
20 years        2
Name: AgeuponOutcome, dtype: int64

In [6]:
def transform_to_years(age_string):
    '''takes as input the AgeuponOutcome feature of the raw data and outputs the corresponding number of years'''

    if age_string is np.nan:
        return np.nan

    split_string = age_string.split()

    if split_string[1].strip('s') == 'year':
        return float(split_string[0])

    elif split_string[1].strip('s') == 'month':
        return float(split_string[0]) / 12

    elif split_string[1].strip('s') == 'week':
        return float(split_string[0]) / 52

    elif split_string[1].strip('s') == 'day':
        return float(split_string[0]) / 365

In [7]:
data['AgeuponOutcome'] = data.AgeuponOutcome.apply(transform_to_years)  #apply the function

In [8]:
data.AgeuponOutcome.value_counts(dropna=False)  #looks good

1.000000     3969
2.000000     3742
0.166667     3397
3.000000     1823
0.083333     1281
0.250000     1277
4.000000     1071
5.000000      992
0.333333      888
6.000000      670
0.057692      659
0.416667      652
0.500000      588
8.000000      536
7.000000      531
0.038462      529
0.833333      457
10.000000     446
0.666667      402
0.076923      334
0.019231      317
9.000000      288
0.583333      288
12.000000     234
0.750000      224
0.916667      166
13.000000     143
11.000000     126
0.008219      109
0.005479       99
14.000000      97
15.000000      85
0.002740       66
0.010959       50
0.016438       50
16.000000      36
0.013699       24
0.000000       22
NaN            18
17.000000      17
0.096154       11
18.000000      10
19.000000       3
20.000000       2
Name: AgeuponOutcome, dtype: int64

## transform date column

In [9]:
#for now i just extract the year and month since these are the features we need for data understanding

data['year'] = data.DateTime.apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').year)
data['month'] = data.DateTime.apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').month)


## transform SexuponOutcome

In [10]:
data.SexuponOutcome.value_counts(dropna=False)  #what are the old values?

Neutered Male    9779
Spayed Female    8820
Intact Male      3525
Intact Female    3511
Unknown          1093
NaN                 1
Name: SexuponOutcome, dtype: int64

In [11]:
def transform_sex(sex_string, neutralized=False):
    '''takes as input the SexuponOutcome feature and returns just the sex or if the animal was neutralized'''

    if (sex_string is np.nan) or (sex_string == 'Unknown'):
        return np.nan

    split_string = sex_string.split()

    if neutralized:
        if split_string[0] == 'Intact':
            return False
        else:
            return True
    else:
        return split_string[1]

In [12]:
#apply the function
data['neutralized'] = data.SexuponOutcome.apply(lambda x: transform_sex(x, True))
data['SexuponOutcome'] = data.SexuponOutcome.apply(transform_sex)

In [13]:
data.SexuponOutcome.value_counts(dropna=False)  #looks good

Male      13304
Female    12331
NaN        1094
Name: SexuponOutcome, dtype: int64

In [14]:
data.neutralized.value_counts(dropna=False)  #looks good

True     18599
False     7036
NaN       1094
Name: neutralized, dtype: int64

Since these are a bit too many to handle we will try and reduce the number of colors.
Let's look at what values we are dealing with and how often they occur.

In [15]:
data.Color.value_counts()

Black/White                    2824
Black                          2292
Brown Tabby                    1635
Brown Tabby/White               940
White                           931
Brown/White                     884
Orange Tabby                    841
Tan/White                       773
Tricolor                        752
Blue/White                      702
Black/Tan                       672
White/Black                     643
Brown                           639
Tan                             628
White/Brown                     569
Tortie                          530
Calico                          517
Orange Tabby/White              455
Blue                            450
Brown Brindle/White             450
Black/Brown                     436
Blue Tabby                      433
White/Tan                       389
Red                             337
Torbie                          335
Brown/Black                     333
Red/White                       331
Blue Tabby/White            

In [16]:
data["ColorMix"] = data.Color.str.contains('/')
data.Color=data.Color.str.split('/', expand=True)[0]
data.Color.value_counts()

Black                6422
White                3344
Brown Tabby          2592
Brown                1951
Tan                  1674
Orange Tabby         1299
Blue                 1199
Tricolor              800
Red                   779
Brown Brindle         699
Blue Tabby            678
Tortie                580
Calico                552
Chocolate             448
Torbie                398
Sable                 324
Cream Tabby           277
Buff                  267
Yellow                230
Gray                  227
Cream                 221
Fawn                  209
Lynx Point            183
Blue Merle            165
Seal Point            158
Black Brindle          99
Flame Point            85
Gold                   77
Brown Merle            72
Black Smoke            62
Black Tabby            61
Silver                 53
Red Merle              52
Gray Tabby             51
Blue Tick              44
Orange                 43
Silver Tabby           41
Red Tick               40
Lilac Point 

In [17]:
data.ColorMix.value_counts()

True     13924
False    12805
Name: ColorMix, dtype: int64

In [18]:
print(f"We were able to reduce the number of unique colors to: {data.Color.nunique()}")

We were able to reduce the number of unique colors to: 57


We were able to reduce the number of colors quite a bit while still retaining the information that an animal has a pure 
or mixed color (which is actually almost half of the animals). This will probably suffice for this use case.

## save data with these features

In [19]:
data.sample(10)  #check another time

Unnamed: 0_level_0,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,year,month,neutralized,ColorMix
AnimalID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
A685694,Little One,2015-03-06 08:00:00,Adoption,Foster,Cat,Female,0.583333,Domestic Shorthair Mix,Orange Tabby,2015,3,True,False
A427782,Midas,2014-04-10 16:29:00,Return_to_owner,,Dog,Male,9.0,Pit Bull,Tan,2014,4,True,True
A716218,Ladybug,2015-11-17 18:28:00,Adoption,,Dog,Female,4.0,Australian Shepherd Mix,Chocolate,2015,11,True,True
A718853,Sandy,2016-01-10 14:00:00,Return_to_owner,,Dog,Female,0.25,Dachshund/Rat Terrier,Brown,2016,1,True,True
A690804,,2014-10-27 09:00:00,Transfer,SCRP,Cat,Male,1.0,Domestic Shorthair Mix,Orange Tabby,2014,10,False,False
A703371,Nina,2015-05-31 17:29:00,Adoption,,Dog,Female,2.0,Cairn Terrier Mix,Tan,2015,5,True,True
A707763,Tinkerbell,2015-07-19 13:48:00,Adoption,Foster,Cat,Female,0.166667,Domestic Shorthair Mix,Cream Tabby,2015,7,True,False
A690533,Armani,2015-10-13 13:12:00,Transfer,Partner,Cat,Male,1.0,Domestic Shorthair Mix,Cream Tabby,2015,10,True,True
A712912,Goofy,2015-10-04 18:03:00,Adoption,,Dog,Male,0.916667,Australian Cattle Dog Mix,Tan,2015,10,True,True
A699574,Jupiter,2015-03-31 16:50:00,Return_to_owner,,Dog,Male,2.0,Basset Hound/English Cocker Spaniel,Black,2015,3,True,False


In [20]:
#save data
#data.to_csv('data/transformed_data.csv',index_label='AnimalID')

## transform to numeric features

transform non-numeric features for modeling

In [21]:
data_num = data.drop(labels=['Name','DateTime','OutcomeSubtype','Breed','Color'],axis=1)

In [22]:
data_num.head()

Unnamed: 0_level_0,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,year,month,neutralized,ColorMix
AnimalID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A671945,Return_to_owner,Dog,Male,1.0,2014,2,True,True
A656520,Euthanasia,Cat,Female,1.0,2013,10,True,False
A686464,Adoption,Dog,Male,2.0,2015,1,True,True
A683430,Transfer,Cat,Male,0.057692,2014,7,False,False
A667013,Transfer,Dog,Male,2.0,2013,11,True,False


In [23]:
data_num['AnimalType'] = data_num.AnimalType.apply(lambda x: 1 if x == 'Dog' else 0)

data_num['SexuponOutcome'] = data_num.SexuponOutcome.apply(lambda x: 1 if x == 'Female' else 0)

data_num['neutralized'] = data_num.neutralized.apply(float)

In [24]:
data_num.head()

Unnamed: 0_level_0,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,year,month,neutralized,ColorMix
AnimalID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A671945,Return_to_owner,1,0,1.0,2014,2,1.0,True
A656520,Euthanasia,0,1,1.0,2013,10,1.0,False
A686464,Adoption,1,0,2.0,2015,1,1.0,True
A683430,Transfer,0,0,0.057692,2014,7,0.0,False
A667013,Transfer,1,0,2.0,2013,11,1.0,False


In [25]:
#save data
#data_num.to_csv('data/transformed_data_num.csv',index_label='AnimalID')