In [1]:
import datetime as dt

import numpy as np
import pandas as pd

In [2]:
#make pandas display large datasets without '...'s
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)


In [3]:
data = pd.read_csv('data/raw_data.csv', index_col=0)

In [4]:
data.sample(10)  #what does the data look like?

Unnamed: 0_level_0,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
AnimalID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A680765,Dino,2014-06-07 16:51:00,Return_to_owner,,Dog,Neutered Male,10 years,Labrador Retriever Mix,Black
A674786,Jake,2014-03-18 17:29:00,Return_to_owner,,Dog,Neutered Male,6 years,Jindo Mix,Red
A694104,Max,2014-12-26 15:47:00,Transfer,Partner,Cat,Neutered Male,7 years,Persian Mix,Brown Tabby
A701589,,2015-05-02 16:25:00,Return_to_owner,,Dog,Intact Female,2 years,Boston Terrier Mix,Black/White
A686435,Pancho,2014-09-04 15:31:00,Transfer,Partner,Dog,Neutered Male,4 months,Dachshund Mix,Brown/White
A686149,,2014-08-21 11:59:00,Transfer,Partner,Dog,Neutered Male,5 years,Shih Tzu Mix,Black/White
A720338,Jessie,2016-02-08 13:54:00,Return_to_owner,,Dog,Spayed Female,3 years,Labrador Retriever Mix,Tan
A668615,Cassiel,2013-12-17 12:22:00,Adoption,,Dog,Neutered Male,11 months,Dachshund Mix,White/Red
A663987,Dodie,2013-10-03 17:45:00,Adoption,,Dog,Spayed Female,10 months,Pit Bull Mix,Blue
A718123,Joy Smeagle,2015-12-27 17:03:00,Adoption,,Dog,Neutered Male,2 years,Pekingese Mix,Brown/White


## standardize AgeuponOutcome to years

In [5]:
data.AgeuponOutcome.value_counts(dropna=False)  #what are the old values? for comparison with the result

1 year       3969
2 years      3742
2 months     3397
3 years      1823
1 month      1281
3 months     1277
4 years      1071
5 years       992
4 months      888
6 years       670
3 weeks       659
5 months      652
6 months      588
8 years       536
7 years       531
2 weeks       529
10 months     457
10 years      446
8 months      402
4 weeks       334
9 years       288
7 months      288
12 years      234
9 months      224
1 weeks       171
11 months     166
1 week        146
13 years      143
11 years      126
3 days        109
2 days         99
14 years       97
15 years       85
1 day          66
6 days         50
4 days         50
16 years       36
5 days         24
0 years        22
NaN            18
17 years       17
5 weeks        11
18 years       10
19 years        3
20 years        2
Name: AgeuponOutcome, dtype: int64

In [6]:
def transform_to_years(age_string):
    '''takes as input the AgeuponOutcome feature of the raw data and outputs the corresponding number of years'''

    if age_string is np.nan:
        return np.nan

    split_string = age_string.split()

    if split_string[1].strip('s') == 'year':
        return float(split_string[0])

    elif split_string[1].strip('s') == 'month':
        return float(split_string[0]) / 12

    elif split_string[1].strip('s') == 'week':
        return float(split_string[0]) / 52

    elif split_string[1].strip('s') == 'day':
        return float(split_string[0]) / 365

In [7]:
data['AgeuponOutcome'] = data.AgeuponOutcome.apply(transform_to_years)  #apply the function

In [8]:
data.AgeuponOutcome.value_counts(dropna=False)  #looks good

1.000000     3969
2.000000     3742
0.166667     3397
3.000000     1823
0.083333     1281
0.250000     1277
4.000000     1071
5.000000      992
0.333333      888
6.000000      670
0.057692      659
0.416667      652
0.500000      588
8.000000      536
7.000000      531
0.038462      529
0.833333      457
10.000000     446
0.666667      402
0.076923      334
0.019231      317
9.000000      288
0.583333      288
12.000000     234
0.750000      224
0.916667      166
13.000000     143
11.000000     126
0.008219      109
0.005479       99
14.000000      97
15.000000      85
0.002740       66
0.010959       50
0.016438       50
16.000000      36
0.013699       24
0.000000       22
NaN            18
17.000000      17
0.096154       11
18.000000      10
19.000000       3
20.000000       2
Name: AgeuponOutcome, dtype: int64

## transform date column

In [9]:
#for now i just extract the year and month since these are the features we need for data understanding

data['year'] = data.DateTime.apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').year)
data['month'] = data.DateTime.apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').month)


## transform SexuponOutcome

In [10]:
data.SexuponOutcome.value_counts(dropna=False)  #what are the old values?

Neutered Male    9779
Spayed Female    8820
Intact Male      3525
Intact Female    3511
Unknown          1093
NaN                 1
Name: SexuponOutcome, dtype: int64

In [11]:
def transform_sex(sex_string, neutralized=False):
    '''takes as input the SexuponOutcome feature and returns just the sex or if the animal was neutralized'''

    if (sex_string is np.nan) or (sex_string == 'Unknown'):
        return np.nan

    split_string = sex_string.split()

    if neutralized:
        if split_string[0] == 'Intact':
            return False
        else:
            return True
    else:
        return split_string[1]

In [12]:
#apply the function
data['neutralized'] = data.SexuponOutcome.apply(lambda x: transform_sex(x, True))
data['SexuponOutcome'] = data.SexuponOutcome.apply(transform_sex)

In [13]:
data.SexuponOutcome.value_counts(dropna=False)  #looks good

Male      13304
Female    12331
NaN        1094
Name: SexuponOutcome, dtype: int64

In [14]:
data.neutralized.value_counts(dropna=False)  #looks good

True     18599
False     7036
NaN       1094
Name: neutralized, dtype: int64

## save data with these features

In [15]:
data.sample(10)  #check another time

Unnamed: 0_level_0,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,year,month,neutralized
AnimalID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A666174,,2013-10-28 17:18:00,Transfer,Partner,Dog,Male,0.333333,Chihuahua Shorthair Mix,Tricolor,2013,10,False
A699577,Joy,2015-05-10 14:46:00,Adoption,,Dog,Female,1.0,Pit Bull Mix,Blue/White,2015,5,True
A675054,,2014-04-07 17:22:00,Adoption,,Cat,Female,0.583333,Domestic Shorthair Mix,Blue,2014,4,True
A667532,Buzz,2013-11-21 14:56:00,Euthanasia,Aggressive,Dog,Male,2.0,Labrador Retriever/Pit Bull,Black,2013,11,False
A714168,,2015-10-19 12:06:00,Euthanasia,Suffering,Dog,Male,0.5,Labrador Retriever Mix,Black/White,2015,10,False
A690386,,2014-10-20 12:23:00,Euthanasia,Suffering,Cat,Male,0.333333,Domestic Medium Hair Mix,Brown Tabby/White,2014,10,False
A671155,Tonka,2014-02-13 16:42:00,Transfer,Partner,Dog,Male,2.0,Staffordshire Mix,White,2014,2,True
A676993,,2014-04-17 19:23:00,Transfer,Partner,Cat,Male,0.057692,Domestic Shorthair Mix,Black/White,2014,4,False
A669241,Chipper,2013-12-18 11:55:00,Return_to_owner,,Dog,Male,3.0,Chihuahua Shorthair Mix,White/Chocolate,2013,12,True
A680183,Coyote,2014-06-28 17:58:00,Adoption,,Dog,Male,3.0,Chihuahua Longhair Mix,Black/White,2014,6,True


In [16]:
#save data
#data.to_csv('data/transformed_data.csv',index_label='AnimalID')

## transform to numeric features

transform non-numeric features for modeling

In [18]:
data_num = data.drop(labels=['Name','DateTime','OutcomeSubtype','Breed','Color'],axis=1)

In [21]:
data_num.head()

Unnamed: 0_level_0,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,year,month,neutralized
AnimalID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A671945,Return_to_owner,Dog,Male,1.0,2014,2,True
A656520,Euthanasia,Cat,Female,1.0,2013,10,True
A686464,Adoption,Dog,Male,2.0,2015,1,True
A683430,Transfer,Cat,Male,0.057692,2014,7,False
A667013,Transfer,Dog,Male,2.0,2013,11,True


In [22]:
data_num['AnimalType'] = data_num.AnimalType.apply(lambda x: 1 if x == 'Dog' else 0)

data_num['SexuponOutcome'] = data_num.SexuponOutcome.apply(lambda x: 1 if x == 'Female' else 0)

data_num['neutralized'] = data_num.neutralized.apply(float)

In [23]:
data_num.head()

Unnamed: 0_level_0,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,year,month,neutralized
AnimalID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A671945,Return_to_owner,1,0,1.0,2014,2,1.0
A656520,Euthanasia,0,1,1.0,2013,10,1.0
A686464,Adoption,1,0,2.0,2015,1,1.0
A683430,Transfer,0,0,0.057692,2014,7,0.0
A667013,Transfer,1,0,2.0,2013,11,1.0


In [25]:
#save data
#data_num.to_csv('data/transformed_data_num.csv',index_label='AnimalID')