In [1]:
import datetime as dt

import numpy as np
import pandas as pd

In [2]:
#make pandas display large datasets without '...'s
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)


In [3]:
data = pd.read_csv('data/raw_data.csv', index_col=0)

In [4]:
data.sample(10)  #what does the data look like?

Unnamed: 0_level_0,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
AnimalID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A691402,Bobbie,2014-11-07 18:24:00,Adoption,,Dog,Neutered Male,2 years,Miniature Schnauzer Mix,Gray
A711660,Dinah,2015-12-12 17:40:00,Adoption,Foster,Cat,Spayed Female,4 months,Domestic Shorthair Mix,Black
A704805,,2015-06-09 15:54:00,Transfer,Partner,Cat,Unknown,3 weeks,Domestic Shorthair Mix,Brown Tabby
A709714,,2015-08-12 14:33:00,Euthanasia,Suffering,Cat,Unknown,1 week,Domestic Shorthair Mix,Black Tabby
A701680,Bruno,2015-05-10 16:08:00,Transfer,Partner,Dog,Intact Male,2 months,Chihuahua Shorthair,Brown
A692273,Molly,2014-12-29 11:43:00,Adoption,Foster,Dog,Spayed Female,4 years,Dachshund Mix,Red
A710965,,2015-08-31 11:17:00,Transfer,Partner,Cat,Intact Female,3 weeks,Domestic Longhair Mix,Tricolor/Calico
A657228,Tiger,2013-10-11 11:34:00,Transfer,Partner,Dog,Neutered Male,8 years,Pit Bull Mix,Brown Brindle/White
A681829,,2014-06-20 18:28:00,Transfer,Partner,Cat,Intact Female,1 month,Domestic Shorthair Mix,Black
A671563,Monty,2014-04-19 11:27:00,Adoption,Foster,Cat,Neutered Male,8 months,Domestic Shorthair Mix,Brown Tabby


## standardize AgeuponOutcome to years

In [5]:
data.AgeuponOutcome.value_counts(dropna=False)  #what are the old values? for comparison with the result

1 year       3969
2 years      3742
2 months     3397
3 years      1823
1 month      1281
3 months     1277
4 years      1071
5 years       992
4 months      888
6 years       670
3 weeks       659
5 months      652
6 months      588
8 years       536
7 years       531
2 weeks       529
10 months     457
10 years      446
8 months      402
4 weeks       334
9 years       288
7 months      288
12 years      234
9 months      224
1 weeks       171
11 months     166
1 week        146
13 years      143
11 years      126
3 days        109
2 days         99
14 years       97
15 years       85
1 day          66
6 days         50
4 days         50
16 years       36
5 days         24
0 years        22
NaN            18
17 years       17
5 weeks        11
18 years       10
19 years        3
20 years        2
Name: AgeuponOutcome, dtype: int64

In [6]:
def transform_to_years(age_string):
    '''takes as input the AgeuponOutcome feature of the raw data and outputs the corresponding number of years'''

    if age_string is np.nan:
        return np.nan

    split_string = age_string.split()

    if split_string[1].strip('s') == 'year':
        return float(split_string[0])

    elif split_string[1].strip('s') == 'month':
        return float(split_string[0]) / 12

    elif split_string[1].strip('s') == 'week':
        return float(split_string[0]) / 52

    elif split_string[1].strip('s') == 'day':
        return float(split_string[0]) / 365

In [7]:
data['AgeuponOutcome'] = data.AgeuponOutcome.apply(transform_to_years)  #apply the function

In [8]:
data.AgeuponOutcome.value_counts(dropna=False)  #looks good

1.000000     3969
2.000000     3742
0.166667     3397
3.000000     1823
0.083333     1281
0.250000     1277
4.000000     1071
5.000000      992
0.333333      888
6.000000      670
0.057692      659
0.416667      652
0.500000      588
8.000000      536
7.000000      531
0.038462      529
0.833333      457
10.000000     446
0.666667      402
0.076923      334
0.019231      317
9.000000      288
0.583333      288
12.000000     234
0.750000      224
0.916667      166
13.000000     143
11.000000     126
0.008219      109
0.005479       99
14.000000      97
15.000000      85
0.002740       66
0.010959       50
0.016438       50
16.000000      36
0.013699       24
0.000000       22
NaN            18
17.000000      17
0.096154       11
18.000000      10
19.000000       3
20.000000       2
Name: AgeuponOutcome, dtype: int64

## transform date column

In [9]:
#for now i just extract the year and month since these are the features we need for data understanding

data['year'] = data.DateTime.apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').year)
data['month'] = data.DateTime.apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').month)


## transform SexuponOutcome

In [10]:
data.SexuponOutcome.value_counts(dropna=False)  #what are the old values?

Neutered Male    9779
Spayed Female    8820
Intact Male      3525
Intact Female    3511
Unknown          1093
NaN                 1
Name: SexuponOutcome, dtype: int64

In [11]:
def transform_sex(sex_string, neutralized=False):
    '''takes as input the SexuponOutcome feature and returns just the sex or if the animal was neutralized'''

    if (sex_string is np.nan) or (sex_string == 'Unknown'):
        return np.nan

    split_string = sex_string.split()

    if neutralized:
        if split_string[0] == 'Intact':
            return False
        else:
            return True
    else:
        return split_string[1]

In [12]:
#apply the function
data['neutralized'] = data.SexuponOutcome.apply(lambda x: transform_sex(x, True))
data['SexuponOutcome'] = data.SexuponOutcome.apply(transform_sex)

In [13]:
data.SexuponOutcome.value_counts(dropna=False)  #looks good

Male      13304
Female    12331
NaN        1094
Name: SexuponOutcome, dtype: int64

In [14]:
data.neutralized.value_counts(dropna=False)  #looks good

True     18599
False     7036
NaN       1094
Name: neutralized, dtype: int64

## finish

In [15]:
data.sample(10)  #check another time

Unnamed: 0_level_0,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,year,month,neutralized
AnimalID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A575839,Sushi,2014-07-18 18:34:00,Adoption,,Cat,Female,4.0,Domestic Shorthair Mix,Tortie,2014,7,True
A695335,,2015-01-15 09:00:00,Transfer,SCRP,Cat,,1.0,Domestic Shorthair Mix,Brown Tabby,2015,1,
A689218,Nelson,2014-11-25 12:50:00,Transfer,Partner,Dog,Male,1.0,Catahoula Mix,White/Tan,2014,11,True
A676668,,2014-04-13 17:36:00,Transfer,Partner,Cat,Male,0.057692,Domestic Shorthair Mix,Brown Tabby/White,2014,4,False
A693195,Leia,2014-12-07 13:08:00,Adoption,,Dog,Female,5.0,Miniature Poodle Mix,White,2014,12,False
A701477,Lucy,2015-04-30 18:34:00,Return_to_owner,,Dog,Female,7.0,Border Collie,Black/White,2015,4,True
A688151,Thor,2014-09-24 14:20:00,Adoption,,Dog,Male,5.0,German Shepherd,Black/Brown,2014,9,True
A714763,Lucy,2015-12-16 14:38:00,Transfer,Partner,Dog,Female,2.0,Pit Bull Mix,Fawn/White,2015,12,True
A692891,,2014-11-27 13:51:00,Euthanasia,Suffering,Dog,Female,0.25,Pit Bull Mix,Blue/White,2014,11,False
A671226,Chloe,2014-01-27 16:07:00,Transfer,Partner,Dog,Female,2.0,Dachshund/Beagle,Black/White,2014,1,True


In [16]:
#save data
#data.to_csv('data/transformed_data.csv',index_label='AnimalID')