In [1]:
import datetime as dt
import numpy as np
import pandas as pd

In [2]:
#make pandas display large datasets without '...'s
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)


In [3]:
data = pd.read_csv('data/raw_data.csv', index_col=0)

In [4]:
data.sample(10)  #what does the data look like?

Unnamed: 0_level_0,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
AnimalID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A702735,Spartacus,2015-05-18 16:41:00,Transfer,Partner,Cat,Intact Male,4 weeks,Domestic Shorthair Mix,Orange Tabby/White
A669381,Nugget,2013-12-21 18:26:00,Return_to_owner,,Dog,Neutered Male,3 years,Golden Retriever Mix,Gold
A669159,Ghost,2013-12-22 18:29:00,Adoption,,Cat,Spayed Female,9 months,Domestic Shorthair Mix,Brown Tabby/White
A691866,,2014-11-25 18:34:00,Transfer,SCRP,Cat,Intact Female,1 year,Domestic Shorthair Mix,White/Blue Tabby
A698544,,2015-03-14 09:00:00,Transfer,SCRP,Cat,Intact Female,1 year,Domestic Shorthair Mix,Brown Tabby
A668218,,2013-12-09 14:34:00,Adoption,,Cat,Spayed Female,2 months,Domestic Shorthair Mix,Black/White
A678172,Bella,2014-05-10 18:18:00,Adoption,,Dog,Spayed Female,1 year,Chihuahua Shorthair Mix,Red
A670213,Izzy,2014-01-12 15:03:00,Transfer,Partner,Dog,Intact Female,2 months,Pointer Mix,Black/Brown
A720857,Princess,2016-02-18 11:50:00,Return_to_owner,,Cat,Spayed Female,2 years,Domestic Shorthair Mix,Cream Tabby/White
A704607,Midnight,2015-06-07 14:58:00,Return_to_owner,,Dog,Spayed Female,15 years,Pembroke Welsh Corgi Mix,Black


## standardize AgeuponOutcome to years

In [5]:
data.AgeuponOutcome.value_counts(dropna=False)  #what are the old values? for comparison with the result

1 year       3969
2 years      3742
2 months     3397
3 years      1823
1 month      1281
3 months     1277
4 years      1071
5 years       992
4 months      888
6 years       670
3 weeks       659
5 months      652
6 months      588
8 years       536
7 years       531
2 weeks       529
10 months     457
10 years      446
8 months      402
4 weeks       334
9 years       288
7 months      288
12 years      234
9 months      224
1 weeks       171
11 months     166
1 week        146
13 years      143
11 years      126
3 days        109
2 days         99
14 years       97
15 years       85
1 day          66
6 days         50
4 days         50
16 years       36
5 days         24
0 years        22
NaN            18
17 years       17
5 weeks        11
18 years       10
19 years        3
20 years        2
Name: AgeuponOutcome, dtype: int64

In [6]:
def transform_to_years(age_string):
    '''takes as input the AgeuponOutcome feature of the raw data and outputs the corresponding number of years'''

    if age_string is np.nan:
        return np.nan

    split_string = age_string.split()

    if split_string[1].strip('s') == 'year':
        return float(split_string[0])

    elif split_string[1].strip('s') == 'month':
        return float(split_string[0]) / 12

    elif split_string[1].strip('s') == 'week':
        return float(split_string[0]) / 52

    elif split_string[1].strip('s') == 'day':
        return float(split_string[0]) / 365

In [7]:
data['AgeuponOutcome'] = data.AgeuponOutcome.apply(transform_to_years)  #apply the function

In [8]:
data.AgeuponOutcome.value_counts(dropna=False)  #looks good

1.000000     3969
2.000000     3742
0.166667     3397
3.000000     1823
0.083333     1281
0.250000     1277
4.000000     1071
5.000000      992
0.333333      888
6.000000      670
0.057692      659
0.416667      652
0.500000      588
8.000000      536
7.000000      531
0.038462      529
0.833333      457
10.000000     446
0.666667      402
0.076923      334
0.019231      317
9.000000      288
0.583333      288
12.000000     234
0.750000      224
0.916667      166
13.000000     143
11.000000     126
0.008219      109
0.005479       99
14.000000      97
15.000000      85
0.002740       66
0.010959       50
0.016438       50
16.000000      36
0.013699       24
0.000000       22
NaN            18
17.000000      17
0.096154       11
18.000000      10
19.000000       3
20.000000       2
Name: AgeuponOutcome, dtype: int64

## transform date column

In [9]:
#for now i just extract the year and month since these are the features we need for data understanding

data['year'] = data.DateTime.apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').year)
data['month'] = data.DateTime.apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').month)
data['weekday'] = data.DateTime.apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').weekday())


## transform SexuponOutcome

In [10]:
data.SexuponOutcome.value_counts(dropna=False)  #what are the old values?

Neutered Male    9779
Spayed Female    8820
Intact Male      3525
Intact Female    3511
Unknown          1093
NaN                 1
Name: SexuponOutcome, dtype: int64

In [11]:
def transform_sex(sex_string, neutralized=False):
    '''takes as input the SexuponOutcome feature and returns just the sex or if the animal was neutralized'''

    if (sex_string is np.nan) or (sex_string == 'Unknown'):
        return np.nan

    split_string = sex_string.split()

    if neutralized:
        if split_string[0] == 'Intact':
            return False
        else:
            return True
    else:
        return split_string[1]

In [12]:
#apply the function
data['neutralized'] = data.SexuponOutcome.apply(lambda x: transform_sex(x, True))
data['SexuponOutcome'] = data.SexuponOutcome.apply(transform_sex)

In [13]:
data.SexuponOutcome.value_counts(dropna=False)  #looks good

Male      13304
Female    12331
NaN        1094
Name: SexuponOutcome, dtype: int64

In [14]:
data.neutralized.value_counts(dropna=False)  #looks good

True     18599
False     7036
NaN       1094
Name: neutralized, dtype: int64

## Colors
Let's look at the colors next:

In [15]:
print(f"Current number of unique colors: {data.Color.nunique()}")

Current number of unique colors: 366


Since these are a bit too many to handle we will try and reduce the number of colors.
Let's look at what values we are dealing with and how often they occur.

In [16]:
data.Color.value_counts()

Black/White                    2824
Black                          2292
Brown Tabby                    1635
Brown Tabby/White               940
White                           931
Brown/White                     884
Orange Tabby                    841
Tan/White                       773
Tricolor                        752
Blue/White                      702
Black/Tan                       672
White/Black                     643
Brown                           639
Tan                             628
White/Brown                     569
Tortie                          530
Calico                          517
Orange Tabby/White              455
Blue                            450
Brown Brindle/White             450
Black/Brown                     436
Blue Tabby                      433
White/Tan                       389
Red                             337
Torbie                          335
Brown/Black                     333
Red/White                       331
Blue Tabby/White            

In [17]:
data["ColorMix"] = data.Color.str.contains('/')
data.Color=data.Color.str.split('/', expand=True)[0]
data.Color.value_counts()

Black                6422
White                3344
Brown Tabby          2592
Brown                1951
Tan                  1674
Orange Tabby         1299
Blue                 1199
Tricolor              800
Red                   779
Brown Brindle         699
Blue Tabby            678
Tortie                580
Calico                552
Chocolate             448
Torbie                398
Sable                 324
Cream Tabby           277
Buff                  267
Yellow                230
Gray                  227
Cream                 221
Fawn                  209
Lynx Point            183
Blue Merle            165
Seal Point            158
Black Brindle          99
Flame Point            85
Gold                   77
Brown Merle            72
Black Smoke            62
Black Tabby            61
Silver                 53
Red Merle              52
Gray Tabby             51
Blue Tick              44
Orange                 43
Silver Tabby           41
Red Tick               40
Lilac Point 

In [18]:
data.ColorMix.value_counts()

True     13924
False    12805
Name: ColorMix, dtype: int64

In [19]:
print(f"We were able to reduce the number of unique colors to: {data.Color.nunique()}")

We were able to reduce the number of unique colors to: 57


We were able to reduce the number of colors quite a bit while still retaining the information that an animal has a pure 
or mixed color (which is actually almost half of the animals). This will probably suffice for this use case.

## save data with these features

In [20]:
data.sample(10)  #check another time

Unnamed: 0_level_0,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,year,month,weekday,neutralized,ColorMix
AnimalID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
A658690,Priscilla,2013-10-19 12:58:00,Adoption,,Dog,Female,1.0,Pit Bull Mix,Black,2013,10,5,True,True
A705938,Baby Bella,2015-12-26 16:54:00,Return_to_owner,,Dog,Female,0.583333,Chihuahua Shorthair Mix,Black,2015,12,5,True,True
A668253,Scarlett,2013-12-13 18:31:00,Adoption,,Dog,Female,12.0,Australian Shepherd Mix,Brown,2013,12,4,True,True
A715557,Dynah,2015-11-18 13:18:00,Adoption,,Cat,Female,0.333333,Domestic Shorthair Mix,Brown Tabby,2015,11,2,True,False
A706697,Toby,2015-09-19 19:02:00,Adoption,,Cat,Male,0.416667,Domestic Shorthair Mix,Orange Tabby,2015,9,5,True,False
A687188,Miss Sally,2014-09-06 18:10:00,Adoption,,Dog,Female,0.166667,Pit Bull Mix,White,2014,9,5,True,False
A685155,Scooby,2014-08-11 15:17:00,Transfer,Partner,Dog,Male,1.0,Dachshund Mix,Sable,2014,8,0,True,True
A717152,,2015-12-03 14:04:00,Transfer,Partner,Dog,Male,0.083333,German Shepherd Mix,Tricolor,2015,12,3,False,False
A679335,Rosie,2014-05-25 15:30:00,Return_to_owner,,Dog,Female,12.0,Chihuahua Shorthair/Miniature Schnauzer,Black,2014,5,6,True,True
A692676,,2014-11-28 15:58:00,Transfer,Partner,Dog,Male,0.333333,Manchester Terrier Mix,Black,2014,11,4,False,True


In [21]:
#save data
#data.to_csv('data/transformed_data.csv',index_label='AnimalID')

## Drop OutcomeSubtype & Transform Breed

In [22]:
data.OutcomeSubtype.value_counts(dropna=False) #a lot of NA values

NaN                    13612
Partner                 7816
Foster                  1800
SCRP                    1599
Suffering               1002
Aggressive               320
Offsite                  165
In Kennel                114
Behavior                  86
Rabies Risk               74
Medical                   66
In Foster                 52
Enroute                    8
Court/Investigation        6
At Vet                     4
In Surgery                 3
Barn                       2
Name: OutcomeSubtype, dtype: int64

In [23]:
data = data.drop(["OutcomeSubtype"], axis = 1)

In [24]:
data["BreedMix"] = data.Breed.str.contains("BreedMix") #creating new boolean column for Mix

In [25]:
data["Breed"] = data['Breed'].str.replace(" Mix", "")

In [26]:
data["Breed"] = data['Breed'].str.replace("/.*", "")

  """Entry point for launching an IPython kernel.


In [27]:
data[data.AnimalType == 'Dog'].Breed.nunique() #reduced unqiue breeds by over 1000

187

In [28]:
data[data.AnimalType == 'Dog'].Breed.value_counts()

Chihuahua Shorthair                   2145
Pit Bull                              2113
Labrador Retriever                    1915
German Shepherd                        826
Australian Cattle Dog                  511
Dachshund                              510
Boxer                                  360
Border Collie                          334
Miniature Poodle                       310
Australian Shepherd                    229
Yorkshire Terrier                      226
Jack Russell Terrier                   222
Miniature Schnauzer                    220
Beagle                                 220
Catahoula                              218
Rat Terrier                            215
Siberian Husky                         194
Rottweiler                             181
Shih Tzu                               176
Chihuahua Longhair                     168
Cairn Terrier                          142
Pointer                                139
Great Pyrenees                         131
American Bu

In [29]:
data[data.AnimalType == 'Cat'].Breed.nunique() #still a reduction by about 50%, but initial values were much lower anyway

33

In [30]:
data[data.AnimalType == 'Cat'].Breed.value_counts()

Domestic Shorthair      8958
Domestic Medium Hair     883
Domestic Longhair        547
Siamese                  426
Snowshoe                  78
Manx                      48
Maine Coon                47
Russian Blue              35
Himalayan                 18
Persian                   14
Ragdoll                   12
American Shorthair         9
Angora                     7
Japanese Bobtail           6
Balinese                   5
Bombay                     5
Bengal                     5
British Shorthair          4
Tonkinese                  4
Pixiebob Shorthair         3
Devon Rex                  2
Javanese                   2
Exotic Shorthair           2
Sphynx                     2
Abyssinian                 2
Turkish Van                2
Cymric                     2
Cornish Rex                1
Burmese                    1
Ocicat                     1
Munchkin Longhair          1
Norwegian Forest Cat       1
Havana Brown               1
Name: Breed, dtype: int64

In [31]:
#data.to_csv('data/transformed_data.csv',index_label='AnimalID')

## transform to numeric features

transform non-numeric features for modeling

In [32]:
data_num = data.drop(labels=['Name','DateTime','Breed','Color'],axis=1)

In [33]:
data_num.head()

Unnamed: 0_level_0,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,year,month,weekday,neutralized,ColorMix,BreedMix
AnimalID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A671945,Return_to_owner,Dog,Male,1.0,2014,2,2,True,True,False
A656520,Euthanasia,Cat,Female,1.0,2013,10,6,True,False,False
A686464,Adoption,Dog,Male,2.0,2015,1,5,True,True,False
A683430,Transfer,Cat,Male,0.057692,2014,7,4,False,False,False
A667013,Transfer,Dog,Male,2.0,2013,11,4,True,False,False


In [34]:
data_num['AnimalType'] = data_num.AnimalType.apply(lambda x: 1 if x == 'Dog' else 0)

data_num['SexuponOutcome'] = data_num.SexuponOutcome.apply(lambda x: 1 if x == 'Female' else 0)

data_num['BreedMix'] = data_num.SexuponOutcome.apply(lambda x: 1 if x == 'True' else 0)

data_num['ColorMix'] = data_num.SexuponOutcome.apply(lambda x: 1 if x == 'True' else 0)

data_num['neutralized'] = data_num.neutralized.apply(float)

In [35]:
data_num.head()

Unnamed: 0_level_0,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,year,month,weekday,neutralized,ColorMix,BreedMix
AnimalID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A671945,Return_to_owner,1,0,1.0,2014,2,2,1.0,0,0
A656520,Euthanasia,0,1,1.0,2013,10,6,1.0,0,0
A686464,Adoption,1,0,2.0,2015,1,5,1.0,0,0
A683430,Transfer,0,0,0.057692,2014,7,4,0.0,0,0
A667013,Transfer,1,0,2.0,2013,11,4,1.0,0,0


In [36]:
#save data
# data_num.to_csv('data/transformed_data_num.csv',index_label='AnimalID')