In [97]:
import pandas as pd
import numpy as np
from collections import Counter
from matplotlib import pyplot as plt
import seaborn as sns
import re

In [98]:
# Read Excel Files
puppy_info = pd.read_excel('PuppyInfotest.xls')
puppy_trainer_outcome = pd.read_excel('PuppyTrainerOutcome.xlsx')
trainer_info = pd.read_excel('TrainerInfo.xlsx')

In [99]:
print(len(puppy_info), len(puppy_trainer_outcome), len(trainer_info))

4782 28063 1688


## Processing Data

The first step in preparing our data for our machine learning algorithm is processing. We will start by cleaning up the data for a few of the important columns.

In [100]:
## Computing % of missing values of puppy_info features

In [101]:
(1.0 - puppy_info.count()/len(puppy_info))*100.0

Breed                        0.041824
Color                        0.062735
Sex                          0.104559
Health                       0.000000
StoolFirm                    0.000000
EnergyLevel                  0.000000
EliminationInCrate           0.000000
QuietInCrate                 0.000000
RespondsToCommandKennel      0.000000
NoInappropriateChewing       0.000000
Housemanners                 0.000000
LeftUnattended               0.000000
EliminationInHouse           0.000000
PlaybitePeople               0.000000
StealsFood                   0.000000
OnFurniture                  0.000000
BarksExcessively             0.000000
RaidsGarbage                 0.000000
CounterSurfingJumpOnDoors    0.000000
JumpOnPeople                 0.000000
FriendlyWAnimals             0.000000
GoodWKids                    0.000000
GoodWStrangers               0.000000
WalksWellOnLeash             0.000000
KnowCommandGetBusy           0.000000
EliminatesOnRoute            0.000000
ChasingAnima

In [102]:
## SEX

In [103]:
def process_sex(df):
    target_for_values = {
    'F': ['bitch','remale','f','Female','fem','fema;e','F','    F','Femae','Femail','Femaile','Femal','Femal3','Female','Femalw','femle','FEMALE','female','Girl','ID#2099','girl','n/a','None','own','Unknown','1364 & 655','1112/1329','065 102 601','2052','2235','11796','1972','1677','1649','1590','1395','1070','219','0','696','1018','ID# 2099','femal','femalw','Famale','femaile','femail']}

    for k, v in target_for_values.items():
        df.loc[df.Sex.isin(v), 'Sex'] = k

    target_for_values = {
    'M': ['Male','1110','1231','1627','1644','1766','1870','2019','??','1JJ11','boy','Crate from Val and Jim Hazlin','don\'t have one.','M - neutered','maie','Mail','Maile','Make','make','Male - neutered','male (neutered)','"Male, neutered"','Male1832','mine doesn\'t have a number?','N/A','NA','Neutered Male','new crate','none','own crate','Weren\'t given a crate','m','male','MALE','Male', 'Male','neutered mail','mail','Male, neutered',' Neutered Male']}

    for k, v in target_for_values.items():
        df.loc[df.Sex.isin(v), 'Sex'] = k

process_sex(puppy_info)
print(puppy_info['Sex'].fillna('M', inplace = True))
        
print(puppy_info['Sex'].value_counts())
print(puppy_info['Sex'].unique())

puppy_info['Sex']=puppy_info['Sex'].map({'F':0, 'M':1, 0:0, 1:1}).astype(int)

None
M    2400
F    2382
Name: Sex, dtype: int64
['F' 'M']


## Attends Classes


In [104]:
print(puppy_info['AttendsClasses'].value_counts())

# Replace missing values with 4.0 (mean is 796 which is close to 4.0
# category)
puppy_info['AttendsClasses'].fillna(4.0,inplace = True)

5.0    3393
4.0     771
0.0     417
3.0     135
1.0      41
2.0      23
Name: AttendsClasses, dtype: int64


## Exercise Amount

In [105]:
# Extract the first number and process it into minute values
def getFirstNumber(string):
    find_num = re.search(r'\d+', string)
    if(find_num == None):
        # Process words into numbers
        if "one" in string or "an hour" in string:
            return 60
        elif "two" in string:
            return 120
        elif "three" in string:
            return 180
        elif "hour and half" in string:
            return 90
        elif "twenty" in string:
            return 20
        elif "forty" in string:
            return 45
        elif "hour" in string and not "hours" in string:
            return 60
        return 0
    
    # Process numbers into minute values
    num = int(find_num.group())
    if int(num < 10):
        return num*60
    return num

# Process minute values to most important/common times
def averageTime(num, replaceZeroValue = 60):
    if num == 0:
        return replaceZeroValue
    
    time_classes = [0, 20, 40, 60, 90, 120, 180, 440]
    for time_val in time_classes:
        if time_val >= num:
            return time_val
    
    return 440

def process_exercise_amt(puppy_info):
    puppy_exercise_amt = puppy_info["ExericeAmount"].map(lambda x: str(x).lower())
    puppy_exercise_processed = list(map(lambda x: averageTime(int(getFirstNumber(str(x)))), puppy_exercise_amt))

    puppy_info["ExericeAmount"] = puppy_exercise_processed

process_exercise_amt(puppy_info)
print("Distinct time values:", puppy_info["ExericeAmount"].unique())

exercise_amt_counts = Counter(puppy_info["ExericeAmount"])
exercise_amt_counts = list(map(lambda x: (x, exercise_amt_counts[x]), exercise_amt_counts))
exercise_amt_counts.sort()
print("Counts:", exercise_amt_counts)

Distinct time values: [ 90 120  60 440 180  40  20]
Counts: [(20, 147), (40, 604), (60, 2137), (90, 608), (120, 924), (180, 263), (440, 99)]


## Breed

In [106]:
def breed_mappings(breed):
    if breed == "golden labrador" or breed == "golden lab":
        return "golden labrador"
    elif breed in ["belgian shepherd", "belgen sharpart"]:
        return "belgian shepherd"
    elif "rador" in breed or "lab" in breed or breed[0:2] == "la" or breed in ["lr", "l", "bl", "labrador retriever"]:
        return "labrador retriever"
    elif "german shep" in breed or breed in ["gs", "gds", "greman shepherd", "geman shepherd", "gsd", "gsp", "g.s.", "german shepherd"] or "german sh" in breed or "shep" in breed:
        return "german shepherd"
    elif breed in ["gr", "golden retriever"] or "golden retr" in breed:
        return "golden retriever"
    elif breed in ["unknown", "black", "golden", "lym", "lbm", "ly m", "ged", "gdd", "noble", "nan"]:
        # Replace with the most common dog breed: Labrador retriever
        return "labrador retriever"
        # return "unknown"
    else:
        return breed

def process_breed(puppy_info):
    puppy_info["Breed"] = puppy_info["Breed"].map(lambda x: breed_mappings(str(x).lower()))
    
process_breed(puppy_info)
print(puppy_info["Breed"].unique())

['labrador retriever' 'german shepherd' 'golden labrador'
 'golden retriever' 'belgian shepherd']


In [107]:
breed_counts = Counter(puppy_info["Breed"])
print(breed_counts)

puppy_info['Breed']=puppy_info['Breed'].map({
    'labrador retriever': 1,
    'german shepherd': 2,
    'golden labrador': 3,
    'golden retriever': 4,
    'belgian shepherd': 5,
    '1':1,
    '2':2,
    '3':3,
    '4':4,
    '5':5}).astype(int)

print(puppy_info["Breed"].unique())

Counter({'labrador retriever': 4397, 'german shepherd': 347, 'golden retriever': 33, 'golden labrador': 4, 'belgian shepherd': 1})
[1 2 3 4 5]


## Color

In [108]:
target_for_color = {
    'Sable': ['Coated Sable','Sable','sable'],
    'Yellow': ['blond','Blond/Yellow','Blonde','blondelab lode','Butterscotch','Carmel Yellow','cream','Cream','darkish brown','fox red','Gold','gold','golden','GOLDEN','Golden Yellow','Lab','light tan','light yellow','Light Yellow','red','Red','Red Fox','Rust','Tan','tan','WELLOW','Wheat','white','White','White and yellow','White/Yellow','Y','y','yel','Yel','Yellllow','Yello','YELLO','yello','yelloiw','Yellow','yellow','YELLOW','Yellow - Dark','Yellow (red)','Yellow & White','yellow lab','Yellow with black trim','Yellow/Butterscotch','yellow/cream','Yellow/White','yellow1','yellowf','Light yellow','Yellowf'],
    'Golden': ['camel','golden/red','goldish','honey','Light Golden','Medium Gold','red/gold','reddish gold','warm gold','warm honey','Tan/Gold'],
    'Black/tan': ['B & T','b/t','B&T','B+T','bl and tan','Black & Tan','Black &tan','Black + Tan','Black and ran','Black and tan','Black and tan (?)','Black Brown','black tan','black w/ tan','Black, tan','Black, tan, silver','Black,tan','Black/ Tan','black/brown','Black/Tan','black+ tan','Blk & Tan','Blk and Tan','Blk/Tan','Brown & Black','brown black','Brown-Black','Brown, black','Brown/Black','Brown/Black/Tan','Coated Black','tan and black','Tan/Black','black and tan','Black and Tan','Bicolor (Black & red)','Bicolor (black w/ brown legs)','Black & red','Black and ran','Black and Red','black and white','Tri','Tri color','Brindle','GSD','B/T','b&t','Black / Tan','black & tan','black, tan','Black/tan','blk and tan','Blk/TAn','tan/black','Blk and tan','Black & tan','black/tan',],
    'Black': ['Color', 'B','Bl','Bl','blac','black (and beautiful)','blck,','Blk','Blk.','blsck','back','black','BLACK','blk','BLK','color','lab',' Black','blck','Back','BLK.','BLack','nan']
}

def process_color(puppyinfo):
    # Replace messy color values
    for k, v in target_for_color.items():
        puppyinfo.loc[puppyinfo.Color.isin(v), 'Color'] = k
    
    # Replaced missing values with Black
    puppyinfo['Color'].fillna('Black',inplace = True)

process_color(puppy_info)
print(puppy_info.Color.unique())

puppy_info['Color']=puppy_info['Color'].map({'Black': 1, 'Yellow': 2, 'Black/tan': 3, 'Sable': 4, 'Golden': 5, 1:1, 2:2, 3:3, 4:4, 5:5}).astype(int)


['Black' 'Yellow' 'Black/tan' 'Sable' 'Golden']


# Status Code

In [109]:
def process_status_code(df):
    df.dog_SubStatusCode.replace([23,25,26,27,55,98,121,169],[1,1,1,1,1,1,1,1], inplace=True)
    df.dog_SubStatusCode[df.dog_SubStatusCode != 1] = 0

process_status_code(puppy_trainer_outcome)
puppy_trainer_outcome['dog_SubStatusCode'].unique()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


array([1, 0])

In [110]:
puppy_info.shape

(4782, 43)

In [111]:
(1.0-puppy_info.count()/len(puppy_info))*100

Breed                        0.000000
Color                        0.000000
Sex                          0.000000
Health                       0.000000
StoolFirm                    0.000000
EnergyLevel                  0.000000
EliminationInCrate           0.000000
QuietInCrate                 0.000000
RespondsToCommandKennel      0.000000
NoInappropriateChewing       0.000000
Housemanners                 0.000000
LeftUnattended               0.000000
EliminationInHouse           0.000000
PlaybitePeople               0.000000
StealsFood                   0.000000
OnFurniture                  0.000000
BarksExcessively             0.000000
RaidsGarbage                 0.000000
CounterSurfingJumpOnDoors    0.000000
JumpOnPeople                 0.000000
FriendlyWAnimals             0.000000
GoodWKids                    0.000000
GoodWStrangers               0.000000
WalksWellOnLeash             0.000000
KnowCommandGetBusy           0.000000
EliminatesOnRoute            0.000000
ChasingAnima

In [112]:
print(puppy_info["NailCutting"].unique())

[  3.   5.   4.   1.   2.   0.  nan]


In [113]:
print(puppy_info['NailCutting'].value_counts())


4.0    1627
5.0    1334
3.0    1026
0.0     312
2.0     292
1.0     190
Name: NailCutting, dtype: int64


In [114]:
puppy_info['NailCutting'].fillna(3.0,inplace = True)

In [115]:
(1.0-puppy_info.count()/len(puppy_info))*100

Breed                        0.0
Color                        0.0
Sex                          0.0
Health                       0.0
StoolFirm                    0.0
EnergyLevel                  0.0
EliminationInCrate           0.0
QuietInCrate                 0.0
RespondsToCommandKennel      0.0
NoInappropriateChewing       0.0
Housemanners                 0.0
LeftUnattended               0.0
EliminationInHouse           0.0
PlaybitePeople               0.0
StealsFood                   0.0
OnFurniture                  0.0
BarksExcessively             0.0
RaidsGarbage                 0.0
CounterSurfingJumpOnDoors    0.0
JumpOnPeople                 0.0
FriendlyWAnimals             0.0
GoodWKids                    0.0
GoodWStrangers               0.0
WalksWellOnLeash             0.0
KnowCommandGetBusy           0.0
EliminatesOnRoute            0.0
ChasingAnimals               0.0
TrafficFear                  0.0
NoiseFear                    0.0
Stairs                       0.0
SitsOnComm

In [116]:
from sklearn.preprocessing import LabelEncoder

In [117]:
lb_make = LabelEncoder()

In [118]:
puppy_info["ExceriseAmount"]=lb_make.fit_transform(puppy_info["ExericeAmount"])

In [119]:
puppy_info["ExceriseAmount"].unique()

array([3, 4, 2, 6, 5, 1, 0])

In [120]:
(1.0-puppy_info.count()/len(puppy_info))*100

Breed                        0.0
Color                        0.0
Sex                          0.0
Health                       0.0
StoolFirm                    0.0
EnergyLevel                  0.0
EliminationInCrate           0.0
QuietInCrate                 0.0
RespondsToCommandKennel      0.0
NoInappropriateChewing       0.0
Housemanners                 0.0
LeftUnattended               0.0
EliminationInHouse           0.0
PlaybitePeople               0.0
StealsFood                   0.0
OnFurniture                  0.0
BarksExcessively             0.0
RaidsGarbage                 0.0
CounterSurfingJumpOnDoors    0.0
JumpOnPeople                 0.0
FriendlyWAnimals             0.0
GoodWKids                    0.0
GoodWStrangers               0.0
WalksWellOnLeash             0.0
KnowCommandGetBusy           0.0
EliminatesOnRoute            0.0
ChasingAnimals               0.0
TrafficFear                  0.0
NoiseFear                    0.0
Stairs                       0.0
SitsOnComm

In [121]:
puppy_info.drop('ExericeAmount', axis=1)

Unnamed: 0,Breed,Color,Sex,Health,StoolFirm,EnergyLevel,EliminationInCrate,QuietInCrate,RespondsToCommandKennel,NoInappropriateChewing,...,ComeOnLeash,ComeOffLeash,CanGivePills,EarCleaning,NailCutting,AttendsClasses,BehavesWellClass,AttendsHomeSwitches,ogr_DogID,ExceriseAmount
0,1,1,0,3,4,4,5,4,3,3,...,3,2,5,3,3.0,3.0,3,3,20446,3
1,1,2,0,4,5,5,5,5,5,4,...,5,5,5,5,5.0,5.0,5,5,20427,4
2,1,2,1,5,5,5,5,5,4,1,...,3,4,1,5,5.0,5.0,5,5,20423,4
3,1,2,0,5,5,4,5,2,5,3,...,5,3,5,5,4.0,5.0,3,5,20421,3
4,1,2,0,5,5,5,5,5,4,1,...,5,5,5,5,1.0,5.0,3,3,20420,2
5,1,1,0,5,4,5,5,4,4,5,...,4,4,5,5,5.0,5.0,5,1,20414,2
6,1,2,0,5,5,5,4,5,5,2,...,4,3,1,4,4.0,4.0,4,1,20412,3
7,1,2,0,5,4,4,5,4,4,3,...,4,4,4,3,4.0,5.0,4,5,20410,2
8,1,2,0,3,5,5,5,4,4,5,...,5,4,1,5,5.0,5.0,5,5,20402,3
9,1,2,0,4,5,5,5,5,4,4,...,3,3,5,5,1.0,5.0,3,1,20400,2


# Merge Data

In [122]:
puppy_info.rename(columns={'ogr_DogID': 'DogID', 'ExericeAmt': 'ExerciseAmount'}, inplace=True)
puppy_trainer_outcome.rename(columns={'dog_DogID': 'DogID'}, inplace=True)

In [123]:
puppy_info = puppy_info.drop_duplicates(['DogID'], keep='first')
puppy_trainer_outcome = puppy_trainer_outcome.drop_duplicates(['DogID'], keep='first')

print(len(puppy_info), len(puppy_trainer_outcome))

2138 12677


In [124]:
puppy_trainer_outcome

Unnamed: 0,DogID,dog_Tattoo,dog_CallName,dog_LitterID,dog_BreederID,dog_DOB,dog_Sex,dog_StatusChangeDate,dog_SubStatusCode,dbc_DogBreedDescription,Dog_EndReasonText,dbcc_ColorDescription,ogr_RelationID,ogr_PersonID,ogr_DateStart,ogr_DateEnd,ogr_DateRefused,Relationship_Description
0,20500,5S315,Sparky,S315,7105.0,2015-10-29,M,2015-10-29,1,Labrador Retriever,No_END,Black,52210,22749,2015-12-30,,,Puppy Raiser/primary
1,20496,1S315,Spirit,S315,7105.0,2015-10-29,F,2015-10-29,1,Labrador Retriever,No_END,Yellow,52212,4909,2015-12-30,,,Adopter of dog
2,20506,6T315,Topher,T315,7105.0,2015-11-06,M,2015-11-06,1,German Shepherd,No_END,Black/tan,52223,14273,2015-12-31,,,Adopter of dog
3,20494,9R315,Raj,R315,7105.0,2015-10-22,M,2015-12-17,0,Labrador Retriever,No_END,Yellow,52203,20181,2015-12-27,,,Puppy Raiser/primary
4,20490,5R315,Rhyme,R315,7105.0,2015-10-22,F,2015-12-18,0,Labrador Retriever,No_END,Yellow,52213,4909,2015-12-30,,,Adopter of dog
5,20491,6R315,Regal,R315,7105.0,2015-10-22,F,2015-12-18,0,Labrador Retriever,No_END,Black,52192,18842,2015-12-20,,,Puppy Raiser/primary
6,20455,2M315,Maggie,M315,7105.0,2015-10-07,F,2015-11-30,0,Labrador Retriever,No_END,Black,52193,20106,2015-12-19,,,Puppy Raiser/primary
7,20480,4P315,Peace,P315,7105.0,2015-10-21,F,2015-12-17,0,Labrador Retriever,No_END,Yellow,52179,26252,2015-12-19,,,Puppy Raiser/primary
8,20488,3R315,Riley,R315,7105.0,2015-10-22,F,2015-12-17,0,Labrador Retriever,No_END,Yellow,52191,21435,2015-12-20,,,Puppy Raiser/primary
9,18413,6K313,Kenner,K313,7105.0,2013-08-16,M,2014-12-16,0,Labrador Retriever,No_END,Black,47324,17477,2013-11-23,2014-07-18 00:00:00,,Puppy Raiser/primary


In [125]:
dog_id_count = Counter(puppy_info['DogID'])
num_repeated_dog_ids = Counter( [dog_id_count[i] for i in dog_id_count] )
print(len(puppy_info), num_repeated_dog_ids)

2138 Counter({1: 2138})


In [126]:
puppy_merged = puppy_info.merge(puppy_trainer_outcome, on=['DogID'], how='inner')

print(len(puppy_info), len(puppy_trainer_outcome), len(puppy_merged))

2138 12677 2110


In [127]:
puppy_merged

Unnamed: 0,Breed,Color,Sex,Health,StoolFirm,EnergyLevel,EliminationInCrate,QuietInCrate,RespondsToCommandKennel,NoInappropriateChewing,...,dog_SubStatusCode,dbc_DogBreedDescription,Dog_EndReasonText,dbcc_ColorDescription,ogr_RelationID,ogr_PersonID,ogr_DateStart,ogr_DateEnd,ogr_DateRefused,Relationship_Description
0,1,1,0,3,4,4,5,4,3,3,...,0,Labrador Retriever,No_END,Black,52109,28298,2015-12-14,,,Puppy Raiser/primary
1,1,2,0,4,5,5,5,5,5,4,...,0,Labrador Retriever,No_END,Yellow,52105,2481,2015-12-12,,,Puppy Raiser/primary
2,1,2,1,5,5,5,5,5,4,1,...,0,Labrador Retriever,No_END,Yellow,52058,28396,2015-12-05,,,Puppy Raiser/primary
3,1,2,0,5,5,4,5,2,5,3,...,0,Labrador Retriever,No_END,Yellow,51994,28143,2015-11-21,,,Puppy Raiser/primary
4,1,2,0,5,5,5,5,5,4,1,...,0,Labrador Retriever,No_END,Yellow,51993,28326,2015-11-21,,,Puppy Raiser/primary
5,1,1,0,5,4,5,5,4,4,5,...,0,Labrador Retriever,No_END,Black,52057,21290,2015-12-05,,,Puppy Raiser/primary
6,1,2,0,5,5,5,4,5,5,2,...,0,Labrador Retriever,No_END,Yellow,51926,22827,2015-11-10,,,Puppy Raiser/primary
7,1,2,0,5,4,4,5,4,4,3,...,0,Labrador Retriever,No_END,Yellow,51927,20468,2015-11-10,,,Puppy Raiser/primary
8,1,2,0,3,5,5,5,4,4,5,...,0,Labrador Retriever,No_END,Yellow,51919,12865,2015-11-09,,,Puppy Raiser/primary
9,1,2,0,4,5,5,5,5,4,4,...,0,Labrador Retriever,No_END,Yellow,52021,27227,2015-11-30,,,Puppy Raiser/primary


In [128]:
puppy_merged.shape

(2110, 61)

In [129]:
puppy_merged.dtypes

Breed                                 int64
Color                                 int64
Sex                                   int64
Health                                int64
StoolFirm                             int64
EnergyLevel                           int64
EliminationInCrate                    int64
QuietInCrate                          int64
RespondsToCommandKennel               int64
NoInappropriateChewing                int64
Housemanners                          int64
LeftUnattended                        int64
EliminationInHouse                    int64
PlaybitePeople                        int64
StealsFood                            int64
OnFurniture                           int64
BarksExcessively                      int64
RaidsGarbage                          int64
CounterSurfingJumpOnDoors             int64
JumpOnPeople                          int64
FriendlyWAnimals                      int64
GoodWKids                             int64
GoodWStrangers                  

In [130]:
puppy_merged.columns.values 

array(['Breed', 'Color', 'Sex', 'Health', 'StoolFirm', 'EnergyLevel',
       'EliminationInCrate', 'QuietInCrate', 'RespondsToCommandKennel',
       'NoInappropriateChewing', 'Housemanners', 'LeftUnattended',
       'EliminationInHouse', 'PlaybitePeople', 'StealsFood', 'OnFurniture',
       'BarksExcessively', 'RaidsGarbage', 'CounterSurfingJumpOnDoors',
       'JumpOnPeople', 'FriendlyWAnimals', 'GoodWKids', 'GoodWStrangers',
       'WalksWellOnLeash', 'KnowCommandGetBusy', 'EliminatesOnRoute',
       'ChasingAnimals', 'TrafficFear', 'NoiseFear', 'Stairs',
       'SitsOnCommand', 'DownOnCommand', 'StaysOnCommand', 'ComeOnLeash',
       'ComeOffLeash', 'CanGivePills', 'EarCleaning', 'NailCutting',
       'AttendsClasses', 'BehavesWellClass', 'AttendsHomeSwitches',
       'DogID', 'ExericeAmount', 'ExceriseAmount', 'dog_Tattoo',
       'dog_CallName', 'dog_LitterID', 'dog_BreederID', 'dog_DOB',
       'dog_Sex', 'dog_StatusChangeDate', 'dog_SubStatusCode',
       'dbc_DogBreedDescriptio

In [131]:
puppy_merged=puppy_merged.drop(['dog_Tattoo',
       'dog_CallName', 'dog_LitterID', 'dog_BreederID', 'dog_DOB',
       'dog_Sex', 'dog_StatusChangeDate',
       'dbc_DogBreedDescription', 'Dog_EndReasonText',
       'dbcc_ColorDescription', 'ogr_RelationID', 'ogr_PersonID',
       'ogr_DateStart', 'ogr_DateEnd', 'ogr_DateRefused',
       'Relationship_Description'],axis=1)

In [132]:
puppy_merged.shape

(2110, 45)

In [133]:
puppy_merged.columns.values 

array(['Breed', 'Color', 'Sex', 'Health', 'StoolFirm', 'EnergyLevel',
       'EliminationInCrate', 'QuietInCrate', 'RespondsToCommandKennel',
       'NoInappropriateChewing', 'Housemanners', 'LeftUnattended',
       'EliminationInHouse', 'PlaybitePeople', 'StealsFood', 'OnFurniture',
       'BarksExcessively', 'RaidsGarbage', 'CounterSurfingJumpOnDoors',
       'JumpOnPeople', 'FriendlyWAnimals', 'GoodWKids', 'GoodWStrangers',
       'WalksWellOnLeash', 'KnowCommandGetBusy', 'EliminatesOnRoute',
       'ChasingAnimals', 'TrafficFear', 'NoiseFear', 'Stairs',
       'SitsOnCommand', 'DownOnCommand', 'StaysOnCommand', 'ComeOnLeash',
       'ComeOffLeash', 'CanGivePills', 'EarCleaning', 'NailCutting',
       'AttendsClasses', 'BehavesWellClass', 'AttendsHomeSwitches',
       'DogID', 'ExericeAmount', 'ExceriseAmount', 'dog_SubStatusCode'], dtype=object)

In [134]:
puppy_merged=puppy_merged.drop(['DogID','ExericeAmount'],axis=1)

In [135]:
puppy_merged.shape

(2110, 43)

In [136]:
puppy_merged.head()

Unnamed: 0,Breed,Color,Sex,Health,StoolFirm,EnergyLevel,EliminationInCrate,QuietInCrate,RespondsToCommandKennel,NoInappropriateChewing,...,ComeOnLeash,ComeOffLeash,CanGivePills,EarCleaning,NailCutting,AttendsClasses,BehavesWellClass,AttendsHomeSwitches,ExceriseAmount,dog_SubStatusCode
0,1,1,0,3,4,4,5,4,3,3,...,3,2,5,3,3.0,3.0,3,3,3,0
1,1,2,0,4,5,5,5,5,5,4,...,5,5,5,5,5.0,5.0,5,5,4,0
2,1,2,1,5,5,5,5,5,4,1,...,3,4,1,5,5.0,5.0,5,5,4,0
3,1,2,0,5,5,4,5,2,5,3,...,5,3,5,5,4.0,5.0,3,5,3,0
4,1,2,0,5,5,5,5,5,4,1,...,5,5,5,5,1.0,5.0,3,3,2,0


In [137]:
puppy_merged.to_csv('puppymerged_num.csv')

In [139]:
target = np.ravel(puppy_merged['dog_SubStatusCode'].values)

In [140]:
features = puppy_merged.drop(['dog_SubStatusCode'], axis=1)

In [141]:
from sklearn.ensemble import RandomForestRegressor


In [143]:
rf = RandomForestRegressor()
rf.fit(features, target)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [144]:
names=puppy_merged.columns.values 
print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), names), 
             reverse=True))

Features sorted by their score:
[(0.045499999999999999, 'ExceriseAmount'), (0.039399999999999998, 'NailCutting'), (0.033399999999999999, 'StaysOnCommand'), (0.032899999999999999, 'AttendsHomeSwitches'), (0.032800000000000003, 'BehavesWellClass'), (0.031099999999999999, 'ChasingAnimals'), (0.030599999999999999, 'JumpOnPeople'), (0.030300000000000001, 'LeftUnattended'), (0.030200000000000001, 'BarksExcessively'), (0.0293, 'PlaybitePeople'), (0.028500000000000001, 'EliminatesOnRoute'), (0.028400000000000002, 'RaidsGarbage'), (0.028199999999999999, 'CanGivePills'), (0.027799999999999998, 'Color'), (0.0269, 'OnFurniture'), (0.026200000000000001, 'EarCleaning'), (0.0258, 'CounterSurfingJumpOnDoors'), (0.025600000000000001, 'WalksWellOnLeash'), (0.024400000000000002, 'ComeOffLeash'), (0.0235, 'NoInappropriateChewing'), (0.0235, 'AttendsClasses'), (0.022599999999999999, 'Stairs'), (0.022499999999999999, 'StealsFood'), (0.022100000000000002, 'KnowCommandGetBusy'), (0.021999999999999999, 'QuietI