In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import KFold
import copy
import datetime as dt
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss


data_train = pd.read_csv('/Users/Hooshmand/Desktop/Kaggle_Animal_shelter/train.csv')
#rename data_train 'AnimalID' column into 'ID' to match the test set
data_train = data_train.rename(columns={'AnimalID':'ID'})
data_test = pd.read_csv('/Users/Hooshmand/Desktop/Kaggle_Animal_shelter/test.csv')

In [63]:
def animal_feature_transform(dataframe, has_outcome=True):
    
    data_train = copy.deepcopy(dataframe)
    
    # AnimalType: 1 - Cat, 0 - Dog
    NA = np.nan
    data_train['AnimalType'] = data_train['AnimalType'].map(lambda x: 1 if x == 'Cat' else (0 if x == 'Dog' else NA))
    
    #split SexuponOutcome into 2 columns: Sex (Male/Female), and S/N (Spayed/Neutered), then delete SexuponOutcome column
    data_train['Sex'] = data_train['SexuponOutcome'].str.split().str[-1]
    data_train['S/N'] = data_train['SexuponOutcome'].str.split().str[0]
    data_train = data_train.drop('SexuponOutcome', 1)
    
    #Sex: 1 - Male, 0 - Female, NA - if missing
    data_train['Sex'] = data_train['Sex'].map(lambda x: 1 if x == 'Male' else (0 if x == 'Female' else NA))
    
    # S/N: 1 - if spayed/neutered, 0 - if not, NA if missing
    data_train['S/N'] = data_train['S/N'].map(lambda x: 1 if ((x == 'Neutered') | (x == 'Spayed')) else (0 if x == 'Intact' else NA))
    
    # split DateTime into 3 columns: Year, Month, Day
    data_train['DateTime'] = data_train['DateTime'].str.split().str[0]
    data_train['Year'] = data_train['DateTime'].str.split('-').str[0]
    data_train['Month'] = data_train['DateTime'].str.split('-').str[1]
    #data_train['Day'] = data_train['DateTime'].str.split('-').str[2]
    data_train['Hour'] = data_train['DateTime'].map(lambda x: 12 if len(x) == 10 else dt.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").hour)
    data_train['Day'] = data_train['DateTime'].map(lambda x: dt.datetime.strptime(x, "%Y-%m-%d").weekday() if len(x) == 10 else dt.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").weekday())
    data_train = data_train.drop('DateTime', 1)
    
    # create column of ages: weeks, months, years
    data_train['AgeUnit'] = data_train['AgeuponOutcome'].str.split().str[-1]
    data_train['AgeUnit'] = data_train['AgeUnit'].map(lambda x: 52 if (x == 'year' or x == 'years') else (4 if (x == 'month' or x == 'months') else 1))
    # create a column of numerical age
    data_train['AgeCoeff'] = data_train['AgeuponOutcome'].str[0]
    # convert the column above into numeric
    data_train['AgeCoeff'] = data_train['AgeCoeff'].convert_objects(convert_numeric=True)
    # create a column of tuples
    data_train['Age']= data_train['AgeCoeff'] * data_train['AgeUnit']
    
    #Color
    ##########################################################################################################
    data_train['Black/White'] = data_train['Color'].map(lambda x:1 if ('Black/White' in x or 'White/Black' in x) else 0)
    data_train['Tricolor'] = data_train['Color'].map(lambda x:1 if ('Calico' in x or 'Tricolor' in x or 'Tortie/White' in x or 'Torbie/White' in x) else 0)
    data_train['BW Tbb/BW Tbb White'] = data_train['Color'].map(lambda x:1 if ('Brown Tabby' in x or 'Brown Tabby/White' in x or 'White/Brown Tabby' in x) else 0)

    data_train['White'] = data_train['Color'].map(lambda x:1 if (x=='White') else 0)
    data_train['Black'] = data_train['Color'].map(lambda x:1 if (x=='Black') else 0)
    data_train['Org Tbb/Org Tbb White'] = data_train['Color'].map(lambda x:1 if (x=='Orange Tabby' or x=='Orange Tabby/White' or x=='Cream Tabby') else 0)
    
    data_train['Brown'] = data_train['Color'].map(lambda x:1 if (x=='Brown' or x=='Brown Brindle' or x=='Chocolate') else 0)
    data_train['Tortoise/Blk BW'] = data_train['Color'].map(lambda x:1 if (x=='Black/Tan' or x=='Red/Black' or x == 'Tortie' or x == 'Torbie' or x=='Black/Brown' or x=='Brown/Black' or x=='Sable' or x=='Brown Brindle/White') else 0)
    data_train['Red/Red White'] = data_train['Color'].map(lambda x:1 if ('Red' in x or 'Red/White' in x) else 0)
    
    data_train['Tan'] = data_train['Color'].map(lambda x:1 if (x=='Tan' or x=='Gold' or x=='Cream' or x=='Buff' or x=='Buff/Tan' or x=='Yellow' or x=='Tan/Cream' or x=='Tan/Tan') else 0)
    data_train['BW White/Seal Pt'] = data_train['Color'].map(lambda x:1 if ('White/Brown' in x or 'Brown/White' in x or 'Seal Point' in x or 'Lynx Point' in x or 'Chocolate/White' in x or 'White/Chocolate' in x or 'White/Brown Brindle' in x) else 0)
    data_train['Blue'] = data_train['Color'].map(lambda x:1 if (x=='Blue' or x=='Grey' or x=='Blue Merle') else 0)
    
    data_train['Blue Tbb/Blue Tbb White'] = data_train['Color'].map(lambda x:1 if (x=='Blue Tabby' or x=='Blue Tabby/White' or x=='Blue/White' or x=='White/Blue' or x=='White/Gray') else 0)
    data_train['BW/White'] = data_train['Color'].map(lambda x:1 if (x=='Brown/White' or x=='White/Brown' or x=='White/Brown Brindle') else 0)
    data_train['White/Tan'] = data_train['Color'].map(lambda x:1 if (x=='White/Tan' or x=='Tan/White') else 0)

    #Breed
    ##########################################################################################################
    data_train['Domes Sh/h'] = data_train['Breed'].map(lambda x:1 if (x == 'Domestic Shorthair Mix' or 'Manx' in x) else 0)
    data_train['Domes M/h'] = data_train['Breed'].map(lambda x:1 if 'Domestic Medium' in x else 0)
    data_train['Domes L/h'] = data_train['Breed'].map(lambda x:1 if 'Domestic Longhair' in x else 0)
    data_train['Siamese/Snowshow'] = data_train['Breed'].map(lambda x:1 if ('Siamese' in x or 'Snowshow' in x) else 0)
    
    # dangerous, territorial, dominant
    data_train['Dangerous_dog'] = data_train['Breed'].map(lambda x:1 if ('Staffordshire Terrier' in x or 'Pit Bull' in x or 'Pitbull' in x or 'Dane' in x or 'Boxer' in x or 'Doberman' in x or 'Rottweiler' in x or 'Bull Terrier' in x or 'Bulldog'in x or 'Mastiff' in x or 'Dogue' in x) else 0)
    
    # hyper, energetic, destructive, working, hunting
    data_train['Destructive'] = data_train['Breed'].map(lambda x:1 if ('Australian Cattle' in x or 'Australian Shephard' in x or 'Greyhound' in x or 'German Shepherd' in x or 'Dachshund' in x or 'Jack Russell' in x or 'Beagle' in x or 'Spaniel'in x or 'Cairn Terrier' in x or 'Pointer' in x or 'Plott Hound' in x or 'Great Pyrenees' in x or 'Collie' in x or 'Corgi' in x) else 0)

    # dogs that love children
    data_train['Loving_dog'] = data_train['Breed'].map(lambda x:1 if ('Retriever' in x or 'Labrador' in x or 'Miniature Poodle' in x or 'Poodle' in x) else 0)
    
    # small lap breeds
    data_train['Small'] = data_train['Breed'].map(lambda x:1 if ('Chihuahua' in x or 'Pug' in x or 'Shih Tzu' in x or 'Schnauzer' in x or 'Maltese' in x or 'Lhasa Apso' in x) else 0)

    ##########################################################################################################
    
    # Convert 'Name' to 1/0

    data_train['Name']= data_train['Name'].fillna(0)
    data_train['Name'] = data_train['Name'].map(lambda x: 0 if x==0 else 1)

    
    data_train.fillna(data_train.mean(), inplace=True)
    #dropped HOUR
    X_df = data_train[['AnimalType','Name', 'Sex','S/N','Age','Year','Month', 'Day', 'Black/White', 'Tricolor', 'BW Tbb/BW Tbb White', 'White', 'Black', 'Org Tbb/Org Tbb White', 'Brown', 'Tortoise/Blk BW', 'Red/Red White', 'Tan', 'BW White/Seal Pt', 'Blue', 'Blue Tbb/Blue Tbb White', 'BW/White', 'White/Tan','Domes Sh/h','Domes M/h', 'Domes L/h', 'Siamese/Snowshow', 'Dangerous_dog', 'Destructive', 'Loving_dog', 'Small']]  
    X = X_df.as_matrix()
    
    if has_outcome:
        unique_label = sorted(list(set(np.asarray(data_train['OutcomeType']))))
        data_train['OutcomeType'] = data_train['OutcomeType'].map(lambda x: unique_label.index(x))
        y = np.asarray(data_train['OutcomeType'])
        
        return X, y, X_df
    else:
        return X


In [74]:
def create_submission(y):
    submission_file = open('/Users/Hooshmand/Desktop/Kaggle_Animal_shelter/sub.csv', 'w')
    #labels = ['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer']
    submission_file.write('ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer\n')
    for i, prob in enumerate(y):
        entry = [i+1] + list(np.around(prob, decimals=3))
        #entry[label + 1] = 1
        submission_file.write(str(entry).replace('[', '').replace(']', '')+'\n')
    submission_file.close()

In [65]:
X_train, y_train, X_df = animal_feature_transform(data_train)
X_test = animal_feature_transform(data_test, has_outcome=False)

In [66]:
######################################################################################################################
# Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [67]:
clf = GradientBoostingClassifier(n_estimators=254, random_state=241, learning_rate=0.2)    
clf.fit(X_train, y_train)

GradientBoostingClassifier(init=None, learning_rate=0.2, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=254,
              presort='auto', random_state=241, subsample=1.0, verbose=0,
              warm_start=False)

In [68]:
clf.fit(X_train, y_train).score(X_train, y_train)
# 0.67746642223801867 with hour
# 0.67746642223801867

0.67746642223801867

In [69]:
clf.feature_importances_
#clf.feature_importances_>0.05

array([ 0.02251449,  0.04309179,  0.02055669,  0.05970022,  0.21806364,
        0.06642588,  0.14581638,  0.105837  ,  0.01126751,  0.01203369,
        0.00960881,  0.01429326,  0.01014265,  0.00574786,  0.01686928,
        0.01378339,  0.00928846,  0.01393244,  0.01529781,  0.00576774,
        0.0089562 ,  0.01092484,  0.00997178,  0.01818138,  0.00808242,
        0.03012908,  0.00783924,  0.02771053,  0.01927857,  0.0190446 ,
        0.01984237])

In [75]:
y = clf.predict_proba(X_test)
create_submission(y)

In [88]:
######################################################################################################################
#Train-Test split
X_ttrain, X_ttest, y_ttrain, y_ttest = train_test_split(X_train, y_train, test_size = 0.8, random_state = 241)

In [89]:
#Gradient Boosting######################################################################################################################
#test loss
score = []
clf = GradientBoostingClassifier(n_estimators=450, random_state=241, learning_rate=0.2)    
clf.fit(X_ttrain, y_ttrain)
for i, y_decision in enumerate(clf.staged_decision_function(X_ttest)):
    y_pred = 1.0 / (1.0 + np.exp(-y_decision))
    loss = log_loss(y_ttest, y_pred)
    score.append(loss)
print np.argmin(score), np.min(score)


254 0.999586424856


In [90]:
clf = RandomForestClassifier(n_estimators=254, random_state=241)
clf.fit(X_ttrain, y_ttrain)
pred = clf.predict_proba(X_ttest)
round(log_loss(y_ttest, pred), 2)

1.03

In [47]:
#Gradient Boosting######################################################################################################################
#test loss
score = []
clf = GradientBoostingClassifier(n_estimators=450, random_state=241, learning_rate=0.2)    
clf.fit(X_train, y_train)
for i, y_decision in enumerate(clf.staged_decision_function(X_test)):
    y_pred = 1.0 / (1.0 + np.exp(-y_decision))
    loss = log_loss(y_test, y_pred)
    score.append(loss)
print np.argmin(score), np.min(score)
#242 1.00030373454
#254 0.999639824874

NameError: name 'y_test' is not defined