In [1]:
import pandas as pd
import numpy as np
import random
import re
from sklearn import cross_validation, metrics, preprocessing
from sklearn import svm
from sklearn.grid_search import GridSearchCV
from sklearn.learning_curve import learning_curve, validation_curve #temporary
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.multiclass import OneVsRestClassifier
import matplotlib.pyplot as plt
%matplotlib inline

TRAINING_FLAG = 0

# ------------------------------------------------------------------
def encode_ordinal(data_col):
    """
    Function encodes categorical data ordinally, accepting a dataframe column
    and returning the encoded dataframe column with a mapping list.
    """
    
    mapping = []
    
    categories = list(set(data_col.values))
    random.shuffle(categories)
    for idx, val in enumerate(categories):
        data_col.loc[data_col == val] = str(idx)
    data_col = data_col.astype(int)
    mapping.append({'mapping': [(x[1], x[0]) for x in list(enumerate(categories))]},)
    
    return data_col, mapping
# ------------------------------------------------------------------

def breed_to_breed_group(data_col):   
    """
    Function simplifies dog breeds into dog breed categories, yielding more
    examples per category. Accepts a dataframe column of dog breeds and returns
    a dataframe column of dog breeds mapped to categories. Function uses
    data and structure of Andy's Kaggle script, which can be found at: 
    kaggle.com/andraszsom/shelter-animal-outcomes/dog-breeds-dog-groups/comments
    """
    
    # Import Andy's breed groups as dictionary
    fname = './Input/dog_breed_to_category.csv'
    breed_to_category_dict = pd.read_csv(fname, names=['Category'], index_col=0, header=None).to_dict()
    
    unknowns = []
    for idx, element in enumerate(data_col):
        # Simplify breed by removing unnecessary words
        element = element.replace(' Shorthair','')
        element = element.replace(' Medium Hair','')
        element = element.replace(' Longhair','')
        element = element.replace(' Wirehair','')
        element = element.replace(' Rough','')
        element = element.replace(' Smooth Coat','')
        element = element.replace(' Smooth','')
        element = element.replace(' Black/Tan','')
        element = element.replace('Black/Tan ','')
        element = element.replace(' Flat Coat','')
        element = element.replace('Flat Coat ','')
        element = element.replace(' Coat','')

        # If more than one breed, split using '/' separator
        if '/' in element:
            i = 0
            split_element = element.split('/')
            a = ''
            for j in split_element:
                if j[-3:] == 'Mix':
                    if not j[:-4] in breed_to_category_dict['Category'].keys():
                        a += 'Unknown Mix'
                        unknowns.append(j[:-4])
                    else:
                        a += breed_to_category_dict['Category'][j[:-4]]
                else:
                    if not j in breed_to_category_dict['Category'].keys():
                        a += 'Unknown'
                        unknowns.append(j)
                    else:
                        a += breed_to_category_dict['Category'][j]
                a += ' / '
            a = a[:-3]
            data_col.iloc[idx] = a
        else:
            if element[-3:] == 'Mix':
                data_col.iloc[idx] = element[:-4]
                if not element[:-4] in breed_to_category_dict['Category'].keys():
                    data_col.iloc[idx] = "Unknown Mix"
                    unknowns.append(element[:-4])
            else:
                data_col.iloc[idx] = element
                if not element in breed_to_category_dict['Category'].keys():
                    data_col.iloc[idx] = "Unknown"
                    unknowns.append(element)
                    
    print('Unknown dog breeds: %s\n' % list(set(unknowns)))
    
    breeds = set([val for val in breed_to_category_dict['Category'].keys()])
    # Map breed to category
    for breed in breeds:
        data_col.loc[data_col == breed] = breed_to_category_dict['Category'][breed]        
    return data_col

# ------------------------------------------------------------------
def find_number(string):
    number = re.match('^[0-9]+', string).group(0)
    return int(number)

# ------------------------------------------------------------------
def age_to_years(data_col):
    '''
    Takes list of strings with ages in the regex format ^[0-9]+\b\s+
    and returns a list of integers with the age in years. E.g. 
    '24 months' becomes '2'.
    '''
    
    data_col = data_col.reset_index(drop=True)
    ages = set(data_col)
    for age in ages:
        if age[-3:] == 'day' or age[-4:] == 'days':
            age_number = find_number(age) * 1/365
        elif age[-4:] == 'week' or age[-5:] == 'weeks':
            age_number = find_number(age) * 1/52
        elif age[-5:] == 'month' or age[-6:] == 'months':
            age_number = find_number(age) * 1/12
        elif age[-4:] == 'year' or age[-5:] == 'years':
            age_number = find_number(age)
        else:
            age_number = -1
            print age_number
        data_col.loc[data_col == age] = int(age_number)

    return data_col

# ------------------------------------------------------------------
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and traning learning curve. Taken from:
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html.
    Currently fails.
    """
    print train_sizes
    print type(train_sizes)
    
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
                estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

# ------------------------------------------------------------------
def plot_validation_curve(estimator, X, y, param_range=np.logspace(-9, 3, 9), cv=None):

    train_scores, test_scores = validation_curve(
        estimator, X, y, param_name="estimator__gamma", param_range=param_range,
        cv=cv, scoring="accuracy", n_jobs=1)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.title("Validation Curve with SVM")
    plt.xlabel("$\gamma$")
    plt.ylabel("Score")
    #plt.ylim(0.0, 1.1)
    plt.semilogx(param_range, train_scores_mean, label="Training score", color="r")
    plt.fill_between(param_range, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2, color="r")
    plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
                 color="g")
    plt.fill_between(param_range, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2, color="g")
    plt.legend(loc="best")
    return plt

# ------------------------------------------------------------------

def retreive_map(mapping, x):
    return([item[1] for item in mapping[0]['mapping'] if item[0] == x][0])

# ------------------------------------------------------------------

def export_to_csv(pred):
    """
    Add headers and export the predictions to csv.
    """
    
    f = open('./Output/shelter_animal_outcomes_results.csv','w')
    result_str = 'ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer\n'
    total = 1
    for i in range(len(pred)):
        for j in range(len(pred[i])):
            result_str += str(pred[i][j].astype(int)) + ','
        result_str += '\n'
        total += 1
    f.write(result_str)
    print('Wrote %i records to csv file.' % total)
    return

#----------------------------------------------------------------------
#----------------------------------------------------------------------

# Import
df = pd.read_csv('./input/train.csv')
df['Regime'] = 'train'
if TRAINING_FLAG != 1:
    df_test = pd.read_csv('./input/test.csv')
    df_test['Regime'] = 'test'
    df_test['OutcomeSubtype'] = ''
    df_test['OutcomeType'] = ''
    df_test['AnimalID'] = ''
    #df_test.set_index('ID')
    df = df.append(df_test)
    
original_rows = df.shape[0]

## Perform preprocessing
# Separate sex and neutered status. The following functions were helpfully taken
# from Eugenia Uchaeva's 'Take a look at the data' script:
# https://www.kaggle.com/uchayder/shelter-animal-outcomes/take-a-look-at-the-data
def get_sex(element):
    """
    Takes string input, searches for sex within string, returns sex
    """
    element = str(element)
    if element.find('Male') >= 0: return 'male'
    if element.find('Female') >= 0: return 'female'
    return 'unknown'

def get_neutered(element):
    """
    Takes string input, searches for neutered status within string, returns 
    status
    """
    element = str(element)
    if element.find('Spayed') >= 0: return 'neutered'
    if element.find('Neutered') >= 0: return 'neutered'
    if element.find('Intact') >= 0: return 'intact'
    return 'unknown'

# Split SexuopnOutcome into Sex and NeuteredStatus
df['Sex'] = df.SexuponOutcome.apply(get_sex)
df['NeuteredStatus'] = df.SexuponOutcome.apply(get_neutered)

# Convert ages into years, being careful to remove NaNs
df = df.dropna(subset = ['AgeuponOutcome'])
df.AgeuponOutcome = age_to_years(df.AgeuponOutcome) #could use .apply instead
df = df.dropna(subset = ['AgeuponOutcome'])
new_rows = df.shape[0]
print('Dropping NaNs reduced the dataframe by %i rows.\n' % (original_rows - new_rows))

# Categorise dog breeds into breed groups
df.Breed[df.AnimalType == 'Dog'] = breed_to_breed_group(df.Breed[df.AnimalType == 'Dog'])

# Expand dates
df.DateTime = pd.to_datetime(df.DateTime)
df['Year'] = df.DateTime.map(lambda x: x.year).astype(str)
df['Month'] = df.DateTime.map(lambda x: x.month).astype(str)
df['Day'] = df.DateTime.map(lambda x: x.dayofweek).astype(str)

# Encode whether animal has a name (note: loses info on whether
# name affects adoption for sake of simplicity!)
def has_name(name):
    if len(str(name)) > 0: return 1
    else: return 0

#df.Name = df.Name.replace(r'\s+', 0, regex=True)
df.Name = df.Name.map(has_name) #Probably not working properly...
print df.groupby('Name').count()

df['Color_1'] = df.Color.str.split('/| ').str.get(0)
    
# Start with ordinal encoding, then try binary encoding and compare
# performance. Ordinal encoding may be acceptable for age.
df.AnimalType, animaltype_mapping = encode_ordinal(df.AnimalType)
df.Sex, sex_mapping = encode_ordinal(df.Sex)
df.NeuteredStatus, neuteredstatus_mapping = encode_ordinal(df.NeuteredStatus)
df.AgeuponOutcome, ageuponoutcome_mapping = encode_ordinal(df.AgeuponOutcome)
df.Breed, breed_mapping = encode_ordinal(df.Breed)
df.Color_1, color_mapping = encode_ordinal(df.Color_1)

# Line below would perform one-hot encoding
#df = pd.concat([data_raw, pd.get_dummies(data_raw['Breed']).rename(columns = lambda x: 'Breed_' + str(x))], axis=1)

#if TRAINING_FLAG != 1:
#    df.AnimalType = df.AnimalType.map(lambda x: retreive_map(animaltype_mapping, x))
#    df.Sex = df.Sex.map(lambda x: retreive_map(sex_mapping, x))
#    df.NeuteredStatus = df.NeuteredStatus.map(lambda x: retreive_map(neuteredstatus_mapping, x))
#    df.AgeuponOutcome = df.AgeuponOutcome.map(lambda x: retreive_map(ageuponoutcome_mapping, x))
#    df.Breed = df.Breed.map(lambda x: retreive_map(breed_mapping, x))
#    df.Color_1 = df.Color_1.map(lambda x: retreive_map(color_mapping, x))

# Impute missing values
#from sklearn.preprocessing import Imputer
#imp = Imputer(missing_values=np.nan, strategy='mean', axis=0)
#imp.fit(df)
#df = imp.transform(df) #Note spits out list of lists, not df

# Split df back into train and test
df_train = df[df['Regime'] == 'train']
df_test = df[df['Regime'] == 'test']

# Keep only relevant columns (training set)
drop_list = ['AnimalID', 'SexuponOutcome', 'DateTime', 'Color', 'Regime', 'ID']
df_train.drop(drop_list, axis = 1, inplace = True)
# Encode training data labels
data_labels = df_train.OutcomeType
lab_enc = preprocessing.LabelEncoder()
data_labels = lab_enc.fit(data_labels).transform(data_labels)
#data_labels = pd.get_dummies(data_labels)
df_train.drop(['OutcomeType', 'OutcomeSubtype'], axis = 1, inplace = True)

# Keep only relevant columns (test set)
drop_list = ['AnimalID', 'OutcomeType', 'OutcomeSubtype', 'SexuponOutcome', 'DateTime', 'Color', 'Regime']
df_test.drop(drop_list, axis = 1, inplace = True)

## Train a classifier
x_train, x_test, y_train, y_test = cross_validation.train_test_split(
    df_train, data_labels, test_size = 0.33, random_state = 0)

# Fit scaler on x_train
print x_train.head()
xy_scaler = preprocessing.StandardScaler()
xy_scaler.fit(x_train)
x_train = xy_scaler.transform(x_train)

# Install PCA or FA to reduce dimensions (currently set to use PCA)
n_features = x_train.shape[1]
n_components = np.arange(0, n_features, 1) #options

def compute_scores(X):
    # Fit the models
    pca = PCA()
    fa = FactorAnalysis()

    pca_scores, fa_scores = [], []
    for n in n_components:
        pca.n_components = n
        fa.n_components = n
        pca_scores.append(np.mean(cross_validation.cross_val_score(pca, X)))
        fa_scores.append(np.mean(cross_validation.cross_val_score(fa, X)))
    
    return pca_scores, fa_scores

for x in [(x_train)]:
    pca_scores, fa_scores = compute_scores(x)
    n_components_pca = n_components[np.argmax(pca_scores)]
    n_components_fa = n_components[np.argmax(fa_scores)]

    pca = PCA(n_components='mle')
    pca.fit(x)
    n_components_pca_mle = pca.n_components_

    print("best n_components by PCA CV = %d" % n_components_pca)
    print("best n_components by FactorAnalysis CV = %d" % n_components_fa)
    print("best n_components by PCA MLE = %d" % n_components_pca_mle)


pca = PCA(n_components= n_components_pca)
x_train = pca.fit(x_train).transform(x_train)

#pca = PCA(n_components= n_components_pca_mle)
#x_train = pca.fit(X_train).transform(X_train)

#fa = FactorAnalysis(n_components= n_components_fa)
#x_train = fa.fit(x_train).transform(x_train)

# Define SVM - basic, not optimised. Use cross validation / grid search 
# to find optimal hyperparameters
cv = cross_validation.ShuffleSplit(df_train.shape[1], n_iter=50, #should df_train be x_train?
                                   test_size=0.1, random_state=0) 
clf_untuned = OneVsRestClassifier(svm.SVC(kernel='rbf', decision_function_shape='ovr'))
params = {
    'estimator__C': [1, 10, 100],
    'estimator__gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10]
}
clf_tuned = GridSearchCV(clf_untuned, cv=cv, param_grid=params)
clf_tuned.fit(x_train, y_train)
print('Best parameters: %s' % clf_tuned.best_params_)
print('Best score: %s' % clf_tuned.best_score_)

# Plot learning curve for best params
#title = 'Learning curve (SVM)'
#plot_learning_curve(clf_tuned.best_estimator_, title, x_train, y_train, ylim=(0.6, 1.05), cv=cv)
#plt.show()

# Plot validation curve
#plot_validation_curve(clf_tuned.best_estimator_, x_train, y_train, cv=cv)
#plt.show()

# Fit scaler, PCA to x_test (training) or df_test (live) and predict
def evaluate_and_predict(x):
    x = xy_scaler.transform(x)
    x = pca.transform(x)
    predicted = clf_tuned.predict(x)
    return predicted

if TRAINING_FLAG == 1:
    predicted = evaluate_and_predict(x_test)
    print("Classification report for classifier %s:\n%s\n"
          % (clf_untuned, metrics.classification_report(y_test, predicted)))
    print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, predicted))
else:
    df_test = df_test.set_index('ID', drop = True)
    print('Preparing to scale the following dataframe:\n')
    print(df_test.head())
    predicted = evaluate_and_predict(df_test)
    df_test['ID'] = df_test.index
    predicted = np.append(pd.DataFrame(df_test.ID), pd.get_dummies(predicted), 1)
    export_to_csv(predicted)

print('Complete!')

Dropping NaNs reduced the dataframe by 24 rows.

Unknown dog breeds: ['Picardy Sheepdog', 'Dogo Argentino', 'English Pointer', 'Plott Hound', 'Unknown', 'Bedlington Terr', 'Glen Of Imaal', 'Yorkshire', 'Bichon Frise', 'Entlebucher', 'Dutch Shepherd', 'Cirneco', 'Landseer', 'Wire Hair Fox Terrier', 'Coton De Tulear', 'Spanish Mastiff', 'Carolina Dog', 'Port Water Dog', 'Jindo', 'American Eskimo', 'Sealyham Terr', 'Treeing Tennesse Brindle', 'Feist', 'American Pit Bull Terrier', 'English Coonhound', 'Patterdale Terr', 'Swiss Hound', 'Treeing Cur', 'Presa Canario', 'Pbgv', 'Bull Terrier Miniature', 'Dachshund Stan', 'Anatol Shepherd', 'English Shepherd', 'Eng Toy Spaniel', 'Chinese Crested', 'German Pointer', 'Hovawart', 'Bruss Griffon', 'Alaskan Husky', 'Redbone Hound', 'Germaned Pointer', 'Lowchen', 'West Highland', 'Dandie Dinmont', 'Old English Bulldog', 'Schnauzer Giant', 'Mexican Hairless', 'Bluetick Hound', 'Podengo Pequeno', 'Chesa Bay Retr', 'Softed Wheaten Terrier', 'Hound', 'Ca

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



       AgeuponOutcome  AnimalType  Breed  Name  Sex  NeuteredStatus  Year  \
11731               9           0     91     1    1               1  2013   
21329               9           1     39     1    1               0  2014   
13372               6           0     91     1    1               1  2014   
11293              17           0     60     1    0               0  2015   
415                 9           0    160     1    0               1  2014   

      Month Day  Color_1  
11731    10   0       16  
21329     1   2       19  
13372    10   0       16  
11293     6   4       12  
415       8   0       19  
best n_components by PCA CV = 8
best n_components by FactorAnalysis CV = 5
best n_components by PCA MLE = 9
Best parameters: {'estimator__C': 100, 'estimator__gamma': 0.001}
Best score: 0.56
Preparing to scale the following dataframe:

    AgeuponOutcome  AnimalType  Breed  Name  Sex  NeuteredStatus  Year Month  \
ID                                                        

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
         "Random Forest", "AdaBoost", "Naive Bayes", "Linear Discriminant Analysis",
         "Quadratic Discriminant Analysis", "AdaBoosted decision trees"]

knn = KNeighborsClassifier(5)
linear_svm = svm.SVC(kernel="linear", C=0.025)
rbf_svm = svm.SVC(gamma=2, C=1)
d_tree = DecisionTreeClassifier(max_depth=5)
rf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
adaboost = AdaBoostClassifier()
naive_bayes = GaussianNB()
lda = LinearDiscriminantAnalysis()
qda = QuadraticDiscriminantAnalysis()
adaboost_rf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5),
                         algorithm="SAMME",
                         n_estimators=200)

classifiers = [
    knn,
    linear_svm,
    rbf_svm,
    d_tree,
    rf,
    adaboost,
    naive_bayes,
    lda,
    qda,
    adaboost_rf
]

def est_score(X, y):

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        clf.fit(X, y)
        score = clf.score(X, y)
        print(name, score)
        
est_score(x_train, y_train)

('Nearest Neighbors', 0.66422500559159026)
('Linear SVM', 0.60495414895996424)
('RBF SVM', 0.89213822411093713)
('Decision Tree', 0.57968016103779918)
('Random Forest', 0.56413554014761802)
('AdaBoost', 0.56256989487810338)
('Naive Bayes', 0.5712368597629166)
('Linear Discriminant Analysis', 0.60411541042272421)
('Quadratic Discriminant Analysis', 0.53807872959069558)
('AdaBoosted decision trees', 0.59270856631626034)




Wrote 11451 records to csv file.
