In [143]:
import re
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

In [144]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

In [145]:
#Data Preprocessing and Exploratory Analysis 
train.describe()

Unnamed: 0,dob_year,dob_month,count,age_upon_intake_(days),age_upon_intake_(years),intake_month,intake_year,intake_hour,intake_number,time_in_shelter_days,age_upon_outcome_(days),age_upon_outcome_(years),outcome_month,outcome_year,outcome_hour,outcome_number
count,47803.0,47803.0,47803.0,47803.0,47803.0,47803.0,47803.0,47803.0,47803.0,47803.0,47803.0,47803.0,47803.0,47803.0,47803.0,47803.0
mean,2012.197728,6.337531,1.0,841.679915,2.305972,6.596113,2014.600046,13.549087,1.144552,17.143366,855.319729,2.343342,6.650587,2014.638893,14.381545,1.144552
std,3.068255,3.342038,0.0,1100.557953,3.015227,3.460172,0.905731,3.179397,0.500055,46.600727,1103.598596,3.023558,3.490263,0.912706,3.456145,0.500055
min,1992.0,1.0,1.0,0.0,0.0,1.0,2013.0,0.0,1.0,0.000694,0.0,0.0,1.0,2013.0,0.0,1.0
25%,2011.0,4.0,1.0,60.0,0.164384,4.0,2014.0,11.0,1.0,1.049653,90.0,0.246575,4.0,2014.0,12.0,1.0
50%,2013.0,6.0,1.0,365.0,1.0,7.0,2015.0,13.0,1.0,4.972222,365.0,1.0,7.0,2015.0,15.0,1.0
75%,2014.0,9.0,1.0,1095.0,3.0,10.0,2015.0,16.0,1.0,13.113889,1095.0,3.0,10.0,2015.0,17.0,1.0
max,2016.0,12.0,1.0,8030.0,22.0,12.0,2018.0,23.0,13.0,1606.194444,8030.0,22.0,12.0,2018.0,23.0,13.0


In [146]:
#Checking for missing values 
train.isnull().sum()
values = {'sex_upon_outcome': 'Unknown'}
train = train.fillna(value=values)



In [147]:
train.outcome_type.value_counts()

Adoption           12967
Transfer           10345
Return to Owner     9718
Euthanasia          4144
Died                2829
Missing             2599
Relocate            2573
Rto-Adopt           2450
Disposal             178
Name: outcome_type, dtype: int64

In [148]:
#dropping the features that will not impact the classification or prediction. For eg- Animal id outcome is primary key and is unique for every animal and won't contribute to the prediction
train = train.drop([ 
#        'animal_id_outcome',
#        'dob_month',
#       'dob_year',
#        'animal_type',
        'age_upon_intake_(days)',
        'age_upon_intake_(years)',
        'intake_datetime',
        'intake_monthyear',
        'count',
        'intake_hour',
        'sex_upon_intake',
        'date_of_birth',
        'age_upon_outcome',
        'age_upon_outcome_(days)',
        'time_in_shelter',
        'age_upon_outcome_(years)',
        'outcome_datetime',
        'outcome_number',
        'outcome_monthyear'], axis=1)

test = test.drop([ 
#        'animal_id_outcome',
#        'dob_month',
#       'dob_year',
#        'animal_type',
        'age_upon_intake_(days)',
        'age_upon_intake_(years)',
        'intake_datetime',
        'intake_monthyear',
        'count',
        'intake_hour',
        'sex_upon_intake',
        'date_of_birth',
        'age_upon_outcome',
        'age_upon_outcome_(days)',
        'time_in_shelter',
        'age_upon_outcome_(years)',
        'outcome_datetime',
        'outcome_number',
        'outcome_monthyear'], axis=1)

def float_to_int_intake(row):
    return int(row['intake_number'])
def float_to_int_outcome(row):
    return int(row['outcome_number'])
train['intake_number']=train.apply (lambda row: float_to_int_intake (row),axis=1)
test['intake_number']=test.apply (lambda row: float_to_int_intake (row),axis=1)

def time_in_shelter_days(row):
    return int(row['time_in_shelter_days'])*365
train['time_in_shelter_days']=train.apply (lambda row: time_in_shelter_days (row),axis=1)
test['time_in_shelter_days']=test.apply (lambda row: time_in_shelter_days (row),axis=1)



In [149]:
train.columns.tolist()


['animal_id_outcome',
 'dob_year',
 'dob_month',
 'age_upon_intake',
 'animal_type',
 'breed',
 'color',
 'intake_condition',
 'intake_type',
 'age_upon_intake_age_group',
 'intake_month',
 'intake_year',
 'intake_weekday',
 'intake_number',
 'time_in_shelter_days',
 'sex_upon_outcome',
 'age_upon_outcome_age_group',
 'outcome_month',
 'outcome_year',
 'outcome_weekday',
 'outcome_hour',
 'outcome_type']

In [150]:
# converting categorical values 

feature_cols = [
 'animal_id_outcome',
 'dob_year',
 'dob_month',
 'age_upon_intake',
 'animal_type',
 'breed',
 'color',
 'intake_condition',
 'intake_type',
 'age_upon_intake_age_group',
 'intake_month',
 'intake_year',
 'intake_weekday',
 'intake_number',
 'time_in_shelter_days',
 'sex_upon_outcome',
 'age_upon_outcome_age_group',
 'outcome_month',
 'outcome_year',
 'outcome_weekday',
 'outcome_hour',
 'outcome_type'
 ]

train['train']=1
test['train']=0

combined=pd.concat([train,test])



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [151]:
combined.isnull().sum()
values = {'outcome_type': 'Unknown'}
combined = combined.fillna(value=values)

In [152]:
le = LabelEncoder()
encoded_X = combined[feature_cols].apply(le.fit_transform)
X = encoded_X
y = le.fit_transform(combined.outcome_type)
print('done1')

#df= pd.get_dummies(combined['outcome_type'])
combined['outcome_type'] = pd.factorize(combined['outcome_type'])[0] 
#print('done2')
#combined=pd.concat([combined,df],axis=1)
print('done3')
train_df = combined[combined["train"]==1]
test_df = combined[combined["train"]==0]
print('done4')
train_df.drop(["train"],axis=1, inplace=True)
test_df.drop(["train"],axis=1, inplace=True)
print('done')

done1
done3
done4
done


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [154]:
# Compute Logistic Regression to Predict Outcome
logreg = LogisticRegression(random_state=1)
y_train = train_df.outcome_type
x_train = train_df.drop(['outcome_type'],axis=1)
y_test = test_df.outcome_type
x_test = test_df.drop(['outcome_type'],axis=1)

logreg.fit(x_train,y_train)
OutcomePredLog = logreg.predict(X_test)
print('Model Accuracy:',metrics.accuracy_score(y_test, OutcomePredLog))



ValueError: could not convert string to float: 'Unknown'