In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn import metrics

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
#Data Preprocessing and Exploratory Analysis 
train.describe()

Unnamed: 0,dob_year,dob_month,count,age_upon_intake_(days),age_upon_intake_(years),intake_month,intake_year,intake_hour,intake_number,time_in_shelter_days,age_upon_outcome_(days),age_upon_outcome_(years),outcome_month,outcome_year,outcome_hour,outcome_number
count,47803.0,47803.0,47803.0,47803.0,47803.0,47803.0,47803.0,47803.0,47803.0,47803.0,47803.0,47803.0,47803.0,47803.0,47803.0,47803.0
mean,2012.197728,6.337531,1.0,841.679915,2.305972,6.596113,2014.600046,13.549087,1.144552,17.143366,855.319729,2.343342,6.650587,2014.638893,14.381545,1.144552
std,3.068255,3.342038,0.0,1100.557953,3.015227,3.460172,0.905731,3.179397,0.500055,46.600727,1103.598596,3.023558,3.490263,0.912706,3.456145,0.500055
min,1992.0,1.0,1.0,0.0,0.0,1.0,2013.0,0.0,1.0,0.000694,0.0,0.0,1.0,2013.0,0.0,1.0
25%,2011.0,4.0,1.0,60.0,0.164384,4.0,2014.0,11.0,1.0,1.049653,90.0,0.246575,4.0,2014.0,12.0,1.0
50%,2013.0,6.0,1.0,365.0,1.0,7.0,2015.0,13.0,1.0,4.972222,365.0,1.0,7.0,2015.0,15.0,1.0
75%,2014.0,9.0,1.0,1095.0,3.0,10.0,2015.0,16.0,1.0,13.113889,1095.0,3.0,10.0,2015.0,17.0,1.0
max,2016.0,12.0,1.0,8030.0,22.0,12.0,2018.0,23.0,13.0,1606.194444,8030.0,22.0,12.0,2018.0,23.0,13.0


In [5]:
#Checking for missing values 
train.isnull().sum()
values = {'sex_upon_outcome': 'Unknown'}
train = train.fillna(value=values)

In [6]:
train.outcome_type.value_counts()

Adoption           12967
Transfer           10345
Return to Owner     9718
Euthanasia          4144
Died                2829
Missing             2599
Relocate            2573
Rto-Adopt           2450
Disposal             178
Name: outcome_type, dtype: int64

In [7]:
#dropping the features that will not impact the classification or prediction
train = train.drop([ 
        'animal_id_outcome',
        'age_upon_intake_(days)',
        'age_upon_intake_(years)',
        'intake_datetime',
        'intake_monthyear',
        'count',
        'intake_hour',
        'sex_upon_intake',
        'date_of_birth',
        'age_upon_outcome',
        'age_upon_outcome_(days)',
        'time_in_shelter',
        'age_upon_outcome_(years)',
        'outcome_datetime',
        'outcome_number',
        'outcome_monthyear'], axis=1)

test = test.drop([ 
        'animal_id_outcome',
        'age_upon_intake_(days)',
        'age_upon_intake_(years)',
        'intake_datetime',
        'intake_monthyear',
        'count',
        'intake_hour',
        'sex_upon_intake',
        'date_of_birth',
        'age_upon_outcome',
        'age_upon_outcome_(days)',
        'time_in_shelter',
        'age_upon_outcome_(years)',
        'outcome_datetime',
        'outcome_number',
        'outcome_monthyear'], axis=1)

def float_to_int_intake(row):
    return int(row['intake_number'])
def float_to_int_outcome(row):
    return int(row['outcome_number'])
train['intake_number']=train.apply (lambda row: float_to_int_intake (row),axis=1)

def time_in_shelter_days(row):
    return int(row['time_in_shelter_days'])*365
train['time_in_shelter_days']=train.apply (lambda row: time_in_shelter_days (row),axis=1)




In [8]:
train.columns.tolist()


['dob_year',
 'dob_month',
 'age_upon_intake',
 'animal_type',
 'breed',
 'color',
 'intake_condition',
 'intake_type',
 'age_upon_intake_age_group',
 'intake_month',
 'intake_year',
 'intake_weekday',
 'intake_number',
 'time_in_shelter_days',
 'sex_upon_outcome',
 'age_upon_outcome_age_group',
 'outcome_month',
 'outcome_year',
 'outcome_weekday',
 'outcome_hour',
 'outcome_type']

In [9]:
test.columns.tolist()

['dob_year',
 'dob_month',
 'age_upon_intake',
 'animal_type',
 'breed',
 'color',
 'intake_condition',
 'intake_type',
 'age_upon_intake_age_group',
 'intake_month',
 'intake_year',
 'intake_weekday',
 'intake_number',
 'time_in_shelter_days',
 'sex_upon_outcome',
 'age_upon_outcome_age_group',
 'outcome_month',
 'outcome_year',
 'outcome_weekday',
 'outcome_hour']

In [10]:
# converting categorical values 

feature_cols = [
 
 'dob_year',
 'dob_month',
 'age_upon_intake',
 'animal_type',
 'breed',
 'color',
 'intake_condition',
 'intake_type',
 'age_upon_intake_age_group',
 'intake_month',
 'intake_year',
 'intake_weekday',
 'intake_number',
 'time_in_shelter_days',
 'sex_upon_outcome',
 'age_upon_outcome_age_group',
 'outcome_month',
 'outcome_year',
 'outcome_weekday',
 'outcome_hour',
 'outcome_type',
 ]


In [11]:
#le = LabelEncoder()
#encoded_X = train[feature_cols].apply(le.fit_transform)
#X = encoded_X
#y = le.fit_transform(train.outcome_type)

encoder = LabelEncoder()

#df['sex_upon_intake'] = encoder.fit_transform(df['sex_upon_intake'].astype('str'))

train['sex_upon_outcome'] = encoder.fit_transform(train['sex_upon_outcome'].astype('str'))

train['animal_type'] = encoder.fit_transform(train['animal_type'].astype('str'))

train['breed'] = encoder.fit_transform(train['breed'].astype('str'))

train['color'] = encoder.fit_transform(train['color'].astype('str'))

train['age_upon_intake_age_group'] = encoder.fit_transform(train['intake_condition'].astype('str'))
train['sex_upon_outcome'] = encoder.fit_transform(train['sex_upon_outcome'].astype('str'))
train['age_upon_outcome_age_group'] = encoder.fit_transform(train['age_upon_outcome_age_group'].astype('str'))
train['outcome_weekday'] = encoder.fit_transform(train['outcome_weekday'].astype('str'))

train['outcome_type'] = encoder.fit_transform(train['outcome_type'].astype('str'))

train['intake_type'] = encoder.fit_transform(train['intake_type'].astype('str'))

train['intake_weekday'] = encoder.fit_transform(train['intake_weekday'].astype('str'))

train['intake_condition'] = encoder.fit_transform(train['intake_condition'].astype('str'))

def label_age_upon_intake(row):
    x = row['age_upon_intake'].split(' ')
    value = 0
    if x=='years':
        value = int(row['age_upon_intake'][:-6])*365
    elif x=='weeks':
        return int(row['age_upon_intake'][:-6])*7
    elif x=='months':
        return int(row['age_upon_intake'][:-7])*30
    elif x=='days':
        return int(row['age_upon_intake'][:-5])
    elif x=='year':
        return int(row['age_upon_intake'][:-5])*365
    elif x=='day':
        return int(row['age_upon_intake'][:-4])
    elif x=='month':
        return int(row['age_upon_intake'][:-6])*30
    elif x=='week':
        return int(row['age_upon_intake'][:-5])*7
    return value
    
train['age_upon_intake']=train.apply (lambda row: label_age_upon_intake (row),axis=1)

In [12]:
# Compute Logistic Regression to Predict Outcome
train, test = train_test_split(train, test_size=0.2)

y_train = train.outcome_type
x_train = train.drop(['outcome_type'],axis=1)
y_test = test.outcome_type
x_test = test.drop(['outcome_type'],axis=1)

lr = LogisticRegression()
lr.fit(x_train,y_train)
y_pred= lr.predict(x_test)

print(f1_score(y_test, y_pred, average="micro"))





0.4926262943206777


In [195]:
rfm = RandomForestClassifier(n_estimators=70, oob_score=70, n_jobs=-1,
                            random_state=101,max_features=None, min_samples_leaf=30)
rfm.fit(x_train,y_train)
y_pred=rfm.predict(x_test)

print(f1_score(y_test, y_pred, average="micro"))

0.6184499529337936


In [196]:
dtree = DecisionTreeClassifier(max_depth=100, random_state=101,
                              max_features=None, min_samples_leaf=15)
dtree.fit(x_train,y_train)
y_pred=dtree.predict(x_test)

print(f1_score(y_test, y_pred, average="micro"))

0.5919882857441691


In [197]:
#Since RFM gave the best F Score
rfm = RandomForestClassifier(n_estimators=70, oob_score=70, n_jobs=-1,
                            random_state=101,max_features=None, min_samples_leaf=30)

Y_train = train.outcome_type
X_train.dtype = train.drop(['outcome_type'],axis=1)
X_test.dtype = test

rfm.fit(X_train.dtype,Y_train)
y_pred=rfm.predict(X_test.dtype)

prediction = pd.DataFrame(rfm, columns=['animal_id_outcome','predictions']).to_csv('prediction.csv')

  
  import sys


ValueError: Number of features of the model must match the input. Model n_features is 20 and input n_features is 21 