In [1]:
# import data manipulation tools
import numpy as np
import pandas as pd
# import visualization tools
import seaborn as sns
import matplotlib.pyplot as plt
# import classification modeling functions
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix, recall_score,\
accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import wrangle_austin_shelter as w

In [2]:
animals, train, validate, test = w.wrangle_austin_animal_shelter()

reading intake data from local file
reading outcome data from local file


In [3]:
train.columns

Index(['animal_id', 'datetime_in', 'found_location', 'intake_type',
       'intake_condition', 'animal_type', 'sex_upon_intake', 'age_upon_intake',
       'breed', 'color', 'name', 'datetime_out', 'date_of_birth',
       'outcome_type', 'sex_upon_outcome', 'age_upon_outcome',
       'outcome_subtype', 'outcome', 'has_name', 'intake_type_encoded',
       'intake_condition_encoded', 'animal_type_encoded',
       'sex_upon_intake_encoded', 'breed_encoded', 'color_encoded',
       'sex_upon_outcome_encoded', 'outcome_subtype_encoded', 'adopted',
       'death', 'transfered'],
      dtype='object')

In [4]:
# train = pd.concat([train, pd.get_dummies(train.outcome)], axis=1)
# validate = pd.concat([validate, pd.get_dummies(validate.outcome)], axis=1)
# test = pd.concat([test, pd.get_dummies(test.outcome)], axis=1)

In [5]:
train.head()

Unnamed: 0,animal_id,datetime_in,found_location,intake_type,intake_condition,animal_type,sex_upon_intake,age_upon_intake,breed,color,...,intake_condition_encoded,animal_type_encoded,sex_upon_intake_encoded,breed_encoded,color_encoded,sex_upon_outcome_encoded,outcome_subtype_encoded,adopted,death,transfered
110679,A733324,2016-08-20 12:30:00,11752 Von Quintus Dr in Austin (TX),Stray,Normal,Dog,Intact Male,1 month,Rat Terrier Mix,Black/White,...,10,2,1,2113,59,2,10,1,0,0
77632,A706156,2015-06-25 15:15:00,Farm To Market 620 N & Hudson Bend Rd in Austi...,Stray,Normal,Cat,Intact Male,1 month,Domestic Shorthair Mix,Brown Tabby/White,...,10,1,1,1127,171,2,19,0,0,1
176479,A852394,2022-03-02 12:56:00,1156 West Cesar Chavez Street in Austin (TX),Stray,Normal,Cat,Intact Female,10 months,Domestic Shorthair,Tortie,...,10,1,0,1126,480,3,19,0,0,1
80305,A703780,2015-05-28 18:01:00,1200 Mearns Meadow Blvd in Austin (TX),Stray,Normal,Dog,Intact Female,1 year,Chihuahua Shorthair/Dachshund,Black/Tan,...,10,2,0,818,56,3,16,1,0,0
121037,A779201,2018-08-27 12:52:00,Austin (TX),Owner Surrender,Normal,Cat,Spayed Female,3 years,Domestic Shorthair Mix,Black,...,10,1,3,1127,7,3,16,1,0,0


In [6]:
X_cols=['has_name', 'sex_upon_outcome_encoded', 
        'intake_type_encoded', 'animal_type_encoded']
y_col = 'adopted'

In [7]:
train_X = train[X_cols]
train_y = train[y_col]
validate_X = validate[X_cols]
validate_y = validate[y_col]
test_X = test[X_cols]
test_y = test[y_col]

In [8]:
train[y_col].value_counts()

1    74497
0    33342
Name: adopted, dtype: int64

In [9]:
baseline = [1] * len(train)

In [10]:
len(baseline) == len(train)

True

In [11]:
print(f'The baseline prediction score accuracy is: \
{(train[y_col] == baseline).mean():.2%}')

The baseline prediction score accuracy is: 69.08%


In [26]:
def get_baseline(train, y_col):
    baseline = [1] * len(train)
    print(f'The baseline prediction score accuracy is: \
{(train[y_col] == baseline).mean():.2%}')

In [28]:
get_baseline(train, y_col)

The baseline prediction score accuracy is: 69.08%


In [24]:
def get_decision_tree(train_X, validate_X, train_y, validate_y):
    '''
    This function will use a decision tree machine learning model to predict 
    customer churn using the columns chosen during the exploration process.
    '''
    # make the decision tree object
    dt = DecisionTreeClassifier(max_depth=6)
    # fit the data to the dt object
    dt.fit(train_X, train_y)
    # predict with the dt object
    dt_preds = dt.predict(train_X)
    dt_val_preds = dt.predict(validate_X)
    # "Model Type" 
    # "evaluation metric" on train: "evaluation result" 
    # "evaluation metric" on validate: "evaluation result"
    print('Decision Tree Model')
#     print(f'{classification_report(train_y, dt_preds)}')
    print(f'Accuracy score on train: {accuracy_score(train_y, dt_preds):.2%}')
    print(f'Accuracy score on validate: {accuracy_score(validate_y, dt_val_preds):.2%}')
    print(f'Recall score on train: {recall_score(train_y, dt_preds):.2%}')
    print(f'Recall score on validate: {recall_score(validate_y, dt_val_preds):.2%}')
    # return the decision tree model for use in other functions
    return dt

In [25]:
dt = get_decision_tree(train_X, validate_X, train_y, validate_y)

Decision Tree Model
Accuracy score on train: 84.91%
Accuracy score on validate: 85.27%
Recall score on train: 97.28%
Recall score on validate: 97.47%


In [16]:
def get_random_forest(train_X, validate_X, train_y, validate_y):
    '''
    This function will use a random forest machine learning model to predict 
    customer churn using the columns chosen during the exploration process.
    '''
    # make the decision tree object
    rf = RandomForestClassifier()
    # fit the data to the rf object
    rf.fit(train_X, train_y)
    # predict with the rf object
    rf_preds = rf.predict(train_X)
    rf_val_preds = rf.predict(validate_X)
    # "Model Type" 
    # "evaluation metric" on train: "evaluation result" 
    # "evaluation metric" on validate: "evaluation result"
    print('Random Forest Model')
    print(f'Accuracy score on train: {accuracy_score(train_y, rf_preds):.2%}')
    print(f'Accuracy score on validate: {accuracy_score(validate_y, rf_val_preds):.2%}')
    print(f'Recall score on train: {recall_score(train_y, rf_preds):.2%}')
    print(f'Recall score on validate: {recall_score(validate_y, rf_val_preds):.2%}')
    # return the random forest model for use in other functions
    return rf

In [17]:
rf = get_random_forest(train_X, validate_X, train_y, validate_y)

Random Forest Model
Accuracy score on train: 84.96%
Accuracy score on validate: 85.29%
Recall score on train: 97.30%
Recall score on validate: 97.50%


In [18]:
def get_logistic_regression(train_X, validate_X, train_y, validate_y):
    '''
    This function will use a logistic regression machine learning model to predict 
    customer churn using the columns chosen during the exploration process.
    '''
    # make the decision tree object
    lr = LogisticRegression()
    # fit the data to the lr object
    lr.fit(train_X, train_y)
    # predict with the lr object
    lr_preds = lr.predict(train_X)
    lr_val_preds = lr.predict(validate_X)
    # "Model Type" 
    # "evaluation metric" on train: "evaluation result" 
    # "evaluation metric" on validate: "evaluation result"
    print('Logistic Regression Model')
    print(f'Accuracy score on train: {accuracy_score(train_y, lr_preds):.2%}')
    print(f'Accuracy score on validate: {accuracy_score(validate_y, lr_val_preds):.2%}')
    print(f'Recall score on train: {recall_score(train_y, lr_preds):.2%}')
    print(f'Recall score on validate: {recall_score(validate_y, lr_val_preds):.2%}')
    # return the logistic regression model for use in other functions
    return lr

In [19]:
lr = get_logistic_regression(train_X, validate_X, train_y, validate_y)

Logistic Regression Model
Accuracy score on train: 76.40%
Accuracy score on validate: 76.79%
Recall score on train: 90.36%
Recall score on validate: 90.53%


In [20]:
def get_rf_test(test_X, test_y, rf):
    '''
    This function will take in a random forest model in order to predict customer
    churn rate using the test data.
    '''
    # make a prediction using the test data and passed rf model
    rf_test_preds = rf.predict(test_X)
    # print the recall score for the test data
    print('Random Forest Model')
    print(f'Accuracy score on test: {accuracy_score(test_y, rf_test_preds):.2%}')
    print(f'Recall score on test: {recall_score(test_y, rf_test_preds):.2%}')

In [21]:
get_rf_test(test_X, test_y, rf)

Random Forest Model
Accuracy score on test: 85.00%
Recall score on test: 97.14%


In [22]:
# create a df with predictions from the test data
predictions = pd.concat([
    # get the customer_ids, and reset the index to ordered 0-1408, dropping the old index
    pd.DataFrame(test.animal_id).reset_index().drop(columns='index'),
    pd.DataFrame(test.death).reset_index().drop(columns='index'),
    # get the churn prediction from the final random forest ml model
    pd.DataFrame(rf.predict(test_X)).rename(columns={0:'prediction_of_adoption'}),
    # get the churn probability predicitons from the final random forest model
    pd.DataFrame(rf.predict_proba(
        # drop the no_adopted probalities since it is not needed
        test_X)).drop(columns=0).rename(
        # rename the column
        columns={1:'probability_of_adoption'})
    
],axis=1)
predictions

Unnamed: 0,animal_id,death,prediction_of_adoption,probability_of_adoption
0,A820063,0,0,0.024039
1,A552798,0,1,0.881092
2,A802391,0,1,0.889022
3,A828829,0,1,0.908354
4,A833771,0,0,0.401824
...,...,...,...,...
38510,A842931,0,1,0.852408
38511,A821331,0,1,0.888133
38512,A837945,0,1,0.862454
38513,A839317,0,1,0.673441


In [23]:
def get_pred_error_plot(predictions):
    '''
    This function will take in a DataFrame containing the actual quality scores 
    and predicted quality scores generated from the test dataset, it will then
    display a plot the error of the wine quality predictions
    '''
    # set figure size
    plt.figure(figsize=(16,12))
    # create a line at zero error
    plt.axhline(label="No Error")
    # create a scatter plot with the error amounts
    plt.scatter(predictions.iloc[:,0], (predictions.iloc[:,2] - predictions.iloc[:,1]), 
                alpha=.5, color="grey", s=100, label="Model 2nd degree Polynomial")
    # change the x and y labels and label sizes
    plt.xlabel('Actual Wine Quality', size=14)
    plt.ylabel('Error of Predicted Wine Qualities', size=14)
    # add a title to the plot
    plt.title('Prediction Error of Polynomial Regression Model', size=16)
    # create a legend
    plt.legend(loc=1)
    # display the plot
    plt.show()