In [1]:
import pandas as pd
import seaborn as sns

# 1. Create a new file named model_evaluation.py or model_evaluation.ipynb for these exercises.

# 2. Given the following confusion matrix, evaluate (by hand) the model's performance.

|               | pred dog   | pred cat   |
|:------------  |-----------:|-----------:|
| actual dog    |         46 |         7  |
| actual cat    |         13 |         34 |


- positive class: Dog pic

- negative class: Cat pic

## 2.1  In the context of this problem, what is a false positive?

**FP**: We predict dog & actually get a cat

## 2.2  In the context of this problem, what is a false negative?

**FN**: We predict cat & actually get a dog

|               | pred dog   | pred cat   |
|:------------  |-----------:|-----------:|
| actual dog    |         46 |         7  |
| actual cat    |         13 |         34 |


## 2.3  How would you describe this model?

In [2]:
#TP: We predict dog & it's actually dog
tp= 46
#TN: We predict cat & it's actually cat
tn= 34
#FP: We predict dog  & it's actually cat
fp=13
#FN: We predict cat & it's actually dog
fn=7

In [3]:
accuracy = (tp + tn) / (tp + tn + fp + fn)

recall = tp / (tp + fn)

precision = tp / (tp + fp)

print(f"""Dog-classifier (where 'DOG' is the positive prediction)

True Positives: {tp}
False Positives: {fp}
False Negatives: {fn}
True Negatives: {tn}
______________________

Baseline is: Dog - highest frequency
Accuracy is: {accuracy}
Recall is: {recall}
Precision is: {precision}
""")

Dog-classifier (where 'DOG' is the positive prediction)

True Positives: 46
False Positives: 13
False Negatives: 7
True Negatives: 34
______________________

Baseline is: Dog - highest frequency
Accuracy is: 0.8
Recall is: 0.8679245283018868
Precision is: 0.7796610169491526



# 3. You are working as a data scientist working for Codeup Cody Creator (C3 for short), a rubber-duck manufacturing plant.

Unfortunately, some of the rubber ducks that are produced will have defects. Your team has built several models that try to predict those defects, and the data from their predictions <a href = "https://ds.codeup.com/data/c3.csv">can be found here</a>.

## 3.1  An internal team wants to investigate the cause of the manufacturing defects. They tell you that they want to identify as many of the ducks that have a **defect** as possible. Which evaluation metric would be appropriate here? Which model would be the best fit for this use case?

In [4]:
#bring in the csv provided in the curriculum exercises
cody_df = pd.read_csv('~/Downloads/c3.csv')

In [5]:
#take a look at the data
cody_df.head()

Unnamed: 0,actual,model1,model2,model3
0,No Defect,No Defect,Defect,No Defect
1,No Defect,No Defect,Defect,Defect
2,No Defect,No Defect,Defect,No Defect
3,No Defect,Defect,Defect,Defect
4,No Defect,No Defect,Defect,No Defect


In [6]:
#what kind of columns and dtypes are we dealing with?
cody_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   actual  200 non-null    object
 1   model1  200 non-null    object
 2   model2  200 non-null    object
 3   model3  200 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB


In [7]:
#how many defects and non-defects do we have in the actual data?
cody_df.actual.value_counts()

No Defect    184
Defect        16
Name: actual, dtype: int64

Since we are interested in 'defects', we will asssign it as 'positive class' for the classifier.
<br>
- defects = positive class
- no defect = negative class
<br>

Quality Control, our internal customer, wants the metric to identify as many defective ducks as possible
<br>

Our best metric for Quality Control here is <b>recall</b>
<br>

- i.e how many real positives do we have?
- i.e how many of defective ducks are actually flagged by defective (positive) by the models?
- i.e let's minimize our false negatives


In [8]:
#model positives

subset = cody_df[cody_df.actual == "Defect"]
subset

Unnamed: 0,actual,model1,model2,model3
13,Defect,No Defect,Defect,Defect
30,Defect,Defect,No Defect,Defect
65,Defect,Defect,Defect,Defect
70,Defect,Defect,Defect,Defect
74,Defect,No Defect,No Defect,Defect
87,Defect,No Defect,Defect,Defect
118,Defect,No Defect,Defect,No Defect
135,Defect,Defect,No Defect,Defect
140,Defect,No Defect,Defect,Defect
147,Defect,Defect,No Defect,Defect


In [9]:
#Model1 Recall
model1_recall = (subset.actual == subset.model1).mean()

# Model 2 recall
model2_recall = (subset.actual == subset.model2).mean()

# Model 3 recall
model3_recall = (subset.actual == subset.model3).mean()

print(f"""Model 1 Recall: {model1_recall:.2%}
Model 2 Recall: {model2_recall:.2%}
Model 3 Recall: {model3_recall:.2%}
""")

Model 1 Recall: 50.00%
Model 2 Recall: 56.25%
Model 3 Recall: 81.25%



<div class="alert alert-block alert-success">
    <b>Takeaways:</b>
        <br>
        <br>
-- Quality Control should select a model with higher recall (to avoid false negatives)
<br>
    -- Quality Control should use <b>Model 3</b>
</div>

## 3.2 Recently several stories in the local news have come out highlighting customers who received a rubber duck with a defect, and portraying C3 in a bad light. 
The PR team has decided to launch a program that gives customers with a defective duck a vacation to Hawaii. They need you to **predict which ducks will have defects**, but tell you they really don't want to accidentally give out a vacation package when the duck really doesn't have a defect. 
<br>
Which evaluation metric would be appropriate here? Which model would be the best fit for this use case?

- positive case = defect 
- negative case = no defect

In [10]:
# choose subset of model1 where we only select 'positive predictions'
subset = cody_df[cody_df.model1=='Defect']
subset

Unnamed: 0,actual,model1,model2,model3
3,No Defect,Defect,Defect,Defect
30,Defect,Defect,No Defect,Defect
62,No Defect,Defect,No Defect,No Defect
65,Defect,Defect,Defect,Defect
70,Defect,Defect,Defect,Defect
135,Defect,Defect,No Defect,Defect
147,Defect,Defect,No Defect,Defect
163,Defect,Defect,Defect,Defect
194,Defect,Defect,No Defect,Defect
196,Defect,Defect,No Defect,No Defect


In [11]:
#model1 precision
model1_precision = (subset.actual == subset.model1).mean()
model1_precision

0.8

In [12]:
# choose subset for model2 where we only select 'positive predictions'
subset2 = cody_df [cody_df.model2 == 'Defect']
# calculate precision
model2_precision = (subset2.actual == subset2.model2).mean()


# choose subset for model3 where we only select 'positive predictions'
subset3 = cody_df [cody_df.model3 == 'Defect']
# calculate precision
model3_precision = (subset3.actual == subset3.model3).mean()

print(f"""Model 1 Precision: {model1_precision:.2%}
Model 2 Precision: {model2_precision:.2%}
Model 3 Precision: {model3_precision:.2%}
""")

Model 1 Precision: 80.00%
Model 2 Precision: 10.00%
Model 3 Precision: 13.13%



<div class="alert alert-block alert-success">
    <b>Takeaways:</b>
        <br>
        <br>
        Use model 1 since it will minimize the false positive predictions of defects
        </div>

# 4. You are working as a data scientist for Gives You Paws ™, a subscription based service that shows you cute pictures of dogs or cats *(or both for an additional fee)*.

At Gives You Paws, anyone can upload pictures of their cats or dogs. The photos are then put through a two step process. 
- First an automated algorithm tags pictures as either a cat or a dog (Phase I). 
- Next, the photos that have been initially identified are put through another round of review, possibly with some human oversight, before being presented to the users (Phase II).

Several models have already been developed with the data, and you can <a href = "https://ds.codeup.com/data/gives_you_paws.csv">find their results here</a>.

Given this dataset, use pandas to create a baseline model (i.e. a model that just predicts the most common class) and answer the following questions:

In [13]:
paws_df = pd.read_csv('~/Downloads/gives_you_paws.csv')

In [14]:
paws_df.head()

Unnamed: 0,actual,model1,model2,model3,model4
0,cat,cat,dog,cat,dog
1,dog,dog,cat,cat,dog
2,dog,cat,cat,cat,dog
3,dog,dog,dog,cat,dog
4,cat,cat,cat,dog,dog


In [15]:
paws_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   actual  5000 non-null   object
 1   model1  5000 non-null   object
 2   model2  5000 non-null   object
 3   model3  5000 non-null   object
 4   model4  5000 non-null   object
dtypes: object(5)
memory usage: 195.4+ KB


In [16]:
paws_df.actual.value_counts()

dog    3254
cat    1746
Name: actual, dtype: int64

In [17]:
paws_df.actual.value_counts().idxmax()

'dog'

In [18]:
#set baseline
paws_df['baseline']= paws_df.actual.value_counts().idxmax()

In [19]:
paws_df.head()

Unnamed: 0,actual,model1,model2,model3,model4,baseline
0,cat,cat,dog,cat,dog,dog
1,dog,dog,cat,cat,dog,dog
2,dog,cat,cat,cat,dog,dog
3,dog,dog,dog,cat,dog,dog
4,cat,cat,cat,dog,dog,dog


## 4a. In terms of accuracy, how do the various models compare to the baseline model? Are any of the models better than the baseline?

- dog = positive class
- cat = negative class

In [21]:
(paws_df.actual == paws_df.baseline).mean()

0.6508

In [20]:
(paws_df.actual == paws_df.model1).mean()

0.8074

In [22]:
(paws_df.actual == paws_df.model2).mean()

0.6304

In [23]:
(paws_df.actual == paws_df.model3).mean()

0.5096

In [24]:
(paws_df.actual == paws_df.model4).mean()

0.7426

In [25]:
paws_df.columns

Index(['actual', 'model1', 'model2', 'model3', 'model4', 'baseline'], dtype='object')

In [26]:
print(paws_df.columns[1])

model1


In [27]:
model_acc = []

for model in paws_df.columns[1:]:
    acc = (paws_df.actual == paws_df[model]).mean()
    model_acc.append([model, acc])
    
model_acc

[['model1', 0.8074],
 ['model2', 0.6304],
 ['model3', 0.5096],
 ['model4', 0.7426],
 ['baseline', 0.6508]]

In [28]:
#convert to df

pd.DataFrame(model_acc, columns=['model', 'accuracy'])

Unnamed: 0,model,accuracy
0,model1,0.8074
1,model2,0.6304
2,model3,0.5096
3,model4,0.7426
4,baseline,0.6508


<div class="alert alert-block alert-success">
    <b>Takeaways:</b>
        <br>
        <br>
In terms of accuracy, model 1 and model 4 perform better than baseline
</div>

## 4b. Suppose you are working on a team that solely deals with **dog pictures**. Which of these models would you recommend?

- dog = positive class
- cat = negative class

<b>Phase I</b>: Automated algorithm tags pictures as either a cat or a dog
<br>
For Phase I, we should choose a model with highest Recall

In [29]:
subset = paws_df[paws_df.actual == 'dog']

subset.head()

Unnamed: 0,actual,model1,model2,model3,model4,baseline
1,dog,dog,cat,cat,dog,dog
2,dog,cat,cat,cat,dog,dog
3,dog,dog,dog,cat,dog,dog
5,dog,dog,dog,dog,dog,dog
8,dog,dog,cat,dog,dog,dog


In [30]:
model_recall = []

for model in subset.columns[1:]:
    recall = (subset.actual == subset[model]).mean()
    model_recall.append([model, recall])
    
model_recall

[['model1', 0.803318992009834],
 ['model2', 0.49078057775046097],
 ['model3', 0.5086047940995697],
 ['model4', 0.9557467732022127],
 ['baseline', 1.0]]

<div class="alert alert-block alert-success">
    <b>Takeaways:</b>
        <br>
        <br>
It appears that Model 4 is performing the best, with Recall of 0.96
</div>

<b>Phase II</b>: Photos that have been initially identified are put through another round of review
<br>
People have a service to see dog pictures, so what do we want to minimize?
- we want to minimize false positives
<br>
Precision is the appropriate metric since we are trying to minimize false positives

In [31]:
paws_df.head()

Unnamed: 0,actual,model1,model2,model3,model4,baseline
0,cat,cat,dog,cat,dog,dog
1,dog,dog,cat,cat,dog,dog
2,dog,cat,cat,cat,dog,dog
3,dog,dog,dog,cat,dog,dog
4,cat,cat,cat,dog,dog,dog


In [32]:
subset1 = paws_df[paws_df.model1 == 'dog']
subset2 = paws_df[paws_df.model2 == 'dog']
subset3 = paws_df[paws_df.model3 == 'dog']
subset4 = paws_df[paws_df.model4 == 'dog']

In [33]:
(subset1.actual == subset1.model1).mean()

0.8900238338440586

In [34]:
(subset2.actual == subset2.model2).mean()

0.8931767337807607

In [35]:
(subset3.actual == subset3.model3).mean()

0.6598883572567783

In [36]:
(subset4.actual == subset4.model4).mean()

0.7312485304490948

<div class="alert alert-block alert-success">
    <b>Takeaways:</b>
        <br>
        <br>
It appears that Model 2 is performing best with Precision of 0.893
</div>

## 4c. Suppose you are working on a team that solely deals with **cat pictures**. Which of these models would you recommend?    

- cat = positive class
- dog = negative class

Phase I: we would use recall again b'c we're working with the actual pictures.
PHASE II: We want to minimize the false positives, therefore, we will use precision again.

In [38]:
#phaseII: precision

model_prec = []

for model in paws_df.columns[1:]:
    subset = paws_df[paws_df[model] == 'cat']
    
    precision = (subset.actual == subset[model]).mean()
    
    model_prec.append([model, precision])
    
model_prec

[['model1', 0.6897721764420747],
 ['model2', 0.4841220423412204],
 ['model3', 0.358346709470305],
 ['model4', 0.8072289156626506],
 ['baseline', nan]]

<div class="alert alert-block alert-success">
    <b>Takeaway for Cat team:</b>
        <br>
        <br>
-- we had to maximize precision to minimize the false positives
<br>
-- therefore, we should use model 4
        </div>

# 5. Follow the links below to read the documentation about each function, then apply those functions to the data from the previous problem .

- <a href = "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html">`sklearn.metrics.accuracy_score`</a>
- <a href = "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html">`sklearn.metrics.precision_score`</a>
- <a href = "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html">`sklearn.metrics.recall_score`</a>
- <a href = "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html">`sklearn.metrics.classification_report`</a>

In [39]:
from sklearn.metrics import classification_report

In [41]:
print("Model 1")

pd.DataFrame(classification_report(paws_df.actual, paws_df.model1,
                                  labels =['cat','dog'],
                                  output_dict=True)).T

Model 1


Unnamed: 0,precision,recall,f1-score,support
cat,0.689772,0.815006,0.747178,1746.0
dog,0.890024,0.803319,0.844452,3254.0
accuracy,0.8074,0.8074,0.8074,0.8074
macro avg,0.789898,0.809162,0.795815,5000.0
weighted avg,0.820096,0.8074,0.810484,5000.0


In [42]:
print("Model 2")

pd.DataFrame(classification_report(paws_df.actual, paws_df.model2,
                                  labels =['cat','dog'],
                                  output_dict=True)).T

Model 2


Unnamed: 0,precision,recall,f1-score,support
cat,0.484122,0.890607,0.627269,1746.0
dog,0.893177,0.490781,0.633479,3254.0
accuracy,0.6304,0.6304,0.6304,0.6304
macro avg,0.688649,0.690694,0.630374,5000.0
weighted avg,0.750335,0.6304,0.63131,5000.0


In [43]:
print("Model 3")

pd.DataFrame(classification_report(paws_df.actual, paws_df.model3,
                                  labels =['cat','dog'],
                                  output_dict=True)).T

Model 3


Unnamed: 0,precision,recall,f1-score,support
cat,0.358347,0.511455,0.421425,1746.0
dog,0.659888,0.508605,0.574453,3254.0
accuracy,0.5096,0.5096,0.5096,0.5096
macro avg,0.509118,0.51003,0.497939,5000.0
weighted avg,0.55459,0.5096,0.521016,5000.0


In [44]:
print("Model 4")

pd.DataFrame(classification_report(paws_df.actual, paws_df.model4,
                                  labels =['cat','dog'],
                                  output_dict=True)).T

Model 4


Unnamed: 0,precision,recall,f1-score,support
cat,0.807229,0.345361,0.483755,1746.0
dog,0.731249,0.955747,0.82856,3254.0
accuracy,0.7426,0.7426,0.7426,0.7426
macro avg,0.769239,0.650554,0.656157,5000.0
weighted avg,0.757781,0.7426,0.708154,5000.0


In [45]:
from sklearn.metrics import precision_score, recall_score

In [47]:
def calculate_precision(predictions, positive='dog'):
    """
    This function will: 
    - take in a model prediction 
    - defaulted pos class is dog
    - output the precision score
    """
    return precision_score(paws_df.actual, predictions, pos_label=positive)

In [48]:
def calculate_recall(predictions, positive='dog'):
    """
    This function will: 
    - take in a model prediction 
    - defaulted pos class is dog
    - output the recall score
    """
    return recall_score(paws_df.actual, predictions, pos_label=positive)

In [49]:
pd.concat([
    paws_df.loc[:, 'model1':'baseline'].apply(calculate_recall).rename('recall'),
    paws_df.loc[:, 'model1':'baseline'].apply(calculate_precision).rename('precision'),
], axis=1)

Unnamed: 0,recall,precision
model1,0.803319,0.890024
model2,0.490781,0.893177
model3,0.508605,0.659888
model4,0.955747,0.731249
baseline,1.0,0.6508
