In [1]:
import pandas as pd
from sklearn.metrics import confusion_matrix

### Model Evaluation Exercise

#### (1) Create a new file named model_evaluation.py or model_evaluation.ipynb for these exercises.

##### (2) Given the following confusion matrix, evaluate (by hand) the model's performance.


|               | actual cat | actual dog |
|:------------  |-----------:|-----------:|
| predicted cat |         34 |          7 |
| predicted dog |         13 |         46 |


- In the context of this problem, what is a false positive?
    - <b><i>Predicting cat when actually dog</i></b>
- In the context of this problem, what is a false negative?
    - <b><i>Predicting dog when actually cat</i></b>
- How would you describe this model?
    - <b><i>When looking at the model through the lense of 'actual cat' being our True Positive, the accuracy is 89% overall, the precision show we predicted people like cats right 83% of the time and our recall showed we predicted 72% of the time that people liked cats

#### (3) You are working as a datascientist working for Codeup Cody Creator (C3 for short), a rubber-duck manufacturing plant.

- Unfortunately, some of the rubber ducks that are produced will have defects. Your team has built several models that try to predict those defects, and the data from their predictions can be found here.

Use the predictions dataset and pandas to help answer the following questions:

In [2]:
c3 = pd.read_csv(r'/Users/davidberchelmann/codeup-data-science/classification-exercises/cs3.csv')

In [3]:
c3.head()

Unnamed: 0,actual,model1,model2,model3
0,No Defect,No Defect,Defect,No Defect
1,No Defect,No Defect,Defect,Defect
2,No Defect,No Defect,Defect,No Defect
3,No Defect,Defect,Defect,Defect
4,No Defect,No Defect,Defect,No Defect


##### - An internal team wants to investigate the cause of the manufacturing defects. They tell you that they want to identify as many of the ducks that have a defect as possible. Which evaluation metric would be appropriate here? Which model would be the best fit for this use case?

In [4]:
pd.crosstab(c3.actual, c3.model1)

model1,Defect,No Defect
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
Defect,8,8
No Defect,2,182


In [5]:
pd.crosstab(c3.actual, c3.model2)

model2,Defect,No Defect
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
Defect,9,7
No Defect,81,103


In [6]:
pd.crosstab(c3.actual, c3.model3)

model3,Defect,No Defect
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
Defect,13,3
No Defect,86,98


In [7]:
c3.actual.value_counts()

No Defect    184
Defect        16
Name: actual, dtype: int64

In [21]:
c3.model3.value_counts()

No Defect    101
Defect        99
Name: model3, dtype: int64

In [15]:
c3['baseline_prediction'] = 'Defect'

In [17]:
model_accuracy = (c3.model3 == c3.actual).mean()
baseline_accuracy = (c3.baseline_prediction == c3.actual).mean()

print(f'   model accuracy: {model_accuracy:.2%}')
print(f'baseline accuracy: {baseline_accuracy:.2%}')

   model accuracy: 55.50%
baseline accuracy: 8.00%


In [81]:
y_true = c3.actual
y_pred = c3.model1


In [82]:
(tp, fn, fp, tn) = confusion_matrix(y_true, y_pred,
                                    labels = ['Defect', 'No Defect']).ravel()
(tp, fn, fp, tn)

(8, 8, 2, 182)

In [83]:
print("True Positives", tp)
print("False Positives", fp)
print("False Negatives", fn)
print("True Negatives", tn)

print("-------------")

accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / (tp + fn)
precision = tp / (tp + fp)

print("Accuracy is", accuracy)
print("Recall is", recall)
print("Precision is", precision)

True Positives 8
False Positives 2
False Negatives 8
True Negatives 182
-------------
Accuracy is 0.95
Recall is 0.5
Precision is 0.8


In [45]:
c3['baseline_prediction'] = 'Defect'

In [47]:
(c3.actual == c3.baseline_prediction).mean()

0.08

In [50]:
model_accuracy = (c3.actual == c3.model3).mean()
baseline_accuracy = (c3.actual == c3.baseline_prediction).mean()

print(f'   model accuracy: {model_accuracy:.2%}')
print(f'baseline accuracy: {baseline_accuracy:.2%}')

   model accuracy: 55.50%
baseline accuracy: 8.00%


In [52]:
subset = c3[c3.actual == 'Defect']
subset

Unnamed: 0,actual,model1,model2,model3,baseline_prediction
13,Defect,No Defect,Defect,Defect,Defect
30,Defect,Defect,No Defect,Defect,Defect
65,Defect,Defect,Defect,Defect,Defect
70,Defect,Defect,Defect,Defect,Defect
74,Defect,No Defect,No Defect,Defect,Defect
87,Defect,No Defect,Defect,Defect,Defect
118,Defect,No Defect,Defect,No Defect,Defect
135,Defect,Defect,No Defect,Defect,Defect
140,Defect,No Defect,Defect,Defect,Defect
147,Defect,Defect,No Defect,Defect,Defect


In [53]:
model_recall = (subset.actual == subset.model3).mean()
baseline_recall = (subset.actual == subset.baseline_prediction).mean()

print(f'   model recall: {model_recall:.2%}')
print(f'baseline recall: {baseline_recall:.2%}')

   model recall: 81.25%
baseline recall: 100.00%


In [54]:
subset = c3[c3.model3 == 'Defect']
subset

Unnamed: 0,actual,model1,model2,model3,baseline_prediction
1,No Defect,No Defect,Defect,Defect,Defect
3,No Defect,Defect,Defect,Defect,Defect
5,No Defect,No Defect,No Defect,Defect,Defect
9,No Defect,No Defect,No Defect,Defect,Defect
13,Defect,No Defect,Defect,Defect,Defect
...,...,...,...,...,...
193,No Defect,No Defect,Defect,Defect,Defect
194,Defect,Defect,No Defect,Defect,Defect
195,No Defect,No Defect,Defect,Defect,Defect
198,No Defect,No Defect,Defect,Defect,Defect


In [55]:
model_precision = (subset.actual == subset.model3).mean()
print(f'model precision: {model_precision:.2%}')

model precision: 13.13%


In [56]:
subset = c3[c3.baseline_prediction == 'Defect']
baseline_precision = (subset.actual == subset.model3).mean()

print(f'baseline precision: {baseline_precision:.2%}')

baseline precision: 55.50%


- <span style="color:red"><b> - The best model to use will be model 3 since it was able to identify the most defects and best metric is recall since it does the best job of capturing Defects.</b></span>

###### Recently several stories in the local news have come out highlighting customers who received a rubber duck with a defect, and portraying C3 in a bad light. 

The PR team has decided to launch a program that gives customers with a defective duck a vacation to Hawaii. They need you to predict which ducks will have defects, but tell you the really don't want to accidentally give out a vacation package when the duck really doesn't have a defect. 

Which evaluation metric would be appropriate here? Which model would be the best fit for this use case?

- <font color=red><b>Precision is the best metric to use here since false positives are low while model 1 will be the best fit to evaluate since it has the highest precision rate out of the 3.</b></font>

#### You are working as a data scientist for Gives You Paws ™, a subscription based service that shows you cute pictures of dogs or cats (or both for an additional fee).

At Gives You Paws, anyone can upload pictures of their cats or dogs. The photos are then put through a two step process. First an automated algorithm tags pictures as either a cat or a dog (Phase I). Next, the photos that have been initially identified are put through another round of review, possibly with some human oversight, before being presented to the users (Phase II).

Given this dataset, use pandas to create a baseline model (i.e. a model that just predicts the most common class) and answer the following questions:

- <font color = blue>In terms of accuracy, how do the various models compare to the baseline model? Are any of the models better than the baseline?</font>



- <font color = green> Suppose you are working on a team that solely deals with dog pictures. Which of these models would you recomend for Phase I? For Phase II?</font>



- <font color = purple>Suppose you are working on a team that solely deals with cat pictures. Which of these models would you recomend for Phase I? For Phase II?</font>


In [84]:
 paws = pd.read_csv(r'/Users/davidberchelmann/codeup-data-science/classification-exercises/gives_you_paws.csv')

In [85]:
paws.head()

Unnamed: 0,actual,model1,model2,model3,model4
0,cat,cat,dog,cat,dog
1,dog,dog,cat,cat,dog
2,dog,cat,cat,cat,dog
3,dog,dog,dog,cat,dog
4,cat,cat,cat,dog,dog


In [88]:
paws['actual'].value_counts()

dog    3254
cat    1746
Name: actual, dtype: int64

In [90]:
paws['baseline_prediction'] = 'dog'

In [126]:
(paws.actual == paws.baseline_prediction).mean()

0.6508

In [92]:
paws.head()

Unnamed: 0,actual,model1,model2,model3,model4,baseline_prediction
0,cat,cat,dog,cat,dog,dog
1,dog,dog,cat,cat,dog,dog
2,dog,cat,cat,cat,dog,dog
3,dog,dog,dog,cat,dog,dog
4,cat,cat,cat,dog,dog,dog


In [93]:
pd.crosstab(paws.actual, paws.model1)

model1,cat,dog
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,1423,323
dog,640,2614


In [167]:
y_true = paws.actual
y_pred = paws.model1


In [168]:
(tn, fp, fn, tp) = confusion_matrix(y_true, y_pred,
                                    labels = ['cat', 'dog']).ravel()
(tn, fp, fn, tp)

(1423, 323, 640, 2614)

In [169]:
print("True Positives", tp)
print("False Positives", fp)
print("False Negatives", fn)
print("True Negatives", tn)

print("-------------")

accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / (tp + fn)
precision = tp / (tp + fp)

print("Accuracy is", accuracy)
print("Recall is", recall)
print("Precision is", precision)

True Positives 2614
False Positives 323
False Negatives 640
True Negatives 1423
-------------
Accuracy is 0.8074
Recall is 0.803318992009834
Precision is 0.8900238338440586


In [174]:
model_accuracy = (paws.actual == paws.model1).mean()
baseline_accuracy = (paws.actual == paws.baseline_prediction).mean()

print(f'   model accuracy: {model_accuracy:.2%}')
print(f'baseline accuracy: {baseline_accuracy:.2%}')

   model accuracy: 80.74%
baseline accuracy: 65.08%


- <font color = blue>In terms of accuracy, how do the various models compare to the baseline model? Are any of the models better than the baseline?</font>
    - <b>Models 1 & 4 beat the baseline in terms of accuracy. Baseline accuracy is 65.08%. Model 1 cames in at 80.74% while Model 4 is at 74.26%

- <font color = green> Suppose you are working on a team that solely deals with dog pictures. Which of these models would you recomend for Phase I? For Phase II?</font>

- my notes.....Model 4 has a high recal in the 90s, Model 1 is best overall with highest precision at 89...Model 2 is close as well in regards to precision...89 as well. Model 3 is poor all around