In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, \
                            accuracy_score, classification_report

- FP (using cat arbitrarily as positive): pred cat, actual dog
- FN: pred dog, actual cat
- This model has about an 80% accuracy, so I would describe it as a good model

## Problem 2

In [2]:
df = pd.read_csv('https://ds.codeup.com/data/c3.csv')
df.head()

Unnamed: 0,actual,model1,model2,model3
0,No Defect,No Defect,Defect,No Defect
1,No Defect,No Defect,Defect,Defect
2,No Defect,No Defect,Defect,No Defect
3,No Defect,Defect,Defect,Defect
4,No Defect,No Defect,Defect,No Defect


In [3]:
df.actual.value_counts()

No Defect    184
Defect        16
Name: actual, dtype: int64

In [4]:
df['baseline'] = 'No Defect'
df.head()

Unnamed: 0,actual,model1,model2,model3,baseline
0,No Defect,No Defect,Defect,No Defect,No Defect
1,No Defect,No Defect,Defect,Defect,No Defect
2,No Defect,No Defect,Defect,No Defect,No Defect
3,No Defect,Defect,Defect,Defect,No Defect
4,No Defect,No Defect,Defect,No Defect,No Defect


In [5]:
positive= 'Defect'

# precision -- how good are our positive predictions?
# precision -- model performance | predicted positive
subset1 = df[df.model1 == positive] 
subset2 = df[df.model2 == positive]
subset3 = df[df.model3 == positive]
subset_base = df[df.baseline == positive]

model1_precision = (subset1.model1 == subset1.actual).mean()
model2_precision = (subset2.model2 == subset2.actual).mean()
model3_precision = (subset3.model3 == subset3.actual).mean()
baseline_precision = (subset_base.baseline == 
                                  subset_base.actual).mean()

# recall -- how good are we at detecting actual positives?
# recall -- model performance | actual positive
subset = df[df.actual == positive]
model1_recall = (subset.model1 == subset.actual).mean()
model2_recall = (subset.model2 == subset.actual).mean()
model3_recall = (subset.model3 == subset.actual).mean()
baseline_recall = (subset.baseline == subset.actual).mean()


print(f'model1 precision: {model1_precision:.2%}')
print(f'model2 precision: {model2_precision:.2%}')
print(f'model3 precision: {model3_precision:.2%}')
print(f'baseline precision: {baseline_precision:.2%}')
print()
print(f'model1 recall: {model1_recall:.2%}')
print(f'model2 recall: {model2_recall:.2%}')
print(f'model3 recall: {model3_recall:.2%}')
print(f'baseline recall: {baseline_recall:.2%}')

model1 precision: 80.00%
model2 precision: 10.00%
model3 precision: 13.13%
baseline precision: nan%

model1 recall: 50.00%
model2 recall: 56.25%
model3 recall: 81.25%
baseline recall: 0.00%


In [6]:
precision_score(df.actual, df.model3, average='binary', 
                pos_label='Defect') ,\
recall_score(df.actual, df.model3, average='binary', 
                pos_label='Defect')

(0.13131313131313133, 0.8125)

In [7]:
# accuracy_score()

If we classify Defects as positives, then we would optimize our model for Recall because we don't want to miss any. The best model for this is model3.

This case would be a high cost of FP, so we want to optimize for Precision here. The best model for this is model1.

## Problem 3

In [8]:
df = pd.read_csv('https://ds.codeup.com/data/gives_you_paws.csv')
df.head()

Unnamed: 0,actual,model1,model2,model3,model4
0,cat,cat,dog,cat,dog
1,dog,dog,cat,cat,dog
2,dog,cat,cat,cat,dog
3,dog,dog,dog,cat,dog
4,cat,cat,cat,dog,dog


In [9]:
df.actual.value_counts()

dog    3254
cat    1746
Name: actual, dtype: int64

In [10]:
df['baseline'] = 'dog'
df.head(3)

Unnamed: 0,actual,model1,model2,model3,model4,baseline
0,cat,cat,dog,cat,dog,dog
1,dog,dog,cat,cat,dog,dog
2,dog,cat,cat,cat,dog,dog


In [15]:
positive= 'cat'

# accuracy -- overall hit rate
model1_accuracy = (df.model1 == df.actual).mean()
model2_accuracy = (df.model2 == df.actual).mean()
model3_accuracy = (df.model3 == df.actual).mean()
model4_accuracy = (df.model4 == df.actual).mean()

baseline_accuracy = (df.baseline == df.actual).mean()

# precision -- how good are our positive predictions?
# precision -- model performance | predicted positive
subset1 = df[df.model1 == positive] 
subset2 = df[df.model2 == positive]
subset3 = df[df.model3 == positive]
subset4 = df[df.model4 == positive]
subset_base = df[df.baseline == positive]

model1_precision = (subset1.model1 == subset1.actual).mean()
model2_precision = (subset2.model2 == subset2.actual).mean()
model3_precision = (subset3.model3 == subset3.actual).mean()
model4_precision = (subset4.model4 == subset4.actual).mean()
baseline_precision = (subset_base.baseline == 
                                  subset_base.actual).mean()

# recall -- how good are we at detecting actual positives?
# recall -- model performance | actual positive
subset = df[df.actual == positive]
model1_recall = (subset.model1 == subset.actual).mean()
model2_recall = (subset.model2 == subset.actual).mean()
model3_recall = (subset.model3 == subset.actual).mean()
model4_recall = (subset.model4 == subset.actual).mean()
baseline_recall = (subset.baseline == subset.actual).mean()

print(f'model1 accuracy: {model1_accuracy:.2%}')
print(f'model2 accuracy: {model2_accuracy:.2%}')
print(f'model3 accuracy: {model3_accuracy:.2%}')
print(f'model4 accuracy: {model4_accuracy:.2%}')
print(f'baseline accuracy: {baseline_accuracy:.2%}')
print()
print(f'model1 precision: {model1_precision:.2%}')
print(f'model2 precision: {model2_precision:.2%}')
print(f'model3 precision: {model3_precision:.2%}')
print(f'model4 precision: {model4_precision:.2%}')
print(f'baseline precision: {baseline_precision:.2%}')
print()
print(f'model1 recall: {model1_recall:.2%}')
print(f'model2 recall: {model2_recall:.2%}')
print(f'model3 recall: {model3_recall:.2%}')
print(f'model4 recall: {model4_recall:.2%}')
print(f'baseline recall: {baseline_recall:.2%}')

model1 accuracy: 80.74%
model2 accuracy: 63.04%
model3 accuracy: 50.96%
model4 accuracy: 74.26%
baseline accuracy: 65.08%

model1 precision: 68.98%
model2 precision: 48.41%
model3 precision: 35.83%
model4 precision: 80.72%
baseline precision: nan%

model1 recall: 81.50%
model2 recall: 89.06%
model3 recall: 51.15%
model4 recall: 34.54%
baseline recall: 0.00%


In [30]:
accuracy_score(df.actual, df.model3) ,\
precision_score(df.actual, df.model3, pos_label='cat') ,\
recall_score(df.actual, df.model3, pos_label='cat')

(0.5096, 0.358346709470305, 0.5114547537227949)

In [28]:
print(classification_report(df.actual, df.model3))

              precision    recall  f1-score   support

         cat       0.36      0.51      0.42      1746
         dog       0.66      0.51      0.57      3254

    accuracy                           0.51      5000
   macro avg       0.51      0.51      0.50      5000
weighted avg       0.55      0.51      0.52      5000



Model1 and model4 both have better accuracy than the baseline model.

Dog: 
High accuracy would be the most important for Phase I, so it should use model1. For Phase II, there would be a higher cost for FP since they would have to be interpreted, rather than just dropped entirely, so the model with the best Precision should be used: model1.

Cat:
High accuracy would be the most important for Phase I, so it should use model1. For Phase II, there would be a higher cost for FP since they would have to be interpreted, rather than just dropped entirely, so the model with the best Precision should be used: model4.