In [1]:
import numpy as np
import pandas as pd
import pickle
import os
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import accuracy_score, precision_recall_curve, roc_auc_score
from meta_data import *

## Load Model Predictions

In [2]:
with open('./passerine.pickle', 'rb') as f:
    passerine = pickle.load(f)
    
df = pd.DataFrame({"file":[f.decode() for f in passerine["file"]],
                   "gt": passerine["gt"]})
for n in passerine:
    if type(n) == int:
        df[n] = passerine[n]["predict"]

## Load Bird MetaData

In [3]:
BIRD_DIR = '/Users/dmorton/Local Documents/Cornell Birds/nabirds/'
def read_meta():
    hierarcy, parent_map, top_levels, terminal_levels = read_hierarchy(BIRD_DIR)
    class_labels = read_class_labels(top_levels, parent_map, BIRD_DIR)
    classes, terminal_classes = read_classes(terminal_levels, BIRD_DIR)

    meta = class_labels.merge(classes).merge(classes.rename(columns={'label_name': 'class_name',
                                                                'id': 'class_id'})\
                           .drop(columns = ['annotation', 'name']))
    name_map = {row['name']: idx + 1 for idx, row in meta[['name']].drop_duplicates()\
                                                             .reset_index(drop=True)\
                                                             .iterrows()}
    terminal_map = {row['label_name']: idx + 1 for idx, row in terminal_classes.iterrows()}
    meta['name_id'] = meta['name'].apply(lambda n: name_map[n])
    meta['terminal_id'] = meta['label_name'].apply(lambda n: terminal_map[n])

    images = read_images(BIRD_DIR)
    boxes = read_boxes(BIRD_DIR)
    sizes = read_sizes(BIRD_DIR)
    train_test = read_train_test(BIRD_DIR)
    train_test_meta = images.merge(meta).merge(boxes).merge(sizes).merge(train_test)\
                          .reset_index(drop=True)
    for c, d in zip(train_test_meta.columns, train_test_meta.dtypes):
        if d == np.dtype('int64'):
            train_test_meta[c] = train_test_meta[c].astype(np.int32)
    train_meta = train_test_meta[train_test_meta['is_train'] == 1].drop(columns = 'is_train').reset_index(drop=True)
    test_meta = train_test_meta[train_test_meta['is_train'] == 0].drop(columns = 'is_train').reset_index(drop=True)

    bad_train_file = '0853/1b7756d652e24d3cab075360168d5960.jpg'
    bad_test_file = '0554/f8e98e5ae4b34355ab635b92a74f1779.jpg'
    train_meta = correct_record(train_meta, bad_train_file)
    test_meta = correct_record(test_meta, bad_test_file)
    train_boxes = make_box_df(train_meta, label_col="class_id")
    test_boxes = make_box_df(test_meta, label_col="class_id")

    return train_meta, test_meta, train_boxes, test_boxes

train_meta, test_meta, train_boxes, test_boxes = read_meta()

## Add class names to model predictions

In [6]:
df = test_meta[["file", "class_name"]].merge(df)

## Class Accuracy - 0% mislabeled training data.

Cuckoos, who look plausibly like passerines had the lowest accuracy.

Perching Birds (aka passerines) fall in the middle range for accuracy.

In [7]:
np.round(df.groupby('class_name').apply(lambda row: (row['gt'] == (row[0] > 0.5)).sum()/row.shape[0]).sort_values() * 100,1)

class_name
Cuckoos                                                    86.7
Nightjars                                                  92.3
Parrots                                                    96.4
Kingfishers and Allies                                     96.7
Woodpeckers                                                96.8
Hawks, Kites, Eagles, and Allies                           98.2
Swifts and Hummingbirds                                    98.2
Owls                                                       98.4
Grouse, Quail, and Allies                                  98.5
Perching Birds                                             98.6
Loons                                                      98.7
Cranes and Rails                                           99.0
Caracaras and Falcons                                      99.1
Pigeons and Doves                                          99.2
Frigatebirds, Boobies, Cormorants, Darters, and Allies     99.3
Grebes                       

## Class Accuracy - 40% mislabeled data

Passerines are the most accurate class. The model struggles with non-passerine accuracy, never higher than 60% and around 50% overall.

In [8]:
np.round(df.groupby('class_name').apply(lambda row: (row['gt'] == (row[40] > 0.5)).sum()/row.shape[0]).sort_values() * 100, 1)

class_name
Cuckoos                                                   23.0
Loons                                                     24.8
Ducks, Geese, and Swans                                   40.4
Nightjars                                                 41.0
Parrots                                                   42.9
Grebes                                                    44.8
Skuas and Alcids                                          45.1
Woodpeckers                                               45.4
Frigatebirds, Boobies, Cormorants, Darters, and Allies    47.7
Pigeons and Doves                                         50.4
Gulls, Terns, and Allies                                  51.1
Plovers, Sandpipers, and Allies                           51.2
Cranes and Rails                                          56.0
Grouse, Quail, and Allies                                 56.1
Hawks, Kites, Eagles, and Allies                          56.7
Kingfishers and Allies                      

Passerine vs. non-passerine accuracy for each model. Ground truth is 1 for passerine, 0 otherwise.

In [9]:
tp_fp = df[['gt']].merge((df[[0, 1, 10, 20, 30, 40]] > 0.5)* 1.0 == df[['gt']].values,
                left_index=True, right_index=True)
np.round(tp_fp.groupby('gt').sum()/tp_fp.groupby('gt').count() * 100, 1)

Unnamed: 0_level_0,0,1,10,20,30,40
gt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,98.9,98.2,96.4,89.9,86.9,50.1
1,98.6,98.3,94.4,95.2,88.2,85.5


ROC remains high, over 90% for training data with 30% errors, indicating that the data is still well separated.

In [11]:
for col in df.columns[3:]:
    print(f"{col}: {np.round(roc_auc_score(df['gt'], df[col]) * 100,1)}")

40: 77.5
30: 94.7
20: 98.0
10: 99.0
1: 99.8
0: 99.9
