In [1]:
import os
import math
import pandas as pd

data_dir = 'D:\\data\\caltecBirds\\CUB_200_2011'

In [2]:
# Load class and attribute data from the supplied text files

train_test_split = pd.read_csv(os.path.join(data_dir, 'train_test_split.txt'),   sep=" ", index_col=[0], names=['image_id', 'trainset'])
image_file_names = pd.read_csv(os.path.join(data_dir, 'images.txt'),             sep=" ", index_col=[0], names=['image_id', 'file_name'])
class_labels     = pd.read_csv(os.path.join(data_dir, 'image_class_labels.txt'), sep=" ", index_col=[0], names=['image_id', 'class_id'])
image_attribute_labels = pd.read_csv(os.path.join(data_dir, 'attributes', 'image_attribute_labels.txt'), sep=" ", 
                                     names = ['image_id', 'attribute_id', 'present', 'certainty_id', 'time', 'd1', 'd2'])
image_attribute_labels = image_attribute_labels.merge(train_test_split, on='image_id', how='left' )
image_attribute_labels = image_attribute_labels.merge(class_labels, on='image_id', how='left' )

expert_probabilities = pd.read_csv(os.path.join(data_dir, 'attributes', 'class_attribute_labels_continuous.txt'), sep=" ",  names=[i for i in range(1, 313)]).transpose() / 100

test_attribute_labels  = image_attribute_labels.loc[image_attribute_labels['trainset']==1].copy()
test_attribute_probabilities = test_attribute_labels.pivot_table(index='attribute_id', columns='class_id')['present']



In [3]:
# Class attribute probabilities based on human knowledge

expert_probabilities

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
1,0.000000,0.044118,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.215686,0.302632,0.315789,0.027027,0.183007,0.107595,0.000000,0.000000
2,0.029197,0.044118,0.039735,0.014599,0.000000,0.000000,0.000000,0.029412,0.060150,0.168919,...,0.649351,0.464286,0.098039,0.322368,0.046053,0.128378,0.117647,0.367089,0.020690,0.082803
3,0.014599,0.029412,0.033113,0.102190,0.032258,0.000000,0.016129,0.107843,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.014706,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.006536,0.013158,0.000000,0.000000,0.032680,0.075949,0.020690,0.000000
5,0.598540,0.573529,0.708609,0.000000,0.010753,0.000000,0.016129,0.078431,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.006329,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308,0.319149,0.000000,0.161765,0.021429,0.045455,0.049020,0.000000,0.000000,0.035461,0.000000,...,0.005952,0.000000,0.164773,0.045752,0.107784,0.296053,0.227848,0.418750,0.192053,0.380368
309,0.535714,0.771429,0.710280,0.841270,0.884615,0.162162,0.638095,0.641975,0.753846,0.152318,...,0.069620,0.000000,0.536424,0.019355,0.309211,0.143791,0.167832,0.187919,0.124138,0.564885
310,0.053571,0.038095,0.009346,0.023810,0.000000,0.121622,0.028571,0.000000,0.023077,0.013245,...,0.031646,0.770115,0.046358,0.658065,0.072368,0.228758,0.251748,0.442953,0.344828,0.030534
311,0.214286,0.104762,0.074766,0.031746,0.038462,0.486486,0.152381,0.197531,0.092308,0.072848,...,0.037975,0.160920,0.304636,0.206452,0.480263,0.562092,0.321678,0.187919,0.331034,0.106870


In [4]:
# Train a Naive Bayes classifier based on the attributes contained in the training dataset

train_attribute_labels = image_attribute_labels.loc[image_attribute_labels['trainset']==0].copy()
class_attribute_probabilities = train_attribute_labels.pivot_table(index='attribute_id', columns='class_id')['present']
round(class_attribute_probabilities,2)

class_id,1,2,3,4,5,6,7,8,9,10,...,191,192,193,194,195,196,197,198,199,200
attribute_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.00,0.07,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.17,0.23,0.23,0.03,0.27,0.10,0.00,0.00
2,0.03,0.07,0.04,0.00,0.00,0.00,0.00,0.00,0.07,0.10,...,0.60,0.50,0.07,0.30,0.07,0.13,0.20,0.23,0.03,0.13
3,0.00,0.03,0.04,0.13,0.07,0.00,0.00,0.06,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,0.00,0.03,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.03,0.13,0.00,0.00
5,0.53,0.50,0.68,0.00,0.00,0.00,0.00,0.17,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.03,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308,0.30,0.00,0.18,0.03,0.00,0.00,0.00,0.00,0.07,0.00,...,0.03,0.00,0.17,0.07,0.13,0.30,0.30,0.50,0.17,0.47
309,0.37,0.60,0.50,0.70,0.50,0.09,0.48,0.56,0.66,0.17,...,0.10,0.00,0.33,0.07,0.23,0.17,0.27,0.27,0.13,0.40
310,0.07,0.03,0.04,0.00,0.00,0.18,0.04,0.00,0.00,0.03,...,0.07,0.80,0.07,0.50,0.03,0.23,0.20,0.30,0.40,0.07
311,0.13,0.07,0.14,0.03,0.07,0.18,0.13,0.17,0.10,0.10,...,0.03,0.13,0.40,0.33,0.47,0.53,0.27,0.13,0.27,0.13


In [6]:
classifiers = {'Expert': expert_probabilities, 'Trained': class_attribute_probabilities, 'TrainedOnTest': test_attribute_probabilities}
for name, classifier in classifiers.items():
    count, correct = 0, 0
    for image_id, _ in train_test_split.loc[train_test_split['trainset'] == 1].iterrows():
        present_attribute_ids = image_attribute_labels.loc[((image_attribute_labels['image_id']==image_id) & (image_attribute_labels['present']==1))]['attribute_id']
        class_probability_factors = classifier.loc[classifier.index.isin([attribute_id for attribute_id in present_attribute_ids]) ]
        class_probabilities = class_probability_factors.prod()
        total_prob          = sum(class_probabilities)
        if total_prob > 0:
            norm_probabilities  = class_probabilities / total_prob
            predicted_class = norm_probabilities.idxmax(skipna=True)
            labelled_class  = class_labels.loc[class_labels.index == image_id]['class_id'].tolist()[0]
            if predicted_class == labelled_class: correct +=1
        count+=1
    print(name, correct / count)
    
        
    

Expert 0.016016016016016016
Trained 0.29796463129796463
TrainedOnTest 0.7055388722055389
