In [44]:
import os
import socket
import pandas as pd
import numpy as np
from sklearn.naive_bayes import BernoulliNB, GaussianNB, CategoricalNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.metrics import accuracy_score

if socket.gethostname() == 'LTSSL-sKTPpP5Xl':
    data_dir = 'C:\\Users\\ams90\\PycharmProjects\\ConceptsBirds\\data'
elif socket.gethostname() == 'LAPTOP-NA88OLS1':
    data_dir = 'D:\\data\\caltecBirds\\CUB_200_2011'
else:
    data_dir = '/home/bwc/ams90/datasets/caltecBirds/CUB_200_2011'


In [60]:
attributes       = pd.read_csv(os.path.join(data_dir, 'attributes.txt'),         sep=" ", index_col=[0], names=['attribute_id', 'attribute_name'])
train_test_split = pd.read_csv(os.path.join(data_dir, 'train_test_split.txt'),   sep=" ", index_col=[0], names=['image_id', 'trainset'])
image_file_names = pd.read_csv(os.path.join(data_dir, 'images.txt'),             sep=" ", index_col=[0], names=['image_id', 'file_name'])
class_labels     = pd.read_csv(os.path.join(data_dir, 'image_class_labels.txt'), sep=" ", index_col=[0], names=['image_id', 'class_id'])
image_attribute_labels = pd.read_csv(os.path.join(data_dir, 'attributes', 'image_attribute_labels.txt'), sep=" ", 
                                     names = ['image_id', 'attribute_id', 'present', 'certainty_id', 'time', 'd1', 'd2'])
image_attribute_labels = image_attribute_labels.merge(train_test_split, on='image_id', how='left' )
image_attribute_labels = image_attribute_labels.merge(class_labels, on='image_id', how='left' )

In [3]:
y_train, y_test = np.array(class_labels.loc[train_test_split['trainset']==1]['class_id']), np.array(class_labels.loc[train_test_split['trainset']==0]['class_id'])
X_train = np.array(image_attribute_labels.loc[image_attribute_labels['trainset']==1].copy().pivot_table(index='image_id', columns='attribute_id')['present'], dtype=np.int8)
X_test = np.array(image_attribute_labels.loc[image_attribute_labels['trainset']==0].copy().pivot_table(index='image_id', columns='attribute_id')['present'], dtype=np.int8)

In [73]:
# Naive Bayes
classifiers = {'Bernoulli': BernoulliNB(alpha=1), 'Gaussian': GaussianNB(), 'Categorical': CategoricalNB(), 'Multinomial': MultinomialNB()}

for name, classifier in classifiers.items():

    classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = classifier.predict(X_test)

    # Evaluate the model's accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Naive Bayes {name}  Accuracy: {accuracy:.4f}")

Naive Bayes Bernoulli  Accuracy: 0.4256
Naive Bayes Gaussian  Accuracy: 0.2133
Naive Bayes Categorical  Accuracy: 0.4256
Naive Bayes Multinomial  Accuracy: 0.4342


In [74]:
# Decision Trees
classifiers = {'Entropy ': DecisionTreeClassifier(criterion='entropy'),
               'Gini    ': DecisionTreeClassifier(criterion='gini'),
               'LogLoss ': DecisionTreeClassifier(criterion='log_loss'),
               }


for name, classifier in classifiers.items():

    classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = classifier.predict(X_test)

    # Evaluate the model's accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Decision Tree {name}  Accuracy: {accuracy:.4f}")

Decision Tree Entropy   Accuracy: 0.2100
Decision Tree Gini      Accuracy: 0.2463
Decision Tree LogLoss   Accuracy: 0.2092


In [75]:
# Ensembles
classifiers = {'Bagging ': BaggingClassifier(DecisionTreeClassifier(criterion='gini'), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1),
               'RandomForest': RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1),
               'AdaBoost':     AdaBoostClassifier(DecisionTreeClassifier(max_depth=5), n_estimators=200, algorithm="SAMME", learning_rate=0.5)              
               }


for name, classifier in classifiers.items():

    classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = classifier.predict(X_test)

    # Evaluate the model's accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Ensemble {name}  Accuracy: {accuracy:.4f}")

Ensemble Bagging   Accuracy: 0.3309
Ensemble RandomForest  Accuracy: 0.2768
Ensemble AdaBoost  Accuracy: 0.2794


In [86]:
# Feature Importance
classifier = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

feature_scores = pd.DataFrame(classifier.feature_importances_, columns=['score']).merge(attributes, left_index=True, right_index=True)


In [89]:
for i, record in enumerate(feature_scores.sort_values(by='score', ascending=False).iterrows()):
    print(i+1, '\t',  record[1]['attribute_name'], '\t',record[1]['score'], )

1 	 has_crown_color::white 	 0.02753056649675947
2 	 has_underparts_color::grey 	 0.027486618226833605
3 	 has_throat_color::grey 	 0.026280445391938464
4 	 has_throat_color::orange 	 0.024922679008873787
5 	 has_bill_shape::hooked 	 0.023318933351797962
6 	 has_bill_color::buff 	 0.021665684603047353
7 	 has_forehead_color::grey 	 0.020681704573849404
8 	 has_bill_length::shorter_than_head 	 0.020356383825627263
9 	 has_shape::long-legged-like 	 0.01990580015893844
10 	 has_bill_length::longer_than_head 	 0.019837104186804257
11 	 has_primary_color::grey 	 0.019551520666991125
12 	 has_bill_shape::needle 	 0.019344783031616436
13 	 has_forehead_color::white 	 0.019339484032629833
14 	 has_nape_color::white 	 0.019215394082178712
15 	 has_bill_shape::all-purpose 	 0.018333155201875676
16 	 has_breast_color::grey 	 0.018303926754650725
17 	 has_belly_color::grey 	 0.017799181855396078
18 	 has_breast_color::white 	 0.017547841718096587
19 	 has_underparts_color::white 	 0.01716044614128