In [8]:
'''
playground for testing evaluation metrics on consitency with other author (Ben Hammer)
and plausibility checks
'''
import numpy as np
from utilities import *

In [9]:
# code from 
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]
    # print("ben Hammer predicted", predicted)

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:  
            num_hits += 1.0
            score += num_hits / (i+1.0)
            # print("ben Hammer add score", score)

    if not actual:
        return 0.0

    return score # / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [10]:
'''
evaluation metrics MAP@5
sources: 
https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
https://www.kaggle.com/c/FacebookRecruiting/discussion/2002
https://en.wikipedia.org/wiki/Information_retrieval
Note, that the metric is designed for "document retrieval", 
where many outcomes might be true (= "relevant documents")
Our case is specific, as there is only one "relevant document" per prediction 
(= the true prediction)
'''

# test implementation against Ben Hammers one
test = average_precision([1,12,3,12,8],12,5)
print("average precision my take: ",test)

test = apk([12],[1,12,3,12,8],5)
print("average precision Ben Hammer: ",test)


test = mean_average_precision([[1,13,3,12,8], [5,3,12,3,6], [8,6,11,2,4]],[12,15,8],5 )
print("mean average precision my take: ",test)

test = mapk([[12],[15],[8]], [[1,13,3,12,8], [5,3,12,3,6], [8,6,11,8,4]], 5)
print("mean average precision Ben Hammer: ",test)

true_lables = [12,15,8]
model_predictions = [[1,13,3,12,8], [5,3,12,3,6], [8,6,11,8,4]]

test = average_precision(['ga','gi','go','gu','ge'], 'go', 5)
print("average precision with strings: ",test)

average precision my take:  0.5
average precision Ben Hammer:  0.5
mean average precision my take:  0.416666666667
mean average precision Ben Hammer:  0.416666666667
average precision with strings:  0.3333333333333333


In [11]:
'''
As a benchmark create case of a dumb model without any predictive power and test, how this model performs
measured in MAP@5 metric
Test set contains 15,610 images. 
Assume, that a "dumb" model will map the images to the individuals randomly, 
'''

# def test_evaluate(test_csv"data/small_train.csv", "data/small_train"):
create_small_case(sel_whales = [1,2,3],
                  small_dir = "data/dumb_train", 
                  small_csv = "data/dumb_train.csv")
 
# train_list = read_csv(file_name = "data/dumb_train.csv")   # for testing toy data set
train_list = read_csv(file_name = "data/train.csv")   # for testing whole train data set

whales, counts = get_whales(train_list)
print("{} individuals".format(len(counts)))  

# to each image in train_list map a ranked list of max_pred whales
# as random number between 1 and # of individuals in scenario (indeces in whale list)
max_pred = 5
dummy_preds = []
for i in range(len(train_list)):
    ranks = np.random.randint(0,len(counts),max_pred)
    dummy_preds.append(ranks)

# get list of true labels: retrieve whale number from name
true_labels = []
for i, img in enumerate(train_list):
    name = img[1]
    true_labels.append([i for i, whale in enumerate(whales) if whale[0] == name][0])

print("dummy_preds first 10: \n",dummy_preds[:10])
print("true_labels  first 50: \n",true_labels[:50])

MAP = mean_average_precision(dummy_preds, true_labels, max_pred)
print("\n MAP", MAP)




old directory removed data/dumb_train
copy 34 images for whale # 1 in ordered list, called w_1287fbc
copy 27 images for whale # 2 in ordered list, called w_98baff9
copy 26 images for whale # 3 in ordered list, called w_7554f44
write csv file: data/dumb_train.csv
4251 individuals
dummy_preds first 10: 
 [array([  72, 3093, 3448, 2663,  554]), array([2245, 3498, 2332, 2935, 2695]), array([2736, 1627, 4152, 3065, 1350]), array([2923, 1929, 1590, 1756, 3128]), array([1433, 4196,   87, 2150,  457]), array([2374, 3760, 3333, 1425, 4179]), array([ 729,  954, 2395, 1622, 3709]), array([ 860, 2564, 2405, 3753, 3033]), array([3093, 3533, 2155,  172, 3333]), array([ 710, 1760,  280,  444,  917])]
true_labels  first 50: 
 [3978, 1, 55, 2234, 1984, 772, 4058, 0, 62, 662, 340, 2957, 796, 7, 689, 828, 406, 3763, 945, 0, 0, 744, 274, 860, 3597, 1175, 0, 664, 1324, 282, 0, 1230, 46, 579, 76, 2915, 1658, 1606, 566, 1237, 0, 183, 816, 2373, 38, 442, 3083, 4097, 1634, 664]

 MAP 0.000653130287648
