In [27]:
import warnings
warnings.simplefilter("ignore", UserWarning)
import pandas as pd
from glob import glob
import os
from typing import List, Set
from statistics import mean
from sklearn.metrics import average_precision_score
from scipy.stats import ttest_ind
import numpy as np

In [28]:
ground_truth = pd.read_csv('merged_annotation.csv', header=None, names=('Path', 'Score'), sep=',', skipinitialspace=True, quotechar="'")
ground_truth['id'] = ground_truth['Path'].str[:-4]
ground_truth.set_index('id', inplace=True)
# ground_truth.loc['55-If3ebb17bc7963bfa8d81f040'].Score (=='Off Topic')

In [31]:
def calculate_precision_at_k(run_result_path : str, k : int):
    df = pd.read_csv(run_result_path, delim_whitespace=True, header=None, names=('QueryId', 'Stance', 'ImageId', 'Score', 'Rank', 'Tag'))
    
    precision_history = []
    for (image_id, stance), results in df.groupby(['QueryId', 'Stance']):
        true_positives = 0
        for _, row in results.iterrows():
            if row['Rank'] > k:
                continue

            lookup_id = f'{row["QueryId"]}-{row["ImageId"]}'
            score = ground_truth.loc[lookup_id].Score
            if (stance == 'pro' and score == 'Pro') or (stance == 'con' and score == 'Con'):
                true_positives += 1
        precision_history.append(true_positives/k)
    
    print(f'Precision at {k}: {mean(precision_history)}')

def calculate_map(run_result_path : str):
    ap_history = get_ap_history(run_result_path)
    
    print(f'Mean Average Precision: {mean(ap_history)}')

def get_ap_history(run_result_path : str):
    df = pd.read_csv(run_result_path, delim_whitespace=True, header=None, names=('QueryId', 'Stance', 'ImageId', 'Score', 'Rank', 'Tag'))
    
    ap_history = []
    for (query_id, stance), results in df.sort_values(['Rank'], ascending=True).groupby(['QueryId', 'Stance']):
        def get_true_score(row : pd.Series):
            true_stance = ground_truth.loc[row.LookupId].Score
            true_score = int((stance == 'pro' and true_stance == 'Pro') or (stance == 'con' and true_stance == 'Con'))
            return true_score

        results['LookupId'] = str(query_id) + '-' + results['ImageId']
        results['TrueScore'] = results.apply(get_true_score, axis=1)
        ap = average_precision_score(np.array(list(results['TrueScore'])), np.array([1] * len(results['Score'])) )
        ap_history.append(ap)
    
    return ap_history


def calculate_t_test(run_result_path : str):
    if run_result_path == 'run-results\\run-1.txt':
        return
    
    baseline_ap = get_ap_history('run-results/run-1.txt')
    run_ap = get_ap_history(run_result_path)


    print('T-Test: ', ttest_ind(baseline_ap, run_ap))

for path in glob('run-results/run*.txt'):
    print(path)
    calculate_precision_at_k(path, 10)
    calculate_precision_at_k(path, 1)
    calculate_map(path)
    calculate_t_test(path)
    print()

run-results\run-1.txt
Precision at 10: 0.147
Precision at 1: 0.14
Mean Average Precision: 0.147

run-results\run0.txt
Precision at 10: 0.15
Precision at 1: 0.18
Mean Average Precision: 0.15
T-Test:  Ttest_indResult(statistic=-0.10437250165798578, pvalue=0.9169793923642124)

run-results\run1.txt
Precision at 10: 0.13
Precision at 1: 0.08
Mean Average Precision: 0.13216666666666665
T-Test:  Ttest_indResult(statistic=0.5273562197081764, pvalue=0.598536639782616)

run-results\run2.txt
Precision at 10: 0.1148936170212766
Precision at 1: 0.14893617021276595
Mean Average Precision: 0.18527524484971294
T-Test:  Ttest_indResult(statistic=-1.0942843243485154, pvalue=0.2752012087288021)

run-results\run3.txt
Precision at 10: 0.148
Precision at 1: 0.07
Mean Average Precision: 0.15533333333333332
T-Test:  Ttest_indResult(statistic=-0.28502055175917607, pvalue=0.7759263018806899)

