In [18]:
import warnings
warnings.simplefilter("ignore", UserWarning)
import pandas as pd
from glob import glob
import os
from typing import List, Set
from statistics import mean
from sklearn.metrics import average_precision_score
from scipy.stats import ttest_ind
import numpy as np
import re

In [9]:
def get_ground_truth(path : str):
    ground_truth = pd.read_csv(path, header=None, names=('Path', 'Score'), sep=',', skipinitialspace=True, quotechar="'")
    ground_truth['id'] = ground_truth['Path'].str[:-4]
    ground_truth.set_index('id', inplace=True)
    return ground_truth
    # ground_truth.loc['55-If3ebb17bc7963bfa8d81f040'].Score (=='Off Topic')

In [47]:
def calculate_precision_at_k(run_result_path : str, k : int, ground_truth):
    df = pd.read_csv(run_result_path, delim_whitespace=True, header=None, names=('QueryId', 'Stance', 'ImageId', 'Score', 'Rank', 'Tag'))
    
    precision_history = []
    for (image_id, stance), results in df.groupby(['QueryId', 'Stance']):
        true_positives = 0
        for _, row in results.iterrows():
            if row['Rank'] > k:
                continue

            lookup_id = f'{row["QueryId"]}-{row["ImageId"]}'
            score = ground_truth.loc[lookup_id].Score
            if (stance.lower() == 'pro' and score.lower() == 'pro') or (stance.lower() == 'con' and score.lower() == 'con'):
                true_positives += 1
        precision_history.append(true_positives/k)
    
    return mean(precision_history)

def calculate_map(run_result_path : str, ground_truth):
    ap_history = get_ap_history(run_result_path, ground_truth)
    
    return mean(ap_history)

def get_ap_history(run_result_path : str, ground_truth):
    df = pd.read_csv(run_result_path, delim_whitespace=True, header=None, names=('QueryId', 'Stance', 'ImageId', 'Score', 'Rank', 'Tag'))
    
    ap_history = []
    for (query_id, stance), results in df.sort_values(['Rank'], ascending=True).groupby(['QueryId', 'Stance']):
        def get_true_score(row : pd.Series):
            true_stance = ground_truth.loc[row.LookupId].Score
            true_score = int((stance.lower() == 'pro' and true_stance.lower() == 'pro') or (stance.lower() == 'con' and true_stance == 'con'))
            return true_score

        results['LookupId'] = str(query_id) + '-' + results['ImageId']
        results['TrueScore'] = results.apply(get_true_score, axis=1)
        ap = average_precision_score(np.array(list(results['TrueScore'])), np.array([1] * len(results['Score'])) )
        ap_history.append(ap)
    
    return ap_history


def calculate_t_test(run_result_path : str, ground_truth):
    if run_result_path == 'run-results\\run-1.txt':
        return 0
    
    baseline_ap = get_ap_history('run-results/run-1.txt', ground_truth)
    run_ap = get_ap_history(run_result_path, ground_truth)

    return ttest_ind(baseline_ap, run_ap).pvalue


print('{Pipeline ID} & {Precision@10} & {Precision@1} & {MAP} & {p-value} \\\\\midrule')
ground_truth = get_ground_truth('qrel_converted.csv')
for path in glob('run-results/run*.txt'):
    print(re.search('run(-?\d*?)\.txt', path).group(1), end=' & ')
    print(f'{calculate_precision_at_k(path, 10, ground_truth):.3f}', end=' & ')
    print(f'{calculate_precision_at_k(path, 1, ground_truth):.3f}', end=' & ')
    print(f'{calculate_map(path, ground_truth):.3f}', end=' & ')
    print(f'{calculate_t_test(path, ground_truth):.3f}', end=' \\\\\n')

{Pipeline ID} & {Precision@10} & {Precision@1} & {MAP} & {p-value} \\\midrule
-1 & 0.127 & 0.130 & 0.097 & 0.000 \\
0 & 0.141 & 0.170 & 0.100 & 0.911 \\
1 & 0.115 & 0.120 & 0.084 & 0.602 \\
2 & 0.090 & 0.106 & 0.113 & 0.621 \\
3 & 0.122 & 0.110 & 0.096 & 0.956 \\
