In [1]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math
from preprocessing.contextualFeaturesGenerator.utils.LETORIterator import LETORIterator
from scipy.stats import ttest_rel
from utils.evaluate import Evaluate 
%matplotlib inline

In [125]:
def get_all_LTR_metrics(file, ids, metrics):
    results = []
    for idx in ids:
        df = pd.read_pickle(file.format(idx))
        for column in df:
            if type(column) is int:
                # Get a ranking with the respective scores.
                ranking = df[str(column) + "_s"].as_matrix()
                
                # Remove all nans
                ranking = [int(x) for x in ranking if not math.isnan(float(x))]
                
                # Calculate the evaluation scores
                scores = Evaluate.compute_scores(ranking)
                
                # Append all evaluation scores together with its query id in the correct order
                results.append([column] + [scores[i] for i in metrics])
                
    df = pd.DataFrame(results, columns=["query_id"] + metrics)
    return df.groupby("query_id", as_index=False).mean().sort_values(by=["query_id"])

def get_baseline_df(test_file, score_file, metrics):
    queries = []
    scores = []
    prev_query_id = -1
    for i in range(1, 6):
        iterator = LETORIterator(test_file.format(i))
        with open(score_file.format(i), "r") as f:
            for line, (d_query_id, doc_id, rel_score, _) in zip(f, iterator.feature_iterator()):
                s_query_id, _, score = line.rstrip().split("\t")
                s_query_id, score = int(s_query_id), float(score)

                assert int(s_query_id) == int(d_query_id), str(s_query_id) + " != " + str(d_query_id)
                
                if prev_query_id == -1:
                    prev_query_id = s_query_id
                    
                
                if s_query_id != prev_query_id:
                    prev_query_id = s_query_id
                    if len(scores) > 0:
                        scores = sorted(scores, key=lambda x: -x[1])
                        queries.append((s_query_id, scores))
                        scores = []
                


                scores.append((int(rel_score), score))
    if len(scores) > 0:
        scores = sorted(scores, key=lambda x: -x[1])
        queries.append((s_query_id, scores))
        scores = []
    
    results = []
    for query, rank in queries:
        ranking = list(zip(*rank))[0]

        # Calculate the evaluation scores
        scores = Evaluate.compute_scores(ranking)
                
        # Append all evaluation scores together with its query id in the correct order
        results.append([query] + [scores[i] for i in metrics])
   
    df = pd.DataFrame(results, columns=["query_id"] + metrics)
    return df.sort_values(by=["query_id"])

def t_test(df_1, df_2, metrics):
    results = []
    for metric in metrics:
        results.append((metric, ttest_rel(df_1[metric], df_2[metric]).pvalue))
    return results
    

In [249]:
pickle_path = "storage/logs"
files = ["_baseline_masks_{}.pkl",
        "_ViP_snapshots_{}.pkl",
        "_ViP_highlights_{}.pkl",
        "_vgg16_snapshots_{}.pkl",
        "_vgg16_highlights_{}.pkl",
        "_vgg16_saliency_{}.pkl"]
metrics = ["p@1","p@5","p@10","ndcg@1","ndcg@5","ndcg@10","map"]

ids = range(1, 26)
vis_dfs = []
for file in files: 
    vis_dfs.append(get_all_LTR_metrics(os.path.join(pickle_path, file), ids, metrics).set_index("query_id"))
    
rankboost_df = get_baseline_df("storage/clueweb12_3.0/Fold{}/vali.txt", "storage/baseline/scores/rankboost_{}", metrics).set_index("query_id")
lambdamart_df = get_baseline_df("storage/clueweb12_3.0/Fold{}/vali.txt", "storage/baseline/scores/lambdamart_{}", metrics).set_index("query_id")
adarank_df = get_baseline_df("storage/clueweb12_3.0/Fold{}/vali.txt", "storage/baseline/scores/adarank_{}", metrics).set_index("query_id")

rankboost_img_df = get_baseline_df("storage/clueweb12_3.0_images/Fold{}/vali.txt", "storage/baseline/scores/rankboost_img_{}", metrics).set_index("query_id")
lambdamart_img_df = get_baseline_df("storage/clueweb12_3.0_images/Fold{}/vali.txt", "storage/baseline/scores/lambdamart_img_{}", metrics).set_index("query_id")
adarank_img_df = get_baseline_df("storage/clueweb12_3.0_images/Fold{}/vali.txt", "storage/baseline/scores/adarank_img_{}", metrics).set_index("query_id")

In [252]:

t_test( vis_dfs[4], lambdamart_img_df, metrics)
# t_test( adarank_df, lambdamart_df, metrics)


[('p@1', 0.1686891822549483),
 ('p@5', 0.02258328305624949),
 ('p@10', 0.023535194855763938),
 ('ndcg@1', 0.10725594035410112),
 ('ndcg@5', 0.04033611190120678),
 ('ndcg@10', 0.019671659970410094),
 ('map', 0.2611927839414759)]

In [216]:
lambdamart_img_df = lambdamart_img_df.set_index("query_id")

In [262]:
(lambdamart_img_df.reset_index(drop=True) <vis_dfs[4].reset_index(drop=True)).sum() - (lambdamart_img_df.reset_index(drop=True) >vis_dfs[4].reset_index(drop=True)).sum()

p@1         8
p@5        19
p@10       16
ndcg@1     16
ndcg@5     24
ndcg@10    27
map         4
dtype: int64

In [190]:
vis_dfs[5].mean()

p@1        0.554000
p@5        0.478400
p@10       0.453200
ndcg@1     0.309833
ndcg@5     0.295540
ndcg@10    0.301756
map        0.421551
dtype: float64

In [238]:
ttest_rel(rankboost_img_df["p@1"]*10, vis_dfs[4]["p@1"]*10)

Ttest_relResult(statistic=-1.633197682259768, pvalue=0.1056039630262534)

query_id
201    0.84
202    0.00
203    0.72
204    0.76
205    0.72
206    0.96
207    1.00
208    0.20
209    0.32
210    0.36
211    0.40
212    0.64
213    0.60
214    1.00
215    0.40
216    0.96
217    0.88
218    0.28
219    0.44
220    0.28
221    0.96
222    0.76
223    0.96
224    0.28
225    0.00
226    0.44
227    0.28
228    0.68
229    0.68
230    0.28
       ... 
271    0.00
272    0.80
273    0.08
274    0.28
275    0.00
276    0.76
277    0.44
278    0.00
279    0.60
280    0.96
281    0.92
282    0.96
283    0.40
284    1.00
285    0.96
286    0.44
287    0.04
288    0.80
289    0.00
290    0.28
291    0.32
292    0.88
293    0.68
294    0.92
295    0.48
296    0.92
297    0.96
298    0.48
299    0.84
300    0.52
Name: p@5, Length: 100, dtype: float64