In [1]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math
from preprocessing.contextualFeaturesGenerator.utils.LETORIterator import LETORIterator
from scipy.stats import ttest_rel
from utils.evaluate import Evaluate 
%matplotlib inline

In [32]:
def get_all_LTR_metrics(file, ids, metrics):
    results = []
    for idx in ids:
        df = pd.read_pickle(file.format(idx))
        for column in df:
            if type(column) is int:
                # Get a ranking with the respective scores.
                ranking = df[str(column) + "_s"].as_matrix()
                
                # Remove all nans
                ranking = [int(x) for x in ranking if not math.isnan(float(x))]
                
                # Calculate the evaluation scores
                scores = Evaluate.compute_scores(ranking)
                
                # Append all evaluation scores together with its query id in the correct order
                results.append([column] + [scores[i] for i in metrics])
                
    df = pd.DataFrame(results, columns=["query_id"] + metrics)
    return df.groupby("query_id", as_index=False).mean().sort_values(by=["query_id"])

def get_baseline_df(test_file, score_file, metrics):
    queries = []
    scores = []
    prev_query_id = -1
    for i in range(1, 6):
        iterator = LETORIterator(test_file.format(i))
        with open(score_file.format(i), "r") as f:
            for line, (d_query_id, doc_id, rel_score, _) in zip(f, iterator.feature_iterator()):
                s_query_id, _, score = line.rstrip().split("\t")
                s_query_id, score = int(s_query_id), float(score)

                assert int(s_query_id) == int(d_query_id), str(s_query_id) + " != " + str(s_query_id)
                
                if prev_query_id == -1:
                    prev_query_id = s_query_id
                    
                
                if s_query_id != prev_query_id:
                    prev_query_id = s_query_id
                    if len(scores) > 0:
                        scores = sorted(scores, key=lambda x: -x[1])
                        queries.append((s_query_id, scores))
                        scores = []
                


                scores.append((int(rel_score), score))
    if len(scores) > 0:
        scores = sorted(scores, key=lambda x: -x[1])
        queries.append((s_query_id, scores))
        scores = []
    
    results = []
    for query, rank in queries:
        ranking = list(zip(*rank))[0]

        # Calculate the evaluation scores
        scores = Evaluate.compute_scores(ranking)
                
        # Append all evaluation scores together with its query id in the correct order
        results.append([query] + [scores[i] for i in metrics])
   
    df = pd.DataFrame(results, columns=["query_id"] + metrics)
    return df.sort_values(by=["query_id"])

def t_test(df_1, df_2, metrics):
    results = []
    for metric in metrics:
        results.append((metric, ttest_rel(df_1[metric], df_2[metric]).pvalue))
    return results
    

In [39]:
pickle_path = "storage/logs"
files = ["_baseline_masks_{}.pkl",
        "_ViP_snapshots_{}.pkl",
        "_ViP_highlights_{}.pkl",
        "_vgg16_snapshots_{}.pkl",
        "_vgg16_highlights_{}.pkl",
        "_vgg16_saliency_{}.pkl"]
metrics = ["p@1","p@5","p@10","ndcg@1","ndcg@5","ndcg@10","map"]

ids = range(1, 26)
vis_dfs = []
for file in files: 
    vis_dfs.append(get_all_LTR_metrics(os.path.join(pickle_path, file), ids, metrics))
    
rankboost_df = get_baseline_df("storage/clueweb12_3.0/Fold{}/vali.txt", "storage/baseline/scores/rankboost_{}", metrics)
lambdamart_df = get_baseline_df("storage/clueweb12_3.0/Fold{}/vali.txt", "storage/baseline/scores/lambdamart_{}", metrics)
adarank_df = get_baseline_df("storage/clueweb12_3.0/Fold{}/vali.txt", "storage/baseline/scores/adarank_{}", metrics)


In [53]:
t_test( vis_dfs[0], lambdamart_df, metrics)

[('p@1', 0.08960816958126622),
 ('p@5', 0.00963157804344407),
 ('p@10', 0.07182853967570887),
 ('ndcg@1', 0.15789906902414283),
 ('ndcg@5', 0.013354235829421737),
 ('ndcg@10', 0.028756056417114255),
 ('map', 0.8033449468254168)]

In [45]:
# lambdamart_df = get_baseline_df("storage/clueweb12_3.0/Fold{}/test.txt", "storage/baseline/scores/lambdamart_P@5_{}", metrics)
vis_dfs[0].mean()

query_id    250.500000
p@1           0.338000
p@5           0.358800
p@10          0.370000
ndcg@1        0.189167
ndcg@5        0.215016
ndcg@10       0.232574
map           0.414786
dtype: float64

In [37]:
lambdamart_same_df.mean()

query_id    250.690000
p@1           0.440000
p@5           0.442000
p@10          0.467000
ndcg@1        0.242500
ndcg@5        0.268343
ndcg@10       0.293948
map           0.433493
dtype: float64

In [144]:
vis_dfs[5].mean()

query_id    250.500000
p@1           0.560000
p@5           0.546800
p@10          0.519800
ndcg@1        0.322500
ndcg@5        0.336874
ndcg@10       0.346342
map           0.455765
dtype: float64