In [42]:
import os
import json
import pickle as pkl
import numpy as np
import pytrec_eval
import read_ap
import download_ap
import scipy.stats
import timeit

from collections import defaultdict, Counter
from tf_idf import TfIdfRetrieval

In [54]:
LOAD_METRICS = True

def write_results(model, mdic):
    results_path = "results"
    if not os.path.exists(results_path):
        os.mkdir(results_path)
        
    #dump metrics to JSON
    with open(os.path.join(results_path, model+".json"), "w") as writer:
        json.dump(mdic["metrics"], writer, indent=1)
        
    #write file with all query-doc pairs, scores, ranks, etc.
    f = open(os.path.join(results_path, model+".dat"), "w")
    for qid in mdic["results"]:
        prevscore = 1e9
        for rank, docid in enumerate(mdic["results"][qid], 1):
            score = mdic["results"][qid][docid]
            if score > prevscore:
                f.close()
                raise Exception("'results_dic' not ordered! Stopped writing results")
            f.write(f"{qid} Q0 {docid} {rank} {score} STANDARD\n")
            prevscore = score
    f.close()
    
def perform_ttest(m1, m2, metric, models, thresh=0.05, print_res=True):
    #if pvalue < thresh (usually 0.05), then diff is significant
    
    qids = [qid for qid in models[m1]["metrics"]]
    scores1 = [models[m1]["metrics"][qid][metric] for qid in qids]
    scores2 = [models[m2]["metrics"][qid][metric] for qid in qids]   
    pvalue = scipy.stats.ttest_rel(scores1, scores2).pvalue
    conclusion = "significant diff" if pvalue < thresh else "insignificant diff"
    print("{:<12} {:<12} {:<19} {:<7} p-value = {:<5.3}".format(m1, m2, conclusion, "("+metric+")", pvalue))
    return pvalue

In [44]:
#read data
if not LOAD_METRICS:
    docs = read_ap.get_processed_docs()
    qrels, queries = read_ap.read_qrels()

In [45]:
#prepare models

if not LOAD_METRICS:
    models = {}

    models["TF-IDF"]     = {"model": TfIdfRetrieval(docs), "results": {}, "metrics": {}}
    models["LDA500"]     = {"model": LDARetrieval(docs, get_model=True, 
                                                        num_topics=500, 
                                                        passes=6, 
                                                        iterations=40, 
                                                        prep_search=True), "results": {}, "metrics": {}}
    # models["word2vec"]   = {"model": ..., "results": {}, "metrics": {}}
    # models["doc2vec"]    = {"model": ..., "results": {}, "metrics": {}}
    # models["LSI-BoW"]    = {"model": ..., "results": {}, "metrics": {}}
    # models["LSI-TF-IDF"] = {"model": ..., "results": {}, "metrics": {}}
    # models["LDA"]        = {"model": ..., "results": {}, "metrics": {}}

In [46]:
#run each query for each model

if not LOAD_METRICS:
    for model in models:
        for qid in qrels: 
            query_text = queries[qid]
            models[model]["results"][qid] = dict(models[model]["model"].search(query_text))


In [47]:
#evaluate results

metrics = {'map', 'ndcg'}

if not LOAD_METRICS:
    evaluator = pytrec_eval.RelevanceEvaluator(qrels, metrics)

    for model in models:
        models[model]["metrics"] = evaluator.evaluate(models[model]["results"])

In [49]:
#write results

if not LOAD_METRICS:
    for model in models:
        write_results(model, models[model])

In [51]:
if LOAD_METRICS:
    models = {"LDA10":{}, "LDA50":{}, "LDA100":{}, "LDA500":{}}
    for model in models:
        with open("./results/" + f"{model}.json","r") as fp:
            models[model]["metrics"] = json.load(fp)

In [52]:
#print avg metrics
for metric in metrics:
    for model in models:
        res = np.array([models[model]["metrics"][qid][metric] for qid in models[model]["metrics"]])
        string = "{:<12} {:<6}: {:<3.2} +/- {:<3.2}".format(model, metric, np.mean(res), np.std(res))
        print(string)

LDA10        ndcg  : 0.31 +/- 0.12
LDA50        ndcg  : 0.36 +/- 0.14
LDA100       ndcg  : 0.39 +/- 0.17
LDA500       ndcg  : 0.4 +/- 0.19
LDA10        map   : 0.0064 +/- 0.013
LDA50        map   : 0.02 +/- 0.038
LDA100       map   : 0.04 +/- 0.077
LDA500       map   : 0.058 +/- 0.1


In [None]:
#perform t-tests

ttest = {}
for metric in metrics:
    for model1 in models:
        for model2 in models:
#             if model1 != model2:
            #or, to reduce redundancy:
            if model1 != model2 and model1+" "+model2 not in ttest and model2+" "+model1 not in ttest:      
            
                ttest[model1+" "+model2] = {metric: perform_ttest(model1, model2, metric, models)}