In [42]:
import os
import json
import pickle as pkl
import numpy as np
import pytrec_eval
import read_ap
import download_ap
import scipy.stats
import timeit

from collections import defaultdict, Counter
from tf_idf import TfIdfRetrieval

In [98]:
LOAD_METRICS = False

def write_results(model, mdic):
    results_path = "results"
    if not os.path.exists(results_path):
        os.mkdir(results_path)
        
    #dump metrics to JSON
    with open(os.path.join(results_path, model+".json"), "w") as writer:
        json.dump(mdic["metrics"], writer, indent=1)
        
    #write file with all query-doc pairs, scores, ranks, etc.
    f = open(os.path.join(results_path, model+".dat"), "w")
    for qid in mdic["results"]:
        prevscore = 1e9
        for rank, docid in enumerate(mdic["results"][qid], 1):
            score = mdic["results"][qid][docid]
            if score > prevscore:
                f.close()
                raise Exception("'results_dic' not ordered! Stopped writing results")
            f.write(f"{qid} Q0 {docid} {rank} {score} STANDARD\n")
            prevscore = score
    f.close()
    
def perform_ttest(m1, m2, metric, models, thresh=0.05, print_res=True):
    #if pvalue < thresh (usually 0.05), then diff is significant
    
    qids = [qid for qid in models[m1]["metrics"]]
    scores1 = [models[m1]["metrics"][qid][metric] for qid in qids]
    scores2 = [models[m2]["metrics"][qid][metric] for qid in qids]   
    pvalue = scipy.stats.ttest_rel(scores1, scores2).pvalue
    conclusion = "significant diff" if pvalue < thresh else "insignificant diff"
    print("{:<25} {:<25} {:<19} {:<7} p-value = {:<5.3}".format(m1, m2, conclusion, "("+metric+")", pvalue))
    return pvalue

In [99]:
#read data
if not LOAD_METRICS:
    docs = read_ap.get_processed_docs()
    qrels, queries = read_ap.read_qrels()

Docs already processed. Loading from disk


In [100]:
#prepare models

if not LOAD_METRICS:
    models = {}

#     models["TF-IDF"]     = {"model": TfIdfRetrieval(docs), "results": {}, "metrics": {}}
    models["LDA500"]     = {"model": LDARetrieval(docs, get_model=True, 
                                                        num_topics=500, 
                                                        passes=6, 
                                                        iterations=40, 
                                                        prep_search=False), "results": {}, "metrics": {}}
    # run by everyone individually, metrics loaded from file
    # models["word2vec"]   = {"model": ..., "results": {}, "metrics": {}}
    # models["doc2vec"]    = {"model": ..., "results": {}, "metrics": {}}
    # models["LSI-BoW"]    = {"model": ..., "results": {}, "metrics": {}}
    # models["LSI-TF-IDF"] = {"model": ..., "results": {}, "metrics": {}}
    # models["LDA"]        = {"model": ..., "results": {}, "metrics": {}}

Loading corpus from disk...


In [46]:
#run each query for each model

if not LOAD_METRICS:
    for model in models:
        for qid in qrels: 
            query_text = queries[qid]
            models[model]["results"][qid] = dict(models[model]["model"].search(query_text))

In [47]:
#evaluate results

metrics = {'map', 'ndcg'}

if not LOAD_METRICS:
    evaluator = pytrec_eval.RelevanceEvaluator(qrels, metrics)

    for model in models:
        models[model]["metrics"] = evaluator.evaluate(models[model]["results"])

In [49]:
#write results

if not LOAD_METRICS:
    for model in models:
        write_results(model, models[model])

In [89]:
if LOAD_METRICS: 
    models = {}
    results_path = "./results/"
    for fname in os.listdir(results_path):
        if fname[-4:] == "json":
            model = fname[:-5]
            models[model] = {}
            with open(results_path + fname, "r") as fp:
                models[model]["metrics"] = json.load(fp)        

In [96]:
#print avg metrics
bestmodels = ["d2v_5", "LDA500", "lsi_bow2000", "lsi_tfidf2000", "TF-IDF", "bow10"]

for metric in metrics:
    for model in models:
        res = np.array([models[model]["metrics"][qid][metric] for qid in models[model]["metrics"]])
        string = "{:<25} {:<6}: {:<3.2} +/- {:<3.2}".format(model, metric, np.mean(res), np.std(res))
        print(string)

bow10                     ndcg  : 0.31 +/- 0.11
d2v_10                    ndcg  : 0.27 +/- 0.099
d2v_100k                  ndcg  : 0.27 +/- 0.098
d2v_10k                   ndcg  : 0.26 +/- 0.092
d2v_15                    ndcg  : 0.27 +/- 0.1
d2v_20                    ndcg  : 0.27 +/- 0.1
d2v_200k                  ndcg  : 0.27 +/- 0.098
d2v_25k                   ndcg  : 0.27 +/- 0.1
d2v_300                   ndcg  : 0.27 +/- 0.098
d2v_400                   ndcg  : 0.27 +/- 0.093
d2v_5                     ndcg  : 0.28 +/- 0.11
d2v_500                   ndcg  : 0.27 +/- 0.096
d2v_50k                   ndcg  : 0.27 +/- 0.099
d2v_bestvalid_metrics_5   ndcg  : 0.28 +/- 0.11
LDA10                     ndcg  : 0.31 +/- 0.12
LDA100                    ndcg  : 0.39 +/- 0.17
LDA1000                   ndcg  : 0.38 +/- 0.18
LDA50                     ndcg  : 0.36 +/- 0.14
LDA500                    ndcg  : 0.4 +/- 0.19
lsi_bow10                 ndcg  : 0.31 +/- 0.11
lsi_bow100                ndcg  : 0.

In [97]:
#perform t-tests

ttest = {}
for metric in metrics:
    for model1 in models:
        if model1 in bestmodels:
            for model2 in models:
                if model2 in bestmodels:
                    if model1 != model2 and model1+" "+model2 not in ttest.keys() and model2+" "+model1 not in ttest.keys():                  
                        ttest[model1+" "+model2] = {metric: perform_ttest(model1, model2, metric, models)}

bow10                     d2v_5                     significant diff    (ndcg)  p-value = 2.15e-06
bow10                     LDA500                    significant diff    (ndcg)  p-value = 1.4e-14
bow10                     lsi_bow2000               significant diff    (ndcg)  p-value = 9.2e-32
bow10                     lsi_tfidf2000             significant diff    (ndcg)  p-value = 1e-25
bow10                     TF-IDF                    significant diff    (ndcg)  p-value = 1.11e-33
d2v_5                     LDA500                    significant diff    (ndcg)  p-value = 6.78e-19
d2v_5                     lsi_bow2000               significant diff    (ndcg)  p-value = 6.1e-35
d2v_5                     lsi_tfidf2000             significant diff    (ndcg)  p-value = 6.22e-29
d2v_5                     TF-IDF                    significant diff    (ndcg)  p-value = 3.66e-37
LDA500                    lsi_bow2000               significant diff    (ndcg)  p-value = 9.13e-12
LDA500          

In [104]:
# if not LOAD_METRICS:
#     num_topics = 500
#     ldamodel = models["LDA500"]["model"].model
#     top_topics = ldamodel.top_topics(models["LDA500"]["model"].corpus, topn=10) 
#     for i, (top, _)  in enumerate(top_topics[:5]):   
#     print(i)
#     for _, word in top:
#         print(word)
#     print()