In [None]:
import pandas as pd
import numpy as np
import csv

In [None]:
def compute_num_of_rel_doc(query_table, rel_table, cutoff):
    target_table = query_table[:cutoff]
    number = sum(target_table['doc_number'].isin(rel_table['doc_id']))
    return number

In [None]:
def compute_AP_for_query(query_table, rel_table):
    AP_result = 0
    rel_docnums = rel_table['doc_id'].tolist()
    query_docnums = query_table['doc_number'].tolist()
    for index in range(len(query_docnums)):
        relevance = 1 if query_docnums[index] in rel_docnums else 0
        if relevance:
            cutoff = index + 1
            precision = compute_num_of_rel_doc(query_table, rel_table, cutoff) / cutoff
            AP_result += relevance * precision
    AP_result = AP_result / len(rel_table)
    return AP_result 

In [None]:
def compute_DCG_for_query(relevance_list):
    DCG_result = relevance_list[0]
    for index in range(1, len(relevance_list)):
        DCG_result += relevance_list[index] / np.log2(index+1)
    return DCG_result

In [None]:
def compute_nDCG_for_query(query_table, rel_table, cutoff):
    target_table = query_table[:cutoff]
    rel_docnums = rel_table['doc_id'].tolist()
    query_docnums = target_table['doc_number'].tolist()
    relevance_list = []
    for index in range(len(query_docnums)):
        doc_number = query_docnums[index]
        if doc_number in rel_docnums:
            relevance = int(rel_table[rel_table['doc_id']==doc_number]['relevance'])
            relevance_list.append(relevance)
        else:
            relevance_list.append(0)
    DCG_result = compute_DCG_for_query(relevance_list)
    ideal_relevance_list = rel_table['relevance'].tolist()
    if len(ideal_relevance_list) >= cutoff:
        ideal_relevance_list = ideal_relevance_list[:cutoff]
    else:
        extra = cutoff - len(ideal_relevance_list)
        ideal_relevance_list.extend([0 for i in range(extra)])
    iDCG_result = compute_DCG_for_query(ideal_relevance_list)
    nDCG_result = DCG_result / iDCG_result
    return nDCG_result

In [None]:
def compute_ir_eval_metrics(query_table, rel_table):
    P_cutoff, R_cutoff, nDCG_cutoff1, nDCG_cutoff2 = 10, 50, 10, 20
    P_result = compute_num_of_rel_doc(
        query_table, rel_table, P_cutoff) / P_cutoff
    R_result = compute_num_of_rel_doc(
        query_table, rel_table, R_cutoff) / len(rel_table)
    R_precision = compute_num_of_rel_doc(
        query_table, rel_table, len(rel_table)) / len(rel_table)
    AP_result = compute_AP_for_query(
        query_table, rel_table)
    nDCG_result1 = compute_nDCG_for_query(
        query_table, rel_table, nDCG_cutoff1)
    nDCG_result2 = compute_nDCG_for_query(
        query_table, rel_table, nDCG_cutoff2)
    metrics_results = [P_result, R_result, R_precision, AP_result,
                       nDCG_result1, nDCG_result2]
    return metrics_results

In [None]:
def write_ir_eval_result_file(ir_eval_results_list):
    with open('ir_eval.csv', 'w') as f:
        f.write("system_number,query_number,P@10,R@50,r-precision,AP,nDCG@10,nDCG@20\n")
        for ir_eval_result in ir_eval_results_list:
            metrics = ir_eval_result[2:]
            metrics = ['{:.3f}'.format(metric) for metric in metrics]
            ir_eval_result = ir_eval_result[:2]
            ir_eval_result = [str(name) for name in ir_eval_result]
            ir_eval_result.extend(metrics)
            f.write(','.join(ir_eval_result) + '\n')

In [None]:
def compute_ir_eval(sys_results, qrels):
    ir_eval_results_list = []
    for sys_num in pd.unique(sys_results['system_number']):
        sys_table = sys_results[sys_results['system_number'] == sys_num]
        ir_eval_results = []
        for query_num in pd.unique(sys_table['query_number']):
            query_table = sys_table[sys_table['query_number'] == query_num]
            rel_table = qrels[qrels['query_id'] == query_num]
            metrics_results = compute_ir_eval_metrics(query_table, rel_table)
            ir_eval_results.append([sys_num, query_num] + metrics_results)
        mean_ir_eval_result = np.mean(np.array(ir_eval_results)[:, 2:], axis=0).tolist()
        ir_eval_results.append([sys_num, 'mean'] + mean_ir_eval_result)
        ir_eval_results_list.extend(ir_eval_results)
    write_ir_eval_result_file(ir_eval_results_list)

In [1]:
import pandas as pd
import numpy as np

In [2]:
def read_system_results(file_name):
    sys_results = {}
    with open(file_name, 'r', encoding='utf-8') as f:
        header = f.readline()
        for line in f:
            line = line.replace('\n', '').split(',')
            sys_num, query_num = line[0], line[1]
            if sys_num not in sys_results:
                sys_results[sys_num] = {}
            if query_num not in sys_results[sys_num]:
                sys_results[sys_num][query_num] = {}
            table = sys_results[sys_num][query_num]
            if 'doc_number' not in table:
                table['doc_number'] = []
            if 'rank_of_doc' not in table:
                table['rank_of_doc'] = []
            if 'score' not in table:
                table['score'] = []
            table['doc_number'].append(int(line[2]))
            table['rank_of_doc'].append(int(line[3]))
            table['score'].append(float(line[4]))
    return sys_results

In [3]:
def read_qrels(file_name):
    qrels = {}
    with open(file_name, 'r', encoding='utf-8') as f:
        header = f.readline()
        for line in f:
            line = line.replace('\n', '').split(',')
            query_num = line[0]
            if query_num not in qrels:
                qrels[query_num] = {}
            table = qrels[query_num]
            if 'doc_id' not in table:
                table['doc_id'] = []
            if 'relevance' not in table:
                table['relevance'] = []
            table['doc_id'].append(int(line[1]))
            table['relevance'].append(int(line[2]))
    return qrels

In [4]:
def compute_ir_eval(sys_results, qrels):
    ir_eval_results_list = []
    for sys_num in sys_results.keys():
        ir_eval_results = []
        for query_num in sys_results[sys_num].keys():
            query_table = sys_results[sys_num][query_num]
            rel_table = qrels[query_num]
            metrics_results = compute_ir_eval_metrics(query_table, rel_table)
            ir_eval_results.append([sys_num, query_num] + metrics_results)
        ir_eval_result = np.array(ir_eval_results)[:, 2:]
        ir_eval_result = ir_eval_result.astype(np.float)
        mean_ir_eval_result = np.mean(ir_eval_result, axis=0).tolist()
        ir_eval_results.append([sys_num, 'mean'] + mean_ir_eval_result)
        ir_eval_results_list.extend(ir_eval_results)
    return ir_eval_results_list

In [5]:
def write_ir_eval_result_file(ir_eval_results_list):
    with open('ir_eval.csv', 'w') as f:
        f.write("system_number,query_number,P@10,R@50,r-precision,AP,nDCG@10,nDCG@20\n")
        for ir_eval_result in ir_eval_results_list:
            metrics = ir_eval_result[2:]
            metrics = ['{:.3f}'.format(metric) for metric in metrics]
            ir_eval_result = ir_eval_result[:2]
            ir_eval_result = [str(name) for name in ir_eval_result]
            ir_eval_result.extend(metrics)
            f.write(','.join(ir_eval_result) + '\n')

In [6]:
def compute_ir_eval_metrics(query_table, rel_table):
    P_cutoff, R_cutoff, nDCG_cutoff1, nDCG_cutoff2 = 10, 50, 10, 20
    rel_num = len(rel_table['doc_id'])
    P_result = compute_num_of_rel_doc(
        query_table, rel_table, P_cutoff) / P_cutoff
    R_result = compute_num_of_rel_doc(
        query_table, rel_table, R_cutoff) / rel_num
    R_precision = compute_num_of_rel_doc(
        query_table, rel_table, rel_num) / rel_num
    AP_result = compute_AP_for_query(
        query_table, rel_table)
    nDCG_result1 = compute_nDCG_for_query(
        query_table, rel_table, nDCG_cutoff1)
    nDCG_result2 = compute_nDCG_for_query(
        query_table, rel_table, nDCG_cutoff2)
    metrics_results = [P_result, R_result, R_precision, AP_result,
                       nDCG_result1, nDCG_result2]
    return metrics_results

In [7]:
def compute_num_of_rel_doc(query_table, rel_table, cutoff):
    num_of_rel_doc = 0
    doc_nums = query_table['doc_number'][:cutoff]
    rel_nums = rel_table['doc_id']
    for doc_num in doc_nums:
        if doc_num in rel_nums:
            num_of_rel_doc += 1
    return num_of_rel_doc

In [8]:
def compute_AP_for_query(query_table, rel_table):
    AP_result = 0
    rel_docnums = rel_table['doc_id']
    query_docnums = query_table['doc_number']
    for index in range(len(query_docnums)):
        relevance = 1 if query_docnums[index] in rel_docnums else 0
        if relevance:
            cutoff = index + 1
            precision = compute_num_of_rel_doc(query_table, rel_table, cutoff) / cutoff
            AP_result += relevance * precision
    AP_result = AP_result / len(rel_docnums)
    return AP_result 

In [9]:
def compute_DCG_for_query(relevance_list):
    DCG_result = relevance_list[0]
    for index in range(1, len(relevance_list)):
        DCG_result += relevance_list[index] / np.log2(index+1)
    return DCG_result

In [10]:
def compute_nDCG_for_query(query_table, rel_table, cutoff):
    query_docnums = query_table['doc_number'][:cutoff]
    rel_docnums = rel_table['doc_id']
    relevance_list = []
    for index in range(len(query_docnums)):
        doc_number = query_docnums[index]
        if doc_number in rel_docnums:
            idx = rel_docnums.index(doc_number)
            relevance = rel_table['relevance'][idx]
            relevance_list.append(relevance)
        else:
            relevance_list.append(0)
    DCG_result = compute_DCG_for_query(relevance_list)
    ideal_relevance_list = rel_table['relevance']
    if len(ideal_relevance_list) >= cutoff:
        ideal_relevance_list = ideal_relevance_list[:cutoff]
    else:
        extra = cutoff - len(ideal_relevance_list)
        ideal_relevance_list.extend([0 for i in range(extra)])
    iDCG_result = compute_DCG_for_query(ideal_relevance_list)
    nDCG_result = DCG_result / iDCG_result
    return nDCG_result

In [11]:
from scipy import stats
import numpy as np

In [34]:
sys_results = read_system_results('./system_results.csv')
qrels = read_qrels('./qrels.csv')
ir_eval_results_list = compute_ir_eval(sys_results, qrels)

In [35]:
def parse_ir_eval_results(ir_eval_results_list):
    system_mean_metrics = [row for row in ir_eval_results_list if row[1] == 'mean']
    mean_metrics_dict = {'P@10': [], 'R@50': [], 'r-precision': [],
                         'AP': [], 'nDCG@10': [], 'nDCG@20': []}
    for row in system_mean_metrics:
        for index, key in enumerate(mean_metrics_dict):
            mean_metrics_dict[key].append(row[index+2])
    system_metrics = [row for row in ir_eval_results_list if row[1] != 'mean']
    metrics_dict = {'P@10': {}, 'R@50': {}, 'r-precision': {},
                    'AP': {}, 'nDCG@10': {}, 'nDCG@20': {}}
    for row in system_metrics:
        sys_num = row[0]
        for index, key in enumerate(metrics_dict):
            if sys_num not in metrics_dict[key]:
                metrics_dict[key][sys_num] = []
            metrics_dict[key][sys_num].append(row[index+2])   
    return mean_metrics_dict, metrics_dict

In [41]:
def perform_ttest(mean_metrics_dict, metrics_dict):
    inputs_list, t_statistics_list, pvalues_list = [], [], []
    for key in mean_metrics_dict:
        ranked_scores = sorted(set(mean_metrics_dict[key]), reverse=True)
        largest_sys_nums = [str(i+1) for i, x in enumerate(mean_metrics_dict[key]) 
                            if x == ranked_scores[0]]
        second_sys_nums = [str(i+1) for i, x in enumerate(mean_metrics_dict[key])
                           if x == ranked_scores[1]]
        t_statistics, pvalues, inputs = [], [], []
        for largest_num in largest_sys_nums:
            for second_num in second_sys_nums:
                inputs.append((largest_num, second_num))
                S1 = metrics_dict[key][largest_num]
                S2 = metrics_dict[key][second_num]
                t_statistic = stats.ttest_ind(S1, S2).statistic
                pvalue = stats.ttest_ind(S1, S2).pvalue
                t_statistics.append(t_statistic)
                pvalues.append(pvalue)
        t_statistic = round(sum(t_statistics) / len(t_statistics), 4)
        pvalue = round(sum(pvalues) / len(pvalues), 4)
        t_statistics_list.append(t_statistic)
        pvalues_list.append(pvalue)
        inputs_list.append(inputs)
    return inputs_list, t_statistics_list, pvalues_list

In [42]:
mean_metrics_dict, metrics_dict = parse_ir_eval_results(ir_eval_results_list)
inputs_list, t_statistics_list, pvalues_list = perform_ttest(mean_metrics_dict, metrics_dict)

In [47]:
def write_ttest_file(inputs_list, t_statistics_list, pvalues_list):
    with open('ttest.csv', 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['info', 'P@10', 'R@50', 'r-precision',
                         'AP', 'nDCG@10', 'nDCG@20'])
        writer.writerow(['input'] + [str(inputs) for inputs in inputs_list])
        t_statistics = ['{:.4f}'.format(t_statistic) for t_statistic in t_statistics_list]
        writer.writerow(['t-stats'] + t_statistics)    
        pvalues = ['{:.4f}'.format(pvalue) for pvalue in pvalues_list]   
        writer.writerow(['p-value'] + pvalues)  
    return

In [50]:
write_ttest_file(inputs_list, t_statistics_list, pvalues_list)