In [4]:
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import argparse
import json
import math
import os
import random
import re
import sys
import threading
import time
from datetime import datetime
from typing import *

import numpy as np
import openai
from rapidfuzz import fuzz
from tqdm import tqdm

In [None]:
def read_jsonl(fpath: str) -> List[Dict]:
    res = []
    with open(fpath, 'r') as f:
        for line in f:
            data = json.loads(line)
            res.append(data)
    return res

def write_jsonl(obj, fpath: str):
    with open(fpath, 'w') as f:
        for entry in obj:
            json.dump(entry, f)
            f.write('\n')


In [None]:
gpt4o=read_jsonl('code/outputs/gpt4o.jsonl')
gpt4o_rank_break_down=read_jsonl('code/outputs/gpt4o_break_down.jsonl')
gpt4o_break_down_with_rationale =read_jsonl('code/outputs/gpt4o_break_down_with_rationale.jsonl')

In [25]:
def hit_judge(msg: str, target: str, thres: float=80):
    msg = re.sub(r"[^a-zA-Z0-9\s]", "", msg.lower())
    target = re.sub(r"[^a-zA-Z0-9\s]", "", target.lower())
    if fuzz.partial_ratio(msg, target) > thres:
        return True
    else:
        return False

def get_rank(pred: str, target: str):
    # 解析出answer中的 字典部分
    pattern = r"\{.*?\}"
    json_str = re.findall(pattern, pred, re.DOTALL)
    if json_str:
        pred = json_str[0]
        try:
            pred = json.loads(pred)
            pred = {k.strip(): v for k, v in pred.items()}
        except:
            return -1
    else:
        return -1
    
    if target not in pred:
        for k, v in pred.items():
            k = str(k)
            if hit_judge(k, target, 90):
                try:
                    rank = int(v)
                    return rank
                except:
                    rank = -1
        return -1

    try:
        rank = int(pred[target])
    except:
        return -1
    return rank

def eval_ndcg(data):
    ndcg = []
    mr = []
    for i, d in enumerate(tqdm(data)):
        rank = get_rank(d['response'], d['target'])
        
        if rank > 0: # 若找到rank
            ndcg.append(1/math.log2(rank + 1))
        else:
            ndcg.append(0)
    final_ndcg = sum(ndcg) / len(ndcg)
    return {f"NDCG@20": final_ndcg}   

In [28]:
eval_ndcg(gpt4o)

100%|██████████| 100/100 [00:00<00:00, 25189.50it/s]


{'NDCG@20': 0.3589020506309217}

In [26]:
eval_ndcg(gpt4o_rank_break_down)

100%|██████████| 100/100 [00:00<00:00, 24358.58it/s]


{'NDCG@20': 0.3532655614231814}

In [34]:
eval_ndcg(gpt4o_break_down_with_rationale)

100%|██████████| 100/100 [00:00<00:00, 24858.08it/s]


{'NDCG@20': 0.3656314316236756}