# 调整benchmark.json以使其合法

In [15]:
import json

OUTPUT_DIR = "results"

def load_existing_results():
    evaluated_key = dict()
    # 读取已有的结果
    with open(f"{OUTPUT_DIR}/benchmark copy.json", "r") as f:
        content = json.load(f)
    correct = []
    for idx, eval_res in enumerate(content):
        if eval_res is None:
            continue
        eval_setting = {
            "task_name": eval_res["task_name"],
            "model_name": eval_res["model_name"],
            "chunking_strategy": eval_res["chunking_strategy"],
            "chunk_size": eval_res["chunk_size"],
        }
        key = json.dumps(eval_setting, sort_keys=True)
        if key in evaluated_key:
            print(f"key: {key}")
            print(f"found@{idx}: {eval_res}")
            print(f"duplicate@{evaluated_key[key][1]}: {evaluated_key[key][0]}")
            print("\n\n")
            continue
        correct.append(eval_res)
        evaluated_key[key] = (eval_res, idx)
    with open(f"{OUTPUT_DIR}/benchmark copy.json", "w", encoding="utf-8") as f:
        json.dump(correct, f, ensure_ascii=False, indent=4)

load_existing_results()

# 统计各数据集的大小

In [3]:
import json
import os
os.environ["CUDA_VISIBLE_DEVICES"]=""

from chunked_pooling.chunked_eval_tasks import *
from chunked_pooling.wrappers import load_model
from transformers import AutoModel, AutoTokenizer

task_name_to_cls = get_eval_tasks()
model_name = "jinaai/jina-embeddings-v2-small-en"
model, has_instructions = load_model(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model.eval()

sample_count = dict()
for task_name, task_cls in task_name_to_cls.items():
    chunking_args = {
        "chunk_size": 1024,
        "n_sentences": 5,
        "chunking_strategy": "fixed_text",
        "model_has_instructions": has_instructions,
        "embedding_model_name": model_name,
    }

    task = task_cls(
        tokenizer=tokenizer,
        prune_size=None,
        truncate_max_length=False,
        **chunking_args,
    )

    task.load_data()
    if "test" in task.queries:
        queries = task.queries["test"]
        corpus = task.corpus["test"]
    elif "dev" in task.queries:
        queries = task.queries["dev"]
        corpus = task.corpus["dev"]
    else:
        continue
    
    num_queries = len(queries)
    num_corpus = len(corpus)
    sample_count[task_name] = {"queries": num_queries, "corpus": num_corpus}
    print(f"{task_name}: NumOfQueries: {num_queries}, NumOfCorpus:{num_corpus}, total: {num_queries+num_corpus}")

    with open("sample_count.json", "w") as f:
        json.dump(sample_count, f)

Map: 100%|██████████| 1406/1406 [00:00<00:00, 23623.07 examples/s]


ArguAnaChunked: NumOfQueries: 1406, NumOfCorpus:8674, total: 10080


Map: 100%|██████████| 4681/4681 [00:00<00:00, 30193.97 examples/s]


ClimateFEVERChunked: NumOfQueries: 1535, NumOfCorpus:5416593, total: 5418128


Casting the dataset: 100%|██████████| 5673/5673 [00:00<00:00, 426352.14 examples/s]
Map: 100%|██████████| 5673/5673 [00:00<00:00, 34252.65 examples/s]
Filter: 100%|██████████| 467/467 [00:00<00:00, 60934.51 examples/s]
Map: 100%|██████████| 43515/43515 [00:01<00:00, 32656.56 examples/s]


DBPediaChunked: NumOfQueries: 400, NumOfCorpus:4635922, total: 4636322


Casting the dataset: 100%|██████████| 140085/140085 [00:00<00:00, 2462042.58 examples/s]
Map: 100%|██████████| 140085/140085 [00:04<00:00, 32528.11 examples/s]
Filter: 100%|██████████| 123142/123142 [00:00<00:00, 454318.34 examples/s]
Casting the dataset: 100%|██████████| 8079/8079 [00:00<00:00, 1532946.48 examples/s]
Map: 100%|██████████| 8079/8079 [00:00<00:00, 33278.35 examples/s]
Filter: 100%|██████████| 123142/123142 [00:00<00:00, 525737.45 examples/s]
Map: 100%|██████████| 7937/7937 [00:00<00:00, 29251.47 examples/s]


FEVERChunked: NumOfQueries: 6666, NumOfCorpus:5416568, total: 5423234


Casting the dataset: 100%|██████████| 170000/170000 [00:00<00:00, 2207145.76 examples/s]
Map: 100%|██████████| 170000/170000 [00:04<00:00, 34141.46 examples/s]
Filter: 100%|██████████| 97852/97852 [00:00<00:00, 430816.31 examples/s]
Casting the dataset: 100%|██████████| 10894/10894 [00:00<00:00, 1073204.34 examples/s]
Map: 100%|██████████| 10894/10894 [00:00<00:00, 31693.97 examples/s]
Filter: 100%|██████████| 97852/97852 [00:00<00:00, 521912.51 examples/s]
Map: 100%|██████████| 14810/14810 [00:00<00:00, 32806.87 examples/s]


HotpotQAChunked: NumOfQueries: 7405, NumOfCorpus:5233329, total: 5240734


Casting the dataset: 100%|██████████| 532751/532751 [00:00<00:00, 2807534.67 examples/s]
Map: 100%|██████████| 532751/532751 [00:17<00:00, 31083.68 examples/s]
Filter: 100%|██████████| 509962/509962 [00:01<00:00, 412982.29 examples/s]
Casting the dataset: 100%|██████████| 7437/7437 [00:00<00:00, 317761.31 examples/s]
Map: 100%|██████████| 7437/7437 [00:00<00:00, 29931.40 examples/s]
Filter: 100%|██████████| 509962/509962 [00:01<00:00, 504039.54 examples/s]
Map: 100%|██████████| 9260/9260 [00:00<00:00, 33182.95 examples/s]


MSMARCOChunked: NumOfQueries: 43, NumOfCorpus:8841823, total: 8841866


Casting the dataset: 100%|██████████| 919/919 [00:00<00:00, 220689.65 examples/s]
Map: 100%|██████████| 919/919 [00:00<00:00, 21059.86 examples/s]
Filter: 100%|██████████| 1109/1109 [00:00<00:00, 222463.20 examples/s]
Map: 100%|██████████| 339/339 [00:00<00:00, 24582.37 examples/s]


SciFactChunked: NumOfQueries: 300, NumOfCorpus:5183, total: 5483
NarrativeQAChunked: NumOfQueries: 10557, NumOfCorpus:355, total: 10912


Map: 100%|██████████| 12334/12334 [00:00<00:00, 33299.35 examples/s]


NFCorpusChunked: NumOfQueries: 323, NumOfCorpus:3633, total: 3956


Map: 100%|██████████| 7626/7626 [00:00<00:00, 32457.30 examples/s]
Map: 100%|██████████| 15675/15675 [00:00<00:00, 32985.30 examples/s]


QuoraChunked: NumOfQueries: 10000, NumOfCorpus:522931, total: 532931


Map: 100%|██████████| 14166/14166 [00:00<00:00, 32800.23 examples/s]
Map: 100%|██████████| 1238/1238 [00:00<00:00, 23219.79 examples/s]
Map: 100%|██████████| 1706/1706 [00:00<00:00, 31877.23 examples/s]


FiQA2018Chunked: NumOfQueries: 648, NumOfCorpus:57638, total: 58286


Map: 100%|██████████| 66336/66336 [00:01<00:00, 34460.22 examples/s]


TRECCOVIDChunked: NumOfQueries: 50, NumOfCorpus:171332, total: 171382
LEMBWikimQARetrievalChunked: NumOfQueries: 300, NumOfCorpus:300, total: 600


Map: 100%|██████████| 29928/29928 [00:00<00:00, 34001.48 examples/s]


SCIDOCSChunked: NumOfQueries: 1000, NumOfCorpus:25657, total: 26657


Map: 100%|██████████| 2214/2214 [00:00<00:00, 25155.67 examples/s]


Touche2020Chunked: NumOfQueries: 49, NumOfCorpus:382545, total: 382594
CmedqaRetrievalChunked: NumOfQueries: 3999, NumOfCorpus:100001, total: 104000
CovidRetrievallChunked: NumOfQueries: 949, NumOfCorpus:100001, total: 100950
DuRetrievalChunked: NumOfQueries: 2000, NumOfCorpus:100001, total: 102001
EcomRetrievalChunked: NumOfQueries: 1000, NumOfCorpus:100902, total: 101902


Using the latest cached version of the dataset since C-MTEB/MedicalRetrieval couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/C-MTEB___medical_retrieval/default/0.0.0/2039188fb5800a9803ba5048df7b76e6fb151fc6 (last modified on Wed Oct  9 10:34:14 2024).
Using the latest cached version of the dataset since C-MTEB/MedicalRetrieval-qrels couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/C-MTEB___medical_retrieval-qrels/default/0.0.0/37b8efec53c54c3d9c6af212f6710b62ccdf895c (last modified on Wed Oct  9 10:34:20 2024).
Using the latest cached version of the dataset since C-MTEB/MMarcoRetrieval couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/C-MTEB___m_marco_retrieval/default/0.0.0/539bbde593d947e2a124ba72651aafc09eb33fc2 (last

MedicalRetrievalChunked: NumOfQueries: 1000, NumOfCorpus:100999, total: 101999


Using the latest cached version of the dataset since C-MTEB/MMarcoRetrieval-qrels couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/C-MTEB___m_marco_retrieval-qrels/default/0.0.0/bae08bb7bddbedb96c7e7db52018a55167b67f89 (last modified on Wed Oct  9 10:36:15 2024).
Using the latest cached version of the dataset since C-MTEB/T2Retrieval couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/C-MTEB___t2_retrieval/default/0.0.0/8731a845f1bf500a4f111cf1070785c793d10e64 (last modified on Wed Oct  9 10:42:58 2024).


MMarcoRetrievalChunked: NumOfQueries: 6980, NumOfCorpus:106813, total: 113793


Using the latest cached version of the dataset since C-MTEB/T2Retrieval-qrels couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/C-MTEB___t2_retrieval-qrels/default/0.0.0/1c83b8d1544e529875e3f6930f3a1fcf749a8e97 (last modified on Wed Oct  9 10:43:05 2024).
Using the latest cached version of the dataset since C-MTEB/VideoRetrieval couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/C-MTEB___video_retrieval/default/0.0.0/58c2597a5943a2ba48f4668c3b90d796283c5639 (last modified on Wed Oct  9 10:50:47 2024).


T2RetrievalChunked: NumOfQueries: 22812, NumOfCorpus:118605, total: 141417


Using the latest cached version of the dataset since C-MTEB/VideoRetrieval-qrels couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/C-MTEB___video_retrieval-qrels/default/0.0.0/faa71382b6a29cf1778d1f436b963e75cb5b927c (last modified on Wed Oct  9 10:50:53 2024).
Using the latest cached version of the dataset since maidalun1020/CrosslingualRetrievalQasEn2Zh couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/maidalun1020___crosslingual_retrieval_qas_en2_zh/default/0.0.0/40e9603cb70de463b5927796467354c6761cbfca (last modified on Wed Oct  9 15:21:27 2024).


VideoRetrievalChunked: NumOfQueries: 1000, NumOfCorpus:100930, total: 101930


Using the latest cached version of the dataset since maidalun1020/CrosslingualRetrievalQasEn2Zh-qrels couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/maidalun1020___crosslingual_retrieval_qas_en2_zh-qrels/default/0.0.0/30811ae53ff352f322509ccc72bb995a6efbb9f6 (last modified on Wed Oct  9 15:29:41 2024).


CrosslingualRetrievalQasEn2ZhChunked: NumOfQueries: 20000, NumOfCorpus:79955, total: 99955


Using the latest cached version of the dataset since maidalun1020/CrosslingualRetrievalBooksEn2Zh couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/maidalun1020___crosslingual_retrieval_books_en2_zh/default/0.0.0/a6635b6e7a2fde2b89934b3a6893966bc47dee91 (last modified on Wed Oct  9 15:56:24 2024).
Using the latest cached version of the dataset since maidalun1020/CrosslingualRetrievalBooksEn2Zh-qrels couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/maidalun1020___crosslingual_retrieval_books_en2_zh-qrels/default/0.0.0/af060b6e545f7ddc317f2cb8f78ca5e9a8212e5c (last modified on Wed Oct  9 15:59:57 2024).


CrosslingualRetrievalBooksEn2ZhChunked: NumOfQueries: 31172, NumOfCorpus:4614, total: 35786


Using the latest cached version of the dataset since maidalun1020/CrosslingualRetrievalBooksZh2En couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/maidalun1020___crosslingual_retrieval_books_zh2_en/default/0.0.0/0bd118d1fabe3618a79cc21a8cf060b190df76ec (last modified on Wed Oct  9 15:58:07 2024).
Using the latest cached version of the dataset since maidalun1020/CrosslingualRetrievalBooksZh2En-qrels couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/maidalun1020___crosslingual_retrieval_books_zh2_en-qrels/default/0.0.0/855524ebb691026a5818f02056df77fcc078c333 (last modified on Wed Oct  9 16:01:24 2024).


CrosslingualRetrievalBooksZh2EnChunked: NumOfQueries: 31172, NumOfCorpus:4614, total: 35786


Using the latest cached version of the dataset since maidalun1020/CrosslingualRetrievalFinanceEn2Zh couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/maidalun1020___crosslingual_retrieval_finance_en2_zh/default/0.0.0/e3fdb61b92df7b08c8408f5899b4d79535772ac8 (last modified on Wed Oct  9 15:56:36 2024).
Using the latest cached version of the dataset since maidalun1020/CrosslingualRetrievalFinanceEn2Zh-qrels couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/maidalun1020___crosslingual_retrieval_finance_en2_zh-qrels/default/0.0.0/68a0bd8bc697b82f5ee59bdd4080996306075a3b (last modified on Wed Oct  9 16:00:29 2024).


CrosslingualRetrievalFinanceEn2ZhChunked: NumOfQueries: 25045, NumOfCorpus:4668, total: 29713


Using the latest cached version of the dataset since maidalun1020/CrosslingualRetrievalFinanceZh2En couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/maidalun1020___crosslingual_retrieval_finance_zh2_en/default/0.0.0/1275094b1120bc158a7b7affc26293fc8cdc48b9 (last modified on Wed Oct  9 15:58:22 2024).
Using the latest cached version of the dataset since maidalun1020/CrosslingualRetrievalFinanceZh2En-qrels couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/maidalun1020___crosslingual_retrieval_finance_zh2_en-qrels/default/0.0.0/1c33dd7b1fa3bf20978045cd7d431b6d2dd702ed (last modified on Wed Oct  9 16:01:30 2024).


CrosslingualRetrievalFinanceZh2EnChunked: NumOfQueries: 25020, NumOfCorpus:4668, total: 29688


Using the latest cached version of the dataset since maidalun1020/CrosslingualRetrievalLawEn2Zh couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/maidalun1020___crosslingual_retrieval_law_en2_zh/default/0.0.0/662a7835baa0d01b75f35bdfcca2206b1f6fd4c5 (last modified on Wed Oct  9 15:55:08 2024).
Using the latest cached version of the dataset since maidalun1020/CrosslingualRetrievalLawEn2Zh-qrels couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/maidalun1020___crosslingual_retrieval_law_en2_zh-qrels/default/0.0.0/a4357a5466ac22f915661ff68290c219d8ca24ec (last modified on Wed Oct  9 15:36:20 2024).


CrosslingualRetrievalLawEn2ZhChunked: NumOfQueries: 26642, NumOfCorpus:4899, total: 31541


Using the latest cached version of the dataset since maidalun1020/CrosslingualRetrievalLawZh2En couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/maidalun1020___crosslingual_retrieval_law_zh2_en/default/0.0.0/15bc6fe6bac5f9994821e237e5099682f15ed6fe (last modified on Wed Oct  9 15:57:47 2024).
Using the latest cached version of the dataset since maidalun1020/CrosslingualRetrievalLawZh2En-qrels couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/maidalun1020___crosslingual_retrieval_law_zh2_en-qrels/default/0.0.0/dee9d5954180d1a8e026899218606731220d0755 (last modified on Wed Oct  9 16:01:19 2024).


CrosslingualRetrievalLawZh2EnChunked: NumOfQueries: 26653, NumOfCorpus:4897, total: 31550


Using the latest cached version of the dataset since maidalun1020/CrosslingualRetrievalPaperEn2Zh couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/maidalun1020___crosslingual_retrieval_paper_en2_zh/default/0.0.0/7b51e14b8de01ad31bc504e6804a069400182acd (last modified on Wed Oct  9 15:56:47 2024).
Using the latest cached version of the dataset since maidalun1020/CrosslingualRetrievalPaperEn2Zh-qrels couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/maidalun1020___crosslingual_retrieval_paper_en2_zh-qrels/default/0.0.0/a4a00ba721bac75cab899f1777de9e1c57a4924b (last modified on Wed Oct  9 16:00:37 2024).


CrosslingualRetrievalPaperEn2ZhChunked: NumOfQueries: 22456, NumOfCorpus:5076, total: 27532


Using the latest cached version of the dataset since maidalun1020/CrosslingualRetrievalPaperZh2En couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/maidalun1020___crosslingual_retrieval_paper_zh2_en/default/0.0.0/91772595b92a9eaa3de8da680fa2df1289d234aa (last modified on Wed Oct  9 15:58:38 2024).
Using the latest cached version of the dataset since maidalun1020/CrosslingualRetrievalPaperZh2En-qrels couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/maidalun1020___crosslingual_retrieval_paper_zh2_en-qrels/default/0.0.0/080ddcb391a28646cc3db6343baa292bef1c869d (last modified on Wed Oct  9 16:01:35 2024).


CrosslingualRetrievalPaperZh2EnChunked: NumOfQueries: 22515, NumOfCorpus:5101, total: 27616


Using the latest cached version of the dataset since maidalun1020/CrosslingualRetrievalWikiEn2Zh couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/maidalun1020___crosslingual_retrieval_wiki_en2_zh/default/0.0.0/e005f084db042b883a672a62f111c15c2e570dee (last modified on Wed Oct  9 15:57:00 2024).
Using the latest cached version of the dataset since maidalun1020/CrosslingualRetrievalWikiEn2Zh-qrels couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/maidalun1020___crosslingual_retrieval_wiki_en2_zh-qrels/default/0.0.0/48ffbd2c616e176d786e6a1b42c3d6af79d4964b (last modified on Wed Oct  9 16:01:12 2024).


CrosslingualRetrievalWikiEn2ZhChunked: NumOfQueries: 34060, NumOfCorpus:6506, total: 40566


Using the latest cached version of the dataset since maidalun1020/CrosslingualRetrievalWikiZh2En couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/maidalun1020___crosslingual_retrieval_wiki_zh2_en/default/0.0.0/8d0fceb8bda8345eea22c42f4b57b2ceebd51010 (last modified on Wed Oct  9 15:59:42 2024).
Using the latest cached version of the dataset since maidalun1020/CrosslingualRetrievalWikiZh2En-qrels couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/zyh/.cache/huggingface/datasets/maidalun1020___crosslingual_retrieval_wiki_zh2_en-qrels/default/0.0.0/97e68a0530c3bb26403fe76837e5df131bad96f1 (last modified on Wed Oct  9 16:01:40 2024).


CrosslingualRetrievalWikiZh2EnChunked: NumOfQueries: 34062, NumOfCorpus:6504, total: 40566


In [4]:
lines = []
with open("master.log", "r") as f:
    lines = f.readlines()

dedup = set()
for line in lines:
    if line in dedup:
        print("Duplicate: " + line)
    dedup.add(line.strip())