# 调整benchmark.json以使其合法

In [2]:
import json

OUTPUT_DIR = "results"

def load_existing_results():
    evaluated_key = dict()
    # 读取已有的结果
    with open(f"{OUTPUT_DIR}/benchmark.json", "r") as f:
        content = json.load(f)
    correct = []
    for idx, eval_res in enumerate(content):
        if eval_res is None:
            continue
        if eval_res["task_name"] in [
            "SciFactChunked",
            "NFCorpusChunked",
            "FiQA2018Chunked",
            "LEMBWikimQARetrievalChunked",
            "SCIDOCSChunked",
            "CmedqaRetrievalChunked",
            "CovidRetrievallChunked",
            "DuRetrievalChunked",
            "T2RetrievalChunked",
        ] and eval_res["chunking_strategy"] == "semantic_langchain" and "bce" not in eval_res["model_name"]:
            continue
        eval_setting = {
            "task_name": eval_res["task_name"],
            "model_name": eval_res["model_name"],
            "chunking_strategy": eval_res["chunking_strategy"],
            "chunk_size": eval_res["chunk_size"],
        }
        key = json.dumps(eval_setting, sort_keys=True)
        if key in evaluated_key:
            print(f"key: {key}")
            print(f"found@{idx}: {eval_res}")
            print(f"duplicate@{evaluated_key[key][1]}: {evaluated_key[key][0]}")
            print("\n\n")
            continue
        correct.append(eval_res)
        evaluated_key[key] = (eval_res, idx)
    with open(f"{OUTPUT_DIR}/benchmark.json", "w", encoding="utf-8") as f:
        json.dump(correct, f, ensure_ascii=False, indent=4)

load_existing_results()

# 统计各数据集的大小

In [1]:
import json
import os
import concurrent.futures
os.environ["CUDA_VISIBLE_DEVICES"]=""

import numpy as np
from chunked_pooling.chunked_eval_tasks import *
from chunked_pooling.wrappers import load_model
from transformers import AutoModel, AutoTokenizer

task_name_to_cls = get_eval_tasks()
model_name = "jinaai/jina-embeddings-v2-small-en"
model, has_instructions = load_model(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model.eval()

def task(task_name):
    task_cls = task_name_to_cls[task_name]
    chunking_args = {
        "chunk_size": 1024,
        "n_sentences": 5,
        "chunking_strategy": "fixed_text",
        "model_has_instructions": has_instructions,
        "embedding_model_name": model_name,
    }

    task = task_cls(
        tokenizer=tokenizer,
        prune_size=None,
        truncate_max_length=False,
        **chunking_args,
    )

    task.load_data()
    sub_set = "test" if "test" in task.queries else "dev"
    queries = task.queries[sub_set]
    corpus = task.corpus[sub_set]

    doc_len = np.array([len(str(doc)) for doc in corpus.values()])
    mean_len = doc_len.mean()
    

    num_queries = len(queries)
    num_corpus = len(corpus)
    print(f"{task_name}: NumOfQueries: {num_queries}, NumOfCorpus:{num_corpus}, total: {num_queries+num_corpus}, average_corpus_length: {mean_len}")
    return {task_name: {"queries": num_queries, "corpus": num_corpus, "average_corpus_length": mean_len}}


with concurrent.futures.ProcessPoolExecutor() as executor:
    results = list(executor.map(task, task_name_to_cls.keys()))
    sample_count = {k: v for d in results for k, v in d.items()}

    with open("sample_count.json", "w") as f:
        json.dump(sample_count, f)

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 1406/1406 [00:00<00:00, 26776.45 examples/s]


ArguAnaChunked: NumOfQueries: 1406, NumOfCorpus:8674, total: 10080, average_corpus_length: 1054.4003919760203


Map: 100%|██████████| 4681/4681 [00:00<00:00, 38471.56 examples/s]
Map: 100%|██████████| 5673/5673 [00:00<00:00, 42874.67 examples/s]
Filter: 100%|██████████| 467/467 [00:00<00:00, 98816.46 examples/s]
Map: 100%|██████████| 140085/140085 [00:03<00:00, 43588.90 examples/s]
Filter: 100%|██████████| 123142/123142 [00:00<00:00, 512098.62 examples/s]


ClimateFEVERChunked: NumOfQueries: 1535, NumOfCorpus:5416593, total: 5418128, average_corpus_length: 563.2428945649045


Map: 100%|██████████| 43515/43515 [00:01<00:00, 41679.02 examples/s]
Filter: 100%|██████████| 467/467 [00:00<00:00, 170000.00 examples/s]
Map: 100%|██████████| 8079/8079 [00:00<00:00, 39267.97 examples/s]
Filter: 100%|██████████| 123142/123142 [00:00<00:00, 594758.00 examples/s]


DBPediaChunked: NumOfQueries: 400, NumOfCorpus:4635922, total: 4636322, average_corpus_length: 334.4551515318851


Map: 100%|██████████| 7937/7937 [00:00<00:00, 39013.28 examples/s]
Filter: 100%|██████████| 123142/123142 [00:00<00:00, 567393.59 examples/s]


FEVERChunked: NumOfQueries: 6666, NumOfCorpus:5416568, total: 5423234, average_corpus_length: 563.2350253887702


: 

In [4]:
lines = []
with open("master.log", "r") as f:
    lines = f.readlines()

dedup = set()
for line in lines:
    if line in dedup:
        print("Duplicate: " + line)
    dedup.add(line.strip())

# 下载数据集

In [None]:
from datasets import load_dataset
load_dataset("maidalun1020/CrosslingualRetrievalQasEn2Zh")
load_dataset("maidalun1020/CrosslingualRetrievalLawEn2Zh")
load_dataset("maidalun1020/CrosslingualRetrievalBooksEn2Zh")
load_dataset("maidalun1020/CrosslingualRetrievalFinanceEn2Zh")
load_dataset("maidalun1020/CrosslingualRetrievalPaperEn2Zh")
load_dataset("maidalun1020/CrosslingualRetrievalWikiEn2Zh")
load_dataset("maidalun1020/CrosslingualRetrievalLawZh2En")
load_dataset("maidalun1020/CrosslingualRetrievalBooksZh2En")
load_dataset("maidalun1020/CrosslingualRetrievalFinanceZh2En")
load_dataset("maidalun1020/CrosslingualRetrievalPaperZh2En")
load_dataset("maidalun1020/CrosslingualRetrievalWikiZh2En")
load_dataset("maidalun1020/CrosslingualRetrievalQasEn2Zh-qrels")
load_dataset("maidalun1020/CrosslingualRetrievalLawEn2Zh-qrels")
load_dataset("maidalun1020/CrosslingualRetrievalBooksEn2Zh-qrels")
load_dataset("maidalun1020/CrosslingualRetrievalFinanceEn2Zh-qrels")
load_dataset("maidalun1020/CrosslingualRetrievalPaperEn2Zh-qrels")
load_dataset("maidalun1020/CrosslingualRetrievalWikiEn2Zh-qrels")
load_dataset("maidalun1020/CrosslingualRetrievalLawZh2En-qrels")
load_dataset("maidalun1020/CrosslingualRetrievalBooksZh2En-qrels")
load_dataset("maidalun1020/CrosslingualRetrievalFinanceZh2En-qrels")
load_dataset("maidalun1020/CrosslingualRetrievalPaperZh2En-qrels")
load_dataset("maidalun1020/CrosslingualRetrievalWikiZh2En-qrels")

# 统计各个数据集Corpus文本的长度

In [None]:
import json
import os
import concurrent.futures
os.environ["CUDA_VISIBLE_DEVICES"]=""

import numpy as np
from chunked_pooling.chunked_eval_tasks import *
from chunked_pooling.wrappers import load_model
from transformers import AutoModel, AutoTokenizer

task_name_to_cls = get_eval_tasks()
model_name = "jinaai/jina-embeddings-v2-small-en"
model, has_instructions = load_model(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model.eval()


def task(task_name):
    task_cls = task_name_to_cls[task_name]
    chunking_args = {
        "chunk_size": 1024,
        "n_sentences": 5,
        "chunking_strategy": "fixed_text",
        "model_has_instructions": has_instructions,
        "embedding_model_name": model_name,
    }

    task = task_cls(
        tokenizer=tokenizer,
        prune_size=None,
        truncate_max_length=False,
        **chunking_args,
    )

    task.load_data()
    sub_set = "test" if "test" in task.queries else "dev"
    corpus = task.corpus[sub_set]

    doc_len = {task_name: np.array([len(str(doc)) for doc in corpus.values()])}
    
    return doc_len


with concurrent.futures.ProcessPoolExecutor() as executor:
    doc_len_list = list(executor.map(task, task_name_to_cls.keys()))
    np.save("doc_len.npy", doc_len_list)
