In [None]:
import xxhash
import json
import numpy as np
from functools import lru_cache
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from transformers import AutoTokenizer
import tqdm
import torch

In [None]:
@staticmethod
@lru_cache(maxsize=10)
def _cache_json(path: Path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

def _compute_hash(token_ids: list[int]):
    """
    计算完整 prompt token 序列的哈希值
    """
    h = xxhash.xxh64()
    h.update(np.array(token_ids).tobytes())
    return h.intdigest()

def read_jsonl(path: str | Path) -> list:
    """读取 JSONL 文件并返回 list"""
    file_path = Path(path)
    
    if not file_path.exists():
        print(f"文件不存在: {file_path}")
        raise FileNotFoundError(f"文件不存在: {file_path}")

    data_list = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():  # 跳过空行
                data_list.append(json.loads(line))
    return data_list

def serialize_token_ids(token_ids: list[int] | np.ndarray) -> str:
    """将 token_ids 序列化为 [数值,数值,数值] 格式"""
    if isinstance(token_ids, np.ndarray):
        return str(token_ids.tolist())
    else:
        return str(token_ids)

In [None]:
t1 = torch.Tensor([1,2])
print(t1)
t2 = t1.unsqueeze(dim=1)
t3 = t1[None,:,None,None]
print(t2)
print(t3)

In [None]:
torch.empty_like(t1).exponential_(1) + 1e-10 

In [None]:
t1.exponential_(1) + 1e-10 