In [1]:
import numpy as np

def from_ibin(filename, start_idx = 0, chunk_size = None):
    with open(filename, "rb") as f:
        nvecs, dim = np.fromfile(f, count = 2, dtype = np.int32)
        nvecs = (nvecs - start_idx) if chunk_size is None else chunk_size
        arr = np.fromfile(f, count = nvecs * dim, dtype = np.int32,
                          offset = start_idx * 4 * dim)
    print(filename + ":  " + str(nvecs) + "   " + str(dim))
    return arr.reshape(nvecs, dim)

def to_ibin_padded(filename, arr, target_width=128, pad_value=-1):
    nvecs, dim = arr.shape
    if dim > target_width:
        print(f"警告: 输入维度({dim})大于目标宽度({target_width})，数据将被截断")
        padded_arr = arr[:, :target_width]
    elif dim == target_width:
        print(f"输入维度({dim})等于目标宽度({target_width})，无需填充")
        padded_arr = arr
    else:
        padded_arr = np.full((nvecs, target_width), pad_value, dtype=np.int32)
        padded_arr[:, :dim] = arr
        print(f"已将输入数据从形状({nvecs}, {dim})填充到({nvecs}, {target_width})")
    with open(filename, "wb") as f:
        padded_arr.astype(np.int32).tofile(f)
    file_size_mb = (nvecs * target_width * 4) / (1024 * 1024)
    print(f"已保存到{filename}: {nvecs}行 × {target_width}列")
    print(f"文件大小: {file_size_mb:.2f} MB")
    
    return padded_arr

In [2]:
gt = from_ibin("./data/t2i-10M/gt.train.10M.ibin")

./data/t2i-10M/gt.train.10M.ibin:  10000000   100


In [3]:
to_ibin_padded("./data/t2i-10000000-200/gt_10000000_128.ibin", gt, target_width=128)

已将输入数据从形状(10000000, 100)填充到(10000000, 128)
已保存到./data/t2i-10000000-200/gt_10000000_128.ibin: 10000000行 × 128列
文件大小: 4882.81 MB


array([[ 591476, 8059312, 5695992, ...,      -1,      -1,      -1],
       [3276391, 9621456, 6119637, ...,      -1,      -1,      -1],
       [2024867, 5269721,  912634, ...,      -1,      -1,      -1],
       ...,
       [2588306, 5394456, 9255485, ...,      -1,      -1,      -1],
       [6742606,  511560, 1978136, ...,      -1,      -1,      -1],
       [6358973,  318644, 9713310, ...,      -1,      -1,      -1]],
      shape=(10000000, 128), dtype=int32)