In [1]:
import requests
import time
import json
import numpy as np
from tqdm import tqdm
import joblib

In [2]:
BATCH_SIZE = 64

model = "BAAI/bge-m3"  # 查看模型信息 https://cloud.siliconflow.cn/models
api_key = ""  # API密钥

In [3]:
cs_papers = []
with open("arxiv_cs-metadata.json", "r") as fi:
    for line in tqdm(fi):
        data = json.loads(line)
        cs_papers.append(data)


62081it [00:00, 111703.08it/s]


In [4]:
np.random.random((len(cs_papers), 1024)).shape


(62081, 1024)

In [5]:
cs_papers[0]

{'id': '0704.1274',
 'title': 'Parametric Learning and Monte Carlo Optimization',
 'abstract': "  This paper uncovers and explores the close relationship between Monte Carlo\nOptimization of a parametrized integral (MCO), Parametric machine-Learning\n(PL), and `blackbox' or `oracle'-based optimization (BO). We make four\ncontributions. First, we prove that MCO is mathematically identical to a broad\nclass of PL problems. This identity potentially provides a new application\ndomain for all broadly applicable PL techniques: MCO. Second, we introduce\nimmediate sampling, a new version of the Probability Collectives (PC) algorithm\nfor blackbox optimization. Immediate sampling transforms the original BO\nproblem into an MCO problem. Accordingly, by combining these first two\ncontributions, we can apply all PL techniques to BO. In our third contribution\nwe validate this way of improving BO by demonstrating that cross-validation and\nbagging improve immediate sampling. Finally, conventional

In [6]:
def get_embeddings(texts, model, api_key, max_retries=3, sleep_time=20):
    url = "https://api.siliconflow.cn/v1/embeddings"
    payload = {
        "model": model,  # 配置LLM embedding model
        "input": texts,
        "encoding_format": "float"
    }
    headers = {
        "Authorization": api_key,  # 添加siliconflow API 密钥，https://cloud.siliconflow.cn/account/ak
        "Content-Type": "application/json"
    }
    for attempt in range(max_retries):
        response = requests.request("POST", url, json=payload, headers=headers)
        if response.status_code == 200:
            response_embeds = json.loads(response.text)["data"]
            embeddings = [response_embeds[i]["embedding"] for i in range(len(texts))]
            return np.array(embeddings).astype(np.float32)  # (1, 1024)
        elif response.status_code == 429:
            print(f"遇到429错误，正在进行第{attempt+1}次重试...")
            time.sleep(sleep_time)  # 暂停几秒后重试
            continue
        else:
            print(f"Error: {response.status_code}")
            return None
    print(f"达到最大重试次数 {max_retries}，获取embeddings失败")
    return None

In [7]:
# 批量处理数据
embeddings = []  # concat(title, abstract)
titles = []
ids = []
abs = []
for i in tqdm(range(0, len(cs_papers), BATCH_SIZE)):
    batch = cs_papers[i:i + BATCH_SIZE]
    texts = [paper["title"].strip() + " " + paper["abstract"].strip() for paper in batch]

    batch_embeddings = get_embeddings(texts, model, api_key, sleep_time=10)  # ~ 30min
    if batch_embeddings is not None:
        embeddings.extend(batch_embeddings)
        titles.extend([paper["title"] for paper in batch])
        ids.extend([paper["id"] for paper in batch])
        abs.extend([paper["abstract"] for paper in batch])


  0%|          | 4/971 [00:03<12:17,  1.31it/s]

遇到429错误，正在进行第1次重试...


  1%|          | 8/971 [00:16<29:35,  1.84s/it]  

遇到429错误，正在进行第1次重试...


  1%|          | 11/971 [00:29<43:24,  2.71s/it] 

遇到429错误，正在进行第1次重试...


  2%|▏         | 21/971 [00:46<14:53,  1.06it/s]  

遇到429错误，正在进行第1次重试...


  3%|▎         | 33/971 [01:06<12:27,  1.25it/s]  

遇到429错误，正在进行第1次重试...


  5%|▍         | 45/971 [01:25<12:05,  1.28it/s]  

遇到429错误，正在进行第1次重试...


  6%|▌         | 56/971 [01:43<12:37,  1.21it/s]

遇到429错误，正在进行第1次重试...


  7%|▋         | 66/971 [02:02<13:27,  1.12it/s]

遇到429错误，正在进行第1次重试...


  8%|▊         | 76/971 [02:19<12:56,  1.15it/s]

遇到429错误，正在进行第1次重试...


  9%|▉         | 87/971 [02:38<12:20,  1.19it/s]

遇到429错误，正在进行第1次重试...


 10%|▉         | 96/971 [02:56<13:42,  1.06it/s]

遇到429错误，正在进行第1次重试...


 11%|█         | 106/971 [03:14<12:47,  1.13it/s]

遇到429错误，正在进行第1次重试...


 12%|█▏        | 116/971 [03:32<12:37,  1.13it/s]

遇到429错误，正在进行第1次重试...


 13%|█▎        | 126/971 [03:49<12:25,  1.13it/s]

遇到429错误，正在进行第1次重试...


 14%|█▍        | 136/971 [04:07<12:24,  1.12it/s]

遇到429错误，正在进行第1次重试...


 15%|█▌        | 147/971 [04:26<11:28,  1.20it/s]

遇到429错误，正在进行第1次重试...


 16%|█▌        | 156/971 [04:43<12:43,  1.07it/s]

遇到429错误，正在进行第1次重试...


 17%|█▋        | 165/971 [05:00<12:44,  1.05it/s]

遇到429错误，正在进行第1次重试...


 18%|█▊        | 176/971 [05:19<10:49,  1.22it/s]

遇到429错误，正在进行第1次重试...


 19%|█▉        | 186/971 [05:37<11:27,  1.14it/s]

遇到429错误，正在进行第1次重试...


 20%|██        | 196/971 [05:54<11:21,  1.14it/s]

遇到429错误，正在进行第1次重试...


 21%|██        | 206/971 [06:12<11:06,  1.15it/s]

遇到429错误，正在进行第1次重试...


 22%|██▏       | 215/971 [06:29<11:52,  1.06it/s]

遇到429错误，正在进行第1次重试...


 23%|██▎       | 225/971 [06:47<10:59,  1.13it/s]

遇到429错误，正在进行第1次重试...


 24%|██▍       | 234/971 [07:04<11:14,  1.09it/s]

遇到429错误，正在进行第1次重试...


 25%|██▌       | 244/971 [07:22<10:44,  1.13it/s]

遇到429错误，正在进行第1次重试...


 26%|██▌       | 253/971 [07:40<11:18,  1.06it/s]

遇到429错误，正在进行第1次重试...


 27%|██▋       | 263/971 [07:58<10:30,  1.12it/s]

遇到429错误，正在进行第1次重试...


 28%|██▊       | 272/971 [08:15<10:56,  1.07it/s]

遇到429错误，正在进行第1次重试...


 29%|██▉       | 280/971 [08:31<11:51,  1.03s/it]

遇到429错误，正在进行第1次重试...


 30%|██▉       | 289/971 [08:49<10:48,  1.05it/s]

遇到429错误，正在进行第1次重试...


 31%|███       | 297/971 [09:05<11:46,  1.05s/it]

遇到429错误，正在进行第1次重试...


 32%|███▏      | 306/971 [09:23<10:54,  1.02it/s]

遇到429错误，正在进行第1次重试...


 32%|███▏      | 315/971 [09:40<10:15,  1.07it/s]

遇到429错误，正在进行第1次重试...


 33%|███▎      | 324/971 [09:57<10:31,  1.03it/s]

遇到429错误，正在进行第1次重试...


 34%|███▍      | 331/971 [10:13<12:10,  1.14s/it]

遇到429错误，正在进行第1次重试...


 35%|███▍      | 339/971 [10:30<11:09,  1.06s/it]

遇到429错误，正在进行第1次重试...


 36%|███▌      | 347/971 [10:47<11:05,  1.07s/it]

遇到429错误，正在进行第1次重试...


 37%|███▋      | 355/971 [11:03<10:34,  1.03s/it]

遇到429错误，正在进行第1次重试...


 37%|███▋      | 364/971 [11:20<09:39,  1.05it/s]

遇到429错误，正在进行第1次重试...


 38%|███▊      | 371/971 [11:36<11:30,  1.15s/it]

遇到429错误，正在进行第1次重试...


 39%|███▉      | 380/971 [11:54<09:38,  1.02it/s]

遇到429错误，正在进行第1次重试...


 40%|███▉      | 387/971 [12:10<11:21,  1.17s/it]

遇到429错误，正在进行第1次重试...


 41%|████      | 395/971 [12:26<10:31,  1.10s/it]

遇到429错误，正在进行第1次重试...


 42%|████▏     | 403/971 [12:43<10:28,  1.11s/it]

遇到429错误，正在进行第1次重试...


 42%|████▏     | 411/971 [13:00<09:58,  1.07s/it]

遇到429错误，正在进行第1次重试...


 43%|████▎     | 419/971 [13:17<10:00,  1.09s/it]

遇到429错误，正在进行第1次重试...


 44%|████▍     | 427/971 [13:34<09:32,  1.05s/it]

遇到429错误，正在进行第1次重试...


 45%|████▍     | 435/971 [13:50<09:37,  1.08s/it]

遇到429错误，正在进行第1次重试...


 46%|████▌     | 443/971 [14:07<09:16,  1.05s/it]

遇到429错误，正在进行第1次重试...


 46%|████▋     | 451/971 [14:24<09:18,  1.07s/it]

遇到429错误，正在进行第1次重试...


 47%|████▋     | 459/971 [14:41<09:22,  1.10s/it]

遇到429错误，正在进行第1次重试...


 48%|████▊     | 467/971 [14:58<09:11,  1.09s/it]

遇到429错误，正在进行第1次重试...


 49%|████▉     | 474/971 [15:14<10:00,  1.21s/it]

遇到429错误，正在进行第1次重试...


 50%|████▉     | 482/971 [15:31<08:45,  1.07s/it]

遇到429错误，正在进行第1次重试...


 50%|█████     | 490/971 [15:48<09:06,  1.14s/it]

遇到429错误，正在进行第1次重试...


 51%|█████     | 497/971 [16:04<09:29,  1.20s/it]

遇到429错误，正在进行第1次重试...


 52%|█████▏    | 505/971 [16:20<08:20,  1.07s/it]

遇到429错误，正在进行第1次重试...


 53%|█████▎    | 513/971 [16:37<08:31,  1.12s/it]

遇到429错误，正在进行第1次重试...


 54%|█████▎    | 521/971 [16:54<08:06,  1.08s/it]

遇到429错误，正在进行第1次重试...


 54%|█████▍    | 528/971 [17:11<09:02,  1.22s/it]

遇到429错误，正在进行第1次重试...


 55%|█████▌    | 536/971 [17:27<07:54,  1.09s/it]

遇到429错误，正在进行第1次重试...


 56%|█████▌    | 544/971 [17:44<07:44,  1.09s/it]

遇到429错误，正在进行第1次重试...


 57%|█████▋    | 552/971 [18:01<07:37,  1.09s/it]

遇到429错误，正在进行第1次重试...


 58%|█████▊    | 559/971 [18:17<08:08,  1.19s/it]

遇到429错误，正在进行第1次重试...


 58%|█████▊    | 567/971 [18:34<07:16,  1.08s/it]

遇到429错误，正在进行第1次重试...


 59%|█████▉    | 575/971 [18:50<06:52,  1.04s/it]

遇到429错误，正在进行第1次重试...


 60%|█████▉    | 582/971 [19:06<07:45,  1.20s/it]

遇到429错误，正在进行第1次重试...


 61%|██████    | 590/971 [19:23<06:52,  1.08s/it]

遇到429错误，正在进行第1次重试...


 62%|██████▏   | 598/971 [19:40<06:39,  1.07s/it]

遇到429错误，正在进行第1次重试...


 62%|██████▏   | 605/971 [19:56<07:19,  1.20s/it]

遇到429错误，正在进行第1次重试...


 63%|██████▎   | 613/971 [20:13<06:23,  1.07s/it]

遇到429错误，正在进行第1次重试...


 64%|██████▍   | 620/971 [20:29<07:04,  1.21s/it]

遇到429错误，正在进行第1次重试...


 65%|██████▍   | 627/971 [20:45<06:58,  1.22s/it]

遇到429错误，正在进行第1次重试...


 65%|██████▌   | 634/971 [21:01<06:53,  1.23s/it]

遇到429错误，正在进行第1次重试...


 66%|██████▌   | 642/971 [21:18<05:53,  1.07s/it]

遇到429错误，正在进行第1次重试...


 67%|██████▋   | 650/971 [21:34<05:46,  1.08s/it]

遇到429错误，正在进行第1次重试...


 68%|██████▊   | 657/971 [21:50<06:09,  1.18s/it]

遇到429错误，正在进行第1次重试...


 68%|██████▊   | 664/971 [22:07<06:18,  1.23s/it]

遇到429错误，正在进行第1次重试...


 69%|██████▉   | 671/971 [22:23<05:55,  1.19s/it]

遇到429错误，正在进行第1次重试...


 70%|██████▉   | 679/971 [22:40<05:18,  1.09s/it]

遇到429错误，正在进行第1次重试...


 71%|███████   | 686/971 [22:56<05:40,  1.19s/it]

遇到429错误，正在进行第1次重试...


 71%|███████▏  | 694/971 [23:13<05:10,  1.12s/it]

遇到429错误，正在进行第1次重试...


 72%|███████▏  | 701/971 [23:41<22:38,  5.03s/it]

Error: 502


 73%|███████▎  | 712/971 [23:51<04:01,  1.07it/s]

遇到429错误，正在进行第1次重试...


 74%|███████▍  | 718/971 [24:06<05:41,  1.35s/it]

遇到429错误，正在进行第1次重试...


 75%|███████▍  | 725/971 [24:22<04:57,  1.21s/it]

遇到429错误，正在进行第1次重试...


 75%|███████▌  | 732/971 [24:38<04:53,  1.23s/it]

遇到429错误，正在进行第1次重试...


 76%|███████▌  | 739/971 [24:54<04:41,  1.21s/it]

遇到429错误，正在进行第1次重试...


 77%|███████▋  | 747/971 [25:11<03:59,  1.07s/it]

遇到429错误，正在进行第1次重试...


 78%|███████▊  | 754/971 [25:27<04:17,  1.19s/it]

遇到429错误，正在进行第1次重试...


 78%|███████▊  | 761/971 [25:43<04:16,  1.22s/it]

遇到429错误，正在进行第1次重试...


 79%|███████▉  | 768/971 [25:59<04:06,  1.21s/it]

遇到429错误，正在进行第1次重试...


 80%|███████▉  | 775/971 [26:15<04:04,  1.25s/it]

遇到429错误，正在进行第1次重试...


 81%|████████  | 782/971 [26:31<03:49,  1.22s/it]

遇到429错误，正在进行第1次重试...


 81%|████████▏ | 790/971 [26:48<03:17,  1.09s/it]

遇到429错误，正在进行第1次重试...


 82%|████████▏ | 797/971 [27:04<03:27,  1.19s/it]

遇到429错误，正在进行第1次重试...


 83%|████████▎ | 804/971 [27:20<03:23,  1.22s/it]

遇到429错误，正在进行第1次重试...


 84%|████████▎ | 812/971 [27:37<02:53,  1.09s/it]

遇到429错误，正在进行第1次重试...


 84%|████████▍ | 818/971 [27:53<03:28,  1.36s/it]

遇到429错误，正在进行第1次重试...


 85%|████████▌ | 826/971 [28:09<02:37,  1.09s/it]

遇到429错误，正在进行第1次重试...


 86%|████████▌ | 833/971 [28:25<02:46,  1.20s/it]

遇到429错误，正在进行第1次重试...


 87%|████████▋ | 840/971 [28:41<02:38,  1.21s/it]

遇到429错误，正在进行第1次重试...


 87%|████████▋ | 847/971 [28:58<02:32,  1.23s/it]

遇到429错误，正在进行第1次重试...


 88%|████████▊ | 854/971 [29:14<02:25,  1.24s/it]

遇到429错误，正在进行第1次重试...


 89%|████████▊ | 861/971 [29:30<02:12,  1.21s/it]

遇到429错误，正在进行第1次重试...


 89%|████████▉ | 868/971 [29:46<02:05,  1.22s/it]

遇到429错误，正在进行第1次重试...


 90%|█████████ | 875/971 [30:02<01:56,  1.22s/it]

遇到429错误，正在进行第1次重试...


 91%|█████████ | 883/971 [30:19<01:36,  1.10s/it]

遇到429错误，正在进行第1次重试...


 92%|█████████▏| 890/971 [30:35<01:35,  1.18s/it]

遇到429错误，正在进行第1次重试...


 92%|█████████▏| 897/971 [30:51<01:29,  1.21s/it]

遇到429错误，正在进行第1次重试...


 93%|█████████▎| 904/971 [31:07<01:20,  1.21s/it]

遇到429错误，正在进行第1次重试...


 94%|█████████▍| 911/971 [31:23<01:12,  1.22s/it]

遇到429错误，正在进行第1次重试...


 95%|█████████▍| 918/971 [31:39<01:03,  1.20s/it]

遇到429错误，正在进行第1次重试...


 95%|█████████▌| 925/971 [31:56<00:55,  1.21s/it]

遇到429错误，正在进行第1次重试...


 96%|█████████▌| 932/971 [32:12<00:47,  1.23s/it]

遇到429错误，正在进行第1次重试...


 97%|█████████▋| 939/971 [32:28<00:39,  1.23s/it]

遇到429错误，正在进行第1次重试...


 97%|█████████▋| 946/971 [32:44<00:30,  1.22s/it]

遇到429错误，正在进行第1次重试...


 98%|█████████▊| 953/971 [33:00<00:21,  1.22s/it]

遇到429错误，正在进行第1次重试...


 99%|█████████▉| 962/971 [33:18<00:08,  1.04it/s]

遇到429错误，正在进行第1次重试...


100%|██████████| 971/971 [33:34<00:00,  2.08s/it]


In [10]:
len(embeddings), len(titles), len(ids), len(abs)


(970, 62017, 62017, 62017)

In [14]:
embeddings = np.concatenate([item.reshape(-1, 1024) for item in embeddings], 0)


In [None]:
title + abstract  ==> embedding   # 62000

input: new search query, ==> API ==> embedding

# embedding retrieval
O(N)

# ANN, approximate nearest neighbor  # 向量近似检索


In [15]:
import joblib
joblib.dump((embeddings, titles, ids, abs), "processed_data/embeddings.pkl")


['processed_data/embeddings.pkl']

In [16]:
# import joblib 
embeddings, titles, ids, abs = joblib.load("processed_data/embeddings.pkl")


In [17]:
import faiss  # Facebook AI Search Similarity

nlist = 100  # the number of cells in IndexIVFFlat
k = 20  # the number of cells (out of nlist) that are visited to perform a search
d = 1024  # dimension of embeddings
quantizer = faiss.IndexFlatL2(d)  # the other index
index = faiss.IndexIVFFlat(quantizer, d, nlist)
# assert not index.is_trained
index.train(embeddings)
# assert index.is_trained

index.add(embeddings)      
index.nprobe = k            # add may be a bit slower as well



In [18]:
# 保存索引
faiss.write_index(index, "processed_data/faiss_index.bin")
