In [1]:
import json
import faiss 
import torch
import numpy as np
from tqdm import trange
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


### Load the Embedding Model
### 加载embedding模型

In [2]:
embedding_model = "./model/nomic-embed-text-v1"
embedding_model = SentenceTransformer(embedding_model, trust_remote_code=True)

embedding_model.to(torch.device('cuda'))

  state_dict = torch.load(f"{model_name}/pytorch_model.bin")
<All keys matched successfully>


SentenceTransformer(
  (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False}) with Transformer model: NomicBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

### Read the pre-prepared JSON file in the format required by TinyDB (refer to arxiv_paper_db.json for the specific format). Each index corresponds to a single paper.
### 读取准备好符合tinydb要求json文件（具体格式参考arxiv_paper_db.json）, 每个索引index对应一篇paper

In [3]:
with open('./database/arxiv_paper_db.json','r') as f:
    papers = json.loads(f.read())
papers_l = list(papers['cs_paper_info'].items())

In [4]:
papers_l[:10]

[('1',
  {'id': '1811.06122v1',
   'title': 'The case for shifting the Renyi Entropy',
   'url': 'http://arxiv.org/pdf/1811.06122v1',
   'date': '2018-11-14',
   'abs': 'We introduce a variant of the R\\\'enyi entropy definition that aligns it with\nthe well-known H\\"older mean: in the new formulation, the r-th order R\\\'enyi\nEntropy is the logarithm of the inverse of the r-th order H\\"older mean. This\nbrings about new insights into the relationship of the R\\\'enyi entropy to\nquantities close to it, like the information potential and the partition\nfunction of statistical mechanics. We also provide expressions that allow us to\ncalculate the R\\\'enyi entropies from the Shannon cross-entropy and the escort\nprobabilities. Finally, we discuss why shifting the R\\\'enyi entropy is fruitful\nin some applications.',
   'cat': 'cs.IT',
   'authors': ['Francisco José Valverde-Albacete', 'Carmen Peláez-Moreno']}),
 ('2',
  {'id': '1811.06115v1',
   'title': 'Deep Learning in the Wavele

## Get embeddings of abs and title
## 对title和abs做embedding

In [5]:
def get_embeddings(text_l, batch_size=32):
    res = []
    for i in trange(0, len(text_l), batch_size):
        batch_text = ['search_document: ' + _ for _ in text_l[i:i+batch_size]]
        res.append(embedding_model.encode(batch_text))
    return np.concatenate(res,axis=0)

In [6]:
title_l = [paper[1]['title'] for paper in papers_l]
abs_l = [paper[1]['abs'] for paper in papers_l]

In [7]:
title_embeddings = get_embeddings(title_l)

100%|██████████| 16803/16803 [09:33<00:00, 29.31it/s]


In [None]:
abs_embeddings = get_embeddings(abs_l)

### Convert embeddings into faiss-index
### 将向量储存为faiss index

In [None]:
title_index = faiss.IndexFlatL2(title_embeddings.shape[1])
title_index.add(title_embeddings)

abs_index = faiss.IndexFlatL2(abs_embeddings.shape[1])
abs_index.add(abs_embeddings)

### Save faiss-index, replacing the .bin file in the database folder.
### 向量保存到本地，替换掉database文件夹中的.bin文件

In [None]:
faiss.write_index(faiss.index_gpu_to_cpu(title_index), 'titles.index')

faiss.write_index(faiss.index_gpu_to_cpu(abs_index), 'abstracts.index')

### Save the mapping from paper ID to index locally, replacing the arxivid_to_index_abs.json file.
### 将paper id到索引的映射保存到本地，替换掉 arxivid_to_index_abs.json文件

In [26]:
paperid_2_index = {}
for paper in papers_l:
    paper_id = paper[1]['id']
    index = paper[0]
    paperid_2_index[paper_id] = int(index)
with open('./paperid_to_index.json', 'w') as f:
    json.dump(paperid_2_index, f, indent=4)

### Modify the file paths in the __init__ fuction within src/database.py.
### 对src/database.py中的__init__部分的初始化文件路径做相应的修改