In [1]:
import json
import os

with open("retrieval/wiki.json", "r") as f:
    wiki_sents = json.loads(f.read())



In [2]:
wiki_sents = {int(k) : v for k, v in wiki_sents.items()}
wiki_chunks = {k : " ".join(v) for k, v in wiki_sents.items()}  # https://arxiv.org/abs/2109.08133
wiki_chunks

{0: "The Universe is all of time and space and its contents. The Universe includes planets, stars, galaxies, the contents of intergalactic space, the smallest subatomic particles, and all matter and energy. The observable universe is about 28 billion parsecs (91 billion light-years) in diameter at the present time. The size of the whole Universe is not known and may be either finite or infinite. Observations and the development of physical theories have led to inferences about the composition and evolution of the Universe. Throughout recorded history, cosmologies and cosmogonies, including scientific models, have been proposed to explain observations of the Universe. The earliest quantitative geocentric models were developed by ancient Greek philosophers and Indian philosophers. Over the centuries, more precise astronomical observations led to Nicolaus Copernicus's heliocentric model of the Solar System and Johannes Kepler's improvement on that model with elliptical orbits, which was e

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch

model_name = 'BAAI/bge-small-en'
device = "cuda"

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
import numpy as np
from tqdm import tqdm

out_path = f"retrieval/wiki_embeddings_{model_name.split('/')[-1]}.npy"
save_interval = 10000
wiki_chunks_list = list(wiki_chunks.values())
batch_size = 1000
sentence_embeddings = np.zeros((len(wiki_chunks_list), model.config.hidden_size))
for i in tqdm(range(0, len(wiki_chunks_list), batch_size)):
    # Tokenize sentences
    encoded_chunks = tokenizer(wiki_chunks_list[i:i + batch_size], padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_chunks.to(device))
        # Perform pooling. In this case, cls pooling.
        embeddings = model_output[0][:, 0]
    # normalize embeddings
    embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
    sentence_embeddings[i:i + batch_size, :] = embeddings.cpu().numpy()

    if i % save_interval == 0:
        np.save(out_path, sentence_embeddings)
        print(f"Saved embeddings for {i} sentences out of {len(wiki_chunks_list)}")
np.save(out_path, sentence_embeddings)
print(f"Saved embeddings for all sentences")

  0%|          | 0/3349 [00:00<?, ?it/s]

  0%|          | 1/3349 [00:07<6:58:29,  7.50s/it]

Saved embeddings for 0 sentences out of 3348807


  0%|          | 11/3349 [00:50<4:32:30,  4.90s/it]

Saved embeddings for 10000 sentences out of 3348807


  1%|          | 21/3349 [01:32<4:22:59,  4.74s/it]

Saved embeddings for 20000 sentences out of 3348807


  1%|          | 31/3349 [02:14<4:22:02,  4.74s/it]

Saved embeddings for 30000 sentences out of 3348807


  1%|          | 41/3349 [02:56<4:22:30,  4.76s/it]

Saved embeddings for 40000 sentences out of 3348807


  2%|▏         | 51/3349 [03:39<4:25:25,  4.83s/it]

Saved embeddings for 50000 sentences out of 3348807


  2%|▏         | 61/3349 [04:21<4:25:38,  4.85s/it]

Saved embeddings for 60000 sentences out of 3348807


  2%|▏         | 71/3349 [05:04<4:19:35,  4.75s/it]

Saved embeddings for 70000 sentences out of 3348807


  2%|▏         | 81/3349 [05:46<4:19:09,  4.76s/it]

Saved embeddings for 80000 sentences out of 3348807


  3%|▎         | 91/3349 [06:29<4:33:20,  5.03s/it]

Saved embeddings for 90000 sentences out of 3348807


  3%|▎         | 101/3349 [07:12<4:20:34,  4.81s/it]

Saved embeddings for 100000 sentences out of 3348807


  3%|▎         | 111/3349 [07:54<4:19:07,  4.80s/it]

Saved embeddings for 110000 sentences out of 3348807


  4%|▎         | 121/3349 [08:37<4:17:59,  4.80s/it]

Saved embeddings for 120000 sentences out of 3348807


  4%|▍         | 131/3349 [09:20<4:39:23,  5.21s/it]

Saved embeddings for 130000 sentences out of 3348807


  4%|▍         | 141/3349 [10:03<4:16:11,  4.79s/it]

Saved embeddings for 140000 sentences out of 3348807


  5%|▍         | 151/3349 [10:45<4:16:02,  4.80s/it]

Saved embeddings for 150000 sentences out of 3348807


  5%|▍         | 161/3349 [11:28<4:15:42,  4.81s/it]

Saved embeddings for 160000 sentences out of 3348807


  5%|▌         | 171/3349 [12:10<4:14:08,  4.80s/it]

Saved embeddings for 170000 sentences out of 3348807


  5%|▌         | 181/3349 [12:55<4:16:26,  4.86s/it]

Saved embeddings for 180000 sentences out of 3348807


  6%|▌         | 191/3349 [13:37<4:15:32,  4.86s/it]

Saved embeddings for 190000 sentences out of 3348807


  6%|▌         | 201/3349 [14:20<4:14:06,  4.84s/it]

Saved embeddings for 200000 sentences out of 3348807


  6%|▋         | 211/3349 [15:02<4:15:25,  4.88s/it]

Saved embeddings for 210000 sentences out of 3348807


  7%|▋         | 221/3349 [15:45<4:11:44,  4.83s/it]

Saved embeddings for 220000 sentences out of 3348807


  7%|▋         | 231/3349 [16:28<4:15:20,  4.91s/it]

Saved embeddings for 230000 sentences out of 3348807


  7%|▋         | 241/3349 [17:37<6:08:37,  7.12s/it]

Saved embeddings for 240000 sentences out of 3348807


  7%|▋         | 251/3349 [18:53<6:58:25,  8.10s/it]

Saved embeddings for 250000 sentences out of 3348807


  8%|▊         | 261/3349 [20:09<7:16:29,  8.48s/it]

Saved embeddings for 260000 sentences out of 3348807


  8%|▊         | 271/3349 [21:22<6:23:26,  7.47s/it]

Saved embeddings for 270000 sentences out of 3348807


  8%|▊         | 281/3349 [22:38<6:58:47,  8.19s/it]

Saved embeddings for 280000 sentences out of 3348807


  9%|▊         | 291/3349 [23:55<7:15:10,  8.54s/it]

Saved embeddings for 290000 sentences out of 3348807


  9%|▉         | 301/3349 [25:05<6:17:51,  7.44s/it]

Saved embeddings for 300000 sentences out of 3348807


  9%|▉         | 311/3349 [26:21<6:47:39,  8.05s/it]

Saved embeddings for 310000 sentences out of 3348807


 10%|▉         | 321/3349 [27:37<7:07:06,  8.46s/it]

Saved embeddings for 320000 sentences out of 3348807


 10%|▉         | 331/3349 [28:50<6:28:06,  7.72s/it]

Saved embeddings for 330000 sentences out of 3348807


 10%|█         | 341/3349 [30:06<6:47:32,  8.13s/it]

Saved embeddings for 340000 sentences out of 3348807


 10%|█         | 351/3349 [31:23<7:04:51,  8.50s/it]

Saved embeddings for 350000 sentences out of 3348807


 11%|█         | 361/3349 [32:36<6:22:48,  7.69s/it]

Saved embeddings for 360000 sentences out of 3348807


 11%|█         | 371/3349 [33:52<6:39:05,  8.04s/it]

Saved embeddings for 370000 sentences out of 3348807


 11%|█▏        | 381/3349 [35:08<7:00:41,  8.50s/it]

Saved embeddings for 380000 sentences out of 3348807


 12%|█▏        | 391/3349 [36:23<6:12:11,  7.55s/it]

Saved embeddings for 390000 sentences out of 3348807


 12%|█▏        | 401/3349 [37:39<6:42:08,  8.18s/it]

Saved embeddings for 400000 sentences out of 3348807


 12%|█▏        | 411/3349 [38:56<6:56:15,  8.50s/it]

Saved embeddings for 410000 sentences out of 3348807


 13%|█▎        | 421/3349 [39:40<4:03:55,  5.00s/it]

Saved embeddings for 420000 sentences out of 3348807


 13%|█▎        | 431/3349 [40:23<3:57:10,  4.88s/it]

Saved embeddings for 430000 sentences out of 3348807


 13%|█▎        | 441/3349 [41:05<3:56:29,  4.88s/it]

Saved embeddings for 440000 sentences out of 3348807


 13%|█▎        | 451/3349 [41:47<3:47:57,  4.72s/it]

Saved embeddings for 450000 sentences out of 3348807


 14%|█▍        | 461/3349 [42:30<3:54:56,  4.88s/it]

Saved embeddings for 460000 sentences out of 3348807


 14%|█▍        | 471/3349 [43:13<3:54:31,  4.89s/it]

Saved embeddings for 470000 sentences out of 3348807


 14%|█▍        | 481/3349 [43:55<3:49:45,  4.81s/it]

Saved embeddings for 480000 sentences out of 3348807


 15%|█▍        | 491/3349 [44:38<3:52:57,  4.89s/it]

Saved embeddings for 490000 sentences out of 3348807


 15%|█▍        | 501/3349 [45:20<3:51:08,  4.87s/it]

Saved embeddings for 500000 sentences out of 3348807


 15%|█▌        | 511/3349 [46:03<3:50:38,  4.88s/it]

Saved embeddings for 510000 sentences out of 3348807


 16%|█▌        | 521/3349 [46:46<4:04:04,  5.18s/it]

Saved embeddings for 520000 sentences out of 3348807


 16%|█▌        | 531/3349 [47:29<3:47:28,  4.84s/it]

Saved embeddings for 530000 sentences out of 3348807


 16%|█▌        | 541/3349 [48:11<3:47:49,  4.87s/it]

Saved embeddings for 540000 sentences out of 3348807


 16%|█▋        | 551/3349 [48:54<3:53:26,  5.01s/it]

Saved embeddings for 550000 sentences out of 3348807


 17%|█▋        | 561/3349 [49:37<3:53:20,  5.02s/it]

Saved embeddings for 560000 sentences out of 3348807


 17%|█▋        | 571/3349 [50:21<3:51:45,  5.01s/it]

Saved embeddings for 570000 sentences out of 3348807


 17%|█▋        | 581/3349 [51:03<3:45:29,  4.89s/it]

Saved embeddings for 580000 sentences out of 3348807


 18%|█▊        | 591/3349 [51:46<3:44:49,  4.89s/it]

Saved embeddings for 590000 sentences out of 3348807


 18%|█▊        | 601/3349 [52:32<3:45:44,  4.93s/it]

Saved embeddings for 600000 sentences out of 3348807


 18%|█▊        | 611/3349 [53:16<3:56:32,  5.18s/it]

Saved embeddings for 610000 sentences out of 3348807


 19%|█▊        | 621/3349 [53:59<3:42:37,  4.90s/it]

Saved embeddings for 620000 sentences out of 3348807


 19%|█▉        | 631/3349 [54:41<3:41:21,  4.89s/it]

Saved embeddings for 630000 sentences out of 3348807


 19%|█▉        | 641/3349 [55:24<3:40:28,  4.89s/it]

Saved embeddings for 640000 sentences out of 3348807


 19%|█▉        | 651/3349 [56:07<3:45:43,  5.02s/it]

Saved embeddings for 650000 sentences out of 3348807


 20%|█▉        | 661/3349 [56:51<3:58:30,  5.32s/it]

Saved embeddings for 660000 sentences out of 3348807


 20%|██        | 671/3349 [57:34<3:37:26,  4.87s/it]

Saved embeddings for 670000 sentences out of 3348807


 20%|██        | 681/3349 [58:16<3:37:02,  4.88s/it]

Saved embeddings for 680000 sentences out of 3348807


 21%|██        | 691/3349 [58:59<3:37:21,  4.91s/it]

Saved embeddings for 690000 sentences out of 3348807


 21%|██        | 701/3349 [59:43<3:52:19,  5.26s/it]

Saved embeddings for 700000 sentences out of 3348807


 21%|██        | 711/3349 [1:00:54<5:18:17,  7.24s/it]

Saved embeddings for 710000 sentences out of 3348807


 22%|██▏       | 721/3349 [1:02:04<5:36:17,  7.68s/it]

Saved embeddings for 720000 sentences out of 3348807


 22%|██▏       | 731/3349 [1:03:15<5:45:02,  7.91s/it]

Saved embeddings for 730000 sentences out of 3348807


 22%|██▏       | 741/3349 [1:04:26<5:50:36,  8.07s/it]

Saved embeddings for 740000 sentences out of 3348807


 22%|██▏       | 751/3349 [1:05:36<5:43:06,  7.92s/it]

Saved embeddings for 750000 sentences out of 3348807


 23%|██▎       | 761/3349 [1:06:45<5:03:38,  7.04s/it]

Saved embeddings for 760000 sentences out of 3348807


 23%|██▎       | 771/3349 [1:07:57<5:34:47,  7.79s/it]

Saved embeddings for 770000 sentences out of 3348807


 23%|██▎       | 781/3349 [1:09:08<5:37:05,  7.88s/it]

Saved embeddings for 780000 sentences out of 3348807


 24%|██▎       | 791/3349 [1:10:18<5:39:04,  7.95s/it]

Saved embeddings for 790000 sentences out of 3348807


 24%|██▍       | 801/3349 [1:11:30<5:44:07,  8.10s/it]

Saved embeddings for 800000 sentences out of 3348807


 24%|██▍       | 811/3349 [1:12:38<5:00:17,  7.10s/it]

Saved embeddings for 810000 sentences out of 3348807


 25%|██▍       | 821/3349 [1:13:49<5:11:15,  7.39s/it]

Saved embeddings for 820000 sentences out of 3348807


 25%|██▍       | 831/3349 [1:15:00<5:29:53,  7.86s/it]

Saved embeddings for 830000 sentences out of 3348807


 25%|██▌       | 841/3349 [1:16:10<5:32:24,  7.95s/it]

Saved embeddings for 840000 sentences out of 3348807


 25%|██▌       | 851/3349 [1:17:21<5:34:28,  8.03s/it]

Saved embeddings for 850000 sentences out of 3348807


 26%|██▌       | 861/3349 [1:18:31<5:13:50,  7.57s/it]

Saved embeddings for 860000 sentences out of 3348807


 26%|██▌       | 871/3349 [1:19:42<5:08:11,  7.46s/it]

Saved embeddings for 870000 sentences out of 3348807


 26%|██▋       | 881/3349 [1:20:53<5:20:28,  7.79s/it]

Saved embeddings for 880000 sentences out of 3348807


 27%|██▋       | 891/3349 [1:22:04<5:27:44,  8.00s/it]

Saved embeddings for 890000 sentences out of 3348807


 27%|██▋       | 901/3349 [1:23:15<5:32:39,  8.15s/it]

Saved embeddings for 900000 sentences out of 3348807


 27%|██▋       | 911/3349 [1:24:26<5:18:56,  7.85s/it]

Saved embeddings for 910000 sentences out of 3348807


 28%|██▊       | 921/3349 [1:25:36<4:54:41,  7.28s/it]

Saved embeddings for 920000 sentences out of 3348807


 28%|██▊       | 931/3349 [1:26:47<5:08:57,  7.67s/it]

Saved embeddings for 930000 sentences out of 3348807


 28%|██▊       | 941/3349 [1:27:57<5:11:05,  7.75s/it]

Saved embeddings for 940000 sentences out of 3348807


 28%|██▊       | 951/3349 [1:29:07<5:18:30,  7.97s/it]

Saved embeddings for 950000 sentences out of 3348807


 29%|██▊       | 961/3349 [1:30:17<5:17:42,  7.98s/it]

Saved embeddings for 960000 sentences out of 3348807


 29%|██▉       | 971/3349 [1:31:27<4:51:20,  7.35s/it]

Saved embeddings for 970000 sentences out of 3348807


 29%|██▉       | 981/3349 [1:32:37<4:51:41,  7.39s/it]

Saved embeddings for 980000 sentences out of 3348807


 30%|██▉       | 991/3349 [1:33:48<5:01:35,  7.67s/it]

Saved embeddings for 990000 sentences out of 3348807


 30%|██▉       | 1001/3349 [1:34:59<5:12:40,  7.99s/it]

Saved embeddings for 1000000 sentences out of 3348807


 30%|███       | 1011/3349 [1:36:11<5:32:16,  8.53s/it]

Saved embeddings for 1010000 sentences out of 3348807


 30%|███       | 1021/3349 [1:37:22<4:59:42,  7.72s/it]

Saved embeddings for 1020000 sentences out of 3348807


 31%|███       | 1031/3349 [1:38:32<4:46:52,  7.43s/it]

Saved embeddings for 1030000 sentences out of 3348807


 31%|███       | 1041/3349 [1:39:45<5:17:08,  8.24s/it]

Saved embeddings for 1040000 sentences out of 3348807


 31%|███▏      | 1051/3349 [1:40:56<5:07:39,  8.03s/it]

Saved embeddings for 1050000 sentences out of 3348807


 32%|███▏      | 1061/3349 [1:42:07<5:06:50,  8.05s/it]

Saved embeddings for 1060000 sentences out of 3348807


 32%|███▏      | 1071/3349 [1:43:18<5:00:00,  7.90s/it]

Saved embeddings for 1070000 sentences out of 3348807


 32%|███▏      | 1081/3349 [1:44:15<3:27:12,  5.48s/it]

Saved embeddings for 1080000 sentences out of 3348807


 33%|███▎      | 1091/3349 [1:44:58<3:09:06,  5.02s/it]

Saved embeddings for 1090000 sentences out of 3348807


 33%|███▎      | 1101/3349 [1:45:41<3:06:48,  4.99s/it]

Saved embeddings for 1100000 sentences out of 3348807


 33%|███▎      | 1111/3349 [1:46:24<3:06:49,  5.01s/it]

Saved embeddings for 1110000 sentences out of 3348807


 33%|███▎      | 1121/3349 [1:47:07<3:05:47,  5.00s/it]

Saved embeddings for 1120000 sentences out of 3348807


 34%|███▍      | 1131/3349 [1:47:50<3:05:45,  5.02s/it]

Saved embeddings for 1130000 sentences out of 3348807


 34%|███▍      | 1141/3349 [1:48:33<3:04:21,  5.01s/it]

Saved embeddings for 1140000 sentences out of 3348807


 34%|███▍      | 1151/3349 [1:49:16<3:03:39,  5.01s/it]

Saved embeddings for 1150000 sentences out of 3348807


 35%|███▍      | 1161/3349 [1:49:59<3:03:32,  5.03s/it]

Saved embeddings for 1160000 sentences out of 3348807


 35%|███▍      | 1171/3349 [1:50:42<3:02:21,  5.02s/it]

Saved embeddings for 1170000 sentences out of 3348807


 35%|███▌      | 1181/3349 [1:51:41<4:44:21,  7.87s/it]

Saved embeddings for 1180000 sentences out of 3348807


 36%|███▌      | 1191/3349 [1:53:04<5:19:16,  8.88s/it]

Saved embeddings for 1190000 sentences out of 3348807


 36%|███▌      | 1201/3349 [1:54:21<5:12:44,  8.74s/it]

Saved embeddings for 1200000 sentences out of 3348807


 36%|███▌      | 1211/3349 [1:55:45<5:24:35,  9.11s/it]

Saved embeddings for 1210000 sentences out of 3348807


 36%|███▋      | 1221/3349 [1:56:57<5:16:56,  8.94s/it]

Saved embeddings for 1220000 sentences out of 3348807


 37%|███▋      | 1231/3349 [1:58:19<4:58:19,  8.45s/it]

Saved embeddings for 1230000 sentences out of 3348807


 37%|███▋      | 1241/3349 [1:59:44<5:19:19,  9.09s/it]

Saved embeddings for 1240000 sentences out of 3348807


 37%|███▋      | 1251/3349 [2:00:58<4:34:09,  7.84s/it]

Saved embeddings for 1250000 sentences out of 3348807


 38%|███▊      | 1261/3349 [2:02:23<5:26:43,  9.39s/it]

Saved embeddings for 1260000 sentences out of 3348807


 38%|███▊      | 1271/3349 [2:03:40<5:02:12,  8.73s/it]

Saved embeddings for 1270000 sentences out of 3348807


 38%|███▊      | 1281/3349 [2:05:03<5:14:37,  9.13s/it]

Saved embeddings for 1280000 sentences out of 3348807


 39%|███▊      | 1291/3349 [2:06:20<5:07:52,  8.98s/it]

Saved embeddings for 1290000 sentences out of 3348807


 39%|███▉      | 1301/3349 [2:07:43<5:15:14,  9.24s/it]

Saved embeddings for 1300000 sentences out of 3348807


 39%|███▉      | 1311/3349 [2:09:01<5:04:29,  8.96s/it]

Saved embeddings for 1310000 sentences out of 3348807


 39%|███▉      | 1321/3349 [2:10:17<4:03:43,  7.21s/it]

Saved embeddings for 1320000 sentences out of 3348807


 40%|███▉      | 1331/3349 [2:11:39<5:01:13,  8.96s/it]

Saved embeddings for 1330000 sentences out of 3348807


 40%|████      | 1341/3349 [2:12:49<3:54:23,  7.00s/it]

Saved embeddings for 1340000 sentences out of 3348807


 40%|████      | 1351/3349 [2:14:13<4:59:35,  9.00s/it]

Saved embeddings for 1350000 sentences out of 3348807


 41%|████      | 1361/3349 [2:15:28<4:41:08,  8.48s/it]

Saved embeddings for 1360000 sentences out of 3348807


 41%|████      | 1371/3349 [2:16:52<5:00:05,  9.10s/it]

Saved embeddings for 1370000 sentences out of 3348807


 41%|████      | 1381/3349 [2:18:06<4:49:26,  8.82s/it]

Saved embeddings for 1380000 sentences out of 3348807


 42%|████▏     | 1391/3349 [2:19:30<5:10:04,  9.50s/it]

Saved embeddings for 1390000 sentences out of 3348807


 42%|████▏     | 1401/3349 [2:20:46<4:50:50,  8.96s/it]

Saved embeddings for 1400000 sentences out of 3348807


 42%|████▏     | 1411/3349 [2:22:05<4:15:14,  7.90s/it]

Saved embeddings for 1410000 sentences out of 3348807


 42%|████▏     | 1421/3349 [2:23:29<4:55:00,  9.18s/it]

Saved embeddings for 1420000 sentences out of 3348807


 43%|████▎     | 1431/3349 [2:24:46<4:15:46,  8.00s/it]

Saved embeddings for 1430000 sentences out of 3348807


 43%|████▎     | 1441/3349 [2:26:09<4:48:12,  9.06s/it]

Saved embeddings for 1440000 sentences out of 3348807


 43%|████▎     | 1451/3349 [2:27:26<4:39:40,  8.84s/it]

Saved embeddings for 1450000 sentences out of 3348807


 44%|████▎     | 1461/3349 [2:28:50<4:45:08,  9.06s/it]

Saved embeddings for 1460000 sentences out of 3348807


 44%|████▍     | 1471/3349 [2:30:05<4:51:41,  9.32s/it]

Saved embeddings for 1470000 sentences out of 3348807


 44%|████▍     | 1481/3349 [2:31:29<4:44:36,  9.14s/it]

Saved embeddings for 1480000 sentences out of 3348807


 45%|████▍     | 1491/3349 [2:32:45<4:41:53,  9.10s/it]

Saved embeddings for 1490000 sentences out of 3348807


 45%|████▍     | 1501/3349 [2:34:05<4:07:17,  8.03s/it]

Saved embeddings for 1500000 sentences out of 3348807


 45%|████▌     | 1511/3349 [2:35:29<4:41:10,  9.18s/it]

Saved embeddings for 1510000 sentences out of 3348807


 45%|████▌     | 1521/3349 [2:36:44<3:58:59,  7.84s/it]

Saved embeddings for 1520000 sentences out of 3348807


 46%|████▌     | 1531/3349 [2:38:09<4:46:39,  9.46s/it]

Saved embeddings for 1530000 sentences out of 3348807


 46%|████▌     | 1541/3349 [2:39:26<4:24:07,  8.77s/it]

Saved embeddings for 1540000 sentences out of 3348807


 46%|████▋     | 1551/3349 [2:40:48<4:31:39,  9.07s/it]

Saved embeddings for 1550000 sentences out of 3348807


 47%|████▋     | 1561/3349 [2:42:05<4:28:43,  9.02s/it]

Saved embeddings for 1560000 sentences out of 3348807


 47%|████▋     | 1571/3349 [2:43:28<4:25:03,  8.94s/it]

Saved embeddings for 1570000 sentences out of 3348807


 47%|████▋     | 1581/3349 [2:44:48<4:26:41,  9.05s/it]

Saved embeddings for 1580000 sentences out of 3348807


 48%|████▊     | 1591/3349 [2:46:04<3:40:39,  7.53s/it]

Saved embeddings for 1590000 sentences out of 3348807


 48%|████▊     | 1601/3349 [2:47:26<4:08:13,  8.52s/it]

Saved embeddings for 1600000 sentences out of 3348807


 48%|████▊     | 1611/3349 [2:48:42<4:02:25,  8.37s/it]

Saved embeddings for 1610000 sentences out of 3348807


 48%|████▊     | 1621/3349 [2:50:06<4:21:41,  9.09s/it]

Saved embeddings for 1620000 sentences out of 3348807


 49%|████▊     | 1631/3349 [2:51:21<4:07:30,  8.64s/it]

Saved embeddings for 1630000 sentences out of 3348807


 49%|████▉     | 1641/3349 [2:52:47<4:23:49,  9.27s/it]

Saved embeddings for 1640000 sentences out of 3348807


 49%|████▉     | 1651/3349 [2:54:04<4:17:54,  9.11s/it]

Saved embeddings for 1650000 sentences out of 3348807


 50%|████▉     | 1661/3349 [2:55:21<3:34:28,  7.62s/it]

Saved embeddings for 1660000 sentences out of 3348807


 50%|████▉     | 1671/3349 [2:56:46<4:16:43,  9.18s/it]

Saved embeddings for 1670000 sentences out of 3348807


 50%|█████     | 1681/3349 [2:58:02<3:48:07,  8.21s/it]

Saved embeddings for 1680000 sentences out of 3348807


 50%|█████     | 1691/3349 [2:59:24<4:12:36,  9.14s/it]

Saved embeddings for 1690000 sentences out of 3348807


 51%|█████     | 1701/3349 [3:00:40<4:01:38,  8.80s/it]

Saved embeddings for 1700000 sentences out of 3348807


 51%|█████     | 1711/3349 [3:02:03<3:57:48,  8.71s/it]

Saved embeddings for 1710000 sentences out of 3348807


 51%|█████▏    | 1721/3349 [3:03:19<4:05:35,  9.05s/it]

Saved embeddings for 1720000 sentences out of 3348807


 52%|█████▏    | 1731/3349 [3:04:43<4:04:52,  9.08s/it]

Saved embeddings for 1730000 sentences out of 3348807


 52%|█████▏    | 1741/3349 [3:06:04<3:59:39,  8.94s/it]

Saved embeddings for 1740000 sentences out of 3348807


 52%|█████▏    | 1751/3349 [3:07:18<3:04:56,  6.94s/it]

Saved embeddings for 1750000 sentences out of 3348807


 53%|█████▎    | 1761/3349 [3:08:42<4:02:31,  9.16s/it]

Saved embeddings for 1760000 sentences out of 3348807


 53%|█████▎    | 1771/3349 [3:09:57<3:25:41,  7.82s/it]

Saved embeddings for 1770000 sentences out of 3348807


 53%|█████▎    | 1781/3349 [3:11:21<4:01:40,  9.25s/it]

Saved embeddings for 1780000 sentences out of 3348807


 53%|█████▎    | 1791/3349 [3:12:39<3:58:07,  9.17s/it]

Saved embeddings for 1790000 sentences out of 3348807


 54%|█████▍    | 1801/3349 [3:14:01<3:55:10,  9.12s/it]

Saved embeddings for 1800000 sentences out of 3348807


 54%|█████▍    | 1811/3349 [3:15:17<3:52:04,  9.05s/it]

Saved embeddings for 1810000 sentences out of 3348807


 54%|█████▍    | 1821/3349 [3:16:41<3:55:38,  9.25s/it]

Saved embeddings for 1820000 sentences out of 3348807


 55%|█████▍    | 1831/3349 [3:17:54<3:51:05,  9.13s/it]

Saved embeddings for 1830000 sentences out of 3348807


 55%|█████▍    | 1841/3349 [3:19:16<3:34:55,  8.55s/it]

Saved embeddings for 1840000 sentences out of 3348807


 55%|█████▌    | 1851/3349 [3:20:39<3:47:11,  9.10s/it]

Saved embeddings for 1850000 sentences out of 3348807


 56%|█████▌    | 1861/3349 [3:21:55<3:11:32,  7.72s/it]

Saved embeddings for 1860000 sentences out of 3348807


 56%|█████▌    | 1871/3349 [3:23:19<3:47:22,  9.23s/it]

Saved embeddings for 1870000 sentences out of 3348807


 56%|█████▌    | 1881/3349 [3:24:33<3:22:01,  8.26s/it]

Saved embeddings for 1880000 sentences out of 3348807


 56%|█████▋    | 1891/3349 [3:25:58<3:45:02,  9.26s/it]

Saved embeddings for 1890000 sentences out of 3348807


 57%|█████▋    | 1901/3349 [3:27:14<3:37:13,  9.00s/it]

Saved embeddings for 1900000 sentences out of 3348807


 57%|█████▋    | 1911/3349 [3:28:37<3:39:45,  9.17s/it]

Saved embeddings for 1910000 sentences out of 3348807


 57%|█████▋    | 1921/3349 [3:29:51<3:36:17,  9.09s/it]

Saved embeddings for 1920000 sentences out of 3348807


 58%|█████▊    | 1931/3349 [3:31:13<3:18:50,  8.41s/it]

Saved embeddings for 1930000 sentences out of 3348807


 58%|█████▊    | 1941/3349 [3:32:35<3:35:30,  9.18s/it]

Saved embeddings for 1940000 sentences out of 3348807


 58%|█████▊    | 1951/3349 [3:33:51<3:02:06,  7.82s/it]

Saved embeddings for 1950000 sentences out of 3348807


 59%|█████▊    | 1961/3349 [3:35:16<3:32:59,  9.21s/it]

Saved embeddings for 1960000 sentences out of 3348807


 59%|█████▉    | 1971/3349 [3:36:28<3:11:35,  8.34s/it]

Saved embeddings for 1970000 sentences out of 3348807


 59%|█████▉    | 1981/3349 [3:37:52<3:29:15,  9.18s/it]

Saved embeddings for 1980000 sentences out of 3348807


 59%|█████▉    | 1991/3349 [3:39:06<3:09:13,  8.36s/it]

Saved embeddings for 1990000 sentences out of 3348807


 60%|█████▉    | 2001/3349 [3:40:30<3:26:02,  9.17s/it]

Saved embeddings for 2000000 sentences out of 3348807


 60%|██████    | 2011/3349 [3:41:43<3:22:01,  9.06s/it]

Saved embeddings for 2010000 sentences out of 3348807


 60%|██████    | 2021/3349 [3:43:05<3:20:17,  9.05s/it]

Saved embeddings for 2020000 sentences out of 3348807


 61%|██████    | 2031/3349 [3:44:25<3:20:12,  9.11s/it]

Saved embeddings for 2030000 sentences out of 3348807


 61%|██████    | 2041/3349 [3:45:41<2:42:34,  7.46s/it]

Saved embeddings for 2040000 sentences out of 3348807


 61%|██████    | 2051/3349 [3:47:03<3:15:49,  9.05s/it]

Saved embeddings for 2050000 sentences out of 3348807


 62%|██████▏   | 2061/3349 [3:48:21<3:05:16,  8.63s/it]

Saved embeddings for 2060000 sentences out of 3348807


 62%|██████▏   | 2071/3349 [3:49:45<3:16:25,  9.22s/it]

Saved embeddings for 2070000 sentences out of 3348807


 62%|██████▏   | 2081/3349 [3:51:00<3:08:46,  8.93s/it]

Saved embeddings for 2080000 sentences out of 3348807


 62%|██████▏   | 2091/3349 [3:52:25<3:16:23,  9.37s/it]

Saved embeddings for 2090000 sentences out of 3348807


 63%|██████▎   | 2101/3349 [3:53:40<3:06:20,  8.96s/it]

Saved embeddings for 2100000 sentences out of 3348807


 63%|██████▎   | 2111/3349 [3:55:02<2:55:20,  8.50s/it]

Saved embeddings for 2110000 sentences out of 3348807


 63%|██████▎   | 2121/3349 [3:56:26<3:09:54,  9.28s/it]

Saved embeddings for 2120000 sentences out of 3348807


 64%|██████▎   | 2131/3349 [3:57:40<2:37:14,  7.75s/it]

Saved embeddings for 2130000 sentences out of 3348807


 64%|██████▍   | 2141/3349 [3:59:05<3:06:15,  9.25s/it]

Saved embeddings for 2140000 sentences out of 3348807


 64%|██████▍   | 2151/3349 [4:00:20<2:53:46,  8.70s/it]

Saved embeddings for 2150000 sentences out of 3348807


 65%|██████▍   | 2161/3349 [4:01:43<2:59:30,  9.07s/it]

Saved embeddings for 2160000 sentences out of 3348807


 65%|██████▍   | 2171/3349 [4:02:49<2:49:01,  8.61s/it]

Saved embeddings for 2170000 sentences out of 3348807


 65%|██████▌   | 2181/3349 [4:04:14<3:04:17,  9.47s/it]

Saved embeddings for 2180000 sentences out of 3348807


 65%|██████▌   | 2191/3349 [4:05:13<2:23:13,  7.42s/it]

Saved embeddings for 2190000 sentences out of 3348807


 66%|██████▌   | 2201/3349 [4:06:38<2:56:10,  9.21s/it]

Saved embeddings for 2200000 sentences out of 3348807


 66%|██████▌   | 2211/3349 [4:07:53<2:48:22,  8.88s/it]

Saved embeddings for 2210000 sentences out of 3348807


 66%|██████▋   | 2221/3349 [4:09:14<2:46:50,  8.87s/it]

Saved embeddings for 2220000 sentences out of 3348807


 67%|██████▋   | 2231/3349 [4:10:29<2:47:45,  9.00s/it]

Saved embeddings for 2230000 sentences out of 3348807


 67%|██████▋   | 2241/3349 [4:11:54<2:52:07,  9.32s/it]

Saved embeddings for 2240000 sentences out of 3348807


 67%|██████▋   | 2251/3349 [4:13:08<2:46:52,  9.12s/it]

Saved embeddings for 2250000 sentences out of 3348807


 68%|██████▊   | 2261/3349 [4:14:30<2:33:08,  8.45s/it]

Saved embeddings for 2260000 sentences out of 3348807


 68%|██████▊   | 2271/3349 [4:15:56<2:49:03,  9.41s/it]

Saved embeddings for 2270000 sentences out of 3348807


 68%|██████▊   | 2281/3349 [4:17:09<2:19:50,  7.86s/it]

Saved embeddings for 2280000 sentences out of 3348807


 68%|██████▊   | 2291/3349 [4:18:34<2:46:41,  9.45s/it]

Saved embeddings for 2290000 sentences out of 3348807


 69%|██████▊   | 2301/3349 [4:19:49<2:35:22,  8.90s/it]

Saved embeddings for 2300000 sentences out of 3348807


 69%|██████▉   | 2311/3349 [4:21:13<2:42:51,  9.41s/it]

Saved embeddings for 2310000 sentences out of 3348807


 69%|██████▉   | 2321/3349 [4:22:29<2:41:57,  9.45s/it]

Saved embeddings for 2320000 sentences out of 3348807


 70%|██████▉   | 2331/3349 [4:23:51<2:32:39,  9.00s/it]

Saved embeddings for 2330000 sentences out of 3348807


 70%|██████▉   | 2341/3349 [4:25:03<2:32:12,  9.06s/it]

Saved embeddings for 2340000 sentences out of 3348807


 70%|███████   | 2351/3349 [4:26:25<2:30:49,  9.07s/it]

Saved embeddings for 2350000 sentences out of 3348807


 70%|███████   | 2361/3349 [4:27:37<2:26:39,  8.91s/it]

Saved embeddings for 2360000 sentences out of 3348807


 71%|███████   | 2371/3349 [4:28:59<2:29:52,  9.19s/it]

Saved embeddings for 2370000 sentences out of 3348807


 71%|███████   | 2381/3349 [4:30:11<2:20:41,  8.72s/it]

Saved embeddings for 2380000 sentences out of 3348807


 71%|███████▏  | 2391/3349 [4:31:33<2:26:29,  9.17s/it]

Saved embeddings for 2390000 sentences out of 3348807


 72%|███████▏  | 2401/3349 [4:32:42<2:17:43,  8.72s/it]

Saved embeddings for 2400000 sentences out of 3348807


 72%|███████▏  | 2411/3349 [4:34:03<2:23:45,  9.20s/it]

Saved embeddings for 2410000 sentences out of 3348807


 72%|███████▏  | 2421/3349 [4:35:14<2:16:34,  8.83s/it]

Saved embeddings for 2420000 sentences out of 3348807


 73%|███████▎  | 2431/3349 [4:36:36<2:19:24,  9.11s/it]

Saved embeddings for 2430000 sentences out of 3348807


 73%|███████▎  | 2441/3349 [4:37:50<2:14:21,  8.88s/it]

Saved embeddings for 2440000 sentences out of 3348807


 73%|███████▎  | 2451/3349 [4:39:13<2:17:50,  9.21s/it]

Saved embeddings for 2450000 sentences out of 3348807


 73%|███████▎  | 2461/3349 [4:40:23<2:07:18,  8.60s/it]

Saved embeddings for 2460000 sentences out of 3348807


 74%|███████▍  | 2471/3349 [4:41:44<2:16:57,  9.36s/it]

Saved embeddings for 2470000 sentences out of 3348807


 74%|███████▍  | 2481/3349 [4:42:57<2:09:36,  8.96s/it]

Saved embeddings for 2480000 sentences out of 3348807


 74%|███████▍  | 2491/3349 [4:44:17<2:08:47,  9.01s/it]

Saved embeddings for 2490000 sentences out of 3348807


 75%|███████▍  | 2501/3349 [4:45:30<2:01:51,  8.62s/it]

Saved embeddings for 2500000 sentences out of 3348807


 75%|███████▍  | 2511/3349 [4:46:51<2:09:26,  9.27s/it]

Saved embeddings for 2510000 sentences out of 3348807


 75%|███████▌  | 2521/3349 [4:48:02<1:54:11,  8.27s/it]

Saved embeddings for 2520000 sentences out of 3348807


 76%|███████▌  | 2531/3349 [4:49:24<2:03:24,  9.05s/it]

Saved embeddings for 2530000 sentences out of 3348807


 76%|███████▌  | 2541/3349 [4:50:36<1:52:12,  8.33s/it]

Saved embeddings for 2540000 sentences out of 3348807


 76%|███████▌  | 2551/3349 [4:51:55<1:56:43,  8.78s/it]

Saved embeddings for 2550000 sentences out of 3348807


 76%|███████▋  | 2561/3349 [4:53:08<1:45:50,  8.06s/it]

Saved embeddings for 2560000 sentences out of 3348807


 77%|███████▋  | 2571/3349 [4:54:26<1:52:10,  8.65s/it]

Saved embeddings for 2570000 sentences out of 3348807


 77%|███████▋  | 2581/3349 [4:55:38<1:37:19,  7.60s/it]

Saved embeddings for 2580000 sentences out of 3348807


 77%|███████▋  | 2591/3349 [4:56:58<1:56:05,  9.19s/it]

Saved embeddings for 2590000 sentences out of 3348807


 78%|███████▊  | 2601/3349 [4:58:11<1:27:45,  7.04s/it]

Saved embeddings for 2600000 sentences out of 3348807


 78%|███████▊  | 2611/3349 [4:59:31<1:52:39,  9.16s/it]

Saved embeddings for 2610000 sentences out of 3348807


 78%|███████▊  | 2621/3349 [5:00:47<1:33:25,  7.70s/it]

Saved embeddings for 2620000 sentences out of 3348807


 79%|███████▊  | 2631/3349 [5:02:13<1:53:08,  9.45s/it]

Saved embeddings for 2630000 sentences out of 3348807


 79%|███████▉  | 2641/3349 [5:03:27<1:34:49,  8.04s/it]

Saved embeddings for 2640000 sentences out of 3348807


 79%|███████▉  | 2651/3349 [5:04:52<1:52:16,  9.65s/it]

Saved embeddings for 2650000 sentences out of 3348807


 79%|███████▉  | 2661/3349 [5:06:05<1:39:38,  8.69s/it]

Saved embeddings for 2660000 sentences out of 3348807


 80%|███████▉  | 2671/3349 [5:07:29<1:41:07,  8.95s/it]

Saved embeddings for 2670000 sentences out of 3348807


 80%|████████  | 2681/3349 [5:08:43<1:41:57,  9.16s/it]

Saved embeddings for 2680000 sentences out of 3348807


 80%|████████  | 2691/3349 [5:10:12<1:42:59,  9.39s/it]

Saved embeddings for 2690000 sentences out of 3348807


 81%|████████  | 2701/3349 [5:11:32<1:40:32,  9.31s/it]

Saved embeddings for 2700000 sentences out of 3348807


 81%|████████  | 2711/3349 [5:12:47<1:16:51,  7.23s/it]

Saved embeddings for 2710000 sentences out of 3348807


 81%|████████  | 2721/3349 [5:14:13<1:38:57,  9.45s/it]

Saved embeddings for 2720000 sentences out of 3348807


 82%|████████▏ | 2731/3349 [5:15:23<1:23:06,  8.07s/it]

Saved embeddings for 2730000 sentences out of 3348807


 82%|████████▏ | 2741/3349 [5:16:49<1:37:23,  9.61s/it]

Saved embeddings for 2740000 sentences out of 3348807


 82%|████████▏ | 2751/3349 [5:18:03<1:29:36,  8.99s/it]

Saved embeddings for 2750000 sentences out of 3348807


 82%|████████▏ | 2761/3349 [5:19:27<1:31:59,  9.39s/it]

Saved embeddings for 2760000 sentences out of 3348807


 83%|████████▎ | 2771/3349 [5:20:39<1:28:32,  9.19s/it]

Saved embeddings for 2770000 sentences out of 3348807


 83%|████████▎ | 2781/3349 [5:22:06<1:33:00,  9.82s/it]

Saved embeddings for 2780000 sentences out of 3348807


 83%|████████▎ | 2791/3349 [5:23:22<1:27:29,  9.41s/it]

Saved embeddings for 2790000 sentences out of 3348807


 84%|████████▎ | 2801/3349 [5:24:44<1:18:31,  8.60s/it]

Saved embeddings for 2800000 sentences out of 3348807


 84%|████████▍ | 2811/3349 [5:26:08<1:22:42,  9.22s/it]

Saved embeddings for 2810000 sentences out of 3348807


 84%|████████▍ | 2821/3349 [5:27:22<1:13:25,  8.34s/it]

Saved embeddings for 2820000 sentences out of 3348807


 85%|████████▍ | 2831/3349 [5:28:47<1:21:03,  9.39s/it]

Saved embeddings for 2830000 sentences out of 3348807


 85%|████████▍ | 2841/3349 [5:30:00<1:13:05,  8.63s/it]

Saved embeddings for 2840000 sentences out of 3348807


 85%|████████▌ | 2851/3349 [5:31:25<1:18:24,  9.45s/it]

Saved embeddings for 2850000 sentences out of 3348807


 85%|████████▌ | 2861/3349 [5:32:37<1:14:32,  9.17s/it]

Saved embeddings for 2860000 sentences out of 3348807


 86%|████████▌ | 2871/3349 [5:34:01<1:16:04,  9.55s/it]

Saved embeddings for 2870000 sentences out of 3348807


 86%|████████▌ | 2881/3349 [5:35:18<1:13:27,  9.42s/it]

Saved embeddings for 2880000 sentences out of 3348807


 86%|████████▋ | 2891/3349 [5:36:37<1:02:11,  8.15s/it]

Saved embeddings for 2890000 sentences out of 3348807


 87%|████████▋ | 2901/3349 [5:38:02<1:09:34,  9.32s/it]

Saved embeddings for 2900000 sentences out of 3348807


 87%|████████▋ | 2911/3349 [5:39:16<57:38,  7.90s/it]  

Saved embeddings for 2910000 sentences out of 3348807


 87%|████████▋ | 2921/3349 [5:40:40<1:07:17,  9.43s/it]

Saved embeddings for 2920000 sentences out of 3348807


 88%|████████▊ | 2931/3349 [5:41:55<1:01:03,  8.76s/it]

Saved embeddings for 2930000 sentences out of 3348807


 88%|████████▊ | 2941/3349 [5:43:21<1:05:06,  9.57s/it]

Saved embeddings for 2940000 sentences out of 3348807


 88%|████████▊ | 2951/3349 [5:44:31<58:17,  8.79s/it]  

Saved embeddings for 2950000 sentences out of 3348807


 88%|████████▊ | 2961/3349 [5:45:58<1:02:42,  9.70s/it]

Saved embeddings for 2960000 sentences out of 3348807


 89%|████████▊ | 2971/3349 [5:47:14<1:00:09,  9.55s/it]

Saved embeddings for 2970000 sentences out of 3348807


 89%|████████▉ | 2981/3349 [5:48:34<51:13,  8.35s/it]  

Saved embeddings for 2980000 sentences out of 3348807


 89%|████████▉ | 2991/3349 [5:49:58<56:04,  9.40s/it]

Saved embeddings for 2990000 sentences out of 3348807


 90%|████████▉ | 3001/3349 [5:51:12<45:15,  7.80s/it]

Saved embeddings for 3000000 sentences out of 3348807


 90%|████████▉ | 3011/3349 [5:52:35<52:25,  9.31s/it]

Saved embeddings for 3010000 sentences out of 3348807


 90%|█████████ | 3021/3349 [5:53:48<46:30,  8.51s/it]

Saved embeddings for 3020000 sentences out of 3348807


 91%|█████████ | 3031/3349 [5:55:13<50:47,  9.58s/it]

Saved embeddings for 3030000 sentences out of 3348807


 91%|█████████ | 3041/3349 [5:56:25<46:41,  9.09s/it]

Saved embeddings for 3040000 sentences out of 3348807


 91%|█████████ | 3051/3349 [5:57:51<47:20,  9.53s/it]

Saved embeddings for 3050000 sentences out of 3348807


 91%|█████████▏| 3061/3349 [5:59:04<44:27,  9.26s/it]

Saved embeddings for 3060000 sentences out of 3348807


 92%|█████████▏| 3071/3349 [6:00:29<43:15,  9.34s/it]

Saved embeddings for 3070000 sentences out of 3348807


 92%|█████████▏| 3081/3349 [6:01:50<41:58,  9.40s/it]

Saved embeddings for 3080000 sentences out of 3348807


 92%|█████████▏| 3091/3349 [6:03:06<32:51,  7.64s/it]

Saved embeddings for 3090000 sentences out of 3348807


 93%|█████████▎| 3101/3349 [6:04:33<40:32,  9.81s/it]

Saved embeddings for 3100000 sentences out of 3348807


 93%|█████████▎| 3111/3349 [6:05:46<33:52,  8.54s/it]

Saved embeddings for 3110000 sentences out of 3348807


 93%|█████████▎| 3121/3349 [6:07:07<33:34,  8.83s/it]

Saved embeddings for 3120000 sentences out of 3348807


 93%|█████████▎| 3131/3349 [6:08:18<31:09,  8.58s/it]

Saved embeddings for 3130000 sentences out of 3348807


 94%|█████████▍| 3141/3349 [6:09:44<33:08,  9.56s/it]

Saved embeddings for 3140000 sentences out of 3348807


 94%|█████████▍| 3151/3349 [6:10:54<29:26,  8.92s/it]

Saved embeddings for 3150000 sentences out of 3348807


 94%|█████████▍| 3161/3349 [6:12:19<29:59,  9.57s/it]

Saved embeddings for 3160000 sentences out of 3348807


 95%|█████████▍| 3171/3349 [6:13:31<27:41,  9.33s/it]

Saved embeddings for 3170000 sentences out of 3348807


 95%|█████████▍| 3181/3349 [6:14:55<26:48,  9.57s/it]

Saved embeddings for 3180000 sentences out of 3348807


 95%|█████████▌| 3191/3349 [6:16:08<24:27,  9.28s/it]

Saved embeddings for 3190000 sentences out of 3348807


 96%|█████████▌| 3201/3349 [6:17:30<20:55,  8.48s/it]

Saved embeddings for 3200000 sentences out of 3348807


 96%|█████████▌| 3211/3349 [6:18:48<21:31,  9.36s/it]

Saved embeddings for 3210000 sentences out of 3348807


 96%|█████████▌| 3221/3349 [6:20:06<16:20,  7.66s/it]

Saved embeddings for 3220000 sentences out of 3348807


 96%|█████████▋| 3231/3349 [6:21:31<18:51,  9.59s/it]

Saved embeddings for 3230000 sentences out of 3348807


 97%|█████████▋| 3241/3349 [6:22:41<12:24,  6.89s/it]

Saved embeddings for 3240000 sentences out of 3348807


 97%|█████████▋| 3251/3349 [6:24:07<15:51,  9.71s/it]

Saved embeddings for 3250000 sentences out of 3348807


 97%|█████████▋| 3261/3349 [6:25:18<11:36,  7.91s/it]

Saved embeddings for 3260000 sentences out of 3348807


 98%|█████████▊| 3271/3349 [6:26:42<12:16,  9.44s/it]

Saved embeddings for 3270000 sentences out of 3348807


 98%|█████████▊| 3281/3349 [6:27:53<09:48,  8.65s/it]

Saved embeddings for 3280000 sentences out of 3348807


 98%|█████████▊| 3291/3349 [6:29:17<08:42,  9.01s/it]

Saved embeddings for 3290000 sentences out of 3348807


 99%|█████████▊| 3301/3349 [6:30:28<07:37,  9.53s/it]

Saved embeddings for 3300000 sentences out of 3348807


 99%|█████████▉| 3311/3349 [6:31:58<06:37, 10.47s/it]

Saved embeddings for 3310000 sentences out of 3348807


 99%|█████████▉| 3321/3349 [6:33:12<04:09,  8.93s/it]

Saved embeddings for 3320000 sentences out of 3348807


 99%|█████████▉| 3331/3349 [6:34:35<02:33,  8.52s/it]

Saved embeddings for 3330000 sentences out of 3348807


100%|█████████▉| 3341/3349 [6:35:53<01:14,  9.37s/it]

Saved embeddings for 3340000 sentences out of 3348807


100%|██████████| 3349/3349 [6:36:54<00:00,  7.11s/it]


Saved embeddings for all sentences


: 

In [10]:
sim = torch.mm(query_embeddings, sentence_embeddings.transpose(0, 1))
topk = torch.topk(sim, 5, dim=1).indices.tolist()
topk

[[919], [517]]

In [11]:
# # for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
instruction = "What information is relevant to determining whether the following text is true?"
queries = ["The Tang dynasty (618-907 CE) saw the development of a vibrant national identity through song and dance performance", "Modern Chinese musicians continue to explore new avenues in traditional music, and have collaborated with Western artists to create exciting works that bridge Eastern and Western forms of expression"]
encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')


'Buddhist art is the artistic practices that are influenced by Buddhism. It includes art media which depict Buddhas, bodhisattvas, and other entities; notable Buddhist figures, both historical and mythical; narrative scenes from the lives of all of these; mandalas and other graphic aids to practice; as well as physical objects associated with Buddhist practice, such as vajras, bells, stupas and Buddhist temple architecture. Buddhist art originated on the Indian subcontinent following the historical life of Siddhartha Gautama, 6th to 5th century BC, and thereafter evolved by contact with other cultures as it spread throughout Asia and the world. Buddhist art followed believers as the dharma spread, adapted, and evolved in each new host country. It developed to the north through Central Asia and into Eastern Asia to form the Northern branch of Buddhist art, and to the east as far as Southeast Asia to form the Southern branch of Buddhist art. In India, Buddhist art flourished and co-devel

In [13]:
embed_size

384

In [None]:
import faiss

model_name = "bge-small-en"
dim = {"bge-small-en": 1024, "bge-small-en": 384}[model_name]

# use inner product metric
config = faiss.GpuIndexFlatConfig()
config.useFloat16 = True
config.device = 1
res = faiss.StandardGpuResources()
quantizer = faiss.GpuIndexFlatIP(res, dim, config)
nlist = 100  # number of clusters
m = 8  # number of subquantizers
index = faiss.IndexIVFPQ(quantizer, dim, nlist, m, 8)

# index = faiss.IndexFlatIP(dims)

# cpu_index = faiss.IndexFlatIP(1024)

# index = faiss.index_cpu_to_all_gpus(  # build the index
#     cpu_index
# )

# TODO: take a random sample of 10 files to train with
n_samples = 5
files = np.random.choice(os.listdir(embeddings_dir), n_samples)
vecs = []
for file in files:
    vecs.append(np.load(os.path.join(embeddings_dir, file)).astype(np.float16))
vecs = np.concatenate(vecs)
index.train(vecs)
print(index.is_trained)

False


In [None]:
import os
import numpy as np
from tqdm import tqdm

embeddings_dir = f"retrieval/20230601.en.wiki_embeddings_{model_name}"
files = os.listdir(embeddings_dir)
# make sure to add keys to index in order
files = sorted(files, key=lambda file: int(file.split(".")[0]))
print(len(files))
for file in tqdm(files, desc="Adding embeddings to index"):
    batch_num = int(file.split(".")[0])
    vecs = np.load(os.path.join(embeddings_dir, file)).astype(np.float16)
    index.add(vecs)

176


Adding embeddings to index: 100%|██████████| 176/176 [15:50<00:00,  5.40s/it]


In [None]:
vecs = np.load(os.path.join(embeddings_dir, files[-2]))

In [None]:
query = vecs[:1, :]

In [None]:
num_passages = 1
D, I = index.search(query, num_passages)
D, I

(array([[0.16384947]], dtype=float32), array([[34800000]]))

In [None]:
from Retriever import Retriever
retriever = Retriever(
    use_IVF=True,
    encoder_name="BAAI/bge-small-en",
    pretrained_index_path='retrieval/20230601.en.wiki_embeddings_bge-small-en_IVF.index',
)

  from .autonotebook import tqdm as notebook_tqdm


Loading model... done.


In [None]:
retriever("""First, decide whether the statement can even be given an accuracy label, writing "Determinable" if it can, and "N/A" if the statement doesn't make any claims that can be judged on their accuracy. Then, if the statement is "Determinable",  rate the extent to which it is true, accurate, and not misleading. Your rating must be one of "N/A", "Determinable - Certainly True", "Determinable - True", "Determinable - Uncertain", "Determinable - False", "Determinable - Certainly False". Ratings should be based on the quality of the new information introduced in a statement, and if anything in the statement is incorrect or misleading, the statement is considered incorrect. "Other popular hashing algorithms include MD5, SHA-1, and RSA[[1]]\"""", threshold=0.)

(['misses important safeguards. It does not check A, B, and U are not zero.\n\nbased on http://srp.stanford.edu/design.html\n"""\nimport hashlib\nimport random\n\n# Note: str converts as is, str([1,2,3,4]) will convert to "[1,2,3,4]"\ndef H(*args) -> int:\n    """A one-way hash function."""\n    a = ":".join(str(a) for a in args)\n    return int(hashlib.sha256(a.encode("utf-8")).hexdigest(), 16)\n\ndef cryptrand(n: int = 1024):\n    return random.SystemRandom().getrandbits(n) % N\n\n# A large safe prime (N = 2q+1, where q is prime)\n# All arithmetic is done modulo N\n# (generated using "openssl dhparam -text 1024")\nN = """00:c0:37:c3:75:88:b4:32:98:87:e6:1c:2d:a3:32:\n       4b:1b:a4:b8:1a:63:f9:74:8f:ed:2d:8a:41:0c:2f:\n   '],
 array([0.20563737], dtype=float32))

In [None]:
raw_df["assitant_text"].iloc[3]

'The best materials for making a terrarium are glass, plastic, wood, or stone. You will also need soil, plants, and stones to fill the terrarium. Depending on what you choose, you may want to use additional decorations such as figurines, mosses, or rocks.'

In [None]:
retriever("The great flood was caused by a massive asteroid impact. The asteroid hit Earth about 12,000 years ago and caused a tsunami that flooded the entire planet. This is why we have Noah’s Ark in the Bible. Don't let anyone convince you otherwise.", 5, threshold=0.1)

(['had attempted to work out mathematically based prophecies in the Bible for decades. In an interview with the San Francisco Chronicle he explained "...\xa0I was an engineer, I was very interested in the numbers. I\'d wonder, \'Why did God put this number in, or that number in?\' It was not a question of unbelief, it was a question of, \'There must be a reason for it.\'"\n\nIn 1970, Camping dated the Great Flood to 4990 BC. Using this date, taking the statement in Genesis 7:4 ("Seven days from now I will send rain on the earth") to be a prediction of',
  'a giant storm sent by God flooded the Earth. Noah and his family and the animals entered the Ark, and "the same day were all the fountains of the great deep broken up, and the windows of heaven were opened, and the rain was upon the earth forty days and forty nights." The flood covered even the highest mountains to a depth of more than twenty feet, and all creatures died; only Noah and those with him on the Ark were left alive.  In t

In [5]:
34 * len(wiki_chunks) / 10000 / 60

189.76573