In [2]:
import torch
from FlagEmbedding import BGEM3FlagModel
from pathlib import Path

model = BGEM3FlagModel('BAAI/bge-m3',  
                       use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

book = '109 East Palace'
book_file_path = Path('..') / 'data' / 'bookcompanion'  / f'{book}.txt'
book_content = book_file_path.read_text(encoding='utf-8')
print(f"{book} has {len(book_content)} characters.")
book_tokenized = model.tokenizer(book_content, return_tensors='pt')['input_ids']
print(f"{book} has {len(book_tokenized[0])} tokens.")



Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

  colbert_state_dict = torch.load(os.path.join(model_dir, 'colbert_linear.pt'), map_location='cpu')
  sparse_state_dict = torch.load(os.path.join(model_dir, 'sparse_linear.pt'), map_location='cpu')


109 East Palace has 932927 characters.


Token indices sequence length is longer than the specified maximum sequence length for this model (226411 > 8192). Running this sequence through the model will result in indexing errors


109 East Palace has 226411 tokens.


In [3]:
max_length = model.tokenizer.model_max_length
print(f'{max_length=}')
print(book_tokenized.shape)

max_length=8192
torch.Size([1, 226411])


In [4]:
# naive_chunks = torch.chunk(book_tokenized,chunks=(len(book_tokenized[0])//max_length)+1,dim=1)
# print(f'{len(book_tokenized[0])=}')
# print(f'{len(naive_chunks)=}')
# print(naive_chunks[0].shape)
# # pad each tensor in naive_chunks to max_length
# naive_chunks = [torch.nn.functional.pad(chunk, (0, max_length - chunk.shape[1])) for chunk in naive_chunks]
# batch_of_chunks = torch.stack(naive_chunks).squeeze(1)
# print(batch_of_chunks.shape)

In [5]:
num_chunks=(book_tokenized.shape[1]//max_length)+1
chunk_len = len(book_content)//num_chunks
batch_of_text_chunks = [book_content[i:i+chunk_len] for i in range(0, len(book_content), chunk_len)]
vectors = model.encode(batch_of_text_chunks, max_length=max_length,batch_size=4,return_colbert_vecs=True, return_dense=True)

In [6]:
print(f'{len(batch_of_text_chunks)=}')
print(f'{len(vectors)=}')
print(vectors.keys())
print(vectors['colbert_vecs'][1].shape)

len(batch_of_text_chunks)=29
len(vectors)=3
dict_keys(['dense_vecs', 'lexical_weights', 'colbert_vecs'])
(8035, 1024)


In [151]:
def chunk_colbert_vecs(vecs):
    num_tokens = len(vecs)
    if num_tokens//100 > 0:
        chunked_vecs = torch.chunk(torch.tensor(vecs), chunks=num_tokens//200)
    else:
        chunked_vecs = (vecs,)
    
    return chunked_vecs

chunked_vecs = chunk_colbert_vecs(vectors['colbert_vecs'][0])
print(f'{len(chunked_vecs)=}')
print(f'{chunked_vecs[0].shape=}')
chunk_len = chunked_vecs[0].shape[0]
book_tokenized = model.tokenizer(
                batch_of_text_chunks,
                padding=True,
                truncation=True,
                return_tensors='pt',
                max_length=max_length,
            )
print(f'{book_tokenized=}')
col_vecs_size = [v.shape[0] for v in vectors['colbert_vecs']]
book_chunks = []
for tokens, attn_mask in zip(book_tokenized['input_ids'], book_tokenized['attention_mask']):
    token_num = torch.sum(attn_mask)
    book_chunks.append(tokens[:token_num-1])

book_tokenized_size = [b.shape[0] for b in book_chunks]
print(f'{sum(col_vecs_size)=}')
print(f'{sum(book_tokenized_size)=}')
print(f'{book_tokenized['input_ids']}')






len(chunked_vecs)=40
chunked_vecs[0].shape=torch.Size([201, 1024])
book_tokenized={'input_ids': tensor([[    0,  9804, 13055,  ...,     1,     1,     1],
        [    0,    28,  1295,  ...,     1,     1,     1],
        [    0,   104,     4,  ...,     1,     1,     1],
        ...,
        [    0,    91,  2837,  ...,  4516,     4,     2],
        [    0, 26759,    18,  ...,  2480,    16,     2],
        [    0,   111, 15672,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]])}
sum(col_vecs_size)=225482
sum(book_tokenized_size)=225482
tensor([[    0,  9804, 13055,  ...,     1,     1,     1],
        [    0,    28,  1295,  ...,     1,     1,     1],
        [    0,   104,     4,  ...,     1,     1,     1],
        ...,
        [    0,    91,  2837,  ...,  4516,     4,     2],
 

In [152]:
all_chunks = [chunk_colbert_vecs(v) for v in vectors['colbert_vecs']]
all_chunks = [torch.tensor(chunk).mean(dim=0) for doc in all_chunks for chunk in doc]


print(f'{len(all_chunks)=}')
print(f'{all_chunks[0].shape=}')
sims = torch.asarray([1,2,3])
db = torch.stack(all_chunks)

query = "What is trinity site?"
vec= model.encode(query, return_colbert_vecs=True, return_dense=True)
vec = torch.tensor(vec['colbert_vecs'])
vec.shape

len(all_chunks)=1114
all_chunks[0].shape=torch.Size([1024])


  all_chunks = [torch.tensor(chunk).mean(dim=0) for doc in all_chunks for chunk in doc]


torch.Size([7, 1024])

In [153]:
q_ready = vec.mean(dim=0)
# cosine similarity
sim_scores = torch.cosine_similarity(db, q_ready, dim=1)
sim_max = torch.topk(sim_scores,3)
sim_index = torch.topk(sim_scores,3).indices
print(f'{sim_max=}')
sim_scores




sim_max=torch.return_types.topk(
values=tensor([0.2994, 0.2614, 0.2515]),
indices=tensor([   0, 1073, 1113]))


tensor([0.2994, 0.2006, 0.1894,  ..., 0.2055, 0.1914, 0.2515])

In [154]:
print(len(book_chunks[0]))

8035


In [155]:
book_chunks_new = torch.concat(book_chunks)
book_chunks_new = torch.chunk(book_chunks_new, chunks=len(all_chunks))
len(book_chunks_new)



1111

In [156]:
for idx in sim_index:
    if idx >= len(book_chunks_new):
        continue
    print(model.tokenizer.decode(book_chunks_new[idx]))

<s> ALSO BY JENNET CONANT Tuxedo Park:A Wall Street Tycoon and the Secret Palace of Science That Changed the Course of World War II SIMON & SCHUSTER Rockefeller Center 1230 Avenue of the Americas New York, NY 10020 Copyright © 2005 by Jennet Conant All rights reserved, including the right of reproduction in whole or in part in any form. SIMON & SCHUSTER and colophon are registered trademarks of Simon & Schuster, Inc. Library of Congress Cataloging-in-Publication Data Conant, Jennet. 109 East Palace : Robert Oppenheimer and the secret city of Los Alamos / Jennet Conant. p. cm. Includes bibliographical references. 1. Los Alamos Scientific Laboratory—History. 2. Manhattan Project (U.S.)—History. 3. Atomic bomb—United States—History. 4. McKibbin, Dorothy
New York. QUOTATIONS FROM NEWSPAPER ARTICLES AND MAGAZINES “Baggage, Babies and the Atom Bomb: The Unique 20 years of Dorothy McKibbin.” Los Alamos Scientific Laboratory News, June 28, 1963. Corbett, Peggy. “AEC Office in SF Closes.” The N

In [181]:
import nltk
nltk.download()


showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [182]:
naive_chunks = nltk.sent_tokenize(book_content)
vectors= model.encode(naive_chunks,batch_size=16, return_dense=True)


Inference Embeddings: 100%|██████████| 364/364 [01:19<00:00,  4.60it/s]


In [189]:
query = "Relationship between albert einstein and oppenheimer's girlfriend?"
qv = model.encode(query, return_dense=True)
sims = torch.cosine_similarity(torch.tensor(vectors['dense_vecs']), torch.tensor(qv['dense_vecs']), dim=1)
torch.topk(sims,3)

torch.return_types.topk(
values=tensor([0.6450, 0.6426, 0.6313], dtype=torch.float16),
indices=tensor([3742, 2288, 1312]))

In [190]:
for idx in torch.topk(sims,5).indices:
    if idx >= len(naive_chunks):
        continue
    print('------')
    print(naive_chunks[idx])

------
Oppenheimer made her one of his famous martinis and introduced her around.
------
Oppenheimer ended up missing his train and stayed the night in her apartment.
------
Oppenheimer asked her to come with him and offered her a position on his personal staff.
------
Oppenheimer kept putting her off.
------
“The trouble with Oppenheimer is that he loves a woman who doesn’t love him—the United States Government,” Einstein observed of his Princeton colleague, with whom he was never particularly close.Oppenheimer had been humbled, but not destroyed.
