In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch.nn.functional import cosine_similarity

In [2]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("colbert-ir/colbertv2.0")
model = AutoModel.from_pretrained("colbert-ir/colbertv2.0").eval()

In [16]:
# Define your query
query = "What did Sam Altman say?"
query_tokens = tokenizer(query, return_tensors='pt', padding=True, truncation=True)
query_embeddings = model(**query_tokens).last_hidden_state
query_embeddings = torch.nn.functional.normalize(query_embeddings, p=2, dim=-1)

In [17]:
def chunk_text(text, max_length):
    # Tokenize the text
    tokens = tokenizer.tokenize(text)
    
    # Calculate the effective maximum chunk size, accounting for special tokens
    max_chunk_size = max_length - tokenizer.num_special_tokens_to_add()
    
    # Initialize variables
    chunked_tokens = []
    
    # Start chunking
    for i in range(0, len(tokens), max_chunk_size):
        chunk = tokens[i:i + max_chunk_size]
        chunked_tokens.append(chunk)
    
    return [" ".join(tokenizer.convert_tokens_to_string(chunk)) for chunk in chunked_tokens]

In [18]:
with open("paul_graham/paul_graham_essay.txt", "r", encoding="utf-8") as file:
    essay_text = file.read()

In [19]:
document_chunks = chunk_text(essay_text, 512)

In [20]:
# Initialize list to store scores
max_similarity_scores_all_docs = []

In [21]:
# Process each chunk
for chunk in document_chunks:
    # Tokenize and encode the chunk with necessary padding and truncation
    document_tokens = tokenizer(chunk, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
    
    with torch.no_grad():
        document_embeddings = model(**document_tokens).last_hidden_state
        document_embeddings = torch.nn.functional.normalize(document_embeddings, p=2, dim=-1)

    # Compute similarity as before
    similarity_scores = cosine_similarity(query_embeddings.squeeze(0)[:, None, :], document_embeddings.squeeze(0)[None, :, :], dim=-1)
    max_similarity_scores = similarity_scores.max(dim=1).values
    document_score = max_similarity_scores.mean()
    max_similarity_scores_all_docs.append(document_score.item())

In [22]:
# Output the scores
for i, score in enumerate(max_similarity_scores_all_docs):
    print(f"Chunk {i+1}: Score = {score:.4f}")

Chunk 1: Score = 0.3063
Chunk 2: Score = 0.3079
Chunk 3: Score = 0.3097
Chunk 4: Score = 0.3096
Chunk 5: Score = 0.3166
Chunk 6: Score = 0.3116
Chunk 7: Score = 0.3202
Chunk 8: Score = 0.2948
Chunk 9: Score = 0.3120
Chunk 10: Score = 0.3208
Chunk 11: Score = 0.3069
Chunk 12: Score = 0.3036
Chunk 13: Score = 0.3007
Chunk 14: Score = 0.3041
Chunk 15: Score = 0.3048
Chunk 16: Score = 0.2970
Chunk 17: Score = 0.2954
Chunk 18: Score = 0.3174
Chunk 19: Score = 0.3175
Chunk 20: Score = 0.3043
Chunk 21: Score = 0.2968
Chunk 22: Score = 0.2987
Chunk 23: Score = 0.3068
Chunk 24: Score = 0.3034
Chunk 25: Score = 0.3030
Chunk 26: Score = 0.3058
Chunk 27: Score = 0.3105
Chunk 28: Score = 0.3022
Chunk 29: Score = 0.3058
Chunk 30: Score = 0.2949
Chunk 31: Score = 0.3235
Chunk 32: Score = 0.2984
Chunk 33: Score = 0.3090
Chunk 34: Score = 0.3139
