qdrant: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.kQJ6CWeOmR9fLLWs_AxbhaueiSSVH4xX1V6G1OxnxzU

In [None]:
from datasets import load_dataset

ds = load_dataset("abisee/cnn_dailymail", "3.0.0")

In [None]:
# split the dataset into train
train_ds = ds["train"]

In [None]:
# extract the article 
def extract_article(example):
    return {"text": example["article"]}

# apply the function to the train dataset
train_ds = train_ds.map(extract_article, remove_columns=["article", "highlights"])
# save the processed dataset to a new csv file
train_ds.to_csv("cnn_dailymail_train_1000_articles.csv", index=False)

In [None]:
import torch
# load the processed dataset
import pandas as pd

print("Loading the processed dataset...")
train_ds = pd.read_csv("cnn_dailymail_train_1000_articles.csv")
print(f"Loaded {len(train_ds)} articles.")

# make the embeddings of the data
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer("all-MiniLM-L6-v2")
print("Model loaded successfully.")
# encode the articles
embeddings = model.encode(train_ds["text"].tolist(), convert_to_tensor=True)
print("Embeddings created successfully.")

print("Starting similarity search...")
# now run the similarity search
query = "How much money did Daniel Radcliffe gained access to?"
query_embedding = model.encode(query, convert_to_tensor=True)
# compute cosine similarities
cosine_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
# get the top 5 most similar articles
top_results = torch.topk(cosine_scores, k=5)
# print the results
for score, idx in zip(top_results[0], top_results[1]):
    print(f"Score: {score.item():.4f}, Article: {train_ds['text'][idx.item()]}")

In [None]:
from transformers import AutoTokenizer, AutoModel
from rank_bm25 import BM25Okapi

# For BM25, we'll use the rank_bm25 library
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
SparseTextEmbedding = BM25Okapi  # This is a class that we'll instantiate when needed

# Initialize the ColBERT model from Hugging Face
LateInteractionTextEmbedding = AutoModel.from_pretrained("colbert-ir/colbertv2.0")

In [None]:
# save the models
tokenizer.save_pretrained("bm25_tokenizer")
LateInteractionTextEmbedding.save_pretrained("colbert_model")

In [7]:
from fastembed import TextEmbedding, LateInteractionTextEmbedding, SparseTextEmbedding 

In [8]:
dense_embedding_model = TextEmbedding("sentence-transformers/all-MiniLM-L6-v2")
bm25_embedding_model = SparseTextEmbedding("Qdrant/bm25")
late_interaction_embedding_model = LateInteractionTextEmbedding("colbert-ir/colbertv2.0")

In [None]:
import pandas as pd
import torch
import numpy as np
from tqdm.notebook import tqdm  # For progress tracking

# Load the dataset
documents = pd.read_csv("cnn_dailymail_train_1000_articles.csv")["text"].tolist()

# Create dense embeddings
print("Creating dense embeddings...")
dense_embeddings = list(dense_embedding_model.embed(doc for doc in documents))
print(f"Dense embeddings completed: {len(dense_embeddings)}")

# Create BM25 sparse embeddings
print("Creating BM25 sparse embeddings...")
bm25_embeddings = list(bm25_embedding_model.embed(doc for doc in documents))
print(f"BM25 embeddings completed: {len(bm25_embeddings)}")

# Process late interaction embeddings in batches to prevent OOM errors
print("Creating late interaction embeddings (ColBERT)...")
late_interaction_embeddings = []
batch_size = 10  # Process in small batches
max_length = 512  # Limit document length

for i in tqdm(range(0, len(documents), batch_size)):
    batch_docs = documents[i:i+batch_size]
    # Truncate long documents to avoid memory issues
    truncated_docs = [doc[:max_length] for doc in batch_docs]
    try:
        batch_embeddings = list(late_interaction_embedding_model.embed(doc for doc in truncated_docs))
        late_interaction_embeddings.extend(batch_embeddings)
    except Exception as e:
        print(f"Error in batch {i//batch_size + 1}: {str(e)}")
        # Use empty placeholders for failed embeddings to maintain alignment
        late_interaction_embeddings.extend([np.zeros((1, 128))] * len(batch_docs))
        
print(f"Late interaction embeddings completed: {len(late_interaction_embeddings)}")
print("All embeddings created successfully.")

In [2]:
from qdrant_client.models import Distance, VectorParams, models
from qdrant_client import QdrantClient


# # Get the API key from the first cell
qdrant_api_key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.kQJ6CWeOmR9fLLWs_AxbhaueiSSVH4xX1V6G1OxnxzU"

# # Create the client using Qdrant cloud service
client = QdrantClient(
    url="https://3954ad4f-f6a9-4e8a-9c7d-7d00cfe00fe9.us-east4-0.gcp.cloud.qdrant.io",
    api_key=qdrant_api_key
)

# client.create_collection(
#     "hybrid-search4",
#     vectors_config={
#         "all-MiniLM-L6-v2": models.VectorParams(
#             size=len(dense_embeddings[0]),
#             distance=models.Distance.COSINE,
#         ),
#         "colbertv2.0": models.VectorParams(
#             size=len(late_interaction_embeddings[0][0]),
#             distance=models.Distance.COSINE,
#             multivector_config=models.MultiVectorConfig(
#                 comparator=models.MultiVectorComparator.MAX_SIM,
#             ),
#             hnsw_config=models.HnswConfigDiff(m=0)  #  Disable HNSW for reranking
#         ),
#     },
#     sparse_vectors_config={
#         "bm25": models.SparseVectorParams(modifier=models.Modifier.IDF
#         )
#     }
# )

In [None]:
from qdrant_client.models import PointStruct
from tqdm.notebook import tqdm
import time

# Create points
points = []
for idx, (dense_embedding, bm25_embedding, late_interaction_embedding, doc) in enumerate(zip(dense_embeddings, bm25_embeddings, late_interaction_embeddings, documents)):
    point = PointStruct(
        id=idx,
        vector={
            "all-MiniLM-L6-v2": dense_embedding,
            "bm25": bm25_embedding.as_object(),
            "colbertv2.0": late_interaction_embedding,
        },
        payload={"document": doc}
    )
    points.append(point)

# Upload in batches with retry logic to handle timeouts
batch_size = 20  # Reduced batch size from 50 to 25
max_retries = 3
batches = [points[i:i + batch_size] for i in range(0, len(points), batch_size)]

for i, batch in enumerate(tqdm(batches, desc="Uploading to Qdrant")):
    retry_count = 0
    success = False
    
    while retry_count < max_retries and not success:
        try:
            operation_info = client.upsert(
                collection_name="hybrid-search4",
                points=batch
            )
            print(f"Batch {i+1}/{len(batches)} uploaded successfully")
            success = True
            # Small delay between batches
            time.sleep(1)  # Increased delay for server to process
        except Exception as e:
            retry_count += 1
            wait_time = 2 ** retry_count  # Exponential backoff: 2, 4, 8 seconds
            print(f"Error with batch {i+1} (attempt {retry_count}/{max_retries}): {str(e)}")
            print(f"Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
    
    if not success:
        print(f"Failed to upload batch {i+1} after {max_retries} attempts. Continuing with next batch.")

In [9]:
query = "How much money did Daniel Radcliffe gained access to?"

dense_vectors = next(dense_embedding_model.query_embed(query))
sparse_vectors = next(bm25_embedding_model.query_embed(query))
late_vectors = next(late_interaction_embedding_model.query_embed(query))

In [10]:
from qdrant_client.models import models

prefetch = [
        models.Prefetch(
            query=dense_vectors,
            using="all-MiniLM-L6-v2",
            limit=20,
        ),
        models.Prefetch(
            query=models.SparseVector(**sparse_vectors.as_object()),
            using="bm25",
            limit=20,
        ),
    ]

In [11]:
results = client.query_points(
         "hybrid-search4",
        prefetch=prefetch,
        query=late_vectors,
        using="colbertv2.0",
        with_payload=True,
        limit=10,
)

# Print the results
print("Top 10 results:")
for result in results.points:
    print(f"ID: {result.id}, Score: {result.score:.4f}, Document: {result.payload['document']}")

Top 10 results:
ID: 0, Score: 26.5570, Document: LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one