### Importing Libraries

In [None]:
from datasets import load_dataset

ds = load_dataset("abisee/cnn_dailymail", "3.0.0")

In [None]:
# split the dataset into train
train_ds = ds["train"]

In [20]:
from dotenv import load_dotenv
import os

# Load variables from .env file
load_dotenv()

# Access your keys
google_api_key = os.getenv("GOOGLE_API_KEY")
qdrant_api_key = os.getenv("QDRANT_API_KEY")

In [None]:
# extract the article 
def extract_article(example):
    return {"text": example["article"]}

# apply the function to the train dataset
train_ds = train_ds.map(extract_article, remove_columns=["article", "highlights"])
# save the processed dataset to a new csv file
train_ds.to_csv("cnn_dailymail_train_1000_articles.csv", index=False)

In [None]:
import torch
import pandas as pd

print("Loading the processed dataset...")
train_ds = pd.read_csv("cnn_dailymail_train_1000_articles.csv")
print(f"Loaded {len(train_ds)} articles.")

# make the embeddings of the data
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer("all-MiniLM-L6-v2")
print("Model loaded successfully.")
# encode the articles
embeddings = model.encode(train_ds["text"].tolist(), convert_to_tensor=True)
print("Embeddings created successfully.")

print("Starting similarity search...")
# now run the similarity search
query = "How much money did Daniel Radcliffe gained access to?"
query_embedding = model.encode(query, convert_to_tensor=True)
# compute cosine similarities
cosine_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
# get the top 5 most similar articles
top_results = torch.topk(cosine_scores, k=5)
# print the results
for score, idx in zip(top_results[0], top_results[1]):
    print(f"Score: {score.item():.4f}, Article: {train_ds['text'][idx.item()]}")

In [6]:
from fastembed import TextEmbedding, LateInteractionTextEmbedding, SparseTextEmbedding 

In [7]:
dense_embedding_model = TextEmbedding("sentence-transformers/all-MiniLM-L6-v2")
bm25_embedding_model = SparseTextEmbedding("Qdrant/bm25")
late_interaction_embedding_model = LateInteractionTextEmbedding("colbert-ir/colbertv2.0")

In [None]:
import pandas as pd
import torch
import numpy as np
from tqdm.notebook import tqdm  # For progress tracking

# Load the dataset
documents = pd.read_csv("cnn_dailymail_train_1000_articles.csv")["text"].tolist()

# Create dense embeddings
print("Creating dense embeddings...")
dense_embeddings = list(dense_embedding_model.embed(doc for doc in documents))
print(f"Dense embeddings completed: {len(dense_embeddings)}")

# Create BM25 sparse embeddings
print("Creating BM25 sparse embeddings...")
bm25_embeddings = list(bm25_embedding_model.embed(doc for doc in documents))
print(f"BM25 embeddings completed: {len(bm25_embeddings)}")

# Process late interaction embeddings in batches to prevent OOM errors
print("Creating late interaction embeddings (ColBERT)...")
late_interaction_embeddings = []
batch_size = 10  # Process in small batches
max_length = 512  # Limit document length

for i in tqdm(range(0, len(documents), batch_size)):
    batch_docs = documents[i:i+batch_size]
    # Truncate long documents to avoid memory issues
    truncated_docs = [doc[:max_length] for doc in batch_docs]
    try:
        batch_embeddings = list(late_interaction_embedding_model.embed(doc for doc in truncated_docs))
        late_interaction_embeddings.extend(batch_embeddings)
    except Exception as e:
        print(f"Error in batch {i//batch_size + 1}: {str(e)}")
        # Use empty placeholders for failed embeddings to maintain alignment
        late_interaction_embeddings.extend([np.zeros((1, 128))] * len(batch_docs))
        
print(f"Late interaction embeddings completed: {len(late_interaction_embeddings)}")
print("All embeddings created successfully.")

In [21]:
from qdrant_client.models import Distance, VectorParams, models
from qdrant_client import QdrantClient



# # Create the client using Qdrant cloud service
client = QdrantClient(
    url="https://3954ad4f-f6a9-4e8a-9c7d-7d00cfe00fe9.us-east4-0.gcp.cloud.qdrant.io",
    api_key=qdrant_api_key
)

# client.create_collection(
#     "hybrid-search4",
#     vectors_config={
#         "all-MiniLM-L6-v2": models.VectorParams(
#             size=len(dense_embeddings[0]),
#             distance=models.Distance.COSINE,
#         ),
#         "colbertv2.0": models.VectorParams(
#             size=len(late_interaction_embeddings[0][0]),
#             distance=models.Distance.COSINE,
#             multivector_config=models.MultiVectorConfig(
#                 comparator=models.MultiVectorComparator.MAX_SIM,
#             ),
#             hnsw_config=models.HnswConfigDiff(m=0)  #  Disable HNSW for reranking
#         ),
#     },
#     sparse_vectors_config={
#         "bm25": models.SparseVectorParams(modifier=models.Modifier.IDF
#         )
#     }
# )

In [None]:
from qdrant_client.models import PointStruct
from tqdm.notebook import tqdm
import time

# Create points
points = []
for idx, (dense_embedding, bm25_embedding, late_interaction_embedding, doc) in enumerate(zip(dense_embeddings, bm25_embeddings, late_interaction_embeddings, documents)):
    point = PointStruct(
        id=idx,
        vector={
            "all-MiniLM-L6-v2": dense_embedding,
            "bm25": bm25_embedding.as_object(),
            "colbertv2.0": late_interaction_embedding,
        },
        payload={"document": doc}
    )
    points.append(point)

# Upload in batches with retry logic to handle timeouts
batch_size = 20  # Reduced batch size from 50 to 25
max_retries = 3
batches = [points[i:i + batch_size] for i in range(0, len(points), batch_size)]

for i, batch in enumerate(tqdm(batches, desc="Uploading to Qdrant")):
    retry_count = 0
    success = False
    
    while retry_count < max_retries and not success:
        try:
            operation_info = client.upsert(
                collection_name="hybrid-search4",
                points=batch
            )
            print(f"Batch {i+1}/{len(batches)} uploaded successfully")
            success = True
            # Small delay between batches
            time.sleep(1)  # Increased delay for server to process
        except Exception as e:
            retry_count += 1
            wait_time = 2 ** retry_count  # Exponential backoff: 2, 4, 8 seconds
            print(f"Error with batch {i+1} (attempt {retry_count}/{max_retries}): {str(e)}")
            print(f"Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
    
    if not success:
        print(f"Failed to upload batch {i+1} after {max_retries} attempts. Continuing with next batch.")

In [8]:
query = "I want to know about Nicaragua President?"

dense_vectors = next(dense_embedding_model.query_embed(query))
sparse_vectors = next(bm25_embedding_model.query_embed(query))
late_vectors = next(late_interaction_embedding_model.query_embed(query))

In [9]:
from qdrant_client.models import models

prefetch = [
        models.Prefetch(
            query=dense_vectors,
            using="all-MiniLM-L6-v2",
            limit=20,
        ),
        models.Prefetch(
            query=models.SparseVector(**sparse_vectors.as_object()),
            using="bm25",
            limit=20,
        ),
    ]

In [10]:
results = client.query_points(
         "hybrid-search4",
        prefetch=prefetch,
        query=late_vectors,
        using="colbertv2.0",
        with_payload=True,
        limit=10,
)

# Print the results
print("Top 10 results:")
for result in results.points:
    print(f"ID: {result.id}, Score: {result.score:.4f}, Document: {result.payload['document']}")

Top 10 results:
ID: 473, Score: 20.5680, Document: (CNN) -- Nicaragua President Daniel Ortega said Thursday that the nation is breaking relations with Colombia "in solidarity with the Ecuadoran people." Nicaragua President Daniel Ortega said the nation is breaking diplomatic relations with Colombia. The move comes after the Organization of American States passed a resolution Wednesday in hopes of easing tensions stemming from an attack by Colombian military on a rebel camp in neighboring Ecuador on Saturday. Since that attack, Ecuador has broken off relations with Colombia, and Venezuela says it has moved troops to its border with Colombia. Ortega made his televised remarks in Managua, where he was flanked by Ecuadoran President Rafael Correa. Colombian forces killed at least 17 members of the leftist group Revolutionary Armed Forces of Colombia on Saturday. FARC is estimated to be holding at least 700 hostages in the jungles of Colombia and has been accused by the United States of bei

In [11]:
# Step 1: Generate dense embeddings for your query
from sentence_transformers import SentenceTransformer

# Initialize the Qdrant client
client = QdrantClient(
    url="https://3954ad4f-f6a9-4e8a-9c7d-7d00cfe00fe9.us-east4-0.gcp.cloud.qdrant.io",
    api_key=qdrant_api_key
)


model = SentenceTransformer('all-MiniLM-L6-v2')  # or another appropriate model
query_vector = model.encode(query).tolist()  # Dense vector for the query

# Step 2: Query using cosine similarity
results = client.query_points(
    "hybrid-search4",  # Your collection name
    query=query_vector,  # Use query instead of query_vector
    using="all-MiniLM-L6-v2",  # Specify which vector field to use
    with_payload=True,
    limit=10
)

# Print the results
print("Top 10 results:")
for result in results.points:
    print(f"ID: {result.id}, Score: {result.score:.4f}, Document: {result.payload['document']}")

Top 10 results:
ID: 473, Score: 0.5142, Document: (CNN) -- Nicaragua President Daniel Ortega said Thursday that the nation is breaking relations with Colombia "in solidarity with the Ecuadoran people." Nicaragua President Daniel Ortega said the nation is breaking diplomatic relations with Colombia. The move comes after the Organization of American States passed a resolution Wednesday in hopes of easing tensions stemming from an attack by Colombian military on a rebel camp in neighboring Ecuador on Saturday. Since that attack, Ecuador has broken off relations with Colombia, and Venezuela says it has moved troops to its border with Colombia. Ortega made his televised remarks in Managua, where he was flanked by Ecuadoran President Rafael Correa. Colombian forces killed at least 17 members of the leftist group Revolutionary Armed Forces of Colombia on Saturday. FARC is estimated to be holding at least 700 hostages in the jungles of Colombia and has been accused by the United States of bein

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI  # For Google Gemini
from langchain_core.documents import Document
from langchain_core.messages import HumanMessage, SystemMessage
import os


# Create the Gemini chat model
chat_model = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",  # Using Gemini Pro model
    temperature=0.7,     # Controls randomness
    max_output_tokens=1000,  # Maximum tokens in response
    disable_streaming=False,     # Set to True for streaming responses
    api_key=google_api_key  # Explicitly pass the API key
)

# Function to search the Qdrant collection and return documents
def search_qdrant(query, limit=3):
    # Generate embeddings for the query using all three models
    query_dense = model.encode(query).tolist()
    query_sparse = next(bm25_embedding_model.query_embed(query))
    query_late = next(late_interaction_embedding_model.query_embed(query))
    
    # Search Qdrant collection
    results = client.query_points(
        collection_name="hybrid-search4",
        query=query_late,  # Use late interaction embeddings for main query
        using="colbertv2.0",  # Corrected lowercase name
        prefetch=[
            models.Prefetch(
                query=query_dense,
                using="all-MiniLM-L6-v2",
                limit=limit
            ),
            models.Prefetch(
                query=models.SparseVector(**query_sparse.as_object()),
                using="bm25",
                limit=limit
            )
        ],
        with_payload=True,
        limit=limit
    )
    
    # Convert results to Document objects
    documents = []
    for result in results.points:
        documents.append(
            Document(
                page_content=result.payload["document"],
                metadata={"id": result.id, "score": result.score}
            )
        )
    
    return documents

def rag_chat(question, k=3):
    # Step 1: Retrieve relevant documents
    relevant_docs = search_qdrant(question, limit=k)
    
    # Step 2: Combine retrieved content
    context = "\n\n".join([doc.page_content for doc in relevant_docs])
    
    # Step 3: Create messages
    system_message = f"""You are a helpful AI assistant. Use the following context to answer the user's question. 
If you can't find the answer in the context, say so clearly.

Context:
{context}"""
    
    messages = [
        SystemMessage(content=system_message),
        HumanMessage(content=question)
    ]
    
    # Step 4: Get response from chat model
    response = chat_model.invoke(messages)
    
    return {
        "answer": response.content,
        "sources": relevant_docs,
        "context_length": len(context)
    }

# Test the RAG chatbot
question = "Who is the current president of Nicaragua and what are some of his policies?"
result = rag_chat(question)

print("Question:", question)
print("\nAnswer:", result["answer"])
print(f"\nContext length: {result['context_length']} characters")
print(f"Used {len(result['sources'])} source documents")

Question: Who is the current president of Nicaragua and what are some of his policies?

Answer: According to the article, the current president of Nicaragua is Daniel Ortega. One of his policies mentioned is breaking relations with Colombia "in solidarity with the Ecuadoran people." He stated this was due to "the terrorist policies that the government of [Colombian President] Alvaro Uribe is practicing."

Context length: 7605 characters
Used 3 source documents
