# Chapter 5

## Retrieval and Re-ranking

In [1]:
%load_ext autoreload
%autoreload 2

from dotenv import load_dotenv

load_dotenv()
import nest_asyncio
nest_asyncio.apply()
import asyncio

import wandb
import weave
import pathlib
import pandas as pd
import json

In [2]:
WANDB_ENTITY = "rag-course"
WANDB_PROJECT = "dev"

wandb.require("core")

run = wandb.init(
    entity=WANDB_ENTITY,
    project=WANDB_PROJECT,
    group="Chapter 5",
)

weave_client = weave.init(f"{WANDB_ENTITY}/{WANDB_PROJECT}")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mparambharat[0m ([33mrag-course[0m). Use [1m`wandb login --relogin`[0m to force relogin


Logged in as Weights & Biases user: parambharat.
View Weave data at https://wandb.ai/rag-course/dev/weave


In [3]:
# Reload the data from Chapter 3
chunked_artifact = run.use_artifact(
    f"{WANDB_ENTITY}/{WANDB_PROJECT}/chunked_data:latest", type="dataset"
)
artifact_dir = chunked_artifact.download()
chunked_data_file = pathlib.Path(f"{artifact_dir}/documents.jsonl")
chunked_data = list(map(json.loads, chunked_data_file.read_text().splitlines()))
chunked_data[:2]

2024/07/24 14:37:51 [DEBUG] GET https://storage.googleapis.com/wandb-production.appspot.com/rag-course/dev/j8uh2i2o/artifact/961260984/wandb_manifest.json?Expires=1721815671&GoogleAccessId=gorilla-files-url-signer-man%40wandb-production.iam.gserviceaccount.com&Signature=L85mt%2BfEZ%2BQPimMZhhCXZ4cPBM8Owf8Wc5hFs%2BZFrAj29XqKF%2BmXyxKELLhYH46QYBoGnSwCjIdjtW4RLNNj9R0p3%2Flx2hEb%2FRPuyI0NYLvTBOWKUWH%2FlpzgFXzSCgA0ZCupQBWk%2F3dqtDwHZ3wMJNY0VZhsch07aKf8z4iznaqB858KKRHg7BwwqAJvhuYjBoVRunut0k3ybxPxvchQmLFPDFB%2FJ2B%2BaC%2B19wt27b%2FV7xLkl%2FCOTE1YZ3e9GeeFVX1JYuCLrpvt2iz%2Fbb%2Feo37zxx6CSiSYWJ4WCDll3gPTK3vjuLPkBb11nqLuhXl0K26BXbAkM8K7acVL%2Bllt5w%3D%3D


[{'cleaned_content': 'Anonymous Mode Are you publishing code that you want anyone to be able to run easily? Use Anonymous Mode to let someone run your code, see a W&B dashboard, and visualize results without needing to create a W&B account first. Allow results to be logged in Anonymous Mode with wandb.init(anonymous="allow") :::info Publishing a paper? Please cite W&B, and if you have questions about how to make your code accessible while using W&B, reach out to us at support@wandb.com.\n::: How does someone without an account see results? If someone runs your script and you have to set anonymous="allow":  Auto-create temporary account: W&B checks for an account that\'s already signed in. If there\'s no account, we automatically create a new anonymous account and save that API key for the session. Log results quickly: The user can run and re-run the script, and automatically see results show up in the W&B dashboard UI.\nThese unclaimed anonymous runs will be available for 7 days. Claim

In [None]:
# We'll imporve the retriever by using dense retrieval using cohere embeddings.
# We'll also use a reranker to improve the quality of the retrieved documents.

In [4]:
from scripts.retriever import DenseRetriever, DenseRetrieverWithReranker
from scripts.reranker import CohereReranker
from scripts.utils import display_source

display_source(DenseRetriever)
display_source(CohereReranker)
display_source(DenseRetrieverWithReranker)


In [None]:
dense_retriever = DenseRetrieverWithReranker()
dense_retriever.index_data(chunked_data)

In [5]:
from scripts.retrieval_metrics import ALL_METRICS as RETRIEVAL_METRICS

In [6]:
eval_dataset = weave.ref(
    "weave:///rag-course/dev/object/Dataset:9O0EmmPINmYjgbXW3kucVrDxlTUQJQs0fVZYJj2mtOk"
).get()

In [7]:
eval_dataset.rows[:10]

[WeaveDict({'question': 'How can I access the run object from the Lightning WandBLogger function?', 'answer': "In PyTorch Lightning, the `WandbLogger` is used to log metrics, model weights, and other data to Weights & Biases during training. To access the `wandb.Run` object from within a `LightningModule` when using `WandbLogger`, you can use the `Trainer.logger.experiment` attribute. This attribute provides direct access to the underlying `wandb.Run` object, allowing you to interact with the Weights & Biases API directly.\n\nHere's how you can access the `wandb.Run` object using `WandbLogger` in PyTorch Lightning:\n\n```python\nfrom pytorch_lightning import Trainer, LightningModule\nfrom pytorch_lightning.loggers import WandbLogger\n\nclass MyModel(LightningModule):\n    def training_step(self, batch, batch_idx):\n        # Your training logic here\n        loss = ...\n\n        # Log metrics\n        self.log('train_loss', loss)\n\n        # Access the wandb.Run object\n        run =

In [8]:
retrieval_evaluation = weave.Evaluation(
    name="Retrieval_Evaluation",
    dataset=eval_dataset.rows[:10],
    scorers=RETRIEVAL_METRICS,
    preprocess_model_input=lambda x: {"query": x["question"], "top_k": 10, "top_n": 5},
)
# dense_retrieval_scores = asyncio.run(retrieval_evaluation.evaluate(dense_retriever))

In [None]:
# Using the query enhancer, response generator, and RAG pipeline from the previous chapter

import cohere
from scripts.query_enhancer import QueryEnhancer
from scripts.response_generator import QueryEnhanedResponseGenerator
from scripts.rag_pipeline import QueryEnhancedRAGPipeline

query_enhancer = QueryEnhancer()
# lets add the new prompt
QUERY_ENHANCED_PROMPT = open("prompts/query_enhanced_system.txt").read()

response_generator = QueryEnhanedResponseGenerator(
    model="command-r-plus", prompt=QUERY_ENHANCED_PROMPT, client=cohere.AsyncClient()
)

rag_pipeline = QueryEnhancedRAGPipeline(
    query_enhancer=query_enhancer,
    retriever=dense_retriever,
    response_generator=response_generator,
)

In [None]:
from scripts.response_metrics import ALL_METRICS as RESPONSE_METRICS

response_evaluations = weave.Evaluation(
    name="Response_Evaluation",
    dataset=eval_dataset.rows[:10],
    scorers=RESPONSE_METRICS,
    preprocess_model_input=lambda x: {"query": x["question"]},
)
query_enhanced_response_scores = asyncio.run(
    response_evaluations.evaluate(rag_pipeline)
)

In [9]:
from scripts.retriever import HybridRetrieverReranker

hybrid_retriever = HybridRetrieverReranker()

In [10]:
hybrid_retriever.index_data(chunked_data)


Split strings:   0%|          | 0/696 [00:00<?, ?it/s]

In [11]:
hybrid_retrieval_scores = asyncio.run(retrieval_evaluation.evaluate(hybrid_retriever))

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]