 # Eval with Labels

## Setup

In [9]:
import os
import openai
from pathlib import Path
from pprint import pprint
import ray
from tqdm import tqdm

In [36]:
import sys; sys.path.append("..")
import warnings; warnings.filterwarnings("ignore")
from dotenv import load_dotenv; load_dotenv()

True

In [37]:
EFS_DIR = Path("/efs/shared_storage/simon")
ROOT_DIR = Path(os.getcwd()).parent
print (ROOT_DIR)

/home/ray/default/llm-applications


In [38]:
# Credentials
ray.init(runtime_env={"env_vars": {
    "OPENAI_API_BASE": os.environ["OPENAI_API_BASE"],
    "OPENAI_API_KEY": os.environ["OPENAI_API_KEY"], 
    "ANYSCALE_API_BASE": os.environ["ANYSCALE_API_BASE"],
    "ANYSCALE_API_KEY": os.environ["ANYSCALE_API_KEY"],
    "DB_CONNECTION_STRING": os.environ["DB_CONNECTION_STRING"],
}})

2023-08-31 00:34:29,068	INFO worker.py:1431 -- Connecting to existing Ray cluster at address: 10.0.30.102:6379...
2023-08-31 00:34:29,078	INFO worker.py:1612 -- Connected to Ray cluster. View the dashboard at [1m[32mhttps://session-hvq6cjxyd917stdzvn4cs58auc.i.anyscaleuserdata.com [39m[22m
2023-08-31 00:34:29,082	INFO packaging.py:346 -- Pushing file package 'gcs://_ray_pkg_591bfdbdda85a4ebb0a1c8ccab90c290.zip' (0.74MiB) to Ray cluster...
2023-08-31 00:34:29,084	INFO packaging.py:359 -- Successfully pushed file package 'gcs://_ray_pkg_591bfdbdda85a4ebb0a1c8ccab90c290.zip'.


0,1
Python version:,3.9.15
Ray version:,2.6.2
Dashboard:,http://session-hvq6cjxyd917stdzvn4cs58auc.i.anyscaleuserdata.com


### Utils 

In [20]:
import json

def write_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

def read_json(filename):
    with open(filename, 'r') as f:
        data = json.load(f)
    return data

## Load corpus

In [22]:
sections = read_json(Path(ROOT_DIR, "datasets/eval_full_corpus.json"))

## Load (question, source) labels 

In [13]:
import re
import json
from pathlib import Path

In [43]:
with open(Path(ROOT_DIR, "datasets/eval-dataset-v1.jsonl"), "r") as f:
    val_dataset = [json.loads(item) for item in list(f)]

In [44]:
# Clean up
for row in val_dataset:
    row["source"] = row["source"].replace("https://docs.ray.io/en/latest/", "https://docs.ray.io/en/master/")

In [45]:
val_dataset[:5]

[{'question': 'I’m struggling a bit with Ray Data type conversions when I do map_batches. Any advice?',
  'source': 'https://docs.ray.io/en/master/data/transforming-data.html#configuring-batch-format'},
 {'question': 'How does autoscaling work in a Ray Serve application?',
  'source': 'https://docs.ray.io/en/master/serve/scaling-and-resource-allocation.html#autoscaling'},
 {'question': 'how do I get the address of a ray node',
  'source': 'https://docs.ray.io/en/master/ray-core/miscellaneous.html#node-information'},
 {'question': 'Does Ray support NCCL?',
  'source': 'https://docs.ray.io/en/master/ray-more-libs/ray-collective.html'},
 {'question': 'could you give me an example of using this library for data-parallel training of CNNs on Ray?',
  'source': 'https://docs.ray.io/en/master/ray-air/computer-vision.html#training-vision-models'}]

In [23]:
# Section per document (page) dict
sections_per_doc = {section["source"]: section["text"] for section in sections}
for section in sections:
    page = section["source"]
    if "#" not in page:
        page_sections = [key for key in sections_per_doc.keys() if key.startswith(page)]
        combined_text = "\n".join(sections_per_doc[page_section] for page_section in page_sections)
        sections_per_doc[page] = combined_text

In [26]:
from llama_index import Document

def to_doc(entry_dict):
    return Document(text=entry_dict['text'], metadata={'source': entry_dict['source']})

In [27]:
docs = [to_doc(dict_) for dict_ in sections]

## Build Index

In [24]:
from llama_index import VectorStoreIndex, Document, ServiceContext
from llama_index.embeddings import OpenAIEmbedding, LangchainEmbedding
from langchain.embeddings import HuggingFaceEmbeddings

In [25]:
def build_index(
    docs,
    chunk_size,
    embed_model='text-embedding-ada-002',
):
    if embed_model == 'text-embedding-ada-002':
        embed_model= OpenAIEmbedding(embed_batch_size=100)
    else:
        embed_model = HuggingFaceEmbeddings(model_name=embed_model)
        embed_model = LangchainEmbedding(embed_model, embed_batch_size=100)
        
    service_context = ServiceContext.from_defaults(
        chunk_size=chunk_size,
        embed_model=embed_model,
    )
    index = VectorStoreIndex.from_documents(docs, service_context=service_context, show_progress=True)
    return index

In [46]:
def evaluate_index(
    dataset,
    index,
    top_k=5,
    verbose=False,
):
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    for entry in tqdm(dataset):
        query = entry['question']
        expected_source = entry['source']
        
        retrieved_nodes = retriever.retrieve(query)
        retrieved_sources = [node.node.metadata['source'] for node in retrieved_nodes]
        is_hit = expected_source in retrieved_sources  # assume 1 relevant doc
        
        eval_result = {
            'is_hit': is_hit,
            'retrieved': retrieved_sources,
            'expected': expected_source,
            'query': query,
        }
        eval_results.append(eval_result)
    return eval_results

In [47]:
def evaluate(
    docs, 
    eval_dataset,
    chunk_size=1024,
    embed_model="text-embedding-ada-002",
    top_k=5,
    verbose=True,
):
    index = build_index(docs, chunk_size, embed_model)
    results = evaluate_index(eval_dataset, index, top_k, verbose=verbose)
    return results

## Standard approach

In [59]:
experiment = {
    'embed_model': "sentence-transformers/all-mpnet-base-v2"
}
print(f'Running experiment with {experiment}')

val_result = evaluate(docs, val_dataset, **experiment)

df = pd.DataFrame(val_result)
hit_rate = df['is_hit'].mean()
print(hit_rate)

Running experiment with {'embed_model': 'sentence-transformers/all-mpnet-base-v2'}


Parsing documents into nodes:   0%|          | 0/8944 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/9380 [00:00<?, ?it/s]


  0%|          | 0/179 [00:00<?, ?it/s][A
  1%|          | 1/179 [00:01<04:14,  1.43s/it][A
  1%|          | 2/179 [00:02<04:13,  1.43s/it][A
  2%|▏         | 3/179 [00:04<04:13,  1.44s/it][A
  2%|▏         | 4/179 [00:05<04:13,  1.45s/it][A
  3%|▎         | 5/179 [00:07<04:12,  1.45s/it][A
  3%|▎         | 6/179 [00:08<04:00,  1.39s/it][A
  4%|▍         | 7/179 [00:09<03:59,  1.39s/it][A
  4%|▍         | 8/179 [00:11<03:57,  1.39s/it][A
  5%|▌         | 9/179 [00:12<03:58,  1.41s/it][A
  6%|▌         | 10/179 [00:14<04:04,  1.45s/it][A
  6%|▌         | 11/179 [00:15<04:02,  1.44s/it][A
  7%|▋         | 12/179 [00:17<03:59,  1.43s/it][A
  7%|▋         | 13/179 [00:18<04:00,  1.45s/it][A
  8%|▊         | 14/179 [00:20<04:02,  1.47s/it][A
  8%|▊         | 15/179 [00:21<03:56,  1.44s/it][A
  9%|▉         | 16/179 [00:22<03:47,  1.40s/it][A
  9%|▉         | 17/179 [00:24<03:48,  1.41s/it][A
 10%|█         | 18/179 [00:25<03:47,  1.41s/it][A
 11%|█         | 19/179 [00:2

0.22346368715083798





## Sentence chunk approach

In [48]:
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index import ServiceContext, set_global_service_context
from llama_index.llms import OpenAI
from llama_index.embeddings import OpenAIEmbedding
from llama_index.node_parser import SentenceWindowNodeParser

# create the sentence window node parser w/ default settings
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-mpnet-base-v2"
    ),
    node_parser=node_parser,
)

In [49]:
index = VectorStoreIndex.from_documents(docs, service_context=service_context, show_progress=True)

Parsing documents into nodes:   0%|          | 0/8944 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/59775 [00:00<?, ?it/s]

In [50]:
results = evaluate_index(val_dataset, index, top_k=5, verbose=True)

100%|██████████| 179/179 [26:54<00:00,  9.02s/it]


In [53]:
import pandas as pd
df = pd.DataFrame(results)
hit_rate = df['is_hit'].mean()
hit_rate

0.27932960893854747