# Label-Free Evaluation with Synthetic Data

##  Setup

In [1]:
import os
import openai
from pathlib import Path
from pprint import pprint
import ray
from tqdm import tqdm

In [2]:
import sys; sys.path.append("..")
import warnings; warnings.filterwarnings("ignore")
from dotenv import load_dotenv; load_dotenv()

True

In [3]:
EFS_DIR = Path("/efs/shared_storage/simon")
ROOT_DIR = Path(os.getcwd()).parent
print (ROOT_DIR)

/home/ray/default/llm-applications


In [None]:
# Credentials
ray.init(runtime_env={"env_vars": {
    "OPENAI_API_BASE": os.environ["OPENAI_API_BASE"],
    "OPENAI_API_KEY": os.environ["OPENAI_API_KEY"], 
    "ANYSCALE_API_BASE": os.environ["ANYSCALE_API_BASE"],
    "ANYSCALE_API_KEY": os.environ["ANYSCALE_API_KEY"],
    "DB_CONNECTION_STRING": os.environ["DB_CONNECTION_STRING"],
}})

### Utils 

In [157]:
import json

def write_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

def read_json(filename):
    with open(filename, 'r') as f:
        data = json.load(f)
    return data

## Load data

In [162]:
sections = read_json(Path(ROOT_DIR, "datasets/eval_full_corpus.json"))

In [251]:
from llama_index import Document

def to_doc(entry_dict):
    return Document(text=entry_dict['text'], metadata={'source': entry_dict['source']})

In [252]:
docs = [to_doc(dict_) for dict_ in sections]

### Subsample data

In [15]:
import random 

In [82]:
SAMPLING_RATIO = 0.01
sampling_percentage = SAMPLING_RATIO * 100 
n_samples = int(SAMPLING_RATIO * len(sections))

In [83]:
val_corpus = random.sample(sections, n_samples)

In [84]:
print(f'Sampled {sampling_percentage}% of full corpus '
      f'with {len(sections)} sections, got {len(val_corpus)} sections')

Sampled 1.0% of full corpus with 8944 sections, got 89 sections


## Generate synthetic evaluation data

In [138]:
import re
import uuid

from llama_index.schema import Document, TextNode
from llama_index.llms import OpenAI
from llama_index import PromptHelper
from llama_index.prompts import PromptTemplate

In [88]:
DEFAULT_QA_GENERATE_PROMPT_TMPL = PromptTemplate("""\
Context information is below.

---------------------
{context_str}
---------------------

Given the context information and not prior knowledge.
generate only questions based on the below query.

You are a Teacher/ Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. Restrict the questions to the \
context information provided."
"""
)

In [148]:
# generate queries as a convenience function
def generate_qa_embedding_pairs(
    docs,
    llm=None,
    qa_generate_prompt_tmpl=DEFAULT_QA_GENERATE_PROMPT_TMPL,
    num_questions_per_chunk=2,
) -> dict:
    """Generate examples given a set of nodes."""
    corpus = {
        doc['source']: doc
        for doc in docs
    }

    llm = llm or OpenAI(model="gpt-3.5-turbo")
    prompt_helper = PromptHelper.from_llm_metadata(llm.metadata)

    queries = {}
    relevant_docs = {}
    corpus = {}
    for doc in tqdm(docs):
        text = doc['text']
        source = doc['source']
        if not text.strip():
            continue
            
        # truncate text to fit in LLM context window
        text = prompt_helper.truncate(qa_generate_prompt_tmpl, [text])[0]
        
        # generate hypothetical questions
        query = qa_generate_prompt_tmpl.format(
            context_str=text, num_questions_per_chunk=num_questions_per_chunk
        )
        response = llm.complete(query)

        # process questions
        result = str(response).strip().split("\n")
        questions = [
            re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
        ]
        questions = [question for question in questions if len(question) > 0]

        doc_id = str(uuid.uuid4())
        corpus[doc_id] = doc
        for question in questions:
            question_id = str(uuid.uuid4())
            queries[question_id] = question
            relevant_docs[question_id] = [source]

    
    return {
        'queries': queries,
        'corpus': corpus, 
        'relevant_docs': relevant_docs, 
    }

In [149]:
val_dataset = generate_qa_embedding_pairs(val_corpus)

100%|██████████| 89/89 [01:24<00:00,  1.05it/s]


In [163]:
write_json(val_dataset, Path(ROOT_DIR, "datasets/eval_sample_p1_synthetic.json"))

In [164]:
val_dataset = read_json(Path(ROOT_DIR, "datasets/eval_sample_p1_synthetic.json"))

## Build Index

In [312]:
from llama_index import VectorStoreIndex, Document, ServiceContext
from llama_index.embeddings import OpenAIEmbedding, LangchainEmbedding
from langchain.embeddings import HuggingFaceEmbeddings

In [306]:
def build_index(
    docs,
    chunk_size,
    embed_model='text-embedding-ada-002',
):
    if embed_model == 'text-embedding-ada-002':
        embed_model= OpenAIEmbedding(embed_batch_size=100)
    else:
        embed_model = HuggingFaceEmbeddings(model_name=embed_model)
        embed_model = LangchainEmbedding(embed_model, embed_batch_size=100)
        
    service_context = ServiceContext.from_defaults(
        chunk_size=chunk_size,
        embed_model=embed_model,
    )
    index = VectorStoreIndex.from_documents(docs, service_context=service_context, show_progress=True)
    return index

In [237]:
index = build_index(docs, chunk_size=1024)

Parsing documents into nodes:   0%|          | 0/8944 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/9380 [00:00<?, ?it/s]

In [238]:
query_engine = index.as_query_engine(similarity_top_k=5)

In [239]:
response = query_engine.query('What is the default batch size for map_batches?')

## Evaluate

In [266]:
def evaluate_index(
    dataset,
    index,
    top_k=5,
    verbose=False,
):
    corpus = dataset['corpus']
    queries = dataset['queries']
    relevant_docs = dataset['relevant_docs']

    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    for query_id, query in tqdm(list(queries.items())):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_sources = [node.node.metadata['source'] for node in retrieved_nodes]
        expected_source = relevant_docs[query_id][0]
        is_hit = expected_source in retrieved_sources  # assume 1 relevant doc
        
        eval_result = {
            'is_hit': is_hit,
            'retrieved': retrieved_sources,
            'expected': expected_source,
            'query': query_id,
        }
        eval_results.append(eval_result)
    return eval_results

In [272]:
def evaluate(
    docs, 
    eval_dataset,
    chunk_size=1024,
    embed_model="text-embedding-ada-002",
    top_k=5,
    verbose=True,
):
    index = build_index(docs, chunk_size, embed_model)
    results = evaluate_index(eval_dataset, index, top_k, verbose=verbose)
    return results

### Chunk size experiment

In [246]:
experiments = [
    {
        'chunk_size': 128,
    },
    {
        'chunk_size': 256,
    },
    {
        'chunk_size': 512,
    },    
    {
        'chunk_size': 1024,
    }
]

In [247]:
result_dfs = []
hit_rates = []
for experiment in experiments: 
    print(f'Running experiment with {experiment}')
    val_result = evaluate(docs, val_dataset, **experiment)
    df = pd.DataFrame(val_result)
    result_dfs.append(df)
    hit_rate = df['is_hit'].mean()
    hit_rates.append(hit_rate)
    print(hit_rate)

Running experiment with {'chunk_size': 128}


Parsing documents into nodes:   0%|          | 0/8944 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/70753 [00:00<?, ?it/s]

100%|██████████| 116/116 [08:23<00:00,  4.34s/it]


0.8103448275862069
Running experiment with {'chunk_size': 1024}


Parsing documents into nodes:   0%|          | 0/8944 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/9380 [00:00<?, ?it/s]

100%|██████████| 116/116 [05:43<00:00,  2.96s/it]

0.8103448275862069





In [250]:
hit_rates

[0.8189655172413793,
 0.8362068965517241,
 0.8103448275862069,
 0.8103448275862069]

### Embed model experiment

In [317]:
experiments = [
    {
        'embed_model': "BAAI/bge-large-en",
    },
    {
        'embed_model': "text-embedding-ada-002",
    },
    {
        'embed_model': "thenlper/gte-base",
    },
    {
        'embed_model': "sentence-transformers/all-mpnet-base-v2",
    }
]

In [308]:
result_dfs = []
hit_rates = []
for experiment in experiments: 
    print(f'Running experiment with {experiment}')
    val_result = evaluate(docs, val_dataset, **experiment)
    df = pd.DataFrame(val_result)
    result_dfs.append(df)
    hit_rate = df['is_hit'].mean()
    hit_rates.append(hit_rate)
    print(hit_rate)

Running experiment with {'embed_model': 'BAAI/bge-large-en'}


Downloading (…)b720e/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Downloading (…)364d9b720e/README.md:   0%|          | 0.00/78.9k [00:00<?, ?B/s]

Downloading (…)4d9b720e/config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)b720e/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading (…)364d9b720e/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)d9b720e/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Parsing documents into nodes:   0%|          | 0/8944 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/9380 [00:00<?, ?it/s]

100%|██████████| 116/116 [03:42<00:00,  1.92s/it]

0.8448275862068966
Running experiment with {'embed_model': 'text-embedding-ada-002'}





Parsing documents into nodes:   0%|          | 0/8944 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/9380 [00:00<?, ?it/s]

100%|██████████| 116/116 [05:42<00:00,  2.95s/it]


0.8275862068965517
Running experiment with {'embed_model': 'thenlper/gte-base'}


Parsing documents into nodes:   0%|          | 0/8944 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/9380 [00:00<?, ?it/s]

100%|██████████| 116/116 [02:55<00:00,  1.51s/it]

0.9224137931034483





### Top K Experiment

In [309]:
experiments = [
    {
        'top_k': 2, 
    },
    {
        'top_k': 3,
    },
    {
        'top_k': 4,
    }
]

In [310]:
result_dfs = []
hit_rates = []
for experiment in experiments: 
    print(f'Running experiment with {experiment}')
    val_result = evaluate(docs, val_dataset, **experiment)
    df = pd.DataFrame(val_result)
    result_dfs.append(df)
    hit_rate = df['is_hit'].mean()
    hit_rates.append(hit_rate)
    print(hit_rate)

Running experiment with {'top_k': 2}


Parsing documents into nodes:   0%|          | 0/8944 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/9380 [00:00<?, ?it/s]

100%|██████████| 116/116 [05:39<00:00,  2.92s/it]

0.75
Running experiment with {'top_k': 3}





Parsing documents into nodes:   0%|          | 0/8944 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/9380 [00:00<?, ?it/s]

100%|██████████| 116/116 [05:42<00:00,  2.95s/it]

0.7758620689655172
Running experiment with {'top_k': 4}





Parsing documents into nodes:   0%|          | 0/8944 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/9380 [00:00<?, ?it/s]

100%|██████████| 116/116 [05:49<00:00,  3.01s/it]

0.8017241379310345





### Sentence window approach

In [315]:
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index import ServiceContext, set_global_service_context
from llama_index.llms import OpenAI
from llama_index.embeddings import OpenAIEmbedding
from llama_index.node_parser import SentenceWindowNodeParser

# create the sentence window node parser w/ default settings
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-mpnet-base-v2"
    ),
    node_parser=node_parser,
)


In [316]:
index = VectorStoreIndex.from_documents(docs, service_context=service_context, show_progress=True)

Parsing documents into nodes:   0%|          | 0/8944 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/59775 [00:00<?, ?it/s]

In [None]:
evaluate_index(val_dataset, index, top_k=5, verbose=True)

 34%|███▍      | 40/116 [06:17<12:19,  9.72s/it]

[2m[1m[36m(autoscaler +28h47m35s)[0m Adding 1 node(s) of type worker-node-type-0.


 41%|████▏     | 48/116 [07:40<11:23, 10.05s/it]

[2m[1m[36m(autoscaler +28h48m50s)[0m Resized to 32 CPUs, 2 GPUs.


 66%|██████▌   | 76/116 [12:05<06:23,  9.58s/it]