# Improve RAG performance with LlamaIndex

### Setup

In [1]:
import os
import openai
from pathlib import Path
from pprint import pprint
import ray
from tqdm import tqdm

In [2]:
import sys; sys.path.append("..")
import warnings; warnings.filterwarnings("ignore")
from dotenv import load_dotenv; load_dotenv()

True

In [3]:
EFS_DIR = Path("/efs/shared_storage/simon")
ROOT_DIR = Path(os.getcwd()).parent
print (ROOT_DIR)

/home/ray/default/llm-applications


In [4]:
# Credentials
ray.init(runtime_env={"env_vars": {
    "OPENAI_API_BASE": os.environ["OPENAI_API_BASE"],
    "OPENAI_API_KEY": os.environ["OPENAI_API_KEY"], 
    "ANYSCALE_API_BASE": os.environ["ANYSCALE_API_BASE"],
    "ANYSCALE_API_KEY": os.environ["ANYSCALE_API_KEY"],
    "DB_CONNECTION_STRING": os.environ["DB_CONNECTION_STRING"],
}})

2023-09-07 19:27:48,713	INFO worker.py:1431 -- Connecting to existing Ray cluster at address: 10.0.28.181:6379...
2023-09-07 19:27:48,723	INFO worker.py:1612 -- Connected to Ray cluster. View the dashboard at [1m[32mhttps://session-hvq6cjxyd917stdzvn4cs58auc.i.anyscaleuserdata.com [39m[22m
2023-09-07 19:27:48,726	INFO packaging.py:346 -- Pushing file package 'gcs://_ray_pkg_f2fb0c89dfb991b705c0704475ed37e2.zip' (0.41MiB) to Ray cluster...
2023-09-07 19:27:48,727	INFO packaging.py:359 -- Successfully pushed file package 'gcs://_ray_pkg_f2fb0c89dfb991b705c0704475ed37e2.zip'.


0,1
Python version:,3.9.15
Ray version:,2.6.2
Dashboard:,http://session-hvq6cjxyd917stdzvn4cs58auc.i.anyscaleuserdata.com


### Load eval questions

In [108]:
with open(Path(ROOT_DIR, "datasets/eval-dataset-v1.jsonl"), "r") as f:
    eval_data = [json.loads(item) for item in list(f)]

In [134]:
eval_data_sample = eval_data[:10]


Let's setup the evaluation in the same way as before so we can compare to previous experiments.

Let's load the eval data with reference answers first.

In [136]:
import json

def write_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

def read_json(filename):
    with open(filename, 'r') as f:
        data = json.load(f)
    return data

In [137]:
sections = read_json(Path(ROOT_DIR, "datasets/eval_full_corpus.json"))

In [138]:
from llama_index import Document

def to_doc(entry_dict):
    return Document(text=entry_dict['text'], metadata={'source': entry_dict['source']})

In [139]:
docs = [to_doc(dict_) for dict_ in sections]

### Run Inference (Sentence)

In [63]:
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index import ServiceContext, set_global_service_context, VectorStoreIndex
from llama_index.llms import OpenAI
from llama_index.embeddings import OpenAIEmbedding
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor

# create the sentence window node parser w/ default settings
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

service_context = ServiceContext.from_defaults(
    embed_model=HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-mpnet-base-v2"
    ),
    node_parser=node_parser,
)

index = VectorStoreIndex.from_documents(
    docs, 
    service_context=service_context, 
    show_progress=True
)

query_engine = index.as_query_engine(
    similarity_top_k=2,
    # the target key defaults to `window` to match the node_parser's default
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)

In [140]:
results = []
for entry in tqdm(eval_data_sample):
    question = entry['question']
    response = query_engine.query(question)
    answer = response.response
    sources = [node.metadata['source'] for node in response.source_nodes]
    
    result = {
        'question': question,
        'sources': sources,
        'answer': answer,
    }
    results.append(result)

100%|██████████| 10/10 [00:45<00:00,  4.53s/it]


In [None]:
experiment_name = 'sentence_window'

responses_fp = Path(ROOT_DIR, EXPERIMENTS_DIR, "responses", f"{experiment_name}.json")

responses_fp.parent.mkdir(parents=True, exist_ok=True)

config = {
    "experiment_name": experiment_name,
}

responses = {
    "config": config,
    "results": results,
}

with open(responses_fp, "w") as fp:
    json.dump(responses, fp, indent=4)

### Run inference (normal) 

In [None]:
index = VectorStoreIndex.from_documents(
    docs, 
    show_progress=True
)

query_engine = index.as_query_engine()

In [None]:
results = []
for entry in tqdm(eval_data_sample):
    question = entry['question']
    response = query_engine.query(question)
    answer = response.response
    sources = [node.metadata['source'] for node in response.source_nodes]
    
    result = {
        'question': question,
        'sources': sources,
        'answer': answer,
    }
    results.append(result)

In [None]:
experiment_name = 'normal'
responses_fp = Path(ROOT_DIR, EXPERIMENTS_DIR, "responses", f"{experiment_name}.json")
responses_fp.parent.mkdir(parents=True, exist_ok=True)
config = {
    "experiment_name": experiment_name,
}
responses = {
    "config": config,
    "results": results,
}
with open(responses_fp, "w") as fp:
    json.dump(responses, fp, indent=4)

### Evaluate

In [126]:
from app.config import EMBEDDING_DIMENSIONS, MAX_CONTEXT_LENGTHS
# from app.evaluate import evaluate_responses

In [121]:
EVALUATOR = "gpt-4"
REFERENCE_LOC = str(Path(ROOT_DIR, EXPERIMENTS_DIR, "references", "gpt-4.json"))

In [148]:
def get_retrieval_score(references, generated):
    matches = np.zeros(len(references))
    for i in range(len(references)):
        reference_source = references[i]["source"].split("#")[0]
        if not reference_source:
            matches[i] = 1
            continue
        for source in generated[i]["sources"]:
            # sections don't have to perfectly match
            if reference_source == source.split("#")[0]:
                matches[i] = 1
                continue
    retrieval_score = np.mean(matches)
    return retrieval_score

In [132]:
def evaluate_responses(
    experiment_name,
    reference_loc,
    response_loc,
    evaluator,
    temperature,
    max_context_length,
    system_content,
    assistant_content="",
    num_samples=None,
):
    # Set credentials
    set_credentials(llm=evaluator)

    # Load answers
    with open(Path(reference_loc), "r") as f:
        references = [item for item in json.load(f)][:num_samples]
    with open(Path(response_loc), "r") as f:
        generated = [item for item in json.load(f)["results"]][:num_samples]
    

    # Quality score
    results = []
    context_length = max_context_length - len(system_content + assistant_content)
    for ref, gen in tqdm(zip(references, generated), total=len(references)):
        assert ref["question"] == gen["question"]
        user_content = str(
            {
                "question": gen["question"],
                "generated_answer": gen["answer"],
                "reference_answer": ref["answer"],
            }
        )[:context_length]

        # Generate response
        response = generate_response(
            llm=evaluator,
            temperature=temperature,
            system_content=system_content,
            assistant_content=assistant_content,
            user_content=user_content,
        )

        # Extract from response
        score, reasoning = response.split("\n", 1)
        result = {
            "question": gen["question"],
            "generated_answer": gen["answer"],
            "reference_answer": ref["answer"],
            "score": float(score),
            "reasoning": reasoning.lstrip("\n"),
            "sources": gen["sources"],
        }
        results.append(result)
        clear_output(wait=True)
        display(JSON(json.dumps(result, indent=2)))

    # Save to file
    evaluator_name = evaluator.split("/")[-1].lower()
    evaluation_fp = Path(ROOT_DIR, EXPERIMENTS_DIR, "evaluations", f"{experiment_name}_{evaluator_name}.json")
    evaluation_fp.parent.mkdir(parents=True, exist_ok=True)
    config = {
        "experiment_name": experiment_name,
        "reference_loc": reference_loc,
        "response_loc": response_loc,
        "evaluator": evaluator,
        "temperature": temperature,
        "max_context_length": max_context_length,
        "system_content": system_content,
        "assistant_content": assistant_content,
    }
    evaluation = {
        "config": config,
        "retrieval_score": get_retrieval_score(references, generated),
        "quality_score": np.mean([item["score"] for item in results if (item["score"] and item["reference_answer"])]),
        "results": results,
    }
    with open(evaluation_fp, "w") as fp:
        json.dump(evaluation, fp, indent=4)


In [149]:
evaluation_system_content = """
    Your job is to rate the quality of our generated answer {generated_answer}
    given a query {query} and a reference answer {reference_answer}.
    Your score has to be between 1 and 5.
    You must return your response in a line with only the score.
    Do not return answers in any other format.
    On a separate line provide your reasoning for the score as well.
    """

experiment_name = 'sentence_window'
num_samples = 10

evaluate_responses(
    experiment_name=experiment_name,
    reference_loc=REFERENCE_LOC, 
    response_loc=str(Path(ROOT_DIR, EXPERIMENTS_DIR, "responses", f"{experiment_name}.json")),
    evaluator=EVALUATOR, 
    temperature=0.0, 
    max_context_length=MAX_CONTEXT_LENGTHS[EVALUATOR],
    system_content=evaluation_system_content,
    num_samples=num_samples
)

<IPython.core.display.JSON object>

100%|██████████| 10/10 [01:04<00:00,  6.45s/it]
