# Implementing RAG

## Loading RAG Data into ChromaDB Vector Store

In [1]:
import os
import json

def load_json_files(directory):
    """
    Load all JSON files from the specified directory.

    Args:
        directory (str): The path to the directory containing the JSON files.

    Returns:
        list: A list of dictionaries containing the data from all JSON files.
    """
    data = []
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as f:
                data.append(json.load(f))
    return data

def extract_text(data):
    """
    Extract text data from the JSON structure, supporting two formats.

    Args:
        data (list): A list of dictionaries containing JSON data.

    Returns:
        list: A list of dictionaries with URLs or file paths and their corresponding text chunks.
    """
    documents = []
    for entry in data:
        for key, value in entry.items():
            if isinstance(value, list):  # First format with URL keys
                for chunk in value:
                    documents.append({"source": key, "text": chunk})
            elif isinstance(value, dict):  # Second format with file paths as keys
                if "text" in value:
                    documents.append({"source": key, "text": value["text"]})
    return documents

# Directory containing the JSON files
directory = "RAG_data"

# Load and preprocess data
json_data = load_json_files(directory)
documents = extract_text(json_data)

# Example output
print(f"Loaded {len(documents)} documents.")
print("Sample document:", documents[0])

Loaded 30011 documents.
Sample document: {'source': 'https://github.com/ray-project/llm-numbers#1-mb-gpu-memory-required-for-1-token-of-output-with-a-13b-parameter-model', 'text': 'Skip to content Navigation Menu Toggle navigation Sign in Product GitHub Copilot Write better code with AI Security Find and fix vulnerabilities Actions Automate any workflow Codespaces Instant dev environments Issues Plan and track work Code Review Manage code changes Discussions Collaborate outside of code Code Search Find more, search less Explore All features Documentation GitHub Skills Blog Solutions By company size Enterprises Small and medium teams Startups By use case DevSecOps DevOps CI/CD View all use cases By industry Healthcare Financial services Manufacturing Government View all industries View all solutions Resources Topics AI DevOps Security Software Development View all Explore Learning Pathways White papers, Ebooks, Webinars Customer Stories Partners Open Source GitHub Sponsors Fund open sou

## Generating Embeddings

We will be utilizing the text-embedding-3-large embedding model through the OpenAI API

In [2]:
import json
import os
import openai
import timeit
from scipy.spatial.distance import cosine
import time
import pandas as pd
import numpy as np
from tiktoken import encoding_for_model

In [3]:
from dotenv import load_dotenv

load_dotenv()

True

In [4]:
openai.api_key = os.getenv('OPENAI_API_KEY')

In [5]:
client = openai.OpenAI()

In [6]:
def get_embedding(text, model="text-embedding-3-large", max_tokens=8192):
    tokenizer = encoding_for_model(model)  
    tokens = tokenizer.encode(text)
    if len(tokens) > max_tokens:
        print(f"Truncating text to {max_tokens} tokens: {text[:100]}...")  # Log truncation
        tokens = tokens[:max_tokens]
        text = tokenizer.decode(tokens)
    try:
        response = client.embeddings.create(input=text, model=model)
        return response.data[0].embedding
    except Exception as e:
        print(f"Failed to encode text: {e}")
        return None

In [7]:
def get_embeddings(texts, model="text-embedding-3-large", max_tokens=8192, batch_size=100):
    embeddings = []
    tokenizer = encoding_for_model(model)  # Get the tokenizer for the model
   
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]

        # Truncate texts in the batch that exceed the token limit
        truncated_batch = []
        for text in batch:
            tokens = tokenizer.encode(text)
            if len(tokens) > max_tokens:
                print(f"Truncating text to {max_tokens} tokens: {text[:100]}...")  # Log truncation
                tokens = tokens[:max_tokens]
                text = tokenizer.decode(tokens)
            truncated_batch.append(text)

        try:
            # Generate embeddings for the batch
            response = client.embeddings.create(input=truncated_batch, model=model)
            batch_embeddings = [item.embedding for item in response.data]
            if i % 5000 == 0:
                print(f"Generated embeddings for batch {i}-{i + batch_size}")
            embeddings.extend(batch_embeddings)
        except Exception as e:
            print(f"Unexpected error for batch {i}-{i + batch_size}: {e}")
            embeddings.extend([None] * len(batch))  # Append None for unexpected errors
        
    return embeddings

In [10]:
texts = [doc["text"] for doc in documents]
embeddings = get_embeddings(texts, batch_size=1000)

Generated embeddings for batch 0-1000
Truncating text to 8192 tokens: An In-Depth Guide to the VR Universe Article Top 10 Career Opportunities in Artificial Intelligence ...
Generated embeddings for batch 5000-6000
Generated embeddings for batch 10000-11000
Truncating text to 8192 tokens: R Python library(h2o) # Start the H2O cluster (locally) h2o.init() # Import a sample binary outcome ...
Generated embeddings for batch 15000-16000
Truncating text to 8192 tokens: All rights reserved Products ignio AIOps Redefining IT Operations with AI and Automation ignio Obser...
Generated embeddings for batch 20000-21000
Generated embeddings for batch 25000-26000
Truncating text to 8192 tokens: Topics Alphabetic H2O Wiki Algorithms Activation Function Confusion Matrix Convolutional Neural Netw...
Truncating text to 8192 tokens: By acrastt • Oct 18, 2023 Building Your First Kubeflow Pipeline: A Comprehensive Guide By turhancan9...
Truncating text to 8192 tokens: Media center Investors InvestorsCaree

## Loading Data into ChromaDB Vector Store

We first need to initialize the ChromaDB vector store. The data will persist in the "chroma" directory

In [8]:
import chromadb
from chromadb.config import Settings

In [9]:
persistent_client = chromadb.PersistentClient(settings=Settings(allow_reset=True))
collection = persistent_client.get_or_create_collection("llm_tutor_collection")

Now we can load the documents and generated embeddings into the llm_tutor_collection in our ChromaDB instance

In [11]:
batch_size = 500
batch_ids = []
batch_texts = []
batch_metadata = []
batch_embeddings = []

id = 0
for doc, embedding in zip(documents, embeddings):
    # Add data to the current batch
    batch_ids.append(str(id))
    batch_texts.append(doc["text"])
    batch_metadata.append({"source": doc["source"]})
    batch_embeddings.append(embedding)
    id += 1

    # Check if the batch is ready for uploading
    if len(batch_ids) == batch_size:
        # Upload the batch to ChromaDB
        collection.add(
            ids=batch_ids,
            documents=batch_texts,
            metadatas=batch_metadata,
            embeddings=batch_embeddings
        )
        # Clear the batch lists
        batch_ids = []
        batch_texts = []
        batch_metadata = []
        batch_embeddings = []

# Upload any remaining data in the last batch
if batch_ids:
    collection.add(
        ids=batch_ids,
        documents=batch_texts,
        metadatas=batch_metadata,
        embeddings=batch_embeddings
    )

In [25]:
collection.count()

30011

## Loading in Generative LLM Candidates

### OpenAI GPT-4o-mini

In [21]:
gpt_4o_base = "gpt-4o-mini"
gpt_4o_finetuned = "ft:gpt-4o-mini-2024-07-18:f-prime-capital::AbZYSjIT"

## Setting Up RAG methods

In [22]:
def retrieve_relevant_documents(query, n_results=5):
    """
    Retrieve the most relevant documents from the ChromaDB vector store.
    
    Args:
        query (str): The user's question or query.
        collection (Collection): The ChromaDB collection object.
        n_results (int): Number of results to retrieve.
    
    Returns:
        str: Concatenated text of the top retrieved documents.
    """
    # Generate embedding for the query using Gemini model
    query_embedding = get_embedding(query)

    # Retrieve top documents
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results
    )

    # Combine text from the retrieved documents
    retrieved_text = " ".join(doc[0] for doc in results["documents"])
    return retrieved_text

In [23]:
def run_rag(model, query):
    # Retrieve relevant documents
    retrieved_text = retrieve_relevant_documents(query)

    # Generate response using OpenAI Model
    all_messages = [
        {"role": "system", "content": "You are a helpful tutor who answers questions about a class called Introduction to Deep Learning and LLM based Generative AI Systems"},
        {"role": "user", "content": f"Generate an answer to the following question using the given context:\n\n {query}\n\n {"="*50}\n\nCONTEXT: {retrieved_text}\n\n"}
    ]
    response = client.chat.completions.create(
        model=model,
        messages=all_messages,
        max_tokens=1500,
    )
    model_response_text = response.choices[0].message.content
    return model_response_text

### Querying RAG Pipeline

In [24]:
question = "I want to build a deep learning model for image classification. What are some best practices for training deep learning models?"
response = run_rag(gpt_4o_base ,question)
print(response)

When training a deep learning model for image classification, there are several best practices you should consider to optimize performance and efficiency:

1. **Image Size**: Use a consistent image size for your dataset that is appropriate for the model architecture you choose. Common sizes for models trained on ImageNet are 224x224 or 299x299 pixels. Starting with a smaller size (e.g., 64 pixels) can help in quickly testing your pipeline before scaling up.

2. **Validation Set Size**: The size of your validation set should be proportionate to your overall dataset. While a common practice is to allocate around 20% for validation, it’s essential to ensure that the validation set is large enough to provide reliable metrics. If results are fluctuating significantly across training runs, it may indicate that the validation set is too small.

3. **Data Preprocessing**: Make sure to preprocess your images appropriately. Consider aspects like normalization, resizing, and potentially data augm

## Evaluating RAG with Embedding Similarity

In [26]:
with open('./Test Data/test_data.json', "r") as json_file:
    test_data = json.load(json_file)

In [27]:
def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

In [28]:
def eval_semantic_sim(model):
    # Evaluate semantic similarity
    similarities = []
    start = timeit.default_timer()
    # last_pause = timeit.default_timer()
    for i,example in enumerate(test_data):
        try:
            input_text = example['input']
            expected_output = example['output']
            # Generate model response with RAG
            model_response_text = run_rag(model, input_text)
            # Generate embeddings for expected and actual responses
            expected_embedding = get_embedding(expected_output)
            model_response_embedding = get_embedding(model_response_text)

            # Calculate similarity
            similarity = cosine_similarity(expected_embedding, model_response_embedding)
            similarities.append({'question':input_text ,'expected_output':expected_output,'model_output':model_response_text,'similarities':similarity})
        except Exception as e:
            print(i,e)
            # break
        if (i%5==0)&(i!=0):
            end = timeit.default_timer()
            print(f"{i} - Time Spent: {end-start}, Number of Errors: {i + 1 - len(similarities)}")
            start = timeit.default_timer()
            # print(f"Sleeping for {60-(timeit.default_timer()-last_pause)} seconds")
            # if (60-(timeit.default_timer()-last_pause))>0:
            #     time.sleep(60-(timeit.default_timer()-last_pause))
            # last_pause = timeit.default_timer()

    return similarities

In [29]:
gpt_4o_base_similarities = pd.DataFrame(eval_semantic_sim(gpt_4o_base))

5 - Time Spent: 26.1099309999845, Number of Errors: 0
10 - Time Spent: 21.710568415990565, Number of Errors: 0
15 - Time Spent: 27.2744399170042, Number of Errors: 0
20 - Time Spent: 13.717908041959163, Number of Errors: 0
25 - Time Spent: 21.97741016600048, Number of Errors: 0
30 - Time Spent: 20.57787149999058, Number of Errors: 0
35 - Time Spent: 21.043680500006303, Number of Errors: 0
40 - Time Spent: 12.65108495898312, Number of Errors: 0
45 - Time Spent: 21.59139362501446, Number of Errors: 0
50 - Time Spent: 17.194179290963802, Number of Errors: 0
55 - Time Spent: 16.18291400000453, Number of Errors: 0
60 - Time Spent: 18.01677191699855, Number of Errors: 0
65 - Time Spent: 18.471013915957883, Number of Errors: 0
70 - Time Spent: 17.91276012500748, Number of Errors: 0
75 - Time Spent: 18.473722791997716, Number of Errors: 0
80 - Time Spent: 15.040522042021621, Number of Errors: 0
85 - Time Spent: 22.812194750003982, Number of Errors: 0
90 - Time Spent: 16.849638499959838, Number

In [30]:
print(f"Mean Similarity for GPT-4o Base: {gpt_4o_base_similarities['similarities'].mean()}")
print(f"Standard Deviation Similarity for GPT-4o Base: {gpt_4o_base_similarities['similarities'].std()}")

Mean Similarity for GPT-4o Base: 0.7553352984661562
Standard Deviation Similarity for GPT-4o Base: 0.1290664422819455


In [31]:
gpt_4o_base_similarities.to_csv('./Eval Results/gpt_4o_base_similarities.csv', index=False)

In [32]:
gpt_4o_finetuned_similarities = pd.DataFrame(eval_semantic_sim(gpt_4o_finetuned))

5 - Time Spent: 12.490113665990066, Number of Errors: 0
10 - Time Spent: 10.02567000000272, Number of Errors: 0
15 - Time Spent: 8.636986374971457, Number of Errors: 0
20 - Time Spent: 9.38813083304558, Number of Errors: 0
25 - Time Spent: 9.052979041996878, Number of Errors: 0
30 - Time Spent: 9.079808665963355, Number of Errors: 0
35 - Time Spent: 9.375906957953703, Number of Errors: 0
40 - Time Spent: 12.305499832960777, Number of Errors: 0
45 - Time Spent: 9.344452834047843, Number of Errors: 0
50 - Time Spent: 10.134344165970106, Number of Errors: 0
55 - Time Spent: 8.606583042012062, Number of Errors: 0
60 - Time Spent: 11.603924499999266, Number of Errors: 0
65 - Time Spent: 10.00196408398915, Number of Errors: 0
70 - Time Spent: 9.119711124978494, Number of Errors: 0
75 - Time Spent: 10.540409542038105, Number of Errors: 0
80 - Time Spent: 8.988165374961682, Number of Errors: 0
85 - Time Spent: 9.236099208006635, Number of Errors: 0
90 - Time Spent: 9.532862166000996, Number of

In [33]:
print(f"Mean Similarity for GPT-4o Finetuned: {gpt_4o_finetuned_similarities['similarities'].mean()}")
print(f"Standard Deviation Similarity for GPT-4o Finetuned: {gpt_4o_finetuned_similarities['similarities'].std()}")

Mean Similarity for GPT-4o Finetuned: 0.7709224707018043
Standard Deviation Similarity for GPT-4o Finetuned: 0.1529790971241027


In [34]:
gpt_4o_base_similarities.to_csv('./Eval Results/gpt_4o_finetuned_similarities.csv', index=False)

## Evaluating Chain-of-Thought Prompting with RAG and Fine-Tuned Model

In [35]:
def run_rag_CoT(model, query):
    # Retrieve relevant documents
    retrieved_text = retrieve_relevant_documents(query)

    # Generate response using OpenAI Model
    all_messages = [
        {"role": "system", "content": "You are a helpful tutor who answers questions about a class called Introduction to Deep Learning and LLM based Generative AI Systems"},
        {"role": "user", "content": f"Please answer the following question step-by-step using the given context:\n\n {query}\n\n {"="*50}\n\nCONTEXT: {retrieved_text}\n\n"},
        {"role": "user", "content": f"Remember to break down the question step by step before giving me the final answer"}
    ]
    response = client.chat.completions.create(
        model=model,
        messages=all_messages,
        max_tokens=1500,
    )
    model_response_text = response.choices[0].message.content
    return model_response_text

In [36]:
def eval_semantic_sim_w_CoT(model):
    # Evaluate semantic similarity
    similarities = []
    start = timeit.default_timer()
    # last_pause = timeit.default_timer()
    for i,example in enumerate(test_data):
        try:
            input_text = example['input']
            expected_output = example['output']
            # Generate model response with RAG
            model_response_text = run_rag_CoT(model, input_text)
            # Generate embeddings for expected and actual responses
            expected_embedding = get_embedding(expected_output)
            model_response_embedding = get_embedding(model_response_text)

            # Calculate similarity
            similarity = cosine_similarity(expected_embedding, model_response_embedding)
            similarities.append({'question':input_text ,'expected_output':expected_output,'model_output':model_response_text,'similarities':similarity})
        except Exception as e:
            print(i,e)
            # break
        if (i%5==0)&(i!=0):
            end = timeit.default_timer()
            print(f"{i} - Time Spent: {end-start}, Number of Errors: {i + 1 - len(similarities)}")
            start = timeit.default_timer()
            # print(f"Sleeping for {60-(timeit.default_timer()-last_pause)} seconds")
            # if (60-(timeit.default_timer()-last_pause))>0:
            #     time.sleep(60-(timeit.default_timer()-last_pause))
            # last_pause = timeit.default_timer()

    return similarities

In [38]:
gpt_4o_finetuned_similarities_CoT = pd.DataFrame(eval_semantic_sim_w_CoT(gpt_4o_finetuned))

5 - Time Spent: 14.387737584009301, Number of Errors: 0
10 - Time Spent: 13.807271208032034, Number of Errors: 0
15 - Time Spent: 10.472942082968075, Number of Errors: 0
20 - Time Spent: 10.761664875026327, Number of Errors: 0
25 - Time Spent: 9.523952791991178, Number of Errors: 0
30 - Time Spent: 11.038625208020676, Number of Errors: 0
35 - Time Spent: 15.334635375009384, Number of Errors: 0
40 - Time Spent: 9.237739333009813, Number of Errors: 0
45 - Time Spent: 12.800850333005656, Number of Errors: 0
50 - Time Spent: 13.619311250047758, Number of Errors: 0
55 - Time Spent: 11.571749124967027, Number of Errors: 0
60 - Time Spent: 9.933544334024191, Number of Errors: 0
65 - Time Spent: 11.162098000000697, Number of Errors: 0
70 - Time Spent: 9.509636915987357, Number of Errors: 0
75 - Time Spent: 9.943166040990036, Number of Errors: 0
80 - Time Spent: 10.245050583034754, Number of Errors: 0
85 - Time Spent: 9.213901833980344, Number of Errors: 0
90 - Time Spent: 9.930051999981515, Nu

In [39]:
print(f"Mean Similarity for GPT-4o Finetuned with CoT: {gpt_4o_finetuned_similarities_CoT['similarities'].mean()}")
print(f"Standard Deviation Similarity for GPT-4o Finetuned with CoT: {gpt_4o_finetuned_similarities_CoT['similarities'].std()}")

Mean Similarity for GPT-4o Finetuned with CoT: 0.16310978334545692
Standard Deviation Similarity for GPT-4o Finetuned with CoT: 0.21078976289427062


In [40]:
gpt_4o_finetuned_similarities_CoT.to_csv('./Eval Results/gpt_4o_finetuned_similarities_CoT.csv', index=False)

## Evaluating Self-Consistency with RAG and Fine-Tuned Model

In [43]:
def eval_semantic_sim_w_SelfConsistency(model):
    similarities = []
    start = timeit.default_timer()
    num_samples = 5  # Number of responses to generate per example

    for i, example in enumerate(test_data):
        try:
            input_text = example['input']
            expected_output = example['output']

            # Collect multiple responses
            self_consistency_responses = []
            consistency_prompt = f"The question is: {input_text}\n\nThe expected answer is: {expected_output}\n\nHere are the generated responses:"
            for j in range(num_samples):
                response = run_rag(model, input_text)
                self_consistency_responses.append(response)
                consistency_prompt+= f"\n {j}. {response}"
            consistency_prompt+= "\nWhich response best matches the expected answer? Please answer text of the best response"
            evaluation_prompt = [
                {"role": "system", "content": "You are an evaluator for answers to a question about a class called Introduction to Deep Learning and LLM-based Generative AI Systems. Your task is to pick the best response to a question."},
                {"role": "user", "content": consistency_prompt}
            ]
            response = openai.ChatCompletion.create(
                model="ft:gpt-4o-mini-2024-07-18:f-prime-capital::AbZYSjIT",
                max_tokens=50,
                messages=evaluation_prompt
            )
            aggregated_response = response['choices'][0]['message']['content']

            # Generate embeddings for expected and aggregated responses
            expected_embedding = get_embedding(expected_output)
            aggregated_embedding = get_embedding(aggregated_response)

            # Calculate similarity
            similarity = cosine_similarity(expected_embedding, aggregated_embedding)
            
            # Store results
            similarities.append({
                'question': input_text,
                'expected_output': expected_output,
                'aggregated_output': aggregated_response,
                'individual_responses': self_consistency_responses,
                'similarity': similarity
            })

        except Exception as e:
            print(f"Error at example {i}: {e}")

        if (i % 5 == 0) and (i != 0):
            end = timeit.default_timer()
            print(f"{i} - Time Spent: {end - start}, Number of Errors: {i + 1 - len(similarities)}")
            start = timeit.default_timer()
    
    return similarities


In [None]:
gpt_4o_finetuned_similarities_SelfConsistency = pd.DataFrame(eval_semantic_sim_w_SelfConsistency(gpt_4o_finetuned))

In [None]:
print(f"Mean Similarity for GPT-4o Finetuned with Self-Consistency: {gpt_4o_finetuned_similarities_SelfConsistency['similarity'].mean()}")
print(f"Standard Deviation Similarity for GPT-4o Finetuned with Self-Consistency: {gpt_4o_finetuned_similarities_SelfConsistency['similarity'].std()}")

In [None]:
gpt_4o_finetuned_similarities_SelfConsistency.to_csv('./Eval Results/gpt_4o_finetuned_similarities_SelfConsistency.csv', index=False)