# Implementing RAG

## Loading RAG Data into ChromaDB Vector Store

In [1]:
import os
import json

def load_json_files(directory):
    """
    Load all JSON files from the specified directory.

    Args:
        directory (str): The path to the directory containing the JSON files.

    Returns:
        list: A list of dictionaries containing the data from all JSON files.
    """
    data = []
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as f:
                data.append(json.load(f))
    return data

def extract_text(data):
    """
    Extract text data from the JSON structure, supporting two formats.

    Args:
        data (list): A list of dictionaries containing JSON data.

    Returns:
        list: A list of dictionaries with URLs or file paths and their corresponding text chunks.
    """
    documents = []
    for entry in data:
        for key, value in entry.items():
            if isinstance(value, list):  # First format with URL keys
                for chunk in value:
                    documents.append({"source": key, "text": chunk})
            elif isinstance(value, dict):  # Second format with file paths as keys
                if "text" in value:
                    documents.append({"source": key, "text": value["text"]})
    return documents

# Directory containing the JSON files
directory = "RAG_data"

# Load and preprocess data
json_data = load_json_files(directory)
documents = extract_text(json_data)

# Example output
print(f"Loaded {len(documents)} documents.")
print("Sample document:", documents[0])

Loaded 30011 documents.
Sample document: {'source': 'https://github.com/ray-project/llm-numbers#1-mb-gpu-memory-required-for-1-token-of-output-with-a-13b-parameter-model', 'text': 'Skip to content Navigation Menu Toggle navigation Sign in Product GitHub Copilot Write better code with AI Security Find and fix vulnerabilities Actions Automate any workflow Codespaces Instant dev environments Issues Plan and track work Code Review Manage code changes Discussions Collaborate outside of code Code Search Find more, search less Explore All features Documentation GitHub Skills Blog Solutions By company size Enterprises Small and medium teams Startups By use case DevSecOps DevOps CI/CD View all use cases By industry Healthcare Financial services Manufacturing Government View all industries View all solutions Resources Topics AI DevOps Security Software Development View all Explore Learning Pathways White papers, Ebooks, Webinars Customer Stories Partners Open Source GitHub Sponsors Fund open sou

## Generating Embeddings

We will be utilizing the text-embedding-3-large embedding model through the OpenAI API

In [2]:
import json
import os
import openai
import timeit
from scipy.spatial.distance import cosine
import time
import pandas as pd
import numpy as np
from tiktoken import encoding_for_model

In [3]:
from dotenv import load_dotenv

load_dotenv()

True

In [4]:
openai.api_key = os.getenv('OPENAI_API_KEY')

In [5]:
client = openai.OpenAI()

In [6]:
def get_embedding(text, model="text-embedding-3-large", max_tokens=8192):
    tokenizer = encoding_for_model(model)  
    tokens = tokenizer.encode(text)
    if len(tokens) > max_tokens:
        print(f"Truncating text to {max_tokens} tokens: {text[:100]}...")  # Log truncation
        tokens = tokens[:max_tokens]
        text = tokenizer.decode(tokens)
    try:
        response = client.embeddings.create(input=text, model=model)
        return response.data[0].embedding
    except Exception as e:
        print(f"Failed to encode text: {e}")
        return None

In [14]:
def get_embeddings(texts, model="text-embedding-3-large", max_tokens=8192, batch_size=100):
    embeddings = []
    tokenizer = encoding_for_model(model)  # Get the tokenizer for the model
   
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]

        # Truncate texts in the batch that exceed the token limit
        truncated_batch = []
        for text in batch:
            tokens = tokenizer.encode(text)
            if len(tokens) > max_tokens:
                print(f"Truncating text to {max_tokens} tokens: {text[:100]}...")  # Log truncation
                tokens = tokens[:max_tokens]
                text = tokenizer.decode(tokens)
            truncated_batch.append(text)

        try:
            # Generate embeddings for the batch
            response = client.embeddings.create(input=truncated_batch, model=model)
            batch_embeddings = [item.embedding for item in response.data]
            if i % 5000 == 0:
                print(f"Generated embeddings for batch {i}-{i + batch_size}")
            embeddings.extend(batch_embeddings)
        except Exception as e:
            print(f"Unexpected error for batch {i}-{i + batch_size}: {e}")
            embeddings.extend([None] * len(batch))  # Append None for unexpected errors
        
    return embeddings

In [15]:
texts = [doc["text"] for doc in documents]
embeddings = get_embeddings(texts, batch_size=1000)

Generated embeddings for batch 0-1000
Truncating text to 8192 tokens: An In-Depth Guide to the VR Universe Article Top 10 Career Opportunities in Artificial Intelligence ...
Generated embeddings for batch 5000-6000
Generated embeddings for batch 10000-11000
Truncating text to 8192 tokens: R Python library(h2o) # Start the H2O cluster (locally) h2o.init() # Import a sample binary outcome ...
Generated embeddings for batch 15000-16000
Truncating text to 8192 tokens: All rights reserved Products ignio AIOps Redefining IT Operations with AI and Automation ignio Obser...
Generated embeddings for batch 20000-21000
Generated embeddings for batch 25000-26000
Truncating text to 8192 tokens: Topics Alphabetic H2O Wiki Algorithms Activation Function Confusion Matrix Convolutional Neural Netw...
Truncating text to 8192 tokens: By acrastt • Oct 18, 2023 Building Your First Kubeflow Pipeline: A Comprehensive Guide By turhancan9...
Truncating text to 8192 tokens: Media center Investors InvestorsCaree

## Loading Data into ChromaDB Vector Store

We first need to initialize the ChromaDB vector store. The data will persist in the "chroma" directory

In [16]:
import chromadb
from chromadb.config import Settings

In [17]:
persistent_client = chromadb.PersistentClient()
collection = persistent_client.get_or_create_collection("llm_tutor_collection")

Now we can load the documents and generated embeddings into the llm_tutor_collection in our ChromaDB instance

In [18]:
for doc, embedding in zip(documents, embeddings):
        collection.add(
            ids=[doc["source"]],
            documents=[doc["text"]],
            metadatas=[{"source": doc["source"]}],
            embeddings=[embedding]
        )

Add of existing embedding ID: https://github.com/ray-project/llm-numbers#1-mb-gpu-memory-required-for-1-token-of-output-with-a-13b-parameter-model
Insert of existing embedding ID: https://github.com/ray-project/llm-numbers#1-mb-gpu-memory-required-for-1-token-of-output-with-a-13b-parameter-model
Add of existing embedding ID: https://github.com/ray-project/llm-numbers#1-mb-gpu-memory-required-for-1-token-of-output-with-a-13b-parameter-model
Insert of existing embedding ID: https://github.com/ray-project/llm-numbers#1-mb-gpu-memory-required-for-1-token-of-output-with-a-13b-parameter-model
Add of existing embedding ID: https://github.com/ray-project/llm-numbers#1-mb-gpu-memory-required-for-1-token-of-output-with-a-13b-parameter-model
Insert of existing embedding ID: https://github.com/ray-project/llm-numbers#1-mb-gpu-memory-required-for-1-token-of-output-with-a-13b-parameter-model
Add of existing embedding ID: https://github.com/ray-project/llm-numbers#1-mb-gpu-memory-required-for-1-token

In [19]:
print(documents[0])

{'source': 'https://github.com/ray-project/llm-numbers#1-mb-gpu-memory-required-for-1-token-of-output-with-a-13b-parameter-model', 'text': 'Skip to content Navigation Menu Toggle navigation Sign in Product GitHub Copilot Write better code with AI Security Find and fix vulnerabilities Actions Automate any workflow Codespaces Instant dev environments Issues Plan and track work Code Review Manage code changes Discussions Collaborate outside of code Code Search Find more, search less Explore All features Documentation GitHub Skills Blog Solutions By company size Enterprises Small and medium teams Startups By use case DevSecOps DevOps CI/CD View all use cases By industry Healthcare Financial services Manufacturing Government View all industries View all solutions Resources Topics AI DevOps Security Software Development View all Explore Learning Pathways White papers, Ebooks, Webinars Customer Stories Partners Open Source GitHub Sponsors Fund open source developers The ReadME Project GitHub 

## Loading in Generative LLM Candidates

### OpenAI GPT-4o-mini

In [20]:
gpt_4o_base = "gpt-4o-mini"
gpt_4o_finetuned = "ft:gpt-4o-mini-2024-07-18:f-prime-capital::AbZYSjIT"

## Setting Up RAG methods

In [21]:
def retrieve_relevant_documents(query, n_results=5):
    """
    Retrieve the most relevant documents from the ChromaDB vector store.
    
    Args:
        query (str): The user's question or query.
        collection (Collection): The ChromaDB collection object.
        n_results (int): Number of results to retrieve.
    
    Returns:
        str: Concatenated text of the top retrieved documents.
    """
    # Generate embedding for the query using Gemini model
    query_embedding = get_embedding(query)

    # Retrieve top documents
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results
    )

    # Combine text from the retrieved documents
    retrieved_text = " ".join(doc[0] for doc in results["documents"])
    return retrieved_text

In [28]:
def run_rag(model, query):
    # Retrieve relevant documents
    retrieved_text = retrieve_relevant_documents(query)

    # Generate response using OpenAI Model
    all_messages = [
        {"role": "system", "content": "You are a helpful tutor who answers questions about a class called Introduction to Deep Learning and LLM based Generative AI Systems"},
        {"role": "user", "content": f"Generate an answer to the following question using the given context:\n\n {query}\n\n {"="*50}\n\nCONTEXT: {retrieved_text}\n\n"}
    ]
    response = client.chat.completions.create(
        model=model,
        messages=all_messages,
        max_tokens=1500,
    )
    model_response_text = response.choices[0].message.content
    return model_response_text

### Querying RAG Pipeline

In [29]:
question = "I want to build a deep learning model for image classification. What are some best practices for training deep learning models?"
response = run_rag(gpt_4o_base ,question)
print(response)

When building a deep learning model for image classification, here are some best practices to consider:

1. **Leverage Transfer Learning**: Utilize pre-trained models that have been trained on large datasets like ImageNet. This allows you to take advantage of the learned features in these models, which can significantly improve your performance, especially when you have a small dataset.

2. **Model Selection**: Choose a model from a model zoo that suits your needs. Both TensorFlow and PyTorch offer repositories of pre-trained models that you can easily incorporate into your project.

3. **Modify the Classifier**: Replace the last layer of the model, which is typically designed for a specific number of classes (like 1000 in ImageNet), with a new randomly initialized layer that reflects the number of classes in your dataset.

4. **Fine-tuning Strategy**: Initially, train only the new classification layer. This allows you to utilize the frozen feature extraction layers from the pre-traine

## Evaluating RAG with Embedding Similarity

In [30]:
with open('./Test Data/test_data.json', "r") as json_file:
    test_data = json.load(json_file)

In [31]:
def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

In [32]:
def eval_semantic_sim(model):
    # Evaluate semantic similarity
    similarities = []
    start = timeit.default_timer()
    # last_pause = timeit.default_timer()
    for i,example in enumerate(test_data):
        try:
            input_text = example['input']
            expected_output = example['output']
            # Generate model response with RAG
            model_response_text = run_rag(model, input_text)
            # Generate embeddings for expected and actual responses
            expected_embedding = get_embedding(expected_output)
            model_response_embedding = get_embedding(model_response_text)

            # Calculate similarity
            similarity = cosine_similarity(expected_embedding, model_response_embedding)
            similarities.append({'question':input_text ,'expected_output':expected_output,'model_output':model_response_text,'similarities':similarity})
        except Exception as e:
            print(i,e)
            # break
        if (i%5==0)&(i!=0):
            end = timeit.default_timer()
            print(f"{i} - Time Spent: {end-start}, Number of Errors: {i + 1 - len(similarities)}")
            start = timeit.default_timer()
            # print(f"Sleeping for {60-(timeit.default_timer()-last_pause)} seconds")
            # if (60-(timeit.default_timer()-last_pause))>0:
            #     time.sleep(60-(timeit.default_timer()-last_pause))
            # last_pause = timeit.default_timer()

    return similarities

In [None]:
gpt_4o_base_similarities = pd.DataFrame(eval_semantic_sim(gpt_4o_base))

5 - Time Spent: 16.413978665950708, Number of Errors: 0
10 - Time Spent: 17.447905000008177, Number of Errors: 0
15 - Time Spent: 18.266000291972887, Number of Errors: 0
20 - Time Spent: 11.97698695899453, Number of Errors: 0
25 - Time Spent: 16.13057416601805, Number of Errors: 0
30 - Time Spent: 10.926576166995801, Number of Errors: 0
35 - Time Spent: 13.710724624979775, Number of Errors: 0
40 - Time Spent: 10.657069500011858, Number of Errors: 0
45 - Time Spent: 19.755867750034668, Number of Errors: 0
50 - Time Spent: 13.03439366701059, Number of Errors: 0
55 - Time Spent: 12.324554249993525, Number of Errors: 0
60 - Time Spent: 15.299843166954815, Number of Errors: 0
65 - Time Spent: 12.222618040977977, Number of Errors: 0
70 - Time Spent: 17.312169749988243, Number of Errors: 0
75 - Time Spent: 15.756793124950491, Number of Errors: 0
80 - Time Spent: 12.351058083004318, Number of Errors: 0
85 - Time Spent: 18.340523791965097, Number of Errors: 0
90 - Time Spent: 14.913129209016915

In [None]:
print(f"Mean Similarity for GPT-4o Base: {gpt_4o_base_similarities['similarities'].mean()}")
print(f"Standard Deviation Similarity for GPT-4o Base: {gpt_4o_base_similarities['similarities'].std()}")

In [None]:
gpt_4o_finetuned_similarities = pd.DataFrame(eval_semantic_sim(gpt_4o_finetuned))

In [None]:
print(f"Mean Similarity for GPT-4o Finetuned: {gpt_4o_finetuned_similarities['similarities'].mean()}")
print(f"Standard Deviation Similarity for GPT-4o Finetuned: {gpt_4o_finetuned_similarities['similarities'].std()}")

In [None]:
def evaluate_responses(dataset):
    """
    Evaluate the RAG application responses against the evaluation dataset.
    
    Args:
        dataset (list): List of dictionaries with 'input' and 'output' keys.
        model_response_function (function): A function that takes an input and generates a response.
        
    Returns:
        dict: A dictionary containing evaluation metrics.
    """
    rouge = Rouge()
    smoothing_function = SmoothingFunction().method1

    exact_matches = []
    bleu_scores = []
    rouge_scores = {"rouge-1": [], "rouge-2": [], "rouge-l": []}

    for qa in dataset:
        time.sleep(10) #Try to avoid rate limiting
        # Get input and expected output
        input_text = qa["input"]
        expected_output = qa["output"]

        # Generate model response
        generated_output = query(input_text)

        # Exact Match
        exact_matches.append(int(generated_output.strip() == expected_output.strip()))

        # BLEU Score
        bleu_score = sentence_bleu(
            [expected_output.split()], generated_output.split(), smoothing_function=smoothing_function
        )
        bleu_scores.append(bleu_score)

        # ROUGE Scores
        rouge_score = rouge.get_scores(generated_output, expected_output, avg=True)
        rouge_scores["rouge-1"].append(rouge_score["rouge-1"]["f"])
        rouge_scores["rouge-2"].append(rouge_score["rouge-2"]["f"])
        rouge_scores["rouge-l"].append(rouge_score["rouge-l"]["f"])

    # Compute averages
    metrics = {
        "Exact Match": sum(exact_matches) / len(exact_matches),
        "BLEU": sum(bleu_scores) / len(bleu_scores),
        "ROUGE-1": sum(rouge_scores["rouge-1"]) / len(rouge_scores["rouge-1"]),
        "ROUGE-2": sum(rouge_scores["rouge-2"]) / len(rouge_scores["rouge-2"]),
        "ROUGE-L": sum(rouge_scores["rouge-l"]) / len(rouge_scores["rouge-l"]),
    }

    return metrics

In [None]:
evaluation_dataset_path = "Fine Tuning Data/all_q_and_a_docs_final_v2.json"

In [None]:
with open(evaluation_dataset_path, "r") as f:
    dataset = json.load(f)

In [None]:
metrics = evaluate_responses(dataset)

ValueError: Cannot get the response text.
Cannot get the Candidate text.
Response candidate content has no parts (and thus no text). The candidate is likely blocked by the safety filters.
Content:
{}
Candidate:
{
  "finish_reason": "SAFETY",
  "safety_ratings": [
    {
      "category": "HARM_CATEGORY_HATE_SPEECH",
      "probability": "NEGLIGIBLE",
      "probability_score": 0.20019531,
      "severity": "HARM_SEVERITY_LOW",
      "severity_score": 0.23632812
    },
    {
      "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
      "probability": "MEDIUM",
      "blocked": true,
      "probability_score": 0.75390625,
      "severity": "HARM_SEVERITY_MEDIUM",
      "severity_score": 0.43554688
    },
    {
      "category": "HARM_CATEGORY_HARASSMENT",
      "probability": "NEGLIGIBLE",
      "probability_score": 0.27734375,
      "severity": "HARM_SEVERITY_LOW",
      "severity_score": 0.22949219
    },
    {
      "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
      "probability": "NEGLIGIBLE",
      "probability_score": 0.09033203,
      "severity": "HARM_SEVERITY_NEGLIGIBLE",
      "severity_score": 0.045410156
    }
  ]
}
Response:
{
  "candidates": [
    {
      "finish_reason": "SAFETY",
      "safety_ratings": [
        {
          "category": "HARM_CATEGORY_HATE_SPEECH",
          "probability": "NEGLIGIBLE",
          "probability_score": 0.20019531,
          "severity": "HARM_SEVERITY_LOW",
          "severity_score": 0.23632812
        },
        {
          "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
          "probability": "MEDIUM",
          "blocked": true,
          "probability_score": 0.75390625,
          "severity": "HARM_SEVERITY_MEDIUM",
          "severity_score": 0.43554688
        },
        {
          "category": "HARM_CATEGORY_HARASSMENT",
          "probability": "NEGLIGIBLE",
          "probability_score": 0.27734375,
          "severity": "HARM_SEVERITY_LOW",
          "severity_score": 0.22949219
        },
        {
          "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
          "probability": "NEGLIGIBLE",
          "probability_score": 0.09033203,
          "severity": "HARM_SEVERITY_NEGLIGIBLE",
          "severity_score": 0.045410156
        }
      ]
    }
  ],
  "usage_metadata": {
    "prompt_token_count": 564,
    "candidates_token_count": 33,
    "total_token_count": 597
  },
  "model_version": "gemini-1.5-pro-001"
}

In [None]:
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")