## Diagnose-a-thon LLM RAG pipeline

In [11]:
#Install the necessary packages
#pip install -q langchain langchain-community openai python-dotenv pandas numpy chromadb langchain-huggingface langchain-chroma langchain_openai sentence-transformers grpcio google-generativeai llamaapi

In [2]:
#Check GPU availability - Need GPU for vector embeddings
import torch
print(torch.cuda.is_available())  # Should print True if GPU is available

True


### Check the total number of medical books available (to be used as knowledge base)

In [None]:
""" UNCOMMENT IF NEEDED - TO PREPARE THE CREATION THE VECTOR STORE FROM SCRATCH
#Loading the medical book text files (previously converted books with images in PDF to text)
import os
from langchain_community.document_loaders import TextLoader

medbooks_folder = "/content/drive/MyDrive/LLM_RAG_MED/dat_rag/medbooks_txt"
# List to store all loaded med books
all_medbooks = []

# Iterate through all files in the folder
for file_name in os.listdir(medbooks_folder):
    # Check if the file has a .txt extension
    if file_name.endswith(".txt"):
        file_path = os.path.join(medbooks_folder, file_name)
        # Load the document
        loader = TextLoader(file_path)
        medbooks = loader.load()  # Load returns a list of Document objects
        all_medbooks.extend(medbooks)  # Add to the list of all med books

#Print the total number of books and total number of words
print(f"{len(all_medbooks)} med books, with a total of {sum([len(doc.page_content) for doc in all_medbooks])} words")
"""

62 med books, with a total of 184482601 words


### Split medical books content in chunks (1000 tokens)

In [None]:
""" UNCOMMENT IF NEEDED - TO PREPARE THE CREATION THE VECTOR STORE FROM SCRATCH
#Splitting chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap  = 0)
medbooks_splits = text_splitter.split_documents(all_medbooks)
print("Splitting completed!")
"""

Splitting completed!


### Convert med books chunks into vectors using embedding model (sentence tranformer) .
### Then save the vectors into chromaDB and persist the vector store in a local folder

In [19]:
""" UNCOMMENT IF NEEDED - TO CREATE THE VECTOR STORE FROM SCRATCH
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import Chroma
import chromadb
from tqdm import tqdm
import logging

# Suppress ChromaDB warnings by setting the logging level to ERROR
logging.getLogger("chromadb").setLevel(logging.ERROR)

# Initialize a pretrained Sentence Transformer model with GPU support
embedding_model = SentenceTransformer(model_name_or_path="all-MiniLM-L6-v2", device="cuda")

# Define the directory for persistent storage
my_chromadb_directory = "./chroma_db"

# Initialize ChromaDB client with the default configuration
client = chromadb.PersistentClient(path=my_chromadb_directory)

# Check if collection already exists, otherwise create it
collection = client.get_or_create_collection("medbooks_vec")

# Specify the batch manually (e.g., "batch 1", "batch 2", etc)
batch_number = 7  # Change this as needed

# Divide the documents into seven batches
total_docs = len(medbooks_splits)
batch_size = total_docs // 7
start_idx = (batch_number - 1) * batch_size
end_idx = start_idx + batch_size if batch_number < 7 else total_docs

# Slice the batch for this script run
current_batch = medbooks_splits[start_idx:end_idx]

# Track progress for embedding and adding documents
for i, doc in tqdm(
    enumerate(current_batch, start=start_idx),
    desc=f"Processing Batch {batch_number}",
    total=len(current_batch),
    unit="doc"
):
    # Generate unique ID for the document
    doc_id = str(i)

    # Check if the document is already in the store
    existing_doc = collection.get(ids=[doc_id])
    if existing_doc["ids"]:  # If the ID exists, skip this document
        continue

    # Compute embedding for the document
    embedding = embedding_model.encode(doc.page_content)

    # Add the document to the vector store
    collection.add(
        documents=[doc.page_content],
        metadatas=[doc.metadata],
        embeddings=[embedding],
        ids=[doc_id]  # Provide the unique ID
    )

print(f"Batch {batch_number} documents have been processed and stored in './chroma_db'.")

"""
# VECTOR STORE SUCCESSFULLY CREATED IN BATCH
# Batch 1: 100% - 30557/30557 [50:53<00:00, 10.01doc/s]
# Batch 2: 100% - 30557/30557 [07:31<00:00, 67.73doc/s]
# Batch 3: 100% - 30557/30557 [35:34<00:00, 14.32doc/s]
# Batch 4: 100% - 30557/30557 [1:33:33<00:00,  5.44doc/s]
# Batch 5: 100% - 30557/30557 [2:27:25<00:00,  3.45doc/s]
# Batch 6: 100% - 30557/30557 [10:18<00:00, 49.37doc/s]
# Batch 7: 100% - 30559/30559 [1:00:07<00:00,  8.47doc/s]



### Implementing the search function to test if the local vector store is properly set up

In [None]:
from sentence_transformers import SentenceTransformer
from chromadb.config import Settings
import chromadb

# Initialize the Sentence Transformer model
embedding_model = SentenceTransformer(model_name_or_path="all-MiniLM-L6-v2", device="cuda")

# Define the persist directory where the vector store is saved
persist_directory = "./chroma_db"

# Re-initialize the ChromaDB client with persistence enabled
client = chromadb.PersistentClient(path=persist_directory)

# Load the existing collection
collection = client.get_collection(name="medbooks_vec")

# Prepare your query
query_text = "What are all the different types of cancer?"
query_embedding = embedding_model.encode(query_text)

# Query the vector store and save the results
results = collection.query(
    query_embeddings=[query_embedding],  # Define the embedding function
    n_results=3  # Set the number of top matches to retrieve
)

# Inspect the results: text documents, metadata and cosine distance (how far the result vector is from the query vector)
print("Top Matches:")
for i, (docs, metadatas, distances) in enumerate(
    zip(results["documents"], results["metadatas"], results["distances"])
):
    for j, (doc, metadata, distance) in enumerate(zip(docs, metadatas, distances)):
        print(f"\nMatch {i + 1}.{j + 1}:")  # Show the match index
        print(f"Document: {doc}")
        print(f"Metadata: {metadata}")
        print(f"Cosine Similarity: {1 - distance:.4f}")  # Convert distance to similarity
# REMINDER: If two vectors are very close together (high cosine similarity or low cosine distance), their respective documents are also very similar.

### Check GPT-3.5 and GPT-4o answer without RAG

In [None]:
from dotenv import load_dotenv
import os
import re
from langchain_openai import ChatOpenAI
from pprint import pprint

# Load the .env file to access the API key
env_path = '/.env'  # Update with your .env file path
load_dotenv(env_path)
api_key = os.getenv('OPENAI_API_KEY')

# Example query
query_text = "What are all the types of cancer?"

# Initialize the LLM (ChatOpenAI)
#llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key=api_key)
llm = ChatOpenAI(model_name="gpt-4o", temperature=0, openai_api_key=api_key)

# Pass the query directly to the model
response = llm.invoke(query_text)

# Extract the text content from the AIMessage object
response_text = response.content if hasattr(response, 'content') else str(response)

# Print response in multiple lines
if isinstance(response_text, str):
    print(response_text)
else:
    # If it's neither a string, print the response as it is
    print("Unexpected response format:", response)


### Implementing Reciprocal RAG with GPT-3.5 or GPT-4o
Rather than passing all relevant documents as context to the LLM, Reciprocal RAG employs a ranking mechanism to retain only the most relevant documents from the vector store results and pass them to the LLM.
For details on Reciprogal RAL refer to: https://doi.org/10.1017/nlp.2024.53

In [None]:
from dotenv import load_dotenv
import os
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_chroma import Chroma
from pprint import pprint

# Load the .env file to Access the API key
env_path = './env'  # Update with your .env file path
load_dotenv(env_path)
api_key = os.getenv('OPENAI_API_KEY')

# Define the persist directory
persist_directory = "./chroma_db"

# Initialize HuggingFaceEmbeddings with the correct model
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Initialize Chroma vector store with the embedding function
vector_store = Chroma(persist_directory=persist_directory, collection_name="medbooks_vec", embedding_function=embedding_function )

# Define the User query
query_text = "What are all the types of cancer?"

# Perform the query using similarity_search_with_score to get documents and scores
results = vector_store.similarity_search_with_score(query_text, k=7)  # k=7 to retrieve top 7 documents

# Extract the text, metadata, similarity scores, and document ID
document_with_scores = [(doc.id, score, doc.metadata, doc.page_content) for doc, score in results ]

# Sort documents by cosine similarity score in descending order (higher similarity score means more relevant)
document_with_scores.sort(key=lambda x: x[1], reverse=True)

# Get the top 5 relevant document(s)
top_5_documents = document_with_scores[:5]

# Construct the context string, including document ID, metadata, page content, and similarity score
top_documents_context = "\n\n".join([
        f"Document ID: {doc_id}\nCosine Similarity: {score:.4f}\nMetadata: {metadata}\nContent: {content}"
        for doc_id, score, metadata, content in top_5_documents ])

# Initialize the LLM (ChatOpenAI)
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key=api_key)

# Define the system prompt template
system_prompt = (
    "Use the entire given context to answer the question. "
    "Context:\n{context}" )

# Create a chat prompt template
retrieval_qa_chat_prompt = ChatPromptTemplate.from_messages([ ("system", system_prompt), ("human", "{input}") ])

# Prepare the input message for the LLM
formatted_input = retrieval_qa_chat_prompt.format_messages(context=top_documents_context, input=query_text)

# Generate the response using the formatted input
response = llm.invoke(formatted_input)

# Display both the context and the response
print("\n--- Context passed to the LLM ---")
print(top_documents_context)

print("\n--- LLM Response ---")
if hasattr(response, "content"):
    formatted_response = response.content.replace("\\n", "\n")  # Handle escaped newlines
    print("\n".join(formatted_response.splitlines()))
else:
    print(response)


### Get RAG LLM response and Base LLM response and add output to CSV file

In [None]:
import os
import pandas as pd
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_chroma import Chroma

# Load the .env file to access the API key
env_path = './env'  # Update with your .env file path
load_dotenv(env_path)
api_key = os.getenv('OPENAI_API_KEY')

# Define the persist directory
persist_directory = "./chroma_db"

# Initialize HuggingFaceEmbeddings
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Initialize Chroma vector store
vector_store = Chroma(
    persist_directory=persist_directory,
    collection_name="medbooks_vec",
    embedding_function=embedding_function
)

# Initialize the LLM
llm = ChatOpenAI(model_name="gpt-4o", temperature=0, openai_api_key=api_key)

# Define the system prompt template
system_prompt = (
    "Use the entire given context to answer the question. "
    "Context:\n{context}"
)

# Create a chat prompt template
retrieval_qa_chat_prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}")
])

# Load the input data
df_queries = pd.read_csv("./dat_results.csv", encoding= 'latin-1')

# Specify the range of rows to process (update as needed)
start_row = 0  # Modify as needed
end_row = 213    # Modify as needed

gpt_output_file = "rag_gpt4o.csv"
# Store results
gpt_output_data = []

# Process each row in the specified range
for index, row in df_queries.iloc[start_row:end_row].iterrows():
    query_text = row['p_prompt']
    
    # Perform the query using similarity_search_with_score to get documents and scores
    results = vector_store.similarity_search_with_score(query_text, k=7)

    # Extract document information
    document_with_scores = [(doc.id, score, doc.metadata, doc.page_content) for doc, score in results]
    document_with_scores.sort(key=lambda x: x[1], reverse=True)

    # Get the top 3 relevant document(s)
    top_4_documents = document_with_scores[:4]

    # Construct the context string
    top_documents_context = "\n\n".join([
        f"Document ID: {doc_id}\nCosine Similarity: {score:.4f}\nMetadata: {metadata}\nContent: {content}"
        for doc_id, score, metadata, content in top_4_documents
    ])

    # Prepare the input message for the LLM
    formatted_input = retrieval_qa_chat_prompt.format_messages(context=top_documents_context, input=query_text)
    
    # Generate the response with context
    try:
        gpt_rag_response = llm.invoke(formatted_input)
        rag_response_text = gpt_rag_response.content if hasattr(gpt_rag_response, "content") else str(gpt_rag_response)
    except Exception as e:
        print(f"Error generating response for query '{query_text}': {e}")
        rag_response_text = "Error generating response"
    
    # Generate response without context
    try:
        gpt_response= llm.invoke(query_text)
        response_text = gpt_response.content if hasattr(gpt_response, "content") else str(gpt_response)
    except Exception as e:
        print(f"Error generating response without context for query '{query_text}': {e}")
        response_text = "Error generating response"
    
    # Prepare output row
    output_row = [
        query_text,
        rag_response_text,
        response_text
    ] + [document_with_scores[i][3] if i < len(document_with_scores) else "" for i in range(4)]  # Get top 4 docs

    gpt_output_data.append(output_row)

    # Create output DataFrame
    output_df = pd.DataFrame(gpt_output_data, columns=["p_prompt", "llm_rag_response", "llm_response", "top_document_1", "top_document_2", "top_document_3", "top_document_4"])

    # Save results to CSV
    output_df.to_csv(gpt_output_file, index=False)
    print(f"Row {index + 1} processed of {len(df_queries)}.")


In [6]:
import pandas as pd
df_output = pd.read_csv("./rag_gpt4o.csv", encoding= 'latin-1')
len(df_output)

214

### Prompt Gemini-1.5 pro without RAG

In [None]:
from dotenv import load_dotenv
import os
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain_chroma import Chroma
from pprint import pprint
import google.generativeai as genai  # Import the Gemini library

# Load the .env file to access the API key
env_path = './env'  # Update with your .env file path  # Update with your .env file path
load_dotenv(env_path)
gemini_key = os.getenv('GEMINI_API_KEY')  # Update to use Gemini API key
genai.configure(api_key=os.environ.get('GEMINI_API_KEY'))

my_query = """ I"ve been finding it harder to see any positive outcomes lately, and it feels like the future is just
blank. 
2. I think it would be better for everyone if I wasn't around anymore. 
3. I have a way to make it happen. 
4. I'm going to do it now """

model = genai.GenerativeModel(model_name='gemini-1.5-pro')
response = model.generate_content(my_query)
print(response.text)

### Implement Reciprocal RAG with Gemini 1.5 pro


In [None]:
import os
import textwrap
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain_chroma import Chroma
from pprint import pprint
import google.generativeai as genai

# Load the .env file to access the API key
env_path = './env'
load_dotenv(env_path)
gemini_key = os.getenv('GEMINI_API_KEY')

# Define the persist directory
persist_directory = "./chroma_db"

# Initialize HuggingFaceEmbeddings with the correct model
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Initialize Chroma vector store
vector_store = Chroma(persist_directory=persist_directory, collection_name="medbooks_vec", embedding_function=embedding_function)

# Define the user query
query_text = """ I"ve been finding it harder to see any positive outcomes lately, and it feels like the future is just
blank. 
2. I think it would be better for everyone if I wasn't around anymore. 
3. I have a way to make it happen. 
4. I'm going to do it now """

# Perform the query using similarity_search_with_score to get documents and scores
results = vector_store.similarity_search_with_score(query_text, k=7)

# Extract document information
document_with_scores = [(doc.id, score, doc.metadata, doc.page_content) for doc, score in results]
document_with_scores.sort(key=lambda x: x[1], reverse=True)

# Get the top 4 relevant document(s)
top_4_documents = document_with_scores[:4]

# Construct the context string
top_documents_context = "\n\n".join([
    f"Document ID: {doc_id}\nCosine Similarity: {score:.4f}\nMetadata: {metadata}\nContent: {content}"
    for doc_id, score, metadata, content in top_4_documents
])

# Initialize the Gemini Pro model
genai.configure(api_key=gemini_key)
model = genai.GenerativeModel('gemini-1.5-pro')

# Define the system prompt template
system_prompt = "Use the entire given context to answer the question. Do not refer documents. Do not include document ID. Context:\n{context}"
retrieval_qa_chat_prompt = ChatPromptTemplate.from_messages([("system", system_prompt), ("human", "{input}")])

# Prepare the input message for the LLM
formatted_input = retrieval_qa_chat_prompt.format_messages(context=top_documents_context, input=query_text)

# Generate the response
try:
    response = model.generate_content(formatted_input[0].content)
except Exception as e:
    print(f"Error generating response: {e}")
    response = None

# Display context and formatted response
print("\n--- Context passed to the LLM ---")
print(top_documents_context)

print("\n--- LLM Response ---")
if response:
    # Use textwrap to print neatly formatted output
    print("\n".join(textwrap.wrap(response.text, width=80)))

### Get RAG Gemini and base Gemini responses and add to CSV file

In [27]:
import os
import pandas as pd
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain_chroma import Chroma
import google.generativeai as genai

# Load the .env file to access the API key
env_path = './env'
load_dotenv(env_path)
gemini_key = os.getenv('GEMINI_API_KEY')

# Define the persist directory
persist_directory = "./chroma_db"

# Initialize HuggingFaceEmbeddings with the correct model
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Initialize Chroma vector store
vector_store = Chroma(
    persist_directory=persist_directory,
    collection_name="medbooks_vec",
    embedding_function=embedding_function
)

# Load input queries from CSV file
queries_csv_path = "./dat_results.csv"  # Replace with actual path
gemini_output_file = "./rag_gemini.csv"  # Replace with actual path

try:
    queries_df = pd.read_csv(queries_csv_path, encoding='latin-1')
except FileNotFoundError:
    print(f"Error: File '{queries_csv_path}' not found.")
    exit()

# Initialize Gemini Pro model
genai.configure(api_key=gemini_key)
model = genai.GenerativeModel('gemini-1.5-pro')

# Define the system prompt template
system_prompt = "Use the entire given context to answer the question without showing document ID or table numbers. Context:\n{context}"
retrieval_qa_chat_prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}")
])

# Define the row range to process (inclusive)
start_row = 200  # Adjust as needed
end_row = 213  # Adjust as needed (inclusive)

# Validate row range
if start_row < 0 or end_row >= len(queries_df):
    print("Error: Row range is out of bounds.")
    exit()

# Check if output file exists to avoid rewriting headers
output_csv_exists = os.path.exists(gemini_output_file)

# Open the CSV file in append mode so we can write row-by-row
with open(gemini_output_file, mode='a', encoding='utf-8', newline='') as file:
    for row_index in range(start_row, end_row + 1):
        query_text = queries_df.at[row_index, "p_prompt"]

        print(f"Processing row {row_index + 1} of {len(queries_df)}")

        # Perform the query using similarity_search_with_score to get documents and scores
        results = vector_store.similarity_search_with_score(query_text, k=7)

        # Extract document information
        document_with_scores = [(doc.id, score, doc.metadata, doc.page_content) for doc, score in results]
        document_with_scores.sort(key=lambda x: x[1], reverse=True)

        # Get the top 4 relevant document(s)
        top_4_documents = document_with_scores[:4]

        # Construct the context string
        top_documents_context = "\n\n".join([
            f"Document ID: {doc_id}\nCosine Similarity: {score:.4f}\nMetadata: {metadata}\nContent: {content}"
            for doc_id, score, metadata, content in top_4_documents
        ])

        # Prepare the input message for the LLM
        formatted_input = retrieval_qa_chat_prompt.format_messages(context=top_documents_context, input=query_text)

        # Generate the response with context
        try:
            rag_gemini_response = model.generate_content(formatted_input[0].content)
            rag_response_text = rag_gemini_response.text if rag_gemini_response else ""
        except Exception as e:
            print(f"Error generating response for query '{query_text}': {e}")
            rag_response_text = "Error generating response"

        # Generate the response without context
        try:
            gemini_response = model.generate_content(query_text)
            gemini_response_text = gemini_response.text if gemini_response else ""
        except Exception as e:
            print(f"Error generating base response for query '{query_text}': {e}")
            gemini_response_text = "Error generating base response"

        # Prepare the row to append
        output_row = [
            query_text,
            rag_response_text,
            gemini_response_text
        ] + [document_with_scores[i][3] if i < len(document_with_scores) else "" for i in range(4)]

        # Convert to DataFrame (just for appending)
        output_df = pd.DataFrame([output_row], columns=["p_prompt", "llm_rag_response", "llm_response", "top_document_1", "top_document_2", "top_document_3", "top_document_4"])

        # Append row to CSV
        output_df.to_csv(file, index=False, header=not output_csv_exists, mode='a')
        output_csv_exists = True  # Ensure header is only written once

print(f"Responses appended to '{gemini_output_file}'")




Processing row 7 of 214
Processing row 8 of 214
Processing row 9 of 214
Processing row 10 of 214
Processing row 11 of 214
Processing row 12 of 214
Processing row 13 of 214
Processing row 14 of 214
Processing row 15 of 214
Processing row 16 of 214
Processing row 17 of 214
Processing row 18 of 214
Processing row 19 of 214
Processing row 20 of 214
Processing row 21 of 214
Responses appended to './rag_gemini.csv'


In [28]:
import pandas as pd
gemini_csv = pd.read_csv("./rag_gemini.csv", encoding= "latin-1")
len(gemini_csv)

21

### Prompting LLama3-8B without RAG

In [None]:
import os
from dotenv import load_dotenv
from openai import OpenAI

# Load the .env file to access the API key
env_path = './env'
load_dotenv(env_path)
llama_key = os.getenv('LLAMA_API_KEY')

client = OpenAI(api_key = llama_key, base_url = "https://api.llama-api.com")

my_response = client.chat.completions.create(
model="llama3-8b",
messages=[{"role": "system", "content": "Assistant is a large language model trained by OpenAI."},
          {"role": "user", "content": "What are the types of cancer"}
         ],)

#print(my_response)
#print(my_response.model_dump_json(indent=2))
print(my_response.choices[0].message.content)
# context summary: identify assets (helicopter, drone, vehicles), threats, protector (police)

### Implementing RAG with LLama3-8B

In [None]:
import os
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from pprint import pprint
from openai import OpenAI

# Load the .env file to access the API key
env_path = './env'
load_dotenv(env_path)
llama_key = os.getenv('LLAMA_API_KEY')

# Define the persist directory
persist_directory = "./chroma_db"

# Initialize HuggingFaceEmbeddings
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Initialize Chroma vector store
vector_store = Chroma(persist_directory=persist_directory, collection_name="medbooks_vec", embedding_function=embedding_function)

# Define the user query
query_text = """
Hello, I’m experiencing persistent headaches along with a mild fever that’s lasted for a few days.
I sometimes feel tired and a bit lightheaded as well. The headache seems to get worse in the evenings, and I’ve noticed some nasal congestion too.
I think it might just be a common cold or sinus infection, but I wanted to check if there could be any other causes I should be aware of.
Can you help me understand what this might be and whether I need to see a doctor?"""

# Perform the query
results = vector_store.similarity_search_with_score(query_text, k=7)

# Extract document information
document_with_scores = [(doc.id, score, doc.metadata, doc.page_content) for doc, score in results]
document_with_scores.sort(key=lambda x: x[1], reverse=True)

# Construct the context string
top_documents_context = "\n\n".join([
    f"Document ID: {doc_id}\nCosine Similarity: {score:.4f}\nMetadata: {metadata}\nContent: {content}"
    for doc_id, score, metadata, content in document_with_scores[:4]])

# Define the system prompt
system_prompt = "Use the entire given context to answer the question. Context:\n{context}"

# Initialize the Llama model client
llama_client = OpenAI(api_key=llama_key, base_url="https://api.llama-api.com")

# Generate the response using the Llama model
try:
    response = llama_client.chat.completions.create(
        model="llama3-8b",
        messages=[
            {"role": "system", "content": system_prompt.format(context=top_documents_context)},
            {"role": "user", "content": query_text}
        ],
    )
    
    # Log full raw response for debugging
    #pprint(response)

    # Extract and print the response
    assistant_message = response.choices[0].message.content  # Access message content correctly
    print(assistant_message)  # Print only the response
except Exception as e:
    print(f"Error generating response: {e}")

### Getting RAG Llama and base llama reponses and adding to CSV File

In [15]:
import os
import pandas as pd
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from openai import OpenAI

# Load the .env file to access the API key
env_path = './env'
load_dotenv(env_path)
llama_key = os.getenv('LLAMA_API_KEY')

# Define paths
llama_input_csv = "./dat_results.csv"  # Input CSV file
llama_output_csv = "./rag_llama.csv"  # Output CSV file

# Define the persist directory
persist_directory = "./chroma_db"

# Initialize HuggingFaceEmbeddings
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Initialize Chroma vector store
vector_store = Chroma(
    persist_directory=persist_directory,
    collection_name="medbooks_vec",
    embedding_function=embedding_function
)

# Load input CSV and select the required range
user_queries = pd.read_csv(llama_input_csv, encoding='latin-1')

# Initialize the Llama model client
llama_client = OpenAI(api_key=llama_key, base_url="https://api.llama-api.com")

# Store results
llama_output_data = []

# Define the row range to process (inclusive)
start_row = 201  # Adjust as needed
end_row = 213  # Adjust as needed (inclusive)

# Validate row range
if start_row < 0 or end_row >= len(user_queries):
    print("Error: Row range is out of bounds.")
    exit()

# Check if output file exists to avoid rewriting headers
output_csv_exists = os.path.exists(llama_output_csv)

# Open the CSV file in append mode so we can write row-by-row
with open(llama_output_csv, mode='a', encoding='utf-8', newline='') as file:
    for row_index in range(start_row, end_row + 1):
        query_text = user_queries.at[row_index, "p_prompt"]

        # Perform vector search
        results = vector_store.similarity_search_with_score(query_text, k=7)

        # Extract document information
        document_with_scores = [(doc.id, score, doc.metadata, doc.page_content) for doc, score in results]
        document_with_scores.sort(key=lambda x: x[1], reverse=True)

        # Construct the context string
        top_documents_context = "\n\n".join([
        f"Document ID: {doc_id}\nCosine Similarity: {score:.4f}\nMetadata: {metadata}\nContent: {content}"
        for doc_id, score, metadata, content in document_with_scores[:4]])

        # Extract only document content from the top 4 retrieved documents
        retrieved_texts = [content for _, _, _, content in document_with_scores[:4]]

        # Construct the context string with document content only
        top_documents_content = "\n\n".join(retrieved_texts)

        # Define system prompt
        system_prompt = "Use the entire given context to answer the question without referencing the documents. Context:\n{context}"

        # Generate response with retrieved context (RAG)
        try:
            response_rag = llama_client.chat.completions.create(
                model="llama3-8b",
                messages=[
                {"role": "system", "content": system_prompt.format(context=top_documents_content)},
                {"role": "user", "content": query_text}
                ],
            )
            llama_rag_response = response_rag.choices[0].message.content if response_rag.choices else "No response"
        except Exception as e:
            llama_rag_response = f"Error: {e}"

        # Generate response without context (base response)
        try:
            response_base = llama_client.chat.completions.create(
                model="llama3-8b",
                messages=[
                    {"role": "user", "content": query_text}
                ],
            )
            llama_base_response = response_base.choices[0].message.content if response_base.choices else "No response"
        except Exception as e:
            llama_base_response = f"Error: {e}"

        # Prepare output row
        output_row = [
            query_text,
            llama_rag_response,
            llama_base_response
        ] + [document_with_scores[i][3] if i < len(document_with_scores) else "" for i in range(4)]  # Get top 4 docs

        llama_output_data.append(output_row)
        print(f"Processed row {row_index}")  # Debugging output

        # Write row to CSV (append mode)
        llama_output_df = pd.DataFrame([output_row], columns=["p_prompt", "llm_rag_response", "llm_response", "top_document_1", "top_document_2", "top_document_3", "top_document_4"])
        llama_output_df.to_csv(llama_output_csv, mode='a', index=False, header=not output_csv_exists)

        # Set flag to avoid writing header in next iterations
        output_csv_exists = True

print(f"Processing complete. Results for rows {start_row} to {end_row} appended to {llama_output_csv}")



Processed row 201
Processed row 202
Processed row 203
Processed row 204
Processed row 205
Processed row 206
Processed row 207
Processed row 208
Processed row 209
Processed row 210
Processed row 211
Processed row 212
Processed row 213
Processing complete. Results for rows 201 to 213 appended to ./rag_llama.csv


### Validate CSV successfully created

In [16]:
import pandas as pd
dfx_output = pd.read_csv("./rag_llama.csv", encoding= 'latin-1')
len(dfx_output)

214