# Initialization

## Load libraries

In [1]:
import os
import json
from tqdm import tqdm
import hashlib
from dotenv import load_dotenv
from pinecone import Pinecone
from concurrent.futures import ThreadPoolExecutor, as_completed
from huggingface_hub import hf_hub_download
import transformers
from langchain.chains import RetrievalQA
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

## Constants

In [2]:
namespace="the_gale_encyclopedia_of_medicine" # Needs to be updated, each document should have a different namespace

## Config Manager set up

In [3]:
class ConfigManager:
    def __init__(self, json_config_path: str):
        self.json_config_path = json_config_path
        self.json_config = None

        self._load_config_json()

    def _load_config_json(self):
        try:
            with open(self.json_config_path, "r") as file_obj:
                self.json_config = json.load(file_obj)
        except FileNotFoundError:
            raise FileNotFoundError(f"Configuration file not found: {self.json_config_path}")
        except json.JSONDecodeError as e:
            raise ValueError(f"Error decoding JSON file: {e}")

    def get_config(self, keys: str | list, default=None):
        if isinstance(keys, str):
            keys = [keys]

        data = self.json_config
        for key in keys:
            if isinstance(data, dict):
                data = data.get(key, default)
            else:
                return default
        return data

In [4]:
config_manager = ConfigManager("config.json")

## Load environment varibales

In [5]:
_ = load_dotenv()

# Load LLM model

In [6]:
model_path = os.path.join(config_manager.get_config("llm_model_directory"), config_manager.get_config("llm_model_name"))

if not os.path.exists(model_path):
    model_path = hf_hub_download(
        repo_id=config_manager.get_config("llm_model_repository_name"), 
        filename=config_manager.get_config("llm_model_name"),
        local_dir=config_manager.get_config("llm_model_directory"),
        token=os.getenv("HUGGINGFACE_TOKEN")
    )

# Process RAG documents

## Load documents

In [7]:
loader = DirectoryLoader(
    config_manager.get_config("data_directory"),
    glob="*.pdf",
    loader_cls=PyPDFLoader,
    show_progress=True
)

In [8]:
extracted_data = loader.load()

100%|██████████| 1/1 [00:09<00:00,  9.21s/it]


## Split into chunks

In [9]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=config_manager.get_config(["text_split", "chunk_size"]), 
    chunk_overlap=config_manager.get_config(["text_split", "chunk_overlap"])
)

data_chunks = text_splitter.split_documents(extracted_data)

In [10]:
print(f"Amount of created data chunks: {len(data_chunks)}")

Amount of created data chunks: 5961


# Load documents into VectorDB (Pinecone)

## Load embedding model

In [11]:
embeddings = HuggingFaceEmbeddings(model_name=config_manager.get_config("embedding_model_name"))

## Set up Pinecone connection

In [12]:
pc = Pinecone(
    api_key=os.getenv("PINECONE_API_KEY"),
)

In [13]:
index = pc.Index(config_manager.get_config("pinecone_index_name"))

## Validate duplicated vectors

In [14]:
namespace_exists = index.query(vector=embeddings.embed_query(data_chunks[0].page_content), top_k=1, namespace=namespace).get("matches")

In [15]:
if namespace_exists:
    def check_duplication(chunk):
        duplicated = index.query(vector=embeddings.embed_query(chunk.page_content), top_k=1, namespace=namespace).get("matches")[0].get("score") >= 0.99
        return chunk.page_content if not duplicated else None

    deduplicated_data_chunks = list()

    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(check_duplication, chunk): chunk for chunk in data_chunks}

        for future in tqdm(as_completed(futures), total=len(data_chunks), desc="Validating duplicated data chunks"):
            result = future.result()
            if result:
                deduplicated_data_chunks.append(result)
else:
    deduplicated_data_chunks = [chunk.page_content for chunk in data_chunks]

Validating duplicated data chunks: 100%|██████████| 5961/5961 [00:51<00:00, 115.49it/s]


In [16]:
print(f"{len(deduplicated_data_chunks)} not duplicated vectors will be added.")

0 not duplicated vectors will be added.


## Upload Documents

In [17]:
vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [18]:
def generate_id(content):
    return hashlib.md5(content.encode()).hexdigest()

In [19]:
data_chunks_ids = [generate_id(chunk) for chunk in deduplicated_data_chunks]

In [20]:
_ = vector_store.add_texts(texts=deduplicated_data_chunks, ids=data_chunks_ids, namespace=namespace)

# Test question

In [35]:
prompt="""
Use the following pieces of information to answer the user's question.
If you don't have enough information to answer them please answer "I don't have enough information to answer that question."
Only return the helpful answer below and nothing else.

Question: {question}

Context: 
{context}
"""

In [36]:
prompt_template = PromptTemplate(
    template=prompt, 
    input_variables=["context", "question"]
)

chain_type_kwargs={"prompt": prompt_template}

# Load LLM model

In [37]:
llm = CTransformers(
    model=model_path,
    model_type="llama",
    config={
        "max_new_tokens": 512,
        "temperature": 0.8
    }
)

# Retrieval Question Answering

In [38]:
qa=RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(search_kwargs={"namespace": namespace}),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

In [39]:
response = qa.invoke({"query": "What is Cancer?"}, verbose=True)

In [33]:
print(response.get("result"))

Breast year old lady with information about
I don'
The most common types of
You have enough information provided
Breast chance (For patientswith highlitely cancer cells in a)
I can you are many of HP
You seem to be foundational Health care providers provide an infection
The most people with
The most common.
To diagnosis of information is
Breastasis,
Breast year, I don'
The
The
The two pieces of breast cancerous mammuncontrolled by the
I have enough information provided
Breast
BreastrophyesYou don’ The HIV, although some forms or click heres can provide additional information about.
Yes,
I don'
I don'
I don'
I don'
The type=
I don'
I don'
Breast cancer cells are there are cancer originates are cancer can give themic information that helpfull breast cancer treatment for answer.
I have enough information to answer:
The
The
The
The
The most common types of course,
I don'
I don'
I don'
To perform a
The patient with more... The type 
Breast cancer cells in the pro-A biops belowHelpful answer