# Initialization

## Load libraries

In [1]:
import os
import json
import torch
from tqdm import tqdm
import hashlib
from pprint import pprint
from dotenv import load_dotenv
from pinecone import Pinecone
from concurrent.futures import ThreadPoolExecutor, as_completed
from huggingface_hub import hf_hub_download
import transformers
from langchain.chains import RetrievalQA
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_huggingface.llms import HuggingFacePipeline
from langchain_pinecone import PineconeVectorStore
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate

## Config Manager set up

In [2]:
class ConfigManager:
    def __init__(self, json_config_path: str):
        self.json_config_path = json_config_path
        self.json_config = None

        self._load_config_json()

    def _load_config_json(self):
        try:
            with open(self.json_config_path, "r") as file_obj:
                self.json_config = json.load(file_obj)
        except FileNotFoundError:
            raise FileNotFoundError(f"Configuration file not found: {self.json_config_path}")
        except json.JSONDecodeError as e:
            raise ValueError(f"Error decoding JSON file: {e}")

    def get_config(self, keys: str | list, default=None):
        if isinstance(keys, str):
            keys = [keys]

        data = self.json_config
        for key in keys:
            if isinstance(data, dict):
                data = data.get(key, default)
            else:
                return default
        return data

In [3]:
config_manager = ConfigManager("config.json")

## Load environment varibales

In [4]:
_ = load_dotenv()

# Load LLM model

## Save model

In [5]:
model = transformers.AutoModelForCausalLM.from_pretrained(
    config_manager.get_config("llm_model_repository_name"), 
    torch_dtype=torch.bfloat16, 
    trust_remote_code=True,
    token=os.getenv("HUGGINGFACE_TOKEN"),
)

_ = model.save_pretrained(config_manager.get_config("llm_model_directory"))

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

KeyboardInterrupt: 

## Save tokenizer

In [6]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    config_manager.get_config("llm_model_repository_name"), 
    trust_remote_code=True,
    token=os.getenv("HUGGINGFACE_TOKEN")
)

_ = tokenizer.save_pretrained(config_manager.get_config("llm_model_directory"))

## Load LLM to be used

In [7]:
generation_config = transformers.GenerationConfig(
    max_length=config_manager.get_config(["model_params", "max_length"]),           
    max_new_tokens=config_manager.get_config(["model_params", "max_new_tokens"]),  
    temperature=config_manager.get_config(["model_params", "temperature"]),
    repetition_penalty=config_manager.get_config(["model_params", "repetition_penalty"]),
    no_repeat_ngram_size=config_manager.get_config(["model_params", "no_repeat_ngram_size"]),
    early_stopping=bool(config_manager.get_config(["model_params", "early_stopping"])),
    do_sample=bool(config_manager.get_config(["model_params", "do_sample"])),
    num_beams=config_manager.get_config(["model_params", "num_beams"])
)

pipeline = transformers.pipeline(
    "text-generation",
    model=config_manager.get_config("llm_model_directory"),
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
    generation_config=generation_config,
)

In [8]:
llm = HuggingFacePipeline(pipeline=pipeline, verbose=True)

## Load embedding model

In [9]:
embeddings = HuggingFaceEmbeddings(
    model_name=config_manager.get_config("embedding_model_name")
)

# Process RAG documents

## Load documents

In [10]:
loader = DirectoryLoader(
    config_manager.get_config("data_directory"),
    glob="*.pdf",
    loader_cls=PyPDFLoader,
    show_progress=True
)

In [11]:
extracted_data = loader.load()

100%|██████████| 2/2 [01:58<00:00, 59.48s/it]


## Split into chunks

In [12]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=config_manager.get_config(["text_split", "chunk_size"]), 
    chunk_overlap=config_manager.get_config(["text_split", "chunk_overlap"])
)

data_chunks = text_splitter.split_documents(extracted_data)

In [13]:
content_data_chunks = [chunk.page_content for chunk in data_chunks]

In [14]:
def clean_and_format(strings):
    cleaned_strings = []
    for text in strings:
        # Replace multiple newlines with a single space and remove unnecessary spacing
        cleaned_text = ' '.join(text.split())
        # Optional: Standardize separators (like bullet points or commas)
        cleaned_text = cleaned_text.replace('•', '-').replace(';', ',')
        cleaned_strings.append(cleaned_text)
    return cleaned_strings

In [15]:
formatted_data_chunks = clean_and_format(content_data_chunks)

# Load documents into VectorDB (Pinecone)

## Set up Pinecone connection

In [16]:
pc = Pinecone(
    api_key=os.getenv("PINECONE_API_KEY"),
)

In [17]:
index = pc.Index(config_manager.get_config("pinecone_index_name"))

## Validate duplicated vectors

In [18]:
empty_db = index.query(vector=embeddings.embed_query("test"), top_k=1).get("matches") == list()

In [19]:
if not empty_db:
    def check_duplication(chunk):
        duplicated = index.query(vector=embeddings.embed_query(chunk), top_k=1).get("matches")[0].get("score") >= 0.99
        return chunk.page_content if not duplicated else None

    deduplicated_data_chunks = list()

    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(check_duplication, chunk): chunk for chunk in formatted_data_chunks}

        for future in tqdm(as_completed(futures), total=len(formatted_data_chunks), desc="Validating duplicated data chunks"):
            result = future.result()
            if result:
                deduplicated_data_chunks.append(result)
else:
    deduplicated_data_chunks = formatted_data_chunks

Validating duplicated data chunks: 100%|██████████| 15684/15684 [02:10<00:00, 120.41it/s]


In [20]:
print(f"{len(deduplicated_data_chunks)} not duplicated vectors will be added.")

0 not duplicated vectors will be added.


## Upload Documents

In [21]:
vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [22]:
def generate_id(content):
    return hashlib.md5(content.encode()).hexdigest()

In [23]:
data_chunks_ids = [generate_id(chunk) for chunk in deduplicated_data_chunks]

In [24]:
_ = vector_store.add_texts(texts=deduplicated_data_chunks, ids=data_chunks_ids)

# Test question

In [25]:
prompt="""
You are a medical expert, and you have to answer the questions provided by patience below.
In addition to the question, you have access to the context which is additional information which could be helpful to answer the question.
Take into account:
- If you don't have enough information to answer them please answer: I don't have enough information to answer that question.
- The answer should be clear, complete taking into account the context and short.
- The answer needs to be easy to understand and well structured.
- The maximum amount of characters is 1500
- Only return the helpful answer below and nothing else.

Question: 
{question}

Context: 
{context}

___
Answer:
"""

In [26]:
prompt_template = PromptTemplate(
    template=prompt, 
    input_variables=["context", "question"]
)

chain_type_kwargs={"prompt": prompt_template}

# Retrieval Question Answering

In [27]:
qa=RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

## Test

In [None]:
directoryerbose=True)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [29]:
result = response.get("result")

In [30]:
print(result)


You are a medical expert, and you have to answer the questions provided by patience below.
In addition to the question, you have access to the context which is additional information which could be helpful to answer the question.
Take into account:
- If you don't have enough information to answer them please answer: I don't have enough information to answer that question.
- The answer should be clear, complete taking into account the context and short.
- The answer needs to be easy to understand and well structured.
- The maximum amount of characters is 1500
- Only return the helpful answer below and nothing else.

Question: 
Can you explain to me what arthritis is?

Context: 
rheumatism A popular term for any disorder that causes pain and stiffness in muscles and joints. The term is used to refer both to minor aches and twinges as well as to disorders such as rheumatoid arthritis , osteoarthritis, and polymyalgia rheumatica. rheumatoid arthritis A type of arthritis (joint inflammatio

In [31]:
print(f"Response token amount: {len(tokenizer.encode(result))}.")

Response token amount: 738.


In [32]:
answer = result.split("__\nAnswer:")[-1]
print(answer)


Arthritis is an inflammatory disease of the synovial joints. It is caused by an autoimmune response against the cartilage and/or bone of the joint. It can affect any joint in the body, but is most commonly seen in the hands, feet, spine, and knees. There are two main types: Osteoarthri- tis (OA) is the most common form of arthritis. It occurs when the protective cartilage that covers the ends of the bones wears away. This causes the bones to rub against each other, causing pain, stiffness, and loss of function. RA is an autoimmune disease that causes the body’s immune system to attack its own tissues. This leads to inflammation and damage to the joints and surrounding tissues. Symptoms of arthritis include pain, tenderness, and swelling in the affected joints. Other symptoms may include fatigue, loss of appetite, weight loss, and fever. Treatment for arthritis depends on the type and severity of the disease. For OA, non-steroidal anti-inflammatory drugs (NSAIDs) such as ibuprofen or n

In [None]:
print(f"Answer token amount: {len(tokenizer.encode(answer))}.")

Answer token amount: 281.


: 