In [3]:
!pwd

/home/dephinate/ASU/DL/MedicalChatBot/research


In [4]:
import os
from pathlib import Path

In [5]:
os.chdir("../")

In [6]:
!pwd

/home/dephinate/ASU/DL/MedicalChatBot


In [7]:
# !pip install python-box
# !pip install ensure
# !pip install -e .

In [8]:
os.getenv('PINECONE_API_KEY')

'ce80aa7c-c98e-467c-a100-b4b7e6a07c05'

Check configurations

In [9]:
from medicalChatBot.config.configurations import ConfigurationManager

In [10]:
configurationManager = ConfigurationManager()

[2024-04-12 18:39:15,491,INFO,common,created directory at: artifacts]


In [11]:
dataloader_config = configurationManager.get_dataloader_config()
datasplitter_config = configurationManager.get_datasplitter_config()
vectorization_config =  configurationManager.get_vectorization_config()
model_config = configurationManager.get_model_config()

In [12]:
print(dataloader_config)
print(datasplitter_config)
print(vectorization_config)
print(model_config)

DataLoaderConfig(data_path='artifacts/data', file_types='*.pdf')
SplitterConfig(chunk_size=500, chunk_overlap=50)
VectorizationConfig(encoder_platform='HuggingFace', encoder_name='sentence-transformers/all-MiniLM-L6-v2', model_name='sentence-transformers/all-MiniLM-L6-v2', index_name='medical-chatbot', namespace='medicalChatBot', num_of_documnets=3)
ModelConfig(implementation='LlamaCpp', model_path='artifacts/model/llama-2-7b-chat.Q5_K_M.gguf', model_type='llama', n_gpu_layers=32, n_batch=512, n_ctx=1024, f16_kv=True, temperature=0.8, max_new_tokens=512)


Data Loader

In [13]:
# Loader
import pypdf
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from medicalChatBot.entity  import DataLoaderConfig


class DataLoader:
    def __init__(self,
                 config : DataLoaderConfig)->None :
        self.config = config

    # Extract data from the pdf
    def load_pdf(self):
        loader = DirectoryLoader(   # To load all pdfs from a directory
            path=self.config.data_path,
            glob=self.config.file_types,
            loader_cls=PyPDFLoader,
            show_progress=True
        )
        documents = loader.load()
        return documents

In [14]:
dataLoader = DataLoader(config=dataloader_config)

In [15]:
extracted_data = dataLoader.load_pdf()

100%|██████████| 1/1 [00:08<00:00,  8.49s/it]


Data Splitter

In [16]:
# Data Splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from medicalChatBot.config.configurations import ConfigurationManager
from medicalChatBot.entity  import SplitterConfig

class Splitter:
    def __init__(self,
                 config:SplitterConfig) -> None:
        self.config = config

    # function to impement recursive text splitting 
    def split_recursive(self, extracted_data:None):
        splitter = RecursiveCharacterTextSplitter(chunk_size = self.config.chunk_size  , chunk_overlap = self.config.chunk_overlap, separators=['\n\n', '\n', '.', ','])
        chunks = splitter.split_documents(extracted_data)
        return chunks

In [17]:
splitter = Splitter(config=datasplitter_config)

In [18]:
chunks = splitter.split_recursive(extracted_data=extracted_data)

Vectorization

In [59]:

from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import HuggingFaceEmbeddings
from medicalChatBot.entity import VectorizationConfig
from medicalChatBot.utils.common import load_env



class Vectorizer:
    def __init__(self,
                 config : VectorizationConfig = None) -> None:
        self.config = config

    def download_embeddings_from_huggingface(self,model_name:str = None):
        model_name = model_name or self.config.encoder_name
        embeddings = HuggingFaceEmbeddings(model_name=model_name)
        return embeddings
    
    def create_pinecone_instance(self,env_file_path:str = None):
        load_env(env_file_path=env_file_path)
        pc = Pinecone(
            api_key = f"{os.getenv('PINECONE_API_KEY')}"
        )
        return pc
    
    def check_pinecone_index_status(self,db_instance, index_name = None):
        index_name = index_name or self.config.index_name
        index = db_instance.Index(index_name)
        return index.describe_index_stats()

    def create_pinecone_vectorstore_instance(self,db_instance=None,namespace=None,index_name=None,embeddings=None):
        index_name = index_name or self.config.index_name
        namespace = namespace or self.config.namespace
        
        vectorstore = PineconeVectorStore(
        index=db_instance.Index(index_name),
        embedding=embeddings,
        namespace=namespace,
        index_name=index_name
        )
        return vectorstore
    
    def clean_pinecone_db(self, db_instance,index_name:str=None,namespace:str=None):
        index_name = index_name or self.config.index_name
        namespace = namespace or self.config.namespace
        db_instance.Index(index_name).delete(delete_all=True,namespace=namespace)

    def add_records_pinecone_db(self,vectorstore_instance, chunks):
        vectorstore_instance.add_texts(texts=[t.page_content for t in chunks])


    

In [60]:
# initialize Vectorizer
print(vectorization_config)
vectorizer = Vectorizer(config=vectorization_config)


VectorizationConfig(encoder_platform='HuggingFace', encoder_name='sentence-transformers/all-MiniLM-L6-v2', model_name='sentence-transformers/all-MiniLM-L6-v2', index_name='medical-chatbot', namespace='medicalChatBot', num_of_documnets=3)


In [None]:
# embeddings
embeddings = vectorizer.download_embeddings_from_huggingface()
# Instantiate vectordb
pc = vectorizer.create_pinecone_instance(env_file_path=".env")
index_list = pc.list_indexes()
index_status = vectorizer.check_pinecone_index_status(db_instance=pc)

print("embeddings :",embeddings)
print("\nindex_list:", index_list)
print("\nindex_status :",index_status)



In [49]:
# Create vectorstore and test
vector_store = vectorizer.create_pinecone_vectorstore_instance(db_instance= pc,embeddings=embeddings)

query = "What is ACne ?"

chunks_retrieved = vector_store.similarity_search(
    query,  # our search query
    k=3  # return 3 most relevant docs
)
chunks_retrieved

[Document(page_content='Nancy J. Nordenson\nAcid reflux seeHeartburn\nAcidosis seeRespiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when thepores of the skin become clogged with oil, dead skincells, and bacteria.\nDescription\nAcne vulgaris, the medical term for common acne, is'),
 Document(page_content='The goal of treating moderate acne is to decrease\ninflammation and prevent new comedone formation. Oneeffective treatment is topical tretinoin along with a topical\nGALE ENCYCLOPEDIA OF MEDICINE 2 25Acne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceousglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)GEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(page_content='ent purposes. For example, lotions, soaps, gels, andcr

In [None]:
# Testing cleanup and check status
vectorizer.clean_pinecone_db(db_instance=pc)

index_status = vectorizer.check_pinecone_index_status(db_instance=pc)
index_status

In [66]:
# Testing db loading
vectorizer.add_records_pinecone_db(vectorstore_instance=vector_store,chunks=chunks)

In [67]:
# Test status
index_status = vectorizer.check_pinecone_index_status(db_instance=pc)
index_status

{'dimension': 384,
 'index_fullness': 0.07093,
 'namespaces': {'medicalChatBot': {'vector_count': 7093}},
 'total_vector_count': 7093}

Load Model

In [69]:
from langchain.llms import CTransformers
from langchain_community.llms import LlamaCpp
from medicalChatBot.entity import ModelConfig

class LoadModel:
    def __init__(self,
                 config:ModelConfig = None) -> None:
        self.config = config

    def load_model_from_ctransformers(self, model_path = None, model_type=None,max_new_tokens=None,temperature=None):
        model_path = model_path or self.config.model_path
        model_type = model_type or self.config.model_type
        max_new_tokens = max_new_tokens or self.config.max_new_tokens
        temperature = temperature or self.config.temperature

        llm=CTransformers(model=model_path,
                        model_type=model_type,
                        config={'max_new_tokens':max_new_tokens,
                                'temperature':temperature})
        
        return llm
    
    def load_model_from_llamacpp(self,model_path:str=None, n_gpu_layers:int=None, n_batch:int=None, n_ctx:int=None, f16_kv:bool=None, temperature:int=None):
        model_path = model_path or self.config.model_path 
        n_gpu_layers = n_gpu_layers or self.config.n_gpu_layers 
        n_batch = n_batch or self.config.n_batch
        n_ctx = n_ctx or self.config.n_ctx
        f16_kv = f16_kv or self.config.f16_kv
        temperature = temperature or self.config.temperature
        lcpp_llm = None
        lcpp_llm = LlamaCpp(
            model_path=model_path,
            n_gpu_layers=n_gpu_layers,
            n_batch=n_batch,
            n_ctx=n_ctx,
            f16_kv=f16_kv, 
            temperature = temperature
            )
        return lcpp_llm
    
    def load_model(self):
        if 'ctransformers' in (self.config.implementation).lower:
            llm = self.load_model_from_llamacpp()
        if 'llama' or 'llamacpp' in (self.config.implementation).lower:
            llm = self.load_model_from_llamacpp()
        return llm        