In [2]:
!pwd

/home/dephinate/ASU/DL/MedicalChatBot/research


In [3]:
import os
from pathlib import Path

In [4]:
os.chdir("../")

In [5]:
!pwd

/home/dephinate/ASU/DL/MedicalChatBot


In [6]:
# !pip install python-box
# !pip install ensure
# !pip install -e .

Check configurations

In [7]:
from medicalChatBot.config.configurations import ConfigurationManager

In [8]:
configurationManager = ConfigurationManager()

[2024-04-11 19:24:42,057,INFO,common,created directory at: artifacts]


In [9]:
dataloader_config = configurationManager.get_dataloader_config()
datasplitter_config = configurationManager.get_datasplitter_config()
vectorization_config =  configurationManager.get_vectorization_config()
model_config = configurationManager.get_model_config()

In [10]:
print(dataloader_config)
print(datasplitter_config)
print(vectorization_config)
print(model_config)

DataLoaderConfig(data_path='artifacts/data', file_types='*.pdf')
SplitterConfig(chunk_size=500, chunk_overlap=50)
VectorizationConfig(encoder_platform='HuggingFace', encoder_name='sentence-transformers/all-MiniLM-L6-v2', model_name='sentence-transformers/all-MiniLM-L6-v2', index_name='medical-chatbot', namespace='medicalChatBot', num_of_documnets=3)
ModelConfig(implementation='LlamaCpp', model_path='artifacts/model/llama-2-7b-chat.Q5_K_M.gguf', model_type='llama', n_gpu_layers=32, n_batch=512, n_ctx=1024, f16_kv=True, temperature=0.8, max_new_tokens=512)


Data Loader

In [11]:
# Loader
import pypdf
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from medicalChatBot.entity  import DataLoaderConfig


class DataLoader:
    def __init__(self,
                 config : DataLoaderConfig)->None :
        self.config = config

    # Extract data from the pdf
    def load_pdf(self):
        loader = DirectoryLoader(   # To load all pdfs from a directory
            path=self.config.data_path,
            glob=self.config.file_types,
            loader_cls=PyPDFLoader,
            show_progress=True
        )
        documents = loader.load()
        return documents

In [12]:
dataLoader = DataLoader(config=dataloader_config)

In [13]:
extracted_data = dataLoader.load_pdf()

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:13<00:00, 13.20s/it]


Data Splitter

In [21]:
# Data Splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from medicalChatBot.config.configurations import ConfigurationManager
from medicalChatBot.entity  import SplitterConfig

class Splitter:
    def __init__(self,
                 config:SplitterConfig) -> None:
        self.config = config

    # function to impement recursive text splitting 
    def split_recursive(self, extracted_data:None):
        splitter = RecursiveCharacterTextSplitter(chunk_size = self.config.chunk_size  , chunk_overlap = self.config.chunk_overlap, separators=['\n\n', '\n', '.', ','])
        chunks = splitter.split_documents(extracted_data)
        return chunks

In [22]:
splitter = Splitter(config=datasplitter_config)

In [23]:
chunks = splitter.split_recursive(extracted_data=extracted_data)

Vectorization

In [30]:

from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import HuggingFaceEmbeddings
from medicalChatBot.entity import VectorizationConfig
from medicalChatBot.utils.common import load_env



class Vectorizer:
    def __init__(self,
                 config : VectorizationConfig) -> None:
        self.config = config

    def download_embeddings_from_huggingface(self,model_name = None):
        model_name = model_name or self.config.model_name
        embeddings = HuggingFaceEmbeddings(model_name=model_name)
        return embeddings
    
    def create_pinecone_instance(self,env_file_path = None):
        pc = Pinecone(
            api_key = load_env(env_file_path=env_file_path),
        )
        return pc
    
    def check_pinecone_index_status(self,db_instance, index_name = None):
        index_name = index_name or self.config.index_name
        index = db_instance.Index(index_name)
        index.describe_index_stats()

    def create_pinecone_vectorstore_instance(self,index=None,namespace=None,index_name=None,embeddings=None):
        index_name = index or self.config.index_name
        namespace = namespace or self.config.namespace
        
        vectorstore = PineconeVectorStore(
        index=index,
        embeddings=embeddings,
        namespace=namespace,
        index_name=index_name
        )
        return vectorstore
    
    def clean_pinecone_db(self, db_instance,namespace):
        db_instance.delete(delete_all=True,namespace=namespace)

    def add_records_pinecone_db(self,vectorstore_instance, chunks):
        vectorstore_instance.add_texts(texts=[t.page_content for t in chunks])


    

In [29]:
model_name = "abc"
model_name_b = "pqrs"
model_name or model_name_b

'abc'