In [2]:
!pwd

/home/dephinate/ASU/DL/MedicalChatBot/research


In [3]:
import os
from pathlib import Path

In [4]:
os.chdir("../")

In [5]:
!pwd

/home/dephinate/ASU/DL/MedicalChatBot


In [6]:
# !pip install python-box
# !pip install ensure
# !pip install -e .
# !pip install unstructured
# !pip install pdf2image
# !pip install pdfminer
# !pip uninstall pdfminer
# !pip install pillow_heif
# !pip install opencv-python
# !pip install unstructured-inference
# !pip install pytesseract
# !pip install pikepdf

In [6]:
os.getenv('PINECONE_API_KEY')

'ce80aa7c-c98e-467c-a100-b4b7e6a07c05'

Check configurations

In [7]:
from medicalChatBot.config.configurations import ConfigurationManager

In [8]:
configurationManager = ConfigurationManager()

[2024-04-16 18:54:05,828,INFO,common,created directory at: artifacts]


In [9]:
dataloader_config = configurationManager.get_dataloader_config()
datasplitter_config = configurationManager.get_datasplitter_config()
vectorization_config =  configurationManager.get_vectorization_config()
model_config = configurationManager.get_model_config()

In [10]:
print(dataloader_config)
print(datasplitter_config)
print(vectorization_config)
print(model_config)

DataLoaderConfig(data_path='artifacts/data', file_types='*.pdf')
SplitterConfig(chunk_size=500, chunk_overlap=50)
VectorizationConfig(encoder_platform='HuggingFace', encoder_name='sentence-transformers/all-MiniLM-L6-v2', model_name='sentence-transformers/all-MiniLM-L6-v2', index_name='medical-chatbot', namespace='medicalChatBot', num_of_documnets=3)
ModelConfig(implementation='LlamaCpp', model_path='artifacts/model/llama-2-7b-chat.Q5_K_M.gguf', model_type='llama', n_gpu_layers=32, n_batch=512, n_ctx=1024, f16_kv=True, temperature=0.8, max_new_tokens=512)


Data Loader

In [11]:
# Loader
import pypdf
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from medicalChatBot.entity  import DataLoaderConfig


class DataLoader:
    def __init__(self,
                 config : DataLoaderConfig)->None :
        self.config = config

    # Extract data from the pdf
    def load_pdf(self):
        loader = DirectoryLoader(   # To load all pdfs from a directory
            path=self.config.data_path,
            glob=self.config.file_types,
            loader_cls=PyPDFLoader,
            show_progress=True
        )
        documents = loader.load()
        return documents

In [12]:
dataLoader = DataLoader(config=dataloader_config)

In [13]:
extracted_data = dataLoader.load_pdf()

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:08<00:00,  9.00s/it]


Data Splitter

In [14]:
# Data Splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from medicalChatBot.config.configurations import ConfigurationManager
from medicalChatBot.entity  import SplitterConfig

class Splitter:
    def __init__(self,
                 config:SplitterConfig) -> None:
        self.config = config

    # function to impement recursive text splitting 
    def split_recursive(self, extracted_data:None):
        splitter = RecursiveCharacterTextSplitter(chunk_size = self.config.chunk_size  , chunk_overlap = self.config.chunk_overlap, separators=['\n\n', '\n', '.', ','])
        chunks = splitter.split_documents(extracted_data)
        return chunks

In [15]:
splitter = Splitter(config=datasplitter_config)

In [16]:
chunks = splitter.split_recursive(extracted_data=extracted_data)

In [21]:
print(len(chunks))
chunks[0:10]


7093


[Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION', metadata={'source': 'artifacts/data/The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf', 'page': 0}),
 Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nC-F2', metadata={'source': 'artifacts/data/The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf', 'page': 1}),
 Document(page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow, Manager, Imaging and Multimedia\nContent\nRobyn V . Young, Project Manager, Imaging and\nMultimedia Content\nDean Dauphinais, Senior Editor, Imaging and', metadata={'source': 'artifacts/data/T

Vectorization

In [105]:

from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import HuggingFaceEmbeddings
from medicalChatBot.entity import VectorizationConfig
from medicalChatBot.utils.common import load_env



class Vectorizer:
    def __init__(self,
                 config : VectorizationConfig = None) -> None:
        self.config = config

    def download_embeddings_from_huggingface(self,model_name:str = None):
        model_name = model_name or self.config.encoder_name
        embeddings = HuggingFaceEmbeddings(model_name=model_name)
        return embeddings
    
    def create_pinecone_instance(self,env_file_path:str = None):
        load_env(env_file_path=env_file_path)
        print("Key:",os.getenv('PINECONE_API_KEY'))
        pc = Pinecone(
            api_key = f"{os.getenv('PINECONE_API_KEY')}"
        )
        return pc
    
    def check_pinecone_index_status(self,db_instance, index_name = None):
        index_name = index_name or self.config.index_name
        index = db_instance.Index(index_name)
        return index.describe_index_stats()

    def create_pinecone_vectorstore_instance(self,db_instance=None,namespace=None,index_name=None,embeddings=None):
        index_name = index_name or self.config.index_name
        namespace = namespace or self.config.namespace
        
        vectorstore = PineconeVectorStore(
        index=db_instance.Index(index_name),
        embedding=embeddings,
        namespace=namespace,
        index_name=index_name
        )
        return vectorstore
    
    def clean_pinecone_db(self, db_instance,index_name:str=None,namespace:str=None):
        index_name = index_name or self.config.index_name
        namespace = namespace or self.config.namespace
        db_instance.Index(index_name).delete(delete_all=True,namespace=namespace)

    def add_records_pinecone_db(self,vectorstore_instance, chunks):
        vectorstore_instance.add_texts(texts=[t.page_content for t in chunks])


    

In [78]:
# initialize Vectorizer
print(vectorization_config)
vectorizer = Vectorizer(config=vectorization_config)


VectorizationConfig(encoder_platform='HuggingFace', encoder_name='sentence-transformers/all-MiniLM-L6-v2', model_name='sentence-transformers/all-MiniLM-L6-v2', index_name='medical-chatbot', namespace='medicalChatBot', num_of_documnets=3)


In [112]:
# embeddings
embeddings = vectorizer.download_embeddings_from_huggingface()
# Instantiate vectordb
pc = vectorizer.create_pinecone_instance(env_file_path=".env")
index_list = pc.list_indexes()
index_status = vectorizer.check_pinecone_index_status(db_instance=pc)

print("embeddings :",embeddings)
print("\nindex_list:", index_list)
print("\nindex_status :",index_status)



[2024-04-15 04:00:48,716,INFO,SentenceTransformer,Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2]
[2024-04-15 04:00:49,835,INFO,SentenceTransformer,Use pytorch device_name: cuda]
Key: ce80aa7c-c98e-467c-a100-b4b7e6a07c05
embeddings : client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
) model_name='sentence-transformers/all-MiniLM-L6-v2' cache_folder=None model_kwargs={} encode_kwargs={} multi_process=False show_progress=False

index_list: {'indexes': [{'dimension': 384,
              'host': 'medical-chatbot-v5y6nq5.svc.gcp-starter.pinecone.io',
              'metric': '

In [113]:
# Create vectorstore and test
vector_store = vectorizer.create_pinecone_vectorstore_instance(db_instance= pc,embeddings=embeddings)


In [118]:

query = "Hairfall treatment ?"

chunks_retrieved = vector_store.similarity_search(
    query,  # our search query
    k=5  # return 3 most relevant docs
)
chunks_retrieved

[Document(page_content='Some chemotherapy drugs cause hair loss, but it is'),
 Document(page_content='tone and the scalp is cleaned.'),
 Document(page_content='lowing treatment:'),
 Document(page_content='Medical treatments include:'),
 Document(page_content='Many disorders are associated with celiac disease, though the nature of the connection is unclear. One type of epilepsy is linked to celiac disease. Once their celiac disease is successfully treated, a significant number of these patients have fewer or no seizures. Patients with alopecia areata, a condition where hair loss occurs in sharply defined areas, have been shown to have a higher risk of celiac disease than the general population. There appears to be a higher percentage of celiac disease among people with Down syndrome, but the link between the conditions is unknown.')]

In [107]:
# # Testing cleanup and check status
# vectorizer.clean_pinecone_db(db_instance=pc)

# index_status = vectorizer.check_pinecone_index_status(db_instance=pc)
# index_status

In [1]:
# # Testing db loading
# vectorizer.add_records_pinecone_db(vectorstore_instance=vector_store,chunks=chunks)

In [None]:
# Test status
index_status = vectorizer.check_pinecone_index_status(db_instance=pc)
index_status

Load Model

In [None]:
from langchain.llms import CTransformers
from langchain_community.llms import LlamaCpp
from medicalChatBot.entity import ModelConfig

class LoadModel:
    def __init__(self,
                 config:ModelConfig = None) -> None:
        self.config = config

    def load_model_from_ctransformers(self, model_path = None, model_type=None,max_new_tokens=None,n_ctx:int=None,temperature=None):
        model_path = model_path or self.config.model_path
        model_type = model_type or self.config.model_type
        max_new_tokens = max_new_tokens or self.config.max_new_tokens
        context_length = n_ctx or self.config.n_ctx

        temperature = temperature or self.config.temperature

        print(model_path)
        print(model_type)
        print(max_new_tokens)
        print(temperature)
        print(context_length)

        llm=CTransformers(model=model_path,
                        model_type=model_type,
                        config={'max_new_tokens':max_new_tokens,
                                'temperature':temperature,
                                'context_length': context_length})
        
        return llm
    
    def load_model_from_llamacpp(self,model_path:str=None, n_gpu_layers:int=None, n_batch:int=None, n_ctx:int=None, f16_kv:bool=None, temperature:int=None):
        model_path = model_path or self.config.model_path 
        n_gpu_layers = n_gpu_layers or self.config.n_gpu_layers 
        n_batch = n_batch or self.config.n_batch
        n_ctx = n_ctx or self.config.n_ctx
        f16_kv = f16_kv or self.config.f16_kv
        temperature = temperature or self.config.temperature
        
        print(model_path)
        print(n_gpu_layers)
        print(n_batch)
        print(n_ctx)
        print(f16_kv)
        print(temperature)
        
        lcpp_llm = None
        lcpp_llm = LlamaCpp(
            model_path=model_path,
            n_gpu_layers=n_gpu_layers,
            n_batch=n_batch,
            n_ctx=n_ctx,
            f16_kv=f16_kv, 
            temperature = temperature
            )
        return lcpp_llm
    
    def load_model(self):
        implementation_lower = self.config.implementation.lower()
        print(implementation_lower)
        if 'ctransformers' in implementation_lower:
            llm = self.load_model_from_ctransformers()
        elif 'llama' in implementation_lower or 'llamacpp' in implementation_lower:
            llm = self.load_model_from_llamacpp()
        return llm        

Prompting

In [None]:
from importlib import import_module
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from medicalChatBot.prompts import *


class PromptQueryHandler:

    def __init__(self) -> None:
        pass

    def load_prompt(self, template_name: str = None):
        # Dynamically import the module containing the prompt templates
        prompt_module = import_module("medicalChatBot.prompts")
        default_template_name = "default_template"

        # Check if template_name is None or if the template is not found
        if not template_name:
            template = getattr(prompt_module, default_template_name)
            print(f"No template name provided. Using default template: '{default_template_name}'.")
        else:
            try:
                # Get the template from the module based on the template name
                template = getattr(prompt_module, template_name)
            except AttributeError:
                # If template not found, use a default template``
                template = getattr(prompt_module, default_template_name)
                print(f"Template '{template_name}' not found. Using default template: '{default_template_name}'.")

        # Create a PromptTemplate object from the retrieved template
        prompt = PromptTemplate.from_template(template)
        return prompt
    
    def initialize_chain_with_retrievalqa(self,llm,vector_store,return_source_documents:bool,prompt=None,k:int=3):
        prompt = prompt or self.load_prompt()
        print("prompt: ",prompt)
        qa=RetrievalQA.from_chain_type(
            llm=llm, 
            chain_type="stuff", 
            retriever=vector_store.as_retriever(search_kwargs={'k': k}),
            return_source_documents=return_source_documents, 
            chain_type_kwargs={"prompt": prompt})
        return qa


Test Load model

In [None]:
model_config = configurationManager.get_model_config()
model_config

In [None]:
loadModel = LoadModel(config=model_config)

In [None]:
# Default loader
model_default = loadModel.load_model()

In [None]:
# CTransformers loader
model_ctran = loadModel.load_model_from_ctransformers()

In [None]:
model_ctran.invoke("How are you?")

In [None]:
# LammaCpp loader
model_llamacpp = loadModel.load_model_from_llamacpp()

In [None]:
model_llamacpp.invoke("How are you?")

Test Prompting

In [None]:
query = "How to treat hairfall?"

In [None]:
promptQueryHandler = PromptQueryHandler()

In [None]:
# Using CTransfprmers
qa_ct = promptQueryHandler.initialize_chain_with_retrievalqa(llm=model_ctran,vector_store=vector_store,return_source_documents=True)

In [None]:
response_ct = qa_ct.invoke({"query":query})

In [None]:
#Using LLamaCPP
qa_llama = promptQueryHandler.initialize_chain_with_retrievalqa(llm=model_default,vector_store=vector_store,return_source_documents=True)

In [None]:
response_llama = qa_llama.invoke({"query":"What is Caffiene?"})

In [None]:
response_llama['query']

In [None]:
response_llama['result']

In [None]:
response_llama['source_documents']

Speech to Text

In [None]:
# !pip install openai-whisper
# !pip install pytube

In [11]:
import tempfile
import whisper
from pytube import YouTube



In [13]:
!pwd

/home/dephinate/ASU/DL/MedicalChatBot


In [21]:
# Download model to a directory
whisper_model = whisper.load_model(name="medium",download_root="artifacts/model")

100%|█████████████████████████████████████| 1.42G/1.42G [07:13<00:00, 3.52MiB/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacity of 3.80 GiB of which 10.12 MiB is free. Including non-PyTorch memory, this process has 3.77 GiB memory in use. Of the allocated memory 3.67 GiB is allocated by PyTorch, and 10.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [19]:
# Load model from a directory
whisper_model1 = whisper.load_model(name="artifacts/model/whisper_base.pt")

In [None]:
# whisper_model1

Whisper(
  (encoder): AudioEncoder(
    (conv1): Conv1d(80, 512, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(512, 512, kernel_size=(3,), stride=(2,), padding=(1,))
    (blocks): ModuleList(
      (0-5): 6 x ResidualAttentionBlock(
        (attn): MultiHeadAttention(
          (query): Linear(in_features=512, out_features=512, bias=True)
          (key): Linear(in_features=512, out_features=512, bias=False)
          (value): Linear(in_features=512, out_features=512, bias=True)
          (out): Linear(in_features=512, out_features=512, bias=True)
        )
        (attn_ln): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (mlp_ln): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_post): LayerNorm((512,), eps=1e-05,

In [None]:
# Transcribe audio data
# Let's do this only if we haven't created the transcription file yet.
if not os.path.exists("artifacts/data/transcription.txt"):
    youtube = YouTube(YOUTUBE_VIDEO)
    audio = youtube.streams.filter(only_audio=True).first()

    # Let's load the base model. This is not the most accurate
    # model but it's fast.
    with tempfile.TemporaryDirectory() as tmpdir:
        file = audio.download(output_path=tmpdir)
        transcription = whisper_model.transcribe(file, fp16=False)["text"].strip()

        with open("transcription.txt", "w") as file:
            file.write(transcription)