### Import libraries


In [26]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from langchain.chains import RetrievalQA

from langchain.vectorstores import Pinecone as PineconeLang
from pinecone import Pinecone as PineconeClient
from pinecone import ServerlessSpec

import os
import sys

import warnings
warnings.filterwarnings('ignore')

### Extract data from the PDF


In [2]:
%pwd

'c:\\Users\\44787\\Desktop\\Medical-Chatbot-LLM\\notebooks'

In [3]:
import os 

os.chdir("../")
%pwd

'c:\\Users\\44787\\Desktop\\Medical-Chatbot-LLM'

In [4]:
def load_pdf(data):
    loader = DirectoryLoader(data,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader)

    documents = loader.load()

    return documents

In [5]:
extracted_data = load_pdf("data")

In [6]:
extracted_data[1]

Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION', metadata={'source': 'data\\Medical_book.pdf', 'page': 1})

### Create text chunks


In [7]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [8]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 7020


### Download embedding model


In [9]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2")

    return embeddings

In [10]:
embeddings = download_hugging_face_embeddings()
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [11]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [12]:
# query_result

### Pinecone section


In [13]:
from dotenv import load_dotenv

load_dotenv()

True

In [14]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
CLOUD = os.environ.get('CLOUD')
REGION = os.environ.get('REGION')

#### create the index


In [16]:
pc = PineconeClient(api_key=PINECONE_API_KEY)
spec = ServerlessSpec(cloud=CLOUD,
                      region=REGION)

In [17]:
index_name = 'mchatbot'

In [18]:
# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=384,  # dimensionality of text-embedding-ada-002
        metric='cosine',
        spec=spec
    )
# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()
     

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

#### storing in pinecone

In [19]:
#Creating Embeddings for Each of The Text Chunks & storing
vectorstore=PineconeLang.from_texts([t.page_content for t in text_chunks],
                                    embeddings,
                                    index_name=index_name)

In [20]:
print(vectorstore)

<langchain_community.vectorstores.pinecone.Pinecone object at 0x00000250009A3E50>


In [21]:
#If we already have an index we can load it like this
vectorstore=PineconeLang.from_existing_index(index_name, embeddings)

query = "What are Allergies"
docs=vectorstore.similarity_search(query, k=3)

print("Result", docs)

Result [Document(page_content="GALE ENCYCLOPEDIA OF MEDICINE 2 117Allergies\nAllergic rhinitis is commonly triggered by\nexposure to household dust, animal fur,or pollen. The foreign substance thattriggers an allergic reaction is calledan allergen.\nThe presence of an allergen causes the\nbody's lymphocytes to begin producingIgE antibodies. The lymphocytes of an allergy sufferer produce an unusuallylarge amount of IgE.\nIgE molecules attach to mast\ncells, which contain histamine.HistaminePollen grains\nLymphocyte\nFIRST EXPOSURE"), Document(page_content='allergens are the following:\n• plant pollens\n• animal fur and dander\n• body parts from house mites (microscopic creatures\nfound in all houses)\n• house dust• mold spores• cigarette smoke• solvents• cleaners\nCommon food allergens include the following:\n• nuts, especially peanuts, walnuts, and brazil nuts\n• fish, mollusks, and shellfish• eggs• wheat• milk• food additives and preservatives\nThe following types of drugs commonly ca

### Prompt template creation

In [22]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [23]:
PROMPT = PromptTemplate(template=prompt_template,
                        input_variables=["context", "question"])
chain_type_kwargs = {"prompt": PROMPT}

### creating LLM Model

In [24]:
MODEL_PATH = 'src/model/llama-2-7b-chat.ggmlv3.q4_0.bin'

llm = CTransformers(model=MODEL_PATH,
                    model_type="llama",
                    config={'max_new_tokens': 512,
                            'temperature': 0.8})

In [25]:
qa = RetrievalQA.from_chain_type(llm=llm,
                                 chain_type="stuff",
                                 retriever=vectorstore.as_retriever(
                                     search_kwargs={'k': 2}),
                                 return_source_documents=True,
                                 chain_type_kwargs=chain_type_kwargs)

In [27]:
while True:
  user_input = input(f"Input Prompt: ")
  if user_input == 'exit':
    print('Exiting')
    sys.exit()
  if user_input == '':
    continue
  result = qa({'query': user_input})
  print(f"Answer: {result['result']}")

Answer: Acne is a skin disease that occurs when the pores of the skin become clogged with oil, dead skin cells, and bacteria. It can affect various parts of the body including the face, chest, and back.
Exiting


SystemExit: 