In [None]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import CTransformers
from langchain.prompts import PromptTemplate

In [None]:
from dotenv import load_dotenv

load_dotenv()

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV')

In [None]:
# Extract data from pdf

def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [None]:
import os
os.chdir("../")

In [None]:
extracted_data = load_pdf("Dataset/")

In [None]:
#extracted_data

In [None]:
# Create text chunks for extracted data

def chunk_split(extracted_data):
    splitter = RecursiveCharacterTextSplitter(chunk_size = 500 , chunk_overlap = 20)
    chunks = splitter.split_documents(extracted_data)

    return chunks

In [None]:
text_chunks = chunk_split(extracted_data)
len(text_chunks)

The Extracted data got splitted in to 7020 text chunks.Now we do embedding to convert our text in to vectors

In [None]:
# Download_embedding model

def Download_embedding_model():
    embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embedding



In [None]:
embedding_model = Download_embedding_model()

In [None]:
embedding_model

In [None]:
result = embedding_model.embed_query("chat bot")
len(result)

In [None]:
# Initializing the Pinecone
pinecone.init(api_key=PINECONE_API_KEY,
              environment=PINECONE_API_ENV)
index_name = "medical-chatbot"

#Creating embedding for each of text_chunks & storing
docsearch = Pinecone.from_texts([t.page_content for t in text_chunks], 
                                embedding=embedding_model,
                                index_name=index_name)


In [None]:
# if we already have an index we can laod it like this
index_name = "medical-chatbot"
docsearch = Pinecone.from_existing_index(index_name=index_name, embedding=embedding_model)

In [None]:


query= "symptoms of cancer"

doc = docsearch.similarity_search(query,k=3)

print("Result" , doc)

In [None]:
template = """
Use the following pieces of information to answer the user's question.
if you dont't know the answer, just say i don't have knowledge about it, don't try to make up an answer

Context : {context}
Question: {question}

Only return the helpful answer below and nothing else.
Answer :
"""

In [None]:
Prompt = PromptTemplate(template=template,
                        input_variables=["context","question"])
chain_type_kwargs = {"prompt": Prompt}

In [None]:
# Model llama2 Loading 
llm = CTransformers(model="model\llama-2-7b-chat.ggmlv3.q4_0.bin",
                    model_type="llama",
                    config={'max_new_tokens':512,
                            'temperature':0.8})

In [None]:
QnA = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever = docsearch.as_retriever(search_kwargs={'k':2}),
    return_source_documents = True,
    chain_type_kwargs=chain_type_kwargs
)

In [None]:
# User face
while True:
    user_input = input(f"Input Prompt:")
    result = QnA({"query":user_input})
    print("Response:", result['result'])