In [None]:
import os
os.chdir('../')

In [None]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from dotenv import load_dotenv
from langchain.vectorstores import Pinecone as PineconeVectorStore
from pinecone import Pinecone,ServerlessSpec
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.llms import HuggingFaceHub
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.llms import HuggingFaceHub
from langchain.chains import RetrievalQA



In [None]:
# Load data
def load_pdf(path):
    loader = DirectoryLoader(path, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [None]:
extracted_data = load_pdf('Data/')

In [None]:
# Split text
def split_text(data):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    split_data = splitter.split_documents(data)
    return split_data

In [None]:
text_chunks = split_text(extracted_data)
print(len(text_chunks))

In [None]:
#Dowload the embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [None]:
embeddings = download_hugging_face_embeddings()

In [None]:
query_results = embeddings.embed_query("Hello world")
print(len(query_results))

In [None]:
from dotenv import load_dotenv

load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")

In [None]:
# Create the database in Pinecone

from pinecone import Pinecone,ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)


index_name = "medicalbot"

pc.create_index(
    name=index_name,
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)






In [None]:
#Insert the data into the database

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embeddings,
    index_name=index_name,
)

In [None]:
#Connect to the database

index_name = "medicalbot"

pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(index_name)

In [None]:
#Load Existing Pinecone Index
docsearch = PineconeVectorStore.from_existing_index(index_name=index_name, embedding=embeddings)
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})


In [None]:
retrieved_documents = retriever.invoke("what is Ance?")
print(retrieved_documents)

In [None]:
llm = HuggingFaceHub(
    repo_id="tiiuae/falcon-7b-instruct",
    task="text-generation",
    model_kwargs={"temperature": 0.3, "max_length": 200},
    huggingfacehub_api_token=HUGGINGFACE_API_KEY,
)

In [None]:
# Create a QA chain
qa_chain = load_qa_chain(llm=llm, chain_type="stuff" )

# Create Retrieval-QA pipeline
qa = RetrievalQA(combine_documents_chain=qa_chain, retriever=retriever)


query = "what is Ance"


def extract_answer(result):
    answer_start = result.find("Answer:")
    if answer_start != -1:
        answer_end = result.find("Answer:", answer_start + len("Answer:"))
        if answer_end != -1:
            return result[answer_start + len("Answer:"):answer_end].strip()
        else:
            return result[answer_start + len("Answer:"):].strip()
    return result.strip()


answer = extract_answer(result)
print(answer)