In [46]:
print("Hello World")

Hello World


In [7]:
import os
from pinecone import Pinecone, ServerlessSpec
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Pinecone as LangchainPinecone
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
from langchain_community.llms import CTransformers


In [8]:
load_dotenv()
# Set your Pinecone API key
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
index_name = "mbot"

In [6]:
# Initialize Pinecone connection
pc = Pinecone(api_key=PINECONE_API_KEY)

In [51]:
# Check if the index exists; if not, create it
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # Adjust based on the embedding model used
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

In [52]:
# Extract data from PDFs
def load_pdf(data_directory):
    loader = DirectoryLoader(data_directory, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

extracted_data = load_pdf("Data/")

In [53]:
# Split the documents into chunks for better embedding
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks = text_split(extracted_data)
print("Length of chunks:", len(text_chunks))

Length of chunks: 7486


In [54]:
# Load Hugging Face Embeddings
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

embeddings = download_hugging_face_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [55]:
# Create a Pinecone vector store from the text embeddings
docsearch = LangchainPinecone.from_texts(
    [t.page_content for t in text_chunks],
    embeddings,
    index_name=index_name
)

In [56]:
query = "What is Cancer?"
docs = docsearch.similarity_search(query, k=3)

# Print the results
print("Result", docs)

Result [Document(metadata={}, page_content='Ellen S. Weber, MSN\nBreast cancer\nDefinition\nBreast cancer is caused by the development of\nmalignant cells in the breast. The malignant cells origi-nate in the lining of the milk glands or ducts of the breast(ductal epithelium), defining this malignancy as a cancer.Cancer cells are characterized by uncontrolled divisionleading to abnormal growth and the ability of these cellsto invade normal tissue locally or to spread throughoutthe body, in a process called metastasis.\nDescription'), Document(metadata={}, page_content='Ellen S. Weber, MSN\nBreast cancer\nDefinition\nBreast cancer is caused by the development of\nmalignant cells in the breast. The malignant cells origi-nate in the lining of the milk glands or ducts of the breast(ductal epithelium), defining this malignancy as a cancer.Cancer cells are characterized by uncontrolled divisionleading to abnormal growth and the ability of these cellsto invade normal tissue locally or to sprea

In [57]:
prompt_template="""
Use following piece of instruction to answer the  user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:  
"""

In [58]:
PROMPT=PromptTemplate(template=prompt_template, input_variable=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [59]:
llm=CTransformers(model="Model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.8})

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [61]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever(search_kwargs={'k': 2}),  # Corrected dictionary format
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)


In [None]:
while True:
    user_input=input(f"Input Prompt:")
    result=qa({"query":user_input})
    print("Response:", result["result"])

Response: Abortion is a medical procedure that ends a pregnancy. It is typically performed before the fetus can live independently, usually between 10-24 weeks of gestation. The purpose of an abortion is to end a pregnancy when there is a compelling reason, such as hardship, health concerns, or severe abnormalities in the developing fetus. Abortions are safest when performed within the first trimester of pregnancy.
Response: Actinomycosis is a type of bacterial infection that can affect various parts of the body, including the skin, lungs, and abdomen. It is caused by the bacterium Actinomyces israelii, which is typically found in soil and decaying organic matter. Symptoms of actinomycosis can include fever, fatigue, weight loss, and swelling in the affected area. Treatment typically involves antibiotics and surgical drainage of any abscesses or collections of pus.
Response: Autoimmune disorders are conditions in which a person's immune system attacks the body's own cells, causing tiss