In [1]:
import os
os.chdir("../")

In [3]:
%pwd

'C:\\Users\\Administrator\\Desktop\\Internship\\Research\\NLP\\Version 2 MedicalBot\\Backend'

In [5]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [6]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents


In [7]:
extracted_data=load_pdf_file(data='Data/')


In [11]:
#Spliat the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [13]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))


Length of Text Chunks 135


In [15]:
from langchain.embeddings import HuggingFaceEmbeddings

In [17]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings


In [19]:
embeddings = download_hugging_face_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


In [20]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [21]:
from dotenv import load_dotenv
load_dotenv()

True

In [22]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY=os.environ.get('OPENAI_API_KEY')

In [28]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "mentalhealth3"


pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
) 

In [27]:
index_name = "medicalbot"

In [29]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [30]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [34]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [36]:
retrieved_docs = retriever.invoke("What is Acne?")

In [38]:
retrieved_docs

[Document(id='479631e4-8b52-42b0-a7a5-4a436ee0c4aa', metadata={'page': 26.0, 'source': 'Data\\CampusCare Mental Health Chatbot Training Data.pdf'}, page_content='learnmore?"\n"Let\'sexploresomesimpleself-careactivitiesyoucantrytoday:\nTakingawalkaroundcampus'),
 Document(id='a6455636-f74a-4ab3-8098-69dd290d384f', metadata={'page': 34.0, 'source': 'Data\\CampusCare Mental Health Chatbot Training Data.pdf'}, page_content='issues,likeanxiety,depression,orgrief.Theyoftenhavealistontheir\nwebsite.'),
 Document(id='04e5d5ec-8e39-4e3e-b250-66e716daf781', metadata={'page': 7.0, 'source': 'Data\\Introduction to Machine Learning Notes.pdf'}, page_content='Linear models can also be applied to binary classification  problems. In this case, the line produced\nby the model separates the two classes: It defines where the decision changes from one class value\nto the other. Such a line is often referred to as the decision boundary. Figure 3.2 shows a decision\nboundary for the iris data that separates

In [40]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens=500)

In [42]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [44]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

In [46]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [48]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [58]:
response = rag_chain.invoke({"input": "What is the statistic of people living with mental health issues globally. "})
print(response["answer"])


Chatbot: According to the World Health Organization, approximately 1 in 4 people globally 
will experience a mental health issue at some point in their lives. This includes conditions 
such as anxiety, depression, and grief. However, it is important to note that these numbers 
may vary depending on the source and definition of mental health issues.


In [69]:
response = rag_chain.invoke({"input": "What is stats?"})
print(response["answer"])



I'm sorry, I am not able to answer that question as it is not related to the provided context about a complete blood count. Is there something else I can assist you with?
