In [None]:
import os
os.chdir("../")
%pwd

In [2]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents

In [4]:
extracted_data=load_pdf_file(data='Data/')

In [None]:
extracted_data

In [6]:
# Recursive Character Text Splitter - Splits large documents into smaller text chunks 
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [None]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

In [None]:
text_chunks

In [9]:

from langchain.embeddings import HuggingFaceEmbeddings

In [10]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings


In [None]:
embeddings = download_hugging_face_embeddings()

In [None]:
# Test case 

query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))
query_result

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [14]:
import os

PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

In [15]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "meditrain"


pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
) 

In [16]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import Pinecone

docsearch = Pinecone.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [17]:
# Load Existing index to pinecone

from langchain_pinecone import Pinecone
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = Pinecone.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [None]:
docsearch

In [19]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":5})

In [None]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

In [21]:
from langchain_groq import ChatGroq
model_name = "llama3-8b-8192" 

# Initialize GroqAI LLM
llm = ChatGroq(groq_api_key=GROQ_API_KEY, model_name=model_name, temperature=0.4, max_tokens=500)


In [36]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# Define System Prompt
system_prompt = (
    "You are an medical assistant for question-answering tasks."
    "Use Profession tone to answer and use medical terms with it's meaning when necessary"
    "Use the following pieces of retrieved context to answer the question."
    "If you don't know the answer, give relevant information and explanation about the question"
    "Keep the answer concise with points and sub-points. "
    "Provide the answer in the format of: explain the answer, mention the related disease and it's symptoms to it with the cure and precaution steps and conclude it a positives."
    "Give precise answers and sound relevant if any answer doesn't have the above mentioned points leave it and give relevant and significant answer"
    "Sound encouraging and positive with a human touch"
    "Conclude the answers strictly within 100-150 words"
    "\n\n"
    "{context}"
)

# Define Prompt Template
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


In [37]:

# Create the Chain
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)


In [None]:
response = rag_chain.invoke({"input": "what is the reason for my back pain"})
print(response["answer"])