In [None]:
%cd /Users/i.g/Documents/Medecin-AI-Chatbot

In [None]:
import os
from dotenv import load_dotenv
from langchain_openai import OpenAI

from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings

from langchain.chains import create_retrieval_chain

from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from langchain_pinecone import PineconeVectorStore

load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

MeinLLM = OpenAI(temperature=0.9, max_tokens=500)

In [None]:
def loadPdfDoc(data):
    loader = DirectoryLoader(data, glob='*.pdf', loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [None]:
extractedData = loadPdfDoc(data='Data/')

In [None]:
# Datein in klieneren Chunks splitten:

def textSplit(extractedData):
    textSpitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap=20)
    textChunks = textSpitter.split_documents(extractedData)
    return textChunks

In [None]:
textChunks = textSplit(extractedData=extractedData)
print("Length of text Chunks", len(textChunks))

In [None]:
# Herunterladen von einem Embedding Model von HuggingFace:
def downloadHuggingFace_Embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [None]:
embeddings = downloadHuggingFace_Embeddings()

In [None]:
# Eine Pinecone cluster erstellen:

pc =  Pinecone(api_key=PINECONE_API_KEY)

pc.create_index(
    name="medicin-chat-bot", # Ich hatte ein Problem hier: nach lange Suche --> Losüng: keine Uppercases sind in Pinecone Indexes erlaubt
    dimension=384, # diese Dimension entspricht die Dimension vom HugginfFace Model, es ist sehr wichtig auf diese Dimension zu achten (Falls zukünftig anderes Model)
    metric="cosine", # andere metrics z.B: euclidean ... 
    spec=ServerlessSpec(
        # diese Specs sind die einzigen die gratis sind
        cloud="aws",
        region="us-east-1"
    )
)

In [None]:
# Die Chunks embedden --> Chunks sind jetzt Vektoren und eine VektorStore (DatenBank) ist dafür nötig --> PineconeVectorStore

docSearch = PineconeVectorStore.from_documents(
    documents=textChunks,
    index_name="medicin-chat-bot",
    embedding=embeddings
)

In [None]:
# load existierendes Pinecone Index:

docSearch = PineconeVectorStore.from_existing_index(
    index_name="medicin-chat-bot",
    embedding=embeddings
)

In [None]:
retriever = docSearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

# Hier wird der OpenAI LLM model benutzt, um die vernünftige, richige und gezielte Ergebnisse zu bekommen:

In [None]:
System_Prompt = (
    "You are a medical assistant for question-answering tasks related to the medical field."
    "Use only the following pieces of retrieved context to answer the questions you get asked "
    "and nothing else. If you don't know the answer, say that you don't know."
    "Keep the answers concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", System_Prompt),
        ("human", "{input}")
    ]
)

In [None]:
question_answer_chain = create_stuff_documents_chain(MeinLLM, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
Answer = rag_chain.invoke({"input":"my fever is high, i am coughing the hole time, my nose is closed and i keep spitting mucus, what do i have and what medecine should i take"}) 
print(Answer["answer"])