In [34]:
# !pip install "langchain" 
# !pip install "langchain-ibm" 
# !pip install "langchain-community" 
# !pip install pydantic-core 
# !pip install "langchain-chroma"
# !pip install langchain-pymupdf4llm
# !pip install langchain_text_splitters

Name: langchain-text-splitters
Version: 1.0.0
Summary: LangChain text splitting utilities
Home-page: https://docs.langchain.com/
Author: 
Author-email: 
License: MIT
Location: /home/smiley/anaconda3/lib/python3.12/site-packages
Requires: langchain-core
Required-by: langchain-classic


In [12]:
from getpass import getpass
from ibm_watsonx_ai import Credentials
import os

key_path = "./PRIVATE/key.txt"
key = ""
project_path = "./PRIVATE/pID.txt"
project = ""
try:
    with open(key_path, 'r') as file:
        key = file.read().strip()
except FileNotFoundError:
        print(f"Error: The file '{key_path}' was not found.")
except Exception as e:
    print(f"An error occured: {e}")
try:
    with open(project_path, 'r') as file:
        project = file.read().strip()
except FileNotFoundError:
        print(f"Error: The file '{key_path}' was not found.")
except Exception as e:
    print(f"An error occured: {e}")

# just stop everything if the key or project id's not loaded here
if not key or not project:
    raise ValueError("Stopping execution: API Key or Project ID is missing/empty.")

os.environ["WATSONX_APIKEY"] = key

In [39]:
# load pdf 
loader = PyMuPDF4LLMLoader("")
raw_docs = loader.load()

# split text 
# we use a large chunk size (1000) to keep medical concepts together
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
splits = text_splitter.split_documents(raw_docs)

vectorstore = Chroma.from_documents(documents = splits, embedding = embeddings)
retriever = vectorstore.as_retriever(search_kwargs = {"k": 4})

# the citation formater
def format_docs_with_sources(docs):
    formatted_text = ""
    for doc in docs:
        # extract metadata
        soruce = doc.metadata.get("source", "Unknown File")
        page = doc.metadata.get("page", "Unkown Page")

        # modify the content to physically include source ID like [Source: medical_study.pdf, Page 2]
        content = doc.page_content.replace("\n", " ")
        formatted_text += f"Content: {content}\nSource: [{source}, Page {page}]\n\n"
    return formatted_text

ValueError: File path  is not a valid file or url

In [37]:
import os 
from langchain_ibm import WatsonxEmbeddings, ChatWatsonx
from langchain_chroma import Chroma
from langchain_pymupdf4llm import PyMuPDF4LLMLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

embeddings = WatsonxEmbeddings(
    model_id = "ibm/granite-embedding-278m-multilingual",
    url="https://us-south.ml.cloud.ibm.com",
    project_id=project,
    params={"decoding_method": "greedy"} 
)

parameters = {
    "temperature": 0.6,
    "max_tokens": 1000, #since medical answers and synthesis tend to be on the longer side
}

chat = ChatWatsonx(
    model_id = "ibm/granite-3-3-8b-instruct",
    url = "https://us-south.ml.cloud.ibm.com",
    project_id = project,
    params = parameters,
)

In [18]:
from langchain_core.prompts import ChatPromptTemplate

# how granite would react to your messages and such
system_instruct = """You are an expert research assistant. Your task is to answer the user's question by synthesizing information from provided documents.

Strict Rules:
1. Grounding: Base your answer exclusively on the information contained in the "Provided Documents" section. Do not use any external knowledge or make assumptions.
2. Citation: You must cite the source for every fact or claim in your answer. Use the format [DOC_ID] at the end of the sentence or paragraph that uses the information.
3. Synthesis: If the question requires information from multiple documents, you must synthesize these pieces of information into a single, coherent answer.
4. Unknown Information: If the documents do not contain enough information to answer the question, you must state: "The provided documents do not contain information to answer this question." Do not try to guess."""

# for the template we're gonna get the context (which would be the docs) and the {question} (the user input)
pompt = ChatPromptTemplate.from_template(system_instruct)

# this makes it so it has the context, docs with sources 
rag_chain = (
    {"context": retriever | format_docs_with_sources, "questions": RunnablePassthrough()}
    | prompt
    | chat
)


In [19]:
#this is a mock area for testing
doc1 = "[DOC_1] The Granite model is developed by IBM"
doc2 = "[DOC_2] IBM's headquarters are located in Armonk, New York"
doc3 = "[DOC_3] Python's a programming language released in 1991"

context_text = f"{doc1}\n{doc2}\n{doc3}"

question_text = "who developed Granite and where are they located?" #should use doc 1 and 2 (hopefully)

print(f"---INPUT---\nContext:\n{context_text}\n\nQuestion: {question_text}\n")

response = rag_chain.invoke({
    "context" : context_text,
    "question" : question_text
})

print(f"---OUTPUT---\nb{response.content}")


---INPUT---
Context:
[DOC_1] The Granite model is developed by IBM
[DOC_2] IBM's headquarters are located in Armonk, New York
[DOC_3] Python's a programming language released in 1991

Question: who developed Granite and where are they located?

---OUTPUT---
bThe Granite model was developed by IBM. The location of IBM's headquarters is Armonk, New York [DOC_1, DOC_2]. Unfortunately, there is no information provided about the specific location where the Granite model was developed within IBM.
