<h1>Using Langchain</h1>

In [6]:
!pip install langchain_huggingface langchain_core transformers torch pypdf langchain_community faiss-cpu

You should consider upgrading via the '/Users/alphonsusteow/Desktop/Alph/DrugWise/venv/bin/python3 -m pip install --upgrade pip' command.[0m


In [7]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import LanceDB
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

In [8]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [9]:
import os
from langchain.document_loaders import DirectoryLoader, PyPDFLoader, CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Directory containing PDF and CSV files
data_directory = "corpus"

# Initialize a list to store all documents
documents = []

# Process PDF files
print("Loading PDF documents...")
pdf_directory = os.path.join(data_directory, "pdf")
for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
        file_path = os.path.join(pdf_directory, filename)
        loader = PyPDFLoader(file_path)  # Load PDF with metadata (filename, page number)
        documents.extend(loader.load())

# Process CSV files
print("Loading CSV documents...")
csv_directory = os.path.join(data_directory, "csv")
for filename in os.listdir(csv_directory):
    if filename.endswith(".csv"):
        file_path = os.path.join(csv_directory, filename)
        loader = CSVLoader(file_path)  # Load CSV with metadata (filename, row number)
        documents.extend(loader.load())

# Split documents into chunks
print("Splitting documents into chunks...")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,
    chunk_overlap=50,
    length_function=len,
)
processed_docs = text_splitter.split_documents(documents)

print(f"Processed {len(processed_docs)} document chunks.")

Loading PDF documents...
Done with Drug-DrugInteractionsDatabases-Webservers.pdf
Done with ClinicalPharminDrugDev.pdf
Done with The EffectofCytochromeP450MetabolismonDrugResponseinteractionsandAdverse Effects.pdf
Done with polypharmacy_medicines_optismisation_2013.pdf
Loading CSV documents...
Done with ddinter_downloads_code_R.csv
Done with ddinter_downloads_code_D.csv
Done with ddinter_downloads_code_P.csv
Done with ddinter_downloads_code_B.csv
Done with ddinter_downloads_code_V.csv
Done with ddinter_downloads_code_A.csv
Done with ddinter_downloads_code_L.csv
Done with ddinter_downloads_code_H.csv
Splitting documents into chunks...
Processed 223151 document chunks.


In [None]:
from langchain.vectorstores.chroma import Chroma
import shutil
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings

CHROMA_PATH = 'chroma'
def save_to_chroma(chunks: list[Document]):
    if os.path.exsists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    db = Chroma.from_document(
        chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
    )

    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [10]:
# Initialize LanceDB
from langchain_community.vectorstores import FAISS
vector_store = FAISS.from_documents(processed_docs, embeddings)


In [11]:
# Create retriever
# Top-k is the number of chunks to retrieve
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 2})

retriever.invoke("What is the dataset prepared to aid in drug-drug interactions?")

[Document(id='25672d2e-b486-4b5c-874a-ef3b1e2e2a47', metadata={'producer': 'Acrobat Distiller 23.0 (Windows)', 'creator': 'LaTeX with hyperref package', 'creationdate': '2024-01-06T10:39:39+05:30', 'author': '', 'keywords': '', 'moddate': '2024-01-06T10:39:39+05:30', 'pdfversion': '1.5', 'subject': 'AcademicSubjects/SCI01060, DOI: 10.1093/bib/bbad445, Briefings in Bioinformatics, 25(1), 00, 27 11 2023. Abstract: In clinical treatment, two or more drugs (i.e. drug combination) are simultaneously or successively used for therapy with the purpose of primarily enhancing the therapeutic efficacy or reducing drug side effects. However, inappropriate drug combination may not only fail to improve efficacy, but even lead to adverse reactions. Therefore, according to the basic principle of improving the efficacy and/or reducing adverse reactions, we should study drug–drug interactions (DDIs) comprehensively and thoroughly so as to reasonably use drug combination. In this review, we first introdu

In [12]:
model_id = "Qwen/Qwen2.5-0.5B"
#model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"


print("\nInitializing model and tokenizer...")
print(
        "This might take a few minutes on first run as the model needs to be downloaded."
    )

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map="auto") ##trust_remote_code=True, cache_dir="model_cache", Cache the model for future use


print("\nCreating pipeline...")
# Create pipeline
pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        temperature=0.1,
        top_p=0.95,
        repetition_penalty=1.1,
        do_sample=True,
    )

print("Model initialization complete!")

# Create LangChain LLM
llm = HuggingFacePipeline(pipeline=pipe)


Initializing model and tokenizer...
This might take a few minutes on first run as the model needs to be downloaded.


Device set to use mps



Creating pipeline...
Model initialization complete!


In [13]:
print("Creating RAG chain...")
# Initialize LLM (OpenAI)
# Create prompt template
template = """Use the following pieces of context to answer the question. If you don't know the answer, just say that you don't know.

Context: {context}
Question: {question}

Answer:"""
prompt = PromptTemplate.from_template(template)
# Format documents function
def format_docs(docs):
    return "\n\n".join(
        f"{doc.page_content}\n(Source: {doc.metadata['source']}, Page: {doc.metadata['page']})"
        for doc in docs
    )
# Create the RAG chain
# RunnablePassthrough() is used to pass the question through the chain unchanged. It means that the question firstly pass through the retriever
# then format the context. And than the question would be passed with the retrieved context to the prompt
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
)

Creating RAG chain...


In [14]:
question = 'Isoflurane is a drug i am currently using. What are the drugs it interacts with and the level of interactions?'
response = rag_chain.invoke(question)

KeyError: 'page'

In [None]:
print(response)

Use the following pieces of context to answer the question. If you don't know the answer, just say that you don't know.

Context: drug entries including 2740 approved small molecule drugs, 1577
approved biologics, 134 nutraceuticals and over 6717 experimen-
tal drugs.
Drug–drug interaction
In clinical treatment, to cure or ameliorate symptoms of diseases
or medical condition, two or more kinds of drugs are usually
needed [8]. The fundamental reason of drug combination lies
in the evidence that combination therapy tends to have higher
cure rates than monotherapy [9]. For instance, the combination
therapy of doxorubicin, cyclophosphamide, vincristine and pred-
nisone is commonly implemented in cancer chemotherapy regi-
(Source: Corpus/pdf/Drug-DrugInteractionsDatabases-Webservers.pdf, Page: 0)

Drug–drug interaction prediction | 3
Table 1: The function and URL of databases as well as web servers
Databases or
web servers
Function URL
DrugBank Recording 15 451 drugs and providing more than

In [None]:
rag_chain_withoutpromptoutput = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm.bind(skip_prompt=True)
)

In [None]:
question = 'Isoflurane is a drug i am currently using. What are the drugs it interacts with and the level of interactions?'
response = rag_chain_withoutpromptoutput.invoke(question)
print(response)

 based on the calculation principle of the models, we simply divided the models into three categories: tradi-

tional machine learning-based models, deep learning-based models and score function-based models.

The performance of the baseline model for detecting drug-drug interactions can be evaluated using various metrics such as precision, recall, F1-score, and ROC-AUC. In this case, the performance of the baseline model can be assessed using the Precision (P) and Recall (R) scores. 

Precision measures the ability of a positive test to detect a true positive, while Recall measures the ability of a negative test to correctly identify all false negatives. A high Precision indicates that the model is able to accurately predict drug-drug interactions, while a high Recall indicates that the model is not only identifying potential interactions but also correctly classifying them as non-interactions.

In this case, the Precision (P) value would be calculated as follows:

P = TP / (TP + FP)


<h1>File processing done using FAISS</h1>

In [None]:
file_path = "csv/ddinter_downloads_code_A.csv"
data = pd.read_csv(file_path)

data.head(5)


Unnamed: 0,DDInterID_A,Drug_A,DDInterID_B,Drug_B,Level
0,DDInter1263,Naltrexone,DDInter1,Abacavir,Moderate
1,DDInter1,Abacavir,DDInter1348,Orlistat,Moderate
2,DDInter58,Aluminum hydroxide,DDInter582,Dolutegravir,Major
3,DDInter112,Aprepitant,DDInter582,Dolutegravir,Minor
4,DDInter138,Attapulgite,DDInter582,Dolutegravir,Major


In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(file_path)
docs = loader.load_and_split()

In [None]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI,OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
index = faiss.IndexFlatL2(len(OpenAIEmbeddings().embed_query(" ")))
vector_store = FAISS(
    embedding_function=OpenAIEmbeddings(),
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

In [None]:
vector_store.add_documents(documents=docs)

['ac5cdb7a-a0a8-4a5d-8645-e52630b19c67',
 'eed20877-f452-404c-8eb2-09413386ff3d',
 'e12cc993-6d32-46bb-adc5-fecd756f53fc',
 '8d61be8e-9444-4d23-ab34-d716f8771b1d',
 '8c53771f-3cb5-463b-9cf2-3d247adbc0a8',
 'fa260bdc-21a8-48ac-a3d0-d381a638befe',
 '1a423854-4b94-4e88-b464-491fd5452117',
 '133b20d3-a24a-4439-b169-1ba322e66a63',
 '2d9c8942-baf0-4eae-8879-f5f48c5f0564',
 '7b3717d2-a296-4fe5-be29-f4df3e5d5c36',
 '3a08b2a0-1546-4cf9-9229-69e32be371fa',
 'aa8f4c96-944a-4cd2-b46c-44374ad815a5',
 'd36dfcd7-eaae-44f1-8d92-acba33891e0b',
 '83ef4771-839f-4e1f-b78e-09b43692348e',
 '8e9828fc-cb73-40f4-b778-8ac06e1fe885',
 '2f455a07-7aea-447f-80f0-cbdbf993830d',
 '31c0ec56-6fd8-4598-a0f2-53b38c4c452e',
 'ba16ed0c-0ed3-4f0a-adfa-b88e281e9978',
 '40f3bddc-f42d-4f97-88dc-70250f8a6edc',
 'd84874ef-8eb0-44b7-8962-9340cbe0d7b3',
 '0f01bec0-2957-433a-98aa-54762b64d135',
 '70327642-eba7-48ef-a516-42f6eb42de2b',
 '798cbe52-6c62-4f1f-ae89-b1224992adbf',
 'c9407c6b-3a9c-4116-924d-b53832a5567c',
 'f9eb3bd2-27a9-

In [None]:
print(vector_store.index.ntotal)

56367


In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

retriever = vector_store.as_retriever()

# Set up system prompt
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),

])

# Create the question-answer chain
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
answer= rag_chain.invoke({"input": "What are all the drugs Asparaginase Escherichia coli interact with? What is the level of interaction or each of the listed drugs? Provide the response in the following JSON format: Level: <Level of Interaction>, Drug_B: <Interacting Drug>, DDInterID_B: <Interaction ID>"})

print(answer['answer'])

{
    "Level": "Moderate",
    "Drug_B": "Empagliflozin",
    "DDInterID_B": "DDInter636"
}
