In [1]:

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAI


In [2]:
from Bio import Entrez
import time
import pandas as pd


In [3]:
Entrez.email = "mohd.fauzan78692@gmail.com"  


In [4]:
query = "COPD treatment"  
max_results = 20


In [5]:
handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
record = Entrez.read(handle)
handle.close()

pmids = record["IdList"]
print(f"Found {len(pmids)} articles.")


Found 20 articles.


In [6]:
abstracts = []

for pmid in pmids:
    handle = Entrez.efetch(db="pubmed", id=pmid, rettype="abstract", retmode="text")
    abstract = handle.read()
    abstracts.append({"PMID": pmid, "Abstract": abstract})
    handle.close()
    time.sleep(0.5)  

df = pd.DataFrame(abstracts)
df.head()


Unnamed: 0,PMID,Abstract
0,40779701,1. Chron Respir Dis. 2025 Jan-Dec;22:147997312...
1,40779049,1. Ann Hematol. 2025 Aug 8. doi: 10.1007/s0027...
2,40778913,1. Ter Arkh. 2025 Jul 31;97(7):538-544. doi: 1...
3,40778820,1. Chron Respir Dis. 2025 Jan-Dec;22:147997312...
4,40778628,1. Ther Adv Respir Dis. 2025 Jan-Dec;19:175346...


In [7]:
df.to_csv("copd_pubmed_abstracts.csv", index=False)
print("Saved abstracts to 'copd_pubmed_abstracts.csv'")


Saved abstracts to 'copd_pubmed_abstracts.csv'


In [8]:
import pandas as pd

df = pd.read_csv("copd_pubmed_abstracts.csv")
print("Total abstracts:", len(df))
df.head()


Total abstracts: 20


Unnamed: 0,PMID,Abstract
0,40779701,1. Chron Respir Dis. 2025 Jan-Dec;22:147997312...
1,40779049,1. Ann Hematol. 2025 Aug 8. doi: 10.1007/s0027...
2,40778913,1. Ter Arkh. 2025 Jul 31;97(7):538-544. doi: 1...
3,40778820,1. Chron Respir Dis. 2025 Jan-Dec;22:147997312...
4,40778628,1. Ther Adv Respir Dis. 2025 Jan-Dec;19:175346...


In [9]:
import re

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with single space
    text = re.sub(r'\[[^\]]*\]', '', text)  # Remove [1], [2], etc.
    return text.strip()

df['cleaned'] = df['Abstract'].apply(clean_text)
df['cleaned'].head()


0    1. Chron Respir Dis. 2025 Jan-Dec;22:147997312...
1    1. Ann Hematol. 2025 Aug 8. doi: 10.1007/s0027...
2    1. Ter Arkh. 2025 Jul 31;97(7):538-544. doi: 1...
3    1. Chron Respir Dis. 2025 Jan-Dec;22:147997312...
4    1. Ther Adv Respir Dis. 2025 Jan-Dec;19:175346...
Name: cleaned, dtype: object

In [10]:
def chunk_text(text, max_tokens=150):
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_tokens):
        chunk = " ".join(words[i:i+max_tokens])
        chunks.append(chunk)
    return chunks

# Apply chunking to each abstract
all_chunks = []

for idx, row in df.iterrows():
    chunks = chunk_text(row['cleaned'])
    for chunk in chunks:
        all_chunks.append({
            "pmid": row["PMID"],
            "chunk": chunk
        })

chunk_df = pd.DataFrame(all_chunks)
print("Total chunks created:", len(chunk_df))
chunk_df.head()


Total chunks created: 65


Unnamed: 0,pmid,chunk
0,40779701,1. Chron Respir Dis. 2025 Jan-Dec;22:147997312...
1,40779701,"settings, and concerns regarding sustainabilit..."
2,40779701,interest. A. Murphy has received funding for r...
3,40779049,1. Ann Hematol. 2025 Aug 8. doi: 10.1007/s0027...
4,40779049,"retrospective single-center study, we analyzed..."


In [11]:
chunk_df.to_csv("copd_chunks.csv", index=False)
print("Saved chunked text to 'copd_chunks.csv'")


Saved chunked text to 'copd_chunks.csv'


In [12]:
import pandas as pd

df_chunks = pd.read_csv("copd_chunks.csv")
df_chunks.head()


Unnamed: 0,pmid,chunk
0,40779701,1. Chron Respir Dis. 2025 Jan-Dec;22:147997312...
1,40779701,"settings, and concerns regarding sustainabilit..."
2,40779701,interest. A. Murphy has received funding for r...
3,40779049,1. Ann Hematol. 2025 Aug 8. doi: 10.1007/s0027...
4,40779049,"retrospective single-center study, we analyzed..."


In [13]:
from langchain_core.documents import Document

docs = [
    Document(page_content=row['chunk'], metadata={"pmid": row["pmid"]})
    for _, row in df_chunks.iterrows()
]

print("Total documents prepared for embedding:", len(docs))


Total documents prepared for embedding: 65


In [14]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


  embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [15]:
from langchain_community.vectorstores import Chroma

# Create or load local DB folder
db = Chroma.from_documents(
    documents=docs,
    embedding=embedding_model,
    persist_directory="./chroma_db"
)

# Persist the vector DB
db.persist()
print("✅ Vector DB saved to ./chroma_db")


✅ Vector DB saved to ./chroma_db


  db.persist()


In [16]:
query = "What are the current treatments for COPD?"
results = db.similarity_search(query, k=3)

for i, res in enumerate(results, 1):
    print(f"\n🔹 Result {i}:")
    print(res.page_content[:500])



🔹 Result 1:
1. Chronic Obstr Pulm Dis. 2025 Aug 6. doi: 10.15326/jcopdf.2024.0599. Online ahead of print. Diagnosing Type 2 Inflammation in COPD: Comparison of Blood and Sputum Eosinophil Assessment in the University of California Los Angeles COPD Phenotyping Study. LeMaster WB(1), Ingersoll SA(2), Phee H(2), Wen R(3), Bai J(4), Belperio JA(3), Buhr RG(3)(4)(5), Phillips JE(2), Palchevskiy V(3), Bina T(3), Tashkin DP(3), Cooper CB(3)(6), Barjaktarevic IZ(3). Author information: (1)Division of Allergy, Pulmo

🔹 Result 2:
1. Chronic Obstr Pulm Dis. 2025 Aug 6. doi: 10.15326/jcopdf.2024.0599. Online ahead of print. Diagnosing Type 2 Inflammation in COPD: Comparison of Blood and Sputum Eosinophil Assessment in the University of California Los Angeles COPD Phenotyping Study. LeMaster WB(1), Ingersoll SA(2), Phee H(2), Wen R(3), Bai J(4), Belperio JA(3), Buhr RG(3)(4)(5), Phillips JE(2), Palchevskiy V(3), Bina T(3), Tashkin DP(3), Cooper CB(3)(6), Barjaktarevic IZ(3). Author information: (1

In [17]:
import os
os.environ["GOOGLE_API_KEY"] = "AIzaSyD--2ttcBqspWg96Ql5S7bAOmNU-J6CUHI"  # 🔐 Replace with your key


In [18]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vectordb = Chroma(
    persist_directory="./chroma_db",
    embedding_function=embedding_model
)


  vectordb = Chroma(


In [19]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.2)


In [20]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectordb.as_retriever(search_kwargs={"k": 4}),
    return_source_documents=True
)


In [21]:
import google.generativeai as genai

genai.configure(api_key="AIzaSyAz2UsTU731LwmNGCxCxTu3hhqH_90odcs")


In [22]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-pro",
    temperature=0.2,
    convert_system_message_to_human=True
)


In [23]:
query = "What are the recent treatment methods for COPD?"
response = qa_chain.invoke(query)

print("🔍 Question:", query)
print("\n💡 Answer:\n", response['result'])

# Show source chunks (optional)
print("\n📚 Source Docs:")
for doc in response['source_documents']:
    print("—", doc.page_content[:300], "\n")


ChatGoogleGenerativeAIError: Invalid argument provided to Gemini: 400 API Key not found. Please pass a valid API key. [reason: "API_KEY_INVALID"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
, locale: "en-US"
message: "API Key not found. Please pass a valid API key."
]