In [1]:
import os
from tqdm.auto import tqdm
from dotenv import load_dotenv
from pinecone import ServerlessSpec
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone_text.sparse import BM25Encoder
from langchain_community.retrievers import PineconeHybridSearchRetriever
from langchain_cohere import CohereRerank
from langchain.chains import ConversationalRetrievalChain
from langchain_groq import ChatGroq
from pinecone import Pinecone
from langchain_google_genai import GoogleGenerativeAIEmbeddings


In [3]:
load_dotenv()

True

In [5]:
print("Loading and splitting documents...")
loader = PyPDFLoader("Carbon_adsorption.pdf")
documents = loader.load()

Loading and splitting documents...


In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(documents)
print(f"Loaded and split {len(docs)} chunks.")

Loaded and split 41 chunks.


In [8]:
# --- 2. Initialize Pinecone and Create Index ---
print("Initializing Pinecone...")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
if not pinecone_api_key:
    raise ValueError("PINECONE_API_KEY not found in environment variables.")

index_name = "langchain-test-index"  # change if desired
pc = Pinecone(api_key=pinecone_api_key)
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric="dotproduct",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

index = pc.Index(index_name)

Initializing Pinecone...


In [9]:
gemini_api_key = os.getenv("GEMINI_API_KEY")

# embeddings = GoogleGenerativeAIEmbeddings(
#     model="models/embedding-001",
#     google_api_key=gemini_api_key,
#     max_retries=3,
#     timeout=60,
# )

from langchain_cohere import CohereEmbeddings

embeddings = CohereEmbeddings(model="embed-english-v3.0")
# embeddings = HuggingFaceEmbeddings(model_name="/home/rohan/models")
bm25_encoder = BM25Encoder().default()
print("Embeddings and sparse encoder initialized.")

[nltk_data] Downloading package punkt_tab to /home/pk/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /home/pk/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Embeddings and sparse encoder initialized.


In [10]:
# how many chunks produced locally
print("Num doc chunks:", len(docs))

# pinecone index stats (replace `index` with your Pinecone index object if different)
print("Index stats:", index.describe_index_stats())


Num doc chunks: 41
Index stats: {'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'dotproduct',
 'namespaces': {'': {'vector_count': 41}},
 'total_vector_count': 41,
 'vector_type': 'dense'}


In [11]:
# --- 4. Upsert Documents to Pinecone (Hybrid Search) ---
print("Upserting documents to Pinecone with hybrid search...")
batch_size = 100
for i in tqdm(range(0, len(docs), batch_size)):
    i_end = min(i + batch_size, len(docs))
    docs_batch = docs[i:i_end]
    
    # Get page content
    docs_content = [doc.page_content for doc in docs_batch]

    # Create sparse vectors
    sparse_embeds = bm25_encoder.encode_documents(docs_content)

    # Create dense vectors
    dense_embeds = embeddings.embed_documents(docs_content)

    # Create metadata
    metadata = [
        {
            "text": doc.page_content,
            "source": doc.metadata.get("source", "Unknown"),
            "page": doc.metadata.get("page", 0),
        }
        for doc in docs_batch
    ]

    # Create IDs
    ids = [f"doc_{i+j}" for j in range(len(docs_batch))]
    
    # Create vectors for upsert
    vectors_to_upsert = []
    for doc_id, sparse, dense, meta in zip(ids, sparse_embeds, dense_embeds, metadata):
        vectors_to_upsert.append({
            "id": doc_id,
            "sparse_values": sparse,
            "values": dense,
            "metadata": meta,
        })

    # Upsert to Pinecone
    index.upsert(vectors=vectors_to_upsert)

print("Documents upserted to Pinecone.")

Upserting documents to Pinecone with hybrid search...


  0%|          | 0/1 [00:00<?, ?it/s]

Documents upserted to Pinecone.


In [12]:
# --- 5. Initialize Retriever with Hybrid Search ---
print("Initializing retriever with hybrid search...")
retriever = PineconeHybridSearchRetriever(
    embeddings=embeddings, sparse_encoder=bm25_encoder, index=index, text_key = "text", top_k=20
)
print("Retriever initialized.")

Initializing retriever with hybrid search...
Retriever initialized.


In [13]:
groq_api_key = os.getenv("GROQ_API_KEY")

llm = ChatGroq(
    model = "llama-3.3-70b-versatile",
    temperature = 0.2,
    max_tokens = 32000
)


In [14]:
# --- 6. Initialize Reranker ---
print("Initializing Cohere reranker...")
cohere_api_key = os.getenv("COHERE_API_KEY")
if not cohere_api_key:
    raise ValueError("COHERE_API_KEY not found in environment variables.")
reranker = CohereRerank(model="rerank-english-v3.0")
print("Reranker initialized.")

Initializing Cohere reranker...
Reranker initialized.


In [15]:
# --- 7. Create Conversational RAG Chain ---
print("Creating Conversational RAG Chain...")

# Use the default chain without custom prompt to avoid validation errors
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    condense_question_llm=llm,
    return_source_documents=True
)
print("Conversational RAG Chain created successfully.")

Creating Conversational RAG Chain...
Conversational RAG Chain created successfully.


In [17]:
# --- 8. Run the RAG Agent ---
print("\n--- RAG Agent is Ready ---")
chat_history = []
while True:
    query = input("You: ")
    if query.lower() in ["exit", "quit"]:
        break
    
    result = qa_chain({"question": query, "chat_history": chat_history})
    
    # Rerank the retrieved documents
    reranked_docs = reranker.compress_documents(
        documents=result['source_documents'],
        query=query
    )
    
    print("\n--- Answer ---")
    print(result['answer'])
    print("\n--- Reranked Sources ---")
    for doc in reranked_docs:
        print(f"Source: {doc.metadata['source']}, Page: {doc.metadata['page']}")
        print(f"Content: {doc.page_content[:200]}...")
        print("-" * 20)
        
    chat_history.append((query, result["answer"]))


--- RAG Agent is Ready ---

--- Answer ---
Here are the abstracts:

1. Abstract: Carbon dioxide (CO2) capture technology is a prominent way to mitigate global climate change originating from the excessive emission of greenhouse gas CO2. The structural modification of adsorbents with amine is a new attractive strategy to enhance their CO2 adsorption efficiency under low pressure. The current work is looking to boost the CO2 uptake performance of Zeolitic imidazolate framework -8 (ZIF -8) impregnated with aminoethylethanolamine (AEEA) inside the porous network of crystalline ZIF-8 nanoparticles via the wet functionalization process due to the enormous surface area and remarkable thermally and chemical stability of ZIF -8.

2. Abstract: The rapid increase in atmospheric CO2 concentrations, driven by human activities, has become a critical factor in global climate change, posing severe risks to sustainable development. Addressing this challenge necessitates substantial CO2 removal, which 

UnauthorizedException: (401)
Reason: Unauthorized
HTTP response headers: HTTPHeaderDict({'Date': 'Sat, 23 Aug 2025 04:32:29 GMT', 'Content-Type': 'text/plain', 'Content-Length': '12', 'Connection': 'keep-alive', 'x-pinecone-auth-rejected-reason': 'Malformed domain', 'www-authenticate': 'Malformed domain', 'server': 'envoy'})
HTTP response body: Unauthorized


In [18]:
if pc.has_index(index_name):
    pc.delete_index(index_name)

In [1]:
import pandas as pd
data = pd.read_csv('co2_adsorption_properties_20250823_210213.csv')
data

Unnamed: 0,Abstract_ID,Page_Number,Source,Property,Value
0,A2,0,Carbon_adsorption.pdf,Adsorbent Material,Zeolitic imidazolate framework -8 (ZIF -8) imp...
1,A2,0,Carbon_adsorption.pdf,Pressure Range,Not specified
2,A2,0,Carbon_adsorption.pdf,Temperature Range,Not specified
3,A2,0,Carbon_adsorption.pdf,Maximum CO2 Adsorption Capacity,Not specified
4,A2,0,Carbon_adsorption.pdf,Adsorption Enhancement,Not specified
...,...,...,...,...,...
225,A21,8,Carbon_adsorption.pdf,Adsorbent Material,MPS-TEPA-30
226,A21,8,Carbon_adsorption.pdf,Pressure Range,<0.01 bar to 10 mbar
227,A21,8,Carbon_adsorption.pdf,Temperature Range,"Not specified, but stable up to 473 K"
228,A21,8,Carbon_adsorption.pdf,Maximum CO2 Adsorption Capacity,1.5 mmol g−1


In [8]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("Carbon_adsorption1.pdf")
documents = loader.load()
documents

[Document(metadata={'producer': 'LibreOffice 24.2', 'creator': 'Writer', 'creationdate': '2025-08-23T21:25:38+05:30', 'author': 'Kritesh Gupta', 'source': 'Carbon_adsorption1.pdf', 'total_pages': 11, 'page': 0, 'page_label': '1'}, page_content='Sn\no. \nText data\n1. Abstract:  Carbon  dioxide  (CO2)  capture  technology  is  a  prominent  way  to \nmitigate  global  climate  change  originating  from  the  excessive  emission  of \ngreenhouse gas CO2. The structural modification of adsorbents with amine is a \nnew attractive strategy to enhance their CO2 adsorption efficiency under low  \npressure. The current work is looking to boost the CO2 uptake performance of  \nZeolitic  imidazolate  framework-8  (ZIF-8)  impregnated  with  \naminoethylethanolamine (AEEA) inside the porous network of crystalline ZIF-8  \nnanoparticles via the wet functionalization process due to the enormous surface \narea and remarkable thermally and chemical stability of ZIF-8. The parent ZIF-8  \nand  amine  

In [9]:
import pandas as pd
df = pd.DataFrame(documents)
df

Unnamed: 0,0,1,2,3
0,"(id, None)","(metadata, {'producer': 'LibreOffice 24.2', 'c...","(page_content, Sn\no. \nText data\n1. Abstract...","(type, Document)"
1,"(id, None)","(metadata, {'producer': 'LibreOffice 24.2', 'c...","(page_content, highlight the potential of micr...","(type, Document)"
2,"(id, None)","(metadata, {'producer': 'LibreOffice 24.2', 'c...","(page_content, forces, hydrogen bonds and Lewi...","(type, Document)"
3,"(id, None)","(metadata, {'producer': 'LibreOffice 24.2', 'c...","(page_content, the uptake was 3.16 ± 0.01 mmol...","(type, Document)"
4,"(id, None)","(metadata, {'producer': 'LibreOffice 24.2', 'c...","(page_content, 6. Abstract:\nCyanophyta blooms...","(type, Document)"
5,"(id, None)","(metadata, {'producer': 'LibreOffice 24.2', 'c...","(page_content, adsorption capacity of 2.35 mmo...","(type, Document)"
6,"(id, None)","(metadata, {'producer': 'LibreOffice 24.2', 'c...","(page_content, his study greenly and efficient...","(type, Document)"
7,"(id, None)","(metadata, {'producer': 'LibreOffice 24.2', 'c...","(page_content, impacting both the surface area...","(type, Document)"
8,"(id, None)","(metadata, {'producer': 'LibreOffice 24.2', 'c...","(page_content, the initial adsorption after 7 ...","(type, Document)"
9,"(id, None)","(metadata, {'producer': 'LibreOffice 24.2', 'c...","(page_content, adsorption capacity of 2.77 mmo...","(type, Document)"


KeyError: 10