In [1]:
    from google.colab import drive
    drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Install required packages
!pip install pypdf==5.6.0
!pip install PyMuPDF==1.26.1
!pip install python-dotenv==1.1.0
!pip install langchain-community==0.3.25
!pip install langchain_openai==0.3.23
!pip install rank_bm25==0.2.2
!pip install faiss-cpu==1.11.0
!pip install deepeval==3.1.0

Collecting pypdf==5.6.0
  Downloading pypdf-5.6.0-py3-none-any.whl.metadata (7.2 kB)
Downloading pypdf-5.6.0-py3-none-any.whl (304 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m304.2/304.2 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.6.0
Collecting PyMuPDF==1.26.1
  Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m70.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.1
Collecting python-dotenv==1.1.0
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
  Attempting uninstall: python-dotenv
    Found existing installation: python-do

In [3]:
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks')

In [4]:
import os
import sys
from dotenv import load_dotenv
from google.colab import userdata



# Load environment variables from a .env file
load_dotenv()

# Set the OpenAI API key environment variable (comment out if not using OpenAI)
if not userdata.get('OPENAI_API_KEY'):
    os.environ["OPENAI_API_KEY"] = input("Please enter your OpenAI API key: ")
else:
    os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

# Original path append replaced for Colab compatibility

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from helper_functions import (EmbeddingProvider,
                              retrieve_context_per_question,
                              replace_t_with_space,
                              get_langchain_embedding_provider,
                              show_context)

from evalute_rag import evaluate_rag

from langchain.vectorstores import FAISS


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from helper_functions import (EmbeddingProvider,


In [5]:
# Download required data files
import os
os.makedirs('data', exist_ok=True)

In [6]:
# Download the PDF document used in this notebook
!wget -O /content/drive/MyDrive/Understanding_Climate_Change.pdf https://raw.githubusercontent.com/AnSwati/LLM101/main/Understanding_Climate_Change.pdf
!wget -O /content/drive/MyDrive/Understanding_Climate_Change.pdf https://raw.githubusercontent.com/AnSwati/LLM101/main/Understanding_Climate_Change.pdf

--2025-10-28 22:23:36--  https://raw.githubusercontent.com/AnSwati/LLM101/main/Understanding_Climate_Change.pdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 206372 (202K) [application/octet-stream]
Saving to: ‘/content/drive/MyDrive/Understanding_Climate_Change.pdf’


2025-10-28 22:23:36 (5.49 MB/s) - ‘/content/drive/MyDrive/Understanding_Climate_Change.pdf’ saved [206372/206372]

--2025-10-28 22:23:36--  https://raw.githubusercontent.com/AnSwati/LLM101/main/Understanding_Climate_Change.pdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 

In [7]:

path = "/content/drive/MyDrive/Understanding_Climate_Change.pdf"

In [8]:
from langchain.document_loaders import PyPDFLoader
from pypdf import PdfReader
import os

def encode_pdf(path, chunk_size=1000, chunk_overlap=200):
    """
    Encodes a PDF book into a vector store using OpenAI embeddings.
    Adds more robust error handling for PDF loading.
    """
    try:
        # First, validate PDF file
        if not os.path.exists(path):
            raise FileNotFoundError(f"PDF file not found at {path}")

        # Try PyPDF reader first to validate PDF
        try:
            pdf_reader = PdfReader(path)
            num_pages = len(pdf_reader.pages)
            print(f"PDF validated. Total pages: {num_pages}")
        except Exception as pdf_validate_error:
            print(f"PDF validation failed: {pdf_validate_error}")
            raise

        # Load PDF documents
        loader = PyPDFLoader(path)
        documents = loader.load()

        if not documents:
            raise ValueError("No documents could be loaded from the PDF")

        # Split documents into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len
        )
        texts = text_splitter.split_documents(documents)

        # Optional: Add logging or print number of chunks
        print(f"Total text chunks created: {len(texts)}")

        # Optional cleaning step (assuming replace_t_with_space is defined)
        cleaned_texts = replace_t_with_space(texts) if 'replace_t_with_space' in globals() else texts

        # Create embeddings
        embeddings = get_langchain_embedding_provider(EmbeddingProvider.OPENAI)

        # Create vector store
        vectorstore = FAISS.from_documents(cleaned_texts, embeddings)

        return vectorstore

    except Exception as e:
        print(f"Error processing PDF: {e}")
        raise

In [9]:
chunks_vector_store = encode_pdf(path, chunk_size=1000, chunk_overlap=200)

PDF validated. Total pages: 33
Total text chunks created: 97


In [10]:

chunks_query_retriever = chunks_vector_store.as_retriever(search_kwargs={"k": 2})

In [11]:
test_query = "What is the main cause of climate change?"
context = retrieve_context_per_question(test_query, chunks_query_retriever)
show_context(context)

  docs = chunks_query_retriever.get_relevant_documents(question)


Context 1:
Chapter 2: Causes of Climate Change 
Greenhouse Gases 
The primary cause of recent climate change is the increase in greenhouse gases in the 
atmosphere. Greenhouse gases, such as carbon dioxide (CO2), methane (CH4), and nitrous 
oxide (N2O), trap heat from the sun, creating a "greenhouse effect." This effect is essential 
for life on Earth, as it keeps the planet warm enough to support life. However, human 
activities have intensified this natural process, leading to a warmer climate. 
Fossil Fuels 
Burning fossil fuels for energy releases large amounts of CO2. This includes coal, oil, and 
natural gas used for electricity, heating, and transportation. The industrial revolution marked 
the beginning of a significant increase in fossil fuel consumption, which continues to rise 
today. 
Coal


Context 2:
Most of these climate changes are attributed to very small variations in Earth's orbit that 
change the amount of solar energy our planet receives. During the Holocene epoch,

In [12]:
#Note - this currently works with OPENAI only
evaluate_rag(chunks_query_retriever)

{'questions': ['1. **Multiple Choice: Causes of Climate Change**',
  '   - What is the primary cause of the current climate change trend?',
  '     A) Solar radiation variations',
  '     B) Natural cycles of the Earth',
  '     C) Human activities, such as burning fossil fuels',
  '     D) Volcanic eruptions',
  '',
  '2. **True or False: Impact on Biodiversity**',
  '   - True or False: Climate change does not have any significant impact on the migration patterns and extinction rates of various species.',
  '',
  '3. **Short Answer: Mitigation Strategies**',
  '   - Describe two effective strategies that could be implemented to mitigate the effects of climate change on a global scale.',
  '',
  '4. **Matching: Climate Change Effects**',
  '   - Match the following effects of climate change (1-4) with their likely consequences (A-D).',
  '     1. Rising sea levels',
  '     2. Increased frequency of extreme weather events',
  '     3. Melting polar ice caps',
  '     4. Ocean acidific