In [6]:
# ====================================
# 🔹 1. Install Dependencies
# ====================================
!pip install -q langchain langchain-community langchain-pinecone langchain-google-genai tavily-python pypdf sentence-transformers

# ====================================
# 🔹 2. Imports
# ====================================
import os
from google.colab import userdata, files

# LangChain core
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# LLM + embeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.embeddings import HuggingFaceEmbeddings

# Vector store (Pinecone)
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

# PDF loader
from langchain_community.document_loaders import PyPDFLoader

# Tavily search
from langchain_community.tools.tavily_search import TavilySearchResults

In [2]:

# ====================================
# 🔹 3. Load API Keys
# ====================================
api_key = userdata.get("GOOGLE_API_KEY")        # Gemini
pinecone_api_key = userdata.get("PINECONE_API_KEY")
tavily_api_key = userdata.get("TAVILY_API_KEY")

os.environ["GOOGLE_API_KEY"] = api_key or ""
os.environ["PINECONE_API_KEY"] = pinecone_api_key or ""
os.environ["TAVILY_API_KEY"] = tavily_api_key or ""

print("Gemini key loaded:", bool(api_key))
print("Pinecone key loaded:", bool(pinecone_api_key))
print("Tavily key loaded:", bool(tavily_api_key))

# ====================================
# 🔹 4. Initialize LLM + Embeddings
# ====================================
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0,
    google_api_key=api_key
)

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)  # dim = 384


Gemini key loaded: True
Pinecone key loaded: True
Tavily key loaded: True


  embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [3]:

# ====================================
# 🔹 5. Upload & Process PDF
# ====================================
uploaded = files.upload()   # Upload course catalog PDF
pdf_path = list(uploaded.keys())[0]

loader = PyPDFLoader(pdf_path)
documents = loader.load()

print(f"Loaded {len(documents)} pages from {pdf_path}")

# Split into chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
docs = splitter.split_documents(documents)

print(f"Split into {len(docs)} chunks")

# ====================================
# 🔹 6. Setup Pinecone Vector Store
# ====================================
pc = Pinecone(api_key=pinecone_api_key)
index_name = "courses-index"

# Create index if missing
if index_name not in [i["name"] for i in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    print("Created new Pinecone index:", index_name)

# Store documents
vector_store = PineconeVectorStore.from_documents(
    documents=docs,
    embedding=embeddings,
    index_name=index_name
)
print("Documents stored in Pinecone ✅")


Saving University Course Catalog.pdf to University Course Catalog.pdf
Loaded 2 pages from University Course Catalog.pdf
Split into 5 chunks
Documents stored in Pinecone ✅


In [4]:

# ====================================
# 🔹 7. Prompt Template + Chain
# ====================================
prompt_template = """
You are an assistant who helps answer course-related questions.

If the answer is not in the context, say "I don't know".

Context:
{context}

Question:
{question}

Answer:
"""
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template
)
chain = LLMChain(llm=llm, prompt=prompt)

# Tavily search
tavily = TavilySearchResults()

# ====================================
# 🔹 8. Smart Q&A Function
# ====================================
def smart_answer(question: str, k: int = 3, score_threshold: float = 0.35):
    """Answer using PDF (RAG), fallback to Tavily if no strong match is found."""

    # 1) Search in Pinecone with scores
    results = vector_store.similarity_search_with_score(question, k=k)

    if results and results[0][1] >= score_threshold:
        # Only use if top score >= threshold
        context = "\n\n".join([r[0].page_content for r in results])
        print(f"✅ Answered from PDF (RAG) | score={results[0][1]:.2f}")
        return chain.invoke({"context": context, "question": question})["text"]

    # 2) If weak or no match → Web search
    print("🌐 Answered from Web (Tavily)")
    search_results = tavily.invoke({"query": question})

    summary_prompt = f"""
    Summarize the following web search results into a clear, concise answer:

    Question: {question}

    Results:
    {search_results}
    """
    return llm.invoke(summary_prompt).content




  chain = LLMChain(llm=llm, prompt=prompt)
  tavily = TavilySearchResults()


In [5]:
# ====================================
# 🔹 9. Test Queries
# ====================================
print(smart_answer("List all the undergraduate programs."))   # should hit PDF if available
print(smart_answer("What is new in Python 3.12?"))      # should fallback to Web


✅ Answered from PDF (RAG) | score=0.56
Undergraduate Programs:
*   Bachelor of Arts in English
*   Bachelor of Arts in History
*   Bachelor of Fine Arts in Visual Arts
🌐 Answered from Web (Tavily)
Python 3.12 introduces several new features and improvements, including:

*   **New Type Parameter Syntax (PEP 695):** This enhances the ergonomics of using generic types and type aliases with static type checkers.
*   **Buffer Protocol Support (PEP 688):** Adds support for the buffer protocol directly in Python code.
*   **Improved Error Messages:** Enhances the clarity and helpfulness of error messages.
*   **More Powerful F-Strings:** Expands the capabilities of f-strings.
*   **Performance Enhancements:** Includes a faster Python runtime due to more specializations.
