In [60]:
!pip install langchain==0.0.  # use a version matching your environment (or latest)
!pip install faiss-cpu           # or faiss-gpu if available
!pip install sentence-transformers
!pip install openai               # if using OpenAI embeddings
!pip install tiktoken             # optional

[31mERROR: Invalid requirement: 'langchain==0.0.': Expected end or semicolon (after version specifier)
    langchain==0.0.
             ~~~~~^[0m[31m


In [61]:
!pip install -U langchain-huggingface



In [62]:
!pip install -U langchain-community



In [63]:
!pip install -U langchain-huggingface



In [64]:
!pip install pypdf



In [65]:
!pip install transformers



In [71]:
from langchain_community.document_loaders import PyPDFLoader
import os

# Specify the directory where you uploaded your PDF files
pdf_folder_path = "/content/docs" # You can change this if your PDFs are in a different folder

# Create the directory if it doesn't exist
if not os.path.exists(pdf_folder_path):
    os.makedirs(pdf_folder_path)
    print(f"Created directory: {pdf_folder_path}")
    print("Please upload your PDF files into this folder.")
else:
    print(f"Looking for PDF files in: {pdf_folder_path}")
    # Get a list of all PDF files in the folder
    pdf_files = [f for f in os.listdir(pdf_folder_path) if f.endswith('.pdf')]

    if not pdf_files:
        print("No PDF files found in the specified directory.")
        print("Please upload your PDF files into the folder and run this cell again.")
        docs = [] # Initialize docs as empty if no files found
    else:
        print(f"Found {len(pdf_files)} PDF file(s). Loading...")
        docs = []
        for pdf_file in pdf_files:
            file_path = os.path.join(pdf_folder_path, pdf_file)
            try:
                loader = PyPDFLoader(file_path)
                docs.extend(loader.load())
                print(f"Successfully loaded: {pdf_file}")
            except Exception as e:
                print(f"Error loading {pdf_file}: {e}")

        print(f"Total documents loaded: {len(docs)}")

Looking for PDF files in: /content/docs
Found 1 PDF file(s). Loading...
Successfully loaded: langchain-retrieval-augmented-generation-white-paper.pdf
Total documents loaded: 23


In [80]:
# rag_demo.py
# Simple RAG demo using LangChain + FAISS
# Two options for embeddings:
#  - OpenAIEmbeddings (requires OPENAI_API_KEY)
#  - SentenceTransformerEmbeddings (local, no API key required)

from langchain.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFacePipeline # Changed import
from transformers import pipeline # Import pipeline
import os
from google.colab import userdata # Import userdata to access secrets

# ---------- CONFIG ----------
USE_OPENAI_EMBEDDINGS = False # Changed to False for local embeddings
# OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")  # No longer needed for embeddings
# For demo with local embeddings, set USE_OPENAI_EMBEDDINGS=false and provide sentence-transformer model name:
HF_EMBEDDING_MODEL = "all-MiniLM-L6-v2"
# LLM: For simple demo use OpenAI text-davinci or gpt-3.5; or replace with local HF LLM wrapper.
LOCAL_LLM_MODEL = "google/flan-t5-small" # Changed to a local model name
# ----------------------------

# Sample documents (in practice: load files, pdfs, etc.)
# raw_texts = [ # Removed hardcoded text
#     ("doc1", "LangChain is a framework for building applications with LLMs. It provides chains, agents, and integrations."),
#     ("doc2", "FAISS is a library for efficient similarity search and clustering of dense vectors."),
#     ("doc3", "Retrieval-Augmented Generation augments model context with retrieved documents to improve factual grounding."),
# ]

# Create Document objects with metadata - Using docs variable populated from PDF loading cell
# docs = [Document(page_content=t[1], metadata={"source": t[0]}) for t in raw_texts] # Removed creation from hardcoded text

# Ensure 'docs' variable is available from the PDF loading cell
if 'docs' not in locals() or not docs:
    raise ValueError("No documents loaded. Please run the PDF loading cell first.")


# 1) Chunk text (for longer docs)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
split_docs = []
for d in docs:
    parts = text_splitter.split_text(d.page_content)
    for i, p in enumerate(parts):
        split_docs.append(Document(page_content=p, metadata={**d.metadata, "chunk": i}))

# 2) Create embeddings
if USE_OPENAI_EMBEDDINGS:
    if not OPENAI_API_KEY:
        raise ValueError("Set OPENAI_API_KEY in environment to use OpenAI embeddings.")
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
    embeddings = OpenAIEmbeddings()
else:
    embeddings = HuggingFaceEmbeddings(model_name=HF_EMBEDDING_MODEL)

# 3) Build FAISS index
vector_store = FAISS.from_documents(split_docs, embeddings)

# Optionally persist the index
index_path = "faiss_index"
vector_store.save_local(index_path)  # saves index + metadata

print("Index built and saved to", index_path)

# 4) Build a Retriever -> RetrievalQA chain

# Create a local LLM pipeline
pipe = pipeline("text2text-generation", model=LOCAL_LLM_MODEL, max_new_tokens=64) # Changed to text2text-generation for T5 models
llm = HuggingFacePipeline(pipeline=pipe) # Use HuggingFacePipeline

retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

# 5) Demo query loop
def ask(question):
    print("\n> QUESTION:", question)
    res = qa.run(question)
    print("\n> ANSWER:", res)

if __name__ == "__main__":
    # Example queries
    ask("What is the LangChain white paper content about?") # Changed query to be relevant to the PDF
    ask("What is LangChain?") # Changed query
    ask("What is LLM?") # Changed query
    ask("What is Retrieval-Augmented Generation?") # Changed




Index built and saved to faiss_index


Device set to use cpu



> QUESTION: What is the LangChain white paper content about?

> ANSWER: Embedding Documents

> QUESTION: What is LangChain?

> ANSWER: LangChain can be used to create “chains” or pipelines that automatically process an input query, search for relevant documentation, and assemble prompts that combine the query with context that can be used for generating an accurate response

> QUESTION: What is LLM?

> ANSWER: The LLM forms the core of the RAG pipeline, taking the assembled input prompt and generating a response. Most LLMs available from HuggingFace Hub can be used in the RAG pipeline. The main design parameter is model size (i.e., number of parameters): a

> QUESTION: What is Retrieval-Augmented Generation?

> ANSWER: a method for augmenting a LLM’s knowledge with specific data beyond the general dataset it was trained on
