In [1]:
# ==============================================================================
# 1. INSTALLATION
# ==============================================================================
# Install all the required libraries. The '-q' flag is for a quiet installation.
!pip install -q langchain torch transformers sentence-transformers datasets faiss-cpu ipywidgets langchain-huggingface


In [2]:
# ==============================================================================
# 2. IMPORTS
# ==============================================================================
# Core LangChain and data loading components
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA

# Hugging Face specific components
from transformers import pipeline
from langchain_huggingface import HuggingFacePipeline
from langchain_huggingface import HuggingFaceEmbeddings

In [3]:
# ==============================================================================
# 3. DATA LOADING AND PREPARATION
# ==============================================================================
print("Step 3: Loading and preparing data...")

# Load the dataset from Hugging Face Hub
dataset_name = "databricks/databricks-dolly-15k"
page_content_column = "context"
loader = HuggingFaceDatasetLoader(dataset_name, page_content_column)
data = loader.load()

# Split the loaded documents into smaller, manageable chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = text_splitter.split_documents(data)

print("Data loaded and split into chunks.")

Step 3: Loading and preparing data...
Data loaded and split into chunks.


In [4]:
# ==============================================================================
# 4. EMBEDDING MODEL AND VECTOR STORE
# ==============================================================================
print("\nStep 4: Setting up embedding model and vector store...")

# Define the embedding model to convert text chunks into numerical vectors
model_path = "sentence-transformers/all-MiniLM-l6-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

embeddings = HuggingFaceEmbeddings(
    model_name=model_path,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Create the FAISS vector store to efficiently search the document embeddings
# This process may take a few minutes as it embeds all document chunks
db = FAISS.from_documents(docs, embeddings)

print("FAISS vector store created successfully.")


Step 4: Setting up embedding model and vector store...

FAISS vector store created successfully.


In [5]:
# ==============================================================================
# 5. RETRIEVER, LLM, AND RAG CHAIN SETUP
# ==============================================================================
print("\nStep 5: Setting up the Retriever, LLM, and RAG chain...")

# Create a retriever from the vector store to fetch relevant documents
retriever = db.as_retriever(search_kwargs={"k": 4})

# Set up the Language Model (LLM) pipeline
# We use a 'text2text-generation' model which is suitable for question-answering based on context
# The 'device=-1' argument ensures the model runs on the CPU
pipe = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    tokenizer="google/flan-t5-base",
    max_length=512,
    device=-1
)

llm = HuggingFacePipeline(pipeline=pipe)

# Create the final Retrieval-Augmented Generation (RAG) chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # 'stuff' chain type passes all retrieved chunks to the LLM
    retriever=retriever,
    return_source_documents=False
)

print("RAG chain is ready.")


Step 5: Setting up the Retriever, LLM, and RAG chain...


Device set to use cpu


RAG chain is ready.


In [6]:
# ==============================================================================
# 6. EXECUTE QUERIES
# ==============================================================================
print("\nStep 6: Executing queries...")

# --- Query 1 ---
question_1 = "Who is Hamlet?"
result_1 = qa_chain.invoke({"query": question_1})
print(f"\nQuestion: {question_1}")
print(f"Answer: {result_1['result']}")
print("-" * 50)

# --- Query 2 ---
question_2 = "When did Virgin Australia start operating?"
result_2 = qa_chain.invoke({"query": question_2})
print(f"Question: {question_2}")
print(f"Answer: {result_2['result']}")
print("-" * 50)

# --- Query 3 ---
question_3 = "What is cheesemaking?"
result_3 = qa_chain.invoke({"query": question_3})
print(f"Question: {question_3}")
print(f"Answer: {result_3['result']}")
print("-" * 50)

Token indices sequence length is longer than the specified maximum sequence length for this model (650 > 512). Running this sequence through the model will result in indexing errors



Step 6: Executing queries...

Question: Who is Hamlet?
Answer: Shakespeare
--------------------------------------------------
Question: When did Virgin Australia start operating?
Answer: 31 August 2000
--------------------------------------------------
Question: What is cheesemaking?
Answer: control the spoiling of milk into cheese
--------------------------------------------------
