In [1]:
from langchain_community.document_loaders import PyMuPDFLoader
import os

# Get first PDF file
pdf_folder = "pdf"
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
print(f"PDFs found: {pdf_files}")

# Load first PDF
first_pdf_path = os.path.join(pdf_folder, pdf_files[0])
loader = PyMuPDFLoader(first_pdf_path)
docs = loader.load()

print(f"\nLoaded {len(docs)} pages from {pdf_files[0]}")
print(f"First page metadata: {docs[0].metadata}")
print(f"\nFirst 500 chars:\n{docs[0].page_content[:500]}")

  from .autonotebook import tqdm as notebook_tqdm


PDFs found: ['Core Components In RAG.pdf', 'Data Ingestion And Parsing Techniques.pdf', 'intro_to_rag.pdf', 'Vector Embeddings And Vector Databases.pdf', 'Vector Stores Vs Vector Databases.pdf']

Loaded 7 pages from Core Components In RAG.pdf
First page metadata: {'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-10-19T20:24:26+03:00', 'source': 'pdf\\Core Components In RAG.pdf', 'file_path': 'pdf\\Core Components In RAG.pdf', 'total_pages': 7, 'format': 'PDF 1.7', 'title': '', 'author': 'Abdullah Salah', 'subject': '', 'keywords': '', 'moddate': '2025-10-19T20:24:26+03:00', 'trapped': '', 'modDate': "D:20251019202426+03'00'", 'creationDate': "D:20251019202426+03'00'", 'page': 0}

First 500 chars:
Section 2: Core Components in RAG 
Course: Ultimate RAG Bootcamp Using LangChain, LangGraph and LangSmith 
Section Number: 2 
Total Videos: 2 
Date Created: 2024 
 
Video 1: Data Ingestion and Parsing 
Video Order: 1/2 
Topi

In [7]:
docs

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-10-19T20:54:21+03:00', 'source': 'pdf\\Data Ingestion And Parsing Techniques.pdf', 'file_path': 'pdf\\Data Ingestion And Parsing Techniques.pdf', 'total_pages': 36, 'format': 'PDF 1.7', 'title': '', 'author': 'Abdullah Salah', 'subject': '', 'keywords': '', 'moddate': '2025-10-19T20:54:21+03:00', 'trapped': '', 'modDate': "D:20251019205421+03'00'", 'creationDate': "D:20251019205421+03'00'", 'page': 0}, page_content='Section 3: Data Ingestion and Data Parsing Techniques \nCourse: Ultimate RAG Bootcamp Using LangChain, LangGraph and LangSmith \nSection Number: 3 \nTotal Videos: 9 \nDate Created: 2024 \n \nVideo 1: Document Structure in LangChain \nVideo Order: 1/9 \nTopics: LangChain document structure, Page content vs. metadata, Document loaders, \nText splitters, Project setup notes, Why metadata matters \nDifficulty: Beginner \nContent \nHello guys

In [2]:
# Check how content flows across pages
print("="*50)
for i in range(min(3, len(docs))):  # First 3 pages
    print(f"\nPage {i+1} - Last 200 chars:")
    print(docs[i].page_content[-200:])
    print(f"\nPage {i+1} - First 200 chars:")
    print(docs[i].page_content[:200])
    print("-"*30)

# Look for video boundaries
full_text = docs[0].page_content
if "Video 2:" in full_text or "Video Order: 2" in full_text:
    print("\n✓ Multiple videos found in same page")
else:
    print("\n✗ Video 2 not in first page")


Page 1 - Last 200 chars:
’ll break 
down what data we ingest, how we clean and split it, how we embed it into vectors, and 
how we store it in a vector database so that later phases (query + generation) can work 
effectively.

Page 1 - First 200 chars:
Section 2: Core Components in RAG 
Course: Ultimate RAG Bootcamp Using LangChain, LangGraph and LangSmith 
Section Number: 2 
Total Videos: 2 
Date Created: 2024 
 
Video 1: Data Ingestion and Parsing
------------------------------

Page 2 - Last 200 chars:
ine. 
 
The Ingestion Pipeline: Step by Step 
Step 1: Load the Documents 
Use appropriate loaders per file type to extract text and attach preliminary metadata (e.g., 
source, filename, section, url).

Page 2 - First 200 chars:
What is Document Ingestion and Pre-processing? 
To power the retriever, we first need a vector database filled with vectors that represent 
our knowledge. This knowledge can come from multiple sources
------------------------------

Page 3 - Last 200 chars:
ma

In [3]:
# Find all video markers across all pages
print("Searching for video markers across all pages...\n")

for i, doc in enumerate(docs):
    if "Video Order:" in doc.page_content:
        # Find the video header
        lines = doc.page_content.split('\n')
        for j, line in enumerate(lines[:10]):  # Check first 10 lines
            if "Video" in line and "Order:" in lines[j+1] if j+1 < len(lines) else False:
                print(f"Page {i+1}: Found video marker")
                print(f"  {line}")
                print(f"  {lines[j+1]}")
                break
    
# Check total content length
total_content = " ".join([doc.page_content for doc in docs])
video_count = total_content.count("Video Order:")
print(f"\nTotal 'Video Order:' found: {video_count}")

# Find where Video 2 starts
if "Video 2:" in total_content:
    video2_index = total_content.find("Video 2:")
    print(f"\nVideo 2 found at character position: {video2_index}")
    print("Context around Video 2:")
    print(total_content[video2_index-50:video2_index+200])

Searching for video markers across all pages...

Page 1: Found video marker
  Video 1: Data Ingestion and Parsing 
  Video Order: 1/2 
Page 5: Found video marker
  Video 2: Query Processing and Output Generation Phase 
  Video Order: 2/2 

Total 'Video Order:' found: 2

Video 2 found at character position: 5882
Context around Video 2:
nto the LLM for the Generation Phase. 
Thank you! Video 2: Query Processing and Output Generation Phase 
Video Order: 2/2 
Topics: Query processing, Query embedding, Similarity search, Retrieval, Context 
enrichment, Generation, LLMs (OpenAI, Llama, 


In [4]:
# Quick scan of all PDFs to understand their different structures
print("SCANNING ALL PDFs FOR STRUCTURE PATTERNS\n")
print("="*60)

for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder, pdf_file)
    loader = PyMuPDFLoader(pdf_path)
    docs = loader.load()
    
    print(f"\n📄 {pdf_file}")
    print(f"   Pages: {len(docs)}")
    
    # Check for video markers
    full_text = " ".join([doc.page_content for doc in docs])
    video_count = full_text.count("Video Order:")
    
    print(f"   Videos found: {video_count}")
    
    # Check first 300 chars to see structure
    print(f"   Structure preview:")
    print(f"   {docs[0].page_content[:200].replace(chr(10), ' ')}")
    
    # Look for common patterns
    patterns = {
        "Has Section:": "Section" in full_text[:500],
        "Has Video:": "Video" in full_text[:500], 
        "Has Topics:": "Topics:" in full_text[:500],
        "Has Difficulty:": "Difficulty:" in full_text[:500]
    }
    print(f"   Patterns: {[k for k,v in patterns.items() if v]}")
    print("-"*60)

SCANNING ALL PDFs FOR STRUCTURE PATTERNS


📄 Core Components In RAG.pdf
   Pages: 7
   Videos found: 2
   Structure preview:
   Section 2: Core Components in RAG  Course: Ultimate RAG Bootcamp Using LangChain, LangGraph and LangSmith  Section Number: 2  Total Videos: 2  Date Created: 2024    Video 1: Data Ingestion and Parsing
   Patterns: ['Has Section:', 'Has Video:', 'Has Topics:', 'Has Difficulty:']
------------------------------------------------------------

📄 Data Ingestion And Parsing Techniques.pdf
   Pages: 36
   Videos found: 9
   Structure preview:
   Section 3: Data Ingestion and Data Parsing Techniques  Course: Ultimate RAG Bootcamp Using LangChain, LangGraph and LangSmith  Section Number: 3  Total Videos: 9  Date Created: 2024    Video 1: Docume
   Patterns: ['Has Section:', 'Has Video:', 'Has Topics:', 'Has Difficulty:']
------------------------------------------------------------

📄 intro_to_rag.pdf
   Pages: 20
   Videos found: 4
   Structure preview:
   Section 1: In

In [5]:
# Document Quality Analysis
import re

print("DOCUMENT QUALITY ANALYSIS\n")
print("="*60)

for pdf_file in pdf_files[:2]:  # Start with first 2 PDFs
    pdf_path = os.path.join(pdf_folder, pdf_file)
    loader = PyMuPDFLoader(pdf_path)
    docs = loader.load()
    full_text = " ".join([doc.page_content for doc in docs])
    
    print(f"\n📄 {pdf_file}")
    
    # Quality metrics
    print("\n📊 Quality Metrics:")
    print(f"   Total characters: {len(full_text):,}")
    print(f"   Total words: {len(full_text.split()):,}")
    print(f"   Avg words per page: {len(full_text.split())//len(docs)}")
    
    # Check for common issues
    print("\n⚠️  Potential Issues:")
    
    # 1. Broken sentences across pages?
    broken_sentences = 0
    for doc in docs[:-1]:
        if doc.page_content.strip()[-1] not in '.!?':
            broken_sentences += 1
    print(f"   Pages ending mid-sentence: {broken_sentences}/{len(docs)}")
    
    # 2. Code blocks?
    code_indicators = full_text.count('```') + full_text.count('import ') + full_text.count('def ')
    print(f"   Possible code blocks: {code_indicators > 0}")
    
    # 3. Special characters/encoding issues?
    weird_chars = len(re.findall(r'[^\x00-\x7F]+', full_text))
    print(f"   Non-ASCII characters: {weird_chars}")
    
    # 4. Excessive whitespace?
    double_spaces = full_text.count('  ')
    triple_newlines = full_text.count('\n\n\n')
    print(f"   Double spaces: {double_spaces}, Triple newlines: {triple_newlines}")
    
    # 5. Timestamps in transcript?
    timestamps = len(re.findall(r'\d{1,2}:\d{2}', full_text))
    print(f"   Timestamp patterns: {timestamps}")
    
    print("-"*60)

DOCUMENT QUALITY ANALYSIS


📄 Core Components In RAG.pdf

📊 Quality Metrics:
   Total characters: 10,320
   Total words: 1,646
   Avg words per page: 235

⚠️  Potential Issues:
   Pages ending mid-sentence: 1/7
   Possible code blocks: False
   Non-ASCII characters: 58
   Double spaces: 0, Triple newlines: 0
   Timestamp patterns: 0
------------------------------------------------------------

📄 Data Ingestion And Parsing Techniques.pdf

📊 Quality Metrics:
   Total characters: 58,233
   Total words: 8,912
   Avg words per page: 247

⚠️  Potential Issues:
   Pages ending mid-sentence: 13/36
   Possible code blocks: True
   Non-ASCII characters: 444
   Double spaces: 0, Triple newlines: 0
   Timestamp patterns: 0
------------------------------------------------------------


In [6]:
# Deep dive into ONE document to understand the issues
pdf_path = os.path.join(pdf_folder, "Data Ingestion And Parsing Techniques.pdf")
loader = PyMuPDFLoader(pdf_path)
docs = loader.load()

print("DEEP DIVE: Data Ingestion And Parsing Techniques.pdf")
print("="*60)

# 1. Find the broken sentence issue
print("\n1️⃣ BROKEN SENTENCE EXAMPLE:")
for i in range(len(docs)-1):
    if not docs[i].page_content.strip().endswith(('.', '!', '?', ':')):
        print(f"\nPage {i+1} ends with:")
        print(f"...{docs[i].page_content.strip()[-100:]}")
        print(f"\nPage {i+2} starts with:")
        print(f"{docs[i+1].page_content.strip()[:100]}...")
        break

# 2. Find code blocks
print("\n2️⃣ CODE BLOCK EXAMPLE:")
full_text = " ".join([doc.page_content for doc in docs])
if "import " in full_text:
    idx = full_text.find("import ")
    print(f"Found at position {idx}:")
    print(full_text[idx-50:idx+200])

# 3. Find non-ASCII characters
print("\n3️⃣ NON-ASCII CHARACTERS:")
import re
for i, doc in enumerate(docs[:3]):  # Check first 3 pages
    non_ascii = re.findall(r'[^\x00-\x7F]+', doc.page_content)
    if non_ascii:
        print(f"\nPage {i+1} has: {set(non_ascii[:5])}")  # Show first 5 unique
        break

DEEP DIVE: Data Ingestion And Parsing Techniques.pdf

1️⃣ BROKEN SENTENCE EXAMPLE:

Page 2 ends with:
... embedded and searched." 
• 
metadata: a dictionary like: 
o source: "example.txt" 
o page_number: 1

Page 3 starts with:
o author: "Krish" 
o date_created: "2024-01-01" 
o (any other relevant fields you want) 
If you prin...

2️⃣ CODE BLOCK EXAMPLE:
Found at position 2086:
 reading different kinds of data. For this, we’ll import some libraries. 
If a library like pandas is missing, install it (for example: uv add pandas or your chosen 
package manager). As we add libraries, your pyproject.toml (or requirements) will up

3️⃣ NON-ASCII CHARACTERS:

Page 1 has: {'“', '→', '”—', '’'}


In [10]:
# Check semantic boundaries - How topics flow
print("Hello guys 👋")   # <-- added line
print("SEMANTIC FLOW CHECK")
print("="*60)

pdf_path = os.path.join(pdf_folder, "intro_to_rag.pdf")
loader = PyMuPDFLoader(pdf_path)
docs = loader.load()

# Check if topics are self-contained or reference each other
full_text = " ".join([doc.page_content for doc in docs[:5]])

# Look for references to other parts
references = {
    "Forward refs": len(re.findall(r'(later|next video|upcoming|we will see)', full_text, re.I)),
    "Back refs": len(re.findall(r'(earlier|previous|as mentioned|we saw)', full_text, re.I)),
    "Cross refs": len(re.findall(r'(see also|refer to|check out)', full_text, re.I))
}

print(f"\nCross-references found:")
for ref_type, count in references.items():
    print(f"  {ref_type}: {count}")

# Check topic density - are there clear topic shifts?
print(f"\nTopic markers found:")
topic_markers = ['Introduction', 'Overview', 'Summary', 'Conclusion', 'Step', 'Phase']
for marker in topic_markers:
    count = full_text.count(marker)
    if count > 0:
        print(f"  '{marker}': {count} times")


Hello guys 👋
SEMANTIC FLOW CHECK

Cross-references found:
  Forward refs: 1
  Back refs: 0
  Cross refs: 0

Topic markers found:
  'Introduction': 2 times
  'Overview': 1 times
  'Summary': 1 times


In [11]:
import os, re
from langchain_community.document_loaders import PyMuPDFLoader

# Define folder
pdf_folder = "pdf"

# Greeting & closing patterns (expandable)
greeting_patterns = [
    r"(?i)\bhello\s+guys[.! ]*", 
    r"(?i)\bhi\s+(everyone|guys|folks)[.! ]*",
    r"(?i)\bhey\s+(everyone|guys)[.! ]*"
]

closing_patterns = [
    r"(?i)\bthank\s+you[.! ]*",
    r"(?i)\bthanks\s+(for\s+watching|everyone|guys)[.! ]*",
    r"(?i)\bsee\s+you\s+(in\s+the\s+next\s+video|soon)[.! ]*",
    r"(?i)\bthat'?s\s+it\s+for\s+(this|today'?s)\s+(video|lesson)[.! ]*"
]

def clean_text(text):
    """Remove greetings and closings."""
    for pattern in greeting_patterns + closing_patterns:
        text = re.sub(pattern, '', text)
    # Normalize extra spaces
    text = re.sub(r'\s{2,}', ' ', text).strip()
    return text

# Loop through all PDFs and clean their text
cleaned_docs = {}

for pdf_file in os.listdir(pdf_folder):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, pdf_file)
        loader = PyMuPDFLoader(pdf_path)
        docs = loader.load()

        cleaned_texts = [clean_text(doc.page_content) for doc in docs]
        cleaned_docs[pdf_file] = " ".join(cleaned_texts)

        print(f"✅ Cleaned {pdf_file} ({len(docs)} pages)")


✅ Cleaned Core Components In RAG.pdf (7 pages)
✅ Cleaned Data Ingestion And Parsing Techniques.pdf (36 pages)
✅ Cleaned intro_to_rag.pdf (20 pages)
✅ Cleaned Vector Embeddings And Vector Databases.pdf (38 pages)
✅ Cleaned Vector Stores Vs Vector Databases.pdf (6 pages)


## Creating Custom Loader

In [19]:
import re
import os
from langchain.schema import Document
from langchain_community.document_loaders import PyMuPDFLoader

pdf_folder = "pdf"

def clean_text(text):
    """Remove greetings, closings, normalize whitespace."""
    patterns = [
        r"(?i)\bhello\s+guys[.! ]*",
        r"(?i)\bhi\s+(everyone|guys|folks)[.! ]*",
        r"(?i)\bthank\s+you[.! ]*",
        r"(?i)\bthanks\s+(for\s+watching|everyone|guys)[.! ]*",
        r"(?i)\bsee\s+you\s+(in\s+the\s+next\s+video|soon)[.! ]*",
        r"(?i)\bthat'?s\s+it\s+for\s+(this|today'?s)\s+(video|lesson)[.! ]*",
    ]
    for p in patterns:
        text = re.sub(p, "", text)
    text = re.sub(r"\s{2,}", " ", text).strip()
    return text


def extract_section_metadata(text):
    """Extract metadata at the section level."""
    section_meta = {}
    patterns = {
        "section_name": r"Section\s*\d*:\s*(.*?)(?:Course|Section\s*Number|Total|Date|$)",
        "section_number": r"Section\s*Number:\s*(\d+)",
        "total_videos": r"Total\s*Videos:\s*(\d+)",
        "course_name": r"Course:\s*(.*?)(?:Section|$)",
        "date_created": r"Date\s*Created:\s*(\d+)",
    }
    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.DOTALL)
        if match:
            section_meta[key] = match.group(1).strip()
    return section_meta


def extract_video_metadata(text):
    """Extract metadata inside a single video block."""
    meta = {}
    patterns = {
        "video_title": r"Video\s*\d+:\s*([^\n]*)",
        "video_order": r"Video\s*Order:\s*([^\n]*)",
        "topics": r"Topics:\s*([^\n]*)",
        "difficulty": r"Difficulty:\s*([^\n]*)",
    }
    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            meta[key] = match.group(1).strip()
    return meta


def strip_metadata_from_content(text):
    """
    Remove any leading metadata section (Video title, Topics, Difficulty, etc.)
    including the 'Content' label if present.
    """
    # Find the 'Content' keyword and remove everything before it
    content_match = re.search(r"(?i)\bcontent\b[:\-]?", text)
    if content_match:
        text = text[content_match.end():]
    else:
        # Fallback: remove any lines starting with metadata-like keywords
        lines = text.splitlines()
        cleaned_lines = []
        skip_keywords = ["video", "video order", "topics", "difficulty"]
        for line in lines:
            if any(re.match(fr"(?i)^{kw}", line.strip()) for kw in skip_keywords):
                continue
            cleaned_lines.append(line)
        text = "\n".join(cleaned_lines)
    return text.strip()


def load_pdf_videos(pdf_path):
    """Extract structured Document objects from PDF (only videos)."""
    loader = PyMuPDFLoader(pdf_path)
    docs = loader.load()
    full_text = " ".join([doc.page_content for doc in docs])
    section_meta = extract_section_metadata(full_text)
    section_meta["source_file"] = os.path.basename(pdf_path)

    # Split by video markers and skip the first (section header)
    video_blocks = re.split(r"(?=Video\s*\d+:)", full_text)[1:]

    video_docs = []
    for block in video_blocks:
        if not block.strip():
            continue
        meta = extract_video_metadata(block)
        text = clean_text(strip_metadata_from_content(block))
        metadata = {**section_meta, **meta, "doc_type": "video"}
        video_docs.append(Document(page_content=text, metadata=metadata))

    return video_docs


# --- PROCESS ALL PDFs ---
all_video_docs = []
for pdf_file in os.listdir(pdf_folder):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, pdf_file)
        video_docs = load_pdf_videos(pdf_path)
        all_video_docs.extend(video_docs)
        print(f"✅ {pdf_file}: {len(video_docs)} videos extracted")


# --- PREVIEW SAMPLE DOCUMENTS ---
print("\n📘 SAMPLE DOCUMENTS PREVIEW\n", "="*70)
by_source = {}
for doc in all_video_docs:
    src = doc.metadata["source_file"]
    by_source.setdefault(src, []).append(doc)

for src, docs in by_source.items():
    print(f"\n=== {src} ===")
    for d in docs[:3]:
        print(f"\n🎞️ Video Title: {d.metadata.get('video_title', 'N/A')}")
        print(f"   Video Order: {d.metadata.get('video_order', 'N/A')}")
        print(f"   Topics: {d.metadata.get('topics', 'N/A')}")
        print(f"   Difficulty: {d.metadata.get('difficulty', 'N/A')}")
        print(f"   Section: {d.metadata.get('section_name', 'N/A')}")
        print(f"   Section Number: {d.metadata.get('section_number', 'N/A')}")
        print(f"   Total Videos: {d.metadata.get('total_videos', 'N/A')}")
        print(f"   Date Created: {d.metadata.get('date_created', 'N/A')}")
        print(f"   Source File: {d.metadata.get('source_file', 'N/A')}")
        print(f"   Characters: {len(d.page_content)}")
        print(f"\n📝 Content Preview:\n{d.page_content[:300]}...")
        print("-"*60)


✅ Core Components In RAG.pdf: 2 videos extracted
✅ Data Ingestion And Parsing Techniques.pdf: 9 videos extracted
✅ intro_to_rag.pdf: 4 videos extracted
✅ Vector Embeddings And Vector Databases.pdf: 5 videos extracted
✅ Vector Stores Vs Vector Databases.pdf: 0 videos extracted

📘 SAMPLE DOCUMENTS PREVIEW

=== Core Components In RAG.pdf ===

🎞️ Video Title: Data Ingestion and Parsing
   Video Order: 1/2
   Topics: Document ingestion, Pre-processing, Chunking, Embeddings, Vector databases,
   Difficulty: Beginner
   Section: Core Components in RAG
   Section Number: 2
   Total Videos: 2
   Date Created: 2024
   Source File: Core Components In RAG.pdf
   Characters: 5357

📝 Content Preview:
So we are going to continue the discussion of Retrieval-Augmented Generation (RAG). In this specific video, we’ll dive into the core components of a RAG pipeline. By now, you already have an intuition for how RAG works at a high level: we have a Large Language Model (LLM), we augment it with externa...


In [20]:
all_video_docs

[Document(metadata={'section_name': 'Core Components in RAG', 'section_number': '2', 'total_videos': '2', 'course_name': 'Ultimate RAG Bootcamp Using LangChain, LangGraph and LangSmith', 'date_created': '2024', 'source_file': 'Core Components In RAG.pdf', 'video_title': 'Data Ingestion and Parsing', 'video_order': '1/2', 'topics': 'Document ingestion, Pre-processing, Chunking, Embeddings, Vector databases,', 'difficulty': 'Beginner', 'doc_type': 'video'}, page_content='So we are going to continue the discussion of Retrieval-Augmented Generation (RAG). In this specific video, we’ll dive into the core components of a RAG pipeline. By now, you already have an intuition for how RAG works at a high level: we have a Large Language Model (LLM), we augment it with external knowledge stored in a vector database, and the LLM uses retrieved context from that database to generate better answers. At a glance, when I provide an input to a plain LLM, it just generates an output from its internal know

## Chunking

### Splitting the documents into Chunks

In [22]:
len(all_video_docs)

20

In [23]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define chunking parameters
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200

splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    separators=["\n\n", "\n", ".", " ", ""]
)


In [24]:
# Split all video documents
chunked_docs = splitter.split_documents(all_video_docs)

print(f"✅ Total chunks created: {len(chunked_docs)}")


✅ Total chunks created: 226


In [25]:
# --- PREVIEW SAMPLE CHUNKS ---
print("\n🧩 SAMPLE CHUNKS PREVIEW\n", "="*70)
for i, doc in enumerate(chunked_docs[:5]):
    print(f"\n🔹 Chunk {i+1}")
    print(f"Section: {doc.metadata.get('section_name')}")
    print(f"Video: {doc.metadata.get('video_title')}")
    print(f"Chunk length: {len(doc.page_content)} chars")
    print(f"Metadata keys: {list(doc.metadata.keys())}")
    print(f"\nContent preview:\n{doc.page_content[:300]}...")
    print("-"*60)



🧩 SAMPLE CHUNKS PREVIEW

🔹 Chunk 1
Section: Core Components in RAG
Video: Data Ingestion and Parsing
Chunk length: 982 chars
Metadata keys: ['section_name', 'section_number', 'total_videos', 'course_name', 'date_created', 'source_file', 'video_title', 'video_order', 'topics', 'difficulty', 'doc_type']

Content preview:
So we are going to continue the discussion of Retrieval-Augmented Generation (RAG). In this specific video, we’ll dive into the core components of a RAG pipeline. By now, you already have an intuition for how RAG works at a high level: we have a Large Language Model (LLM), we augment it with externa...
------------------------------------------------------------

🔹 Chunk 2
Section: Core Components in RAG
Video: Data Ingestion and Parsing
Chunk length: 769 chars
Metadata keys: ['section_name', 'section_number', 'total_videos', 'course_name', 'date_created', 'source_file', 'video_title', 'video_order', 'topics', 'difficulty', 'doc_type']

Content preview:
. From the archi

### Cheking the overalp

In [26]:
print("\n🔁 OVERLAP CHECK\n", "="*70)

for i in range(2):
    chunk_a = chunked_docs[i].page_content[-200:]
    chunk_b = chunked_docs[i+1].page_content[:200]
    print(f"\nBetween Chunk {i+1} and {i+2}:")
    print("-"*60)
    print("🔹 End of Chunk A:\n", chunk_a)
    print("\n🔹 Start of Chunk B:\n", chunk_b)
    print("-"*60)



🔁 OVERLAP CHECK

Between Chunk 1 and 2:
------------------------------------------------------------
🔹 End of Chunk A:
 ontext to the LLM, and then the LLM generates a summarized, grounded output using that context. From the architectural diagram we discussed earlier, there are three main phases in a full RAG system: 1

🔹 Start of Chunk B:
 . From the architectural diagram we discussed earlier, there are three main phases in a full RAG system: 1. Document Ingestion Phase 2. Query Processing Phase 3. Generation Phase This video focuses on
------------------------------------------------------------

Between Chunk 2 and 3:
------------------------------------------------------------
🔹 End of Chunk A:
 rs that represent our knowledge. This knowledge can come from multiple sources: company policies, internal documents, PDFs, Word docs, CSVs, websites, databases, images (with extracted text), and more

🔹 Start of Chunk B:
 . This knowledge can come from multiple sources: company policies, 

In [27]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4o-mini")  # or "text-embedding-3-large"
total_tokens = 0

for doc in chunked_docs:
    total_tokens += len(encoding.encode(doc.page_content))

print(f"🔢 Total tokens across all chunks: {total_tokens:,}")
print(f"Average tokens per chunk: {total_tokens // len(chunked_docs)}")


🔢 Total tokens across all chunks: 44,347
Average tokens per chunk: 196


## Embedding + Vector Store Setup

In [30]:
# --- Step 1: Load Environment Variables ---
from dotenv import load_dotenv
import os

load_dotenv()  # loads your .env file automatically
openai_api_key = os.getenv("OPENAI_API_KEY")
print("✅ OpenAI API key loaded:", bool(openai_api_key))

✅ OpenAI API key loaded: True


In [32]:
# --- Step 2: Initialize OpenAI Embedding Model ---
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",  # accurate + cost-efficient
    openai_api_key=openai_api_key
)
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x0000022BFFF2F3E0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x0000022BFFF565D0>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [34]:
# --- Step 3: Create / Persist ChromaDB Store ---
from langchain_community.vectorstores import Chroma

persist_directory = "chroma_store"

# Create or connect to existing store
vectorstore = Chroma.from_documents(
    documents=chunked_docs,      # your 226 chunks
    embedding=embeddings,
    persist_directory=persist_directory
)

In [35]:
# Save to disk so you can reload later
vectorstore.persist()
print("✅ ChromaDB store created and persisted at:", persist_directory)

✅ ChromaDB store created and persisted at: chroma_store


  vectorstore.persist()


In [36]:
# --- Step 4: Test Retrieval ---
query = "What are the main phases in a RAG system?"
results = vectorstore.similarity_search(query, k=3)

results

[Document(metadata={'course_name': 'Ultimate RAG Bootcamp Using LangChain, LangGraph and LangSmith', 'difficulty': 'Beginner', 'total_videos': '2', 'source_file': 'Core Components In RAG.pdf', 'video_title': 'Data Ingestion and Parsing', 'date_created': '2024', 'topics': 'Document ingestion, Pre-processing, Chunking, Embeddings, Vector databases,', 'section_number': '2', 'section_name': 'Core Components in RAG', 'doc_type': 'video', 'video_order': '1/2'}, page_content='. From the architectural diagram we discussed earlier, there are three main phases in a full RAG system: 1. Document Ingestion Phase 2. Query Processing Phase 3. Generation Phase This video focuses on Phase 1: Document Ingestion and Pre-processing. We’ll break down what data we ingest, how we clean and split it, how we embed it into vectors, and how we store it in a vector database so that later phases (query + generation) can work effectively. What is Document Ingestion and Pre-processing? To power the retriever, we fir

## Create the Full RAG Chain

In [37]:
# --- STEP 1: Create the Retriever ---
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}   # number of chunks to retrieve
)

# Test the retriever alone (no LLM yet)
query = "What are the three main phases in a RAG system?"
retrieved_docs = retriever.get_relevant_documents(query)

print(f"✅ Retrieved {len(retrieved_docs)} relevant documents\n")
for i, doc in enumerate(retrieved_docs, start=1):
    print(f"Result {i}: {doc.metadata.get('video_title')} ({doc.metadata.get('section_name')})")
    print(doc.page_content[:300], "...\n")


  retrieved_docs = retriever.get_relevant_documents(query)


✅ Retrieved 3 relevant documents

Result 1: Data Ingestion and Parsing (Core Components in RAG)
. From the architectural diagram we discussed earlier, there are three main phases in a full RAG system: 1. Document Ingestion Phase 2. Query Processing Phase 3. Generation Phase This video focuses on Phase 1: Document Ingestion and Pre-processing. We’ll break down what data we ingest, how we clean  ...

Result 2: Some Examples and Advantages Of Using RAG (Introduction to RAG)
. So these are the three main important things with respect to any RAG architecture. Why Does RAG Actually Matter? Now let me ask one very important question. And this question is just like everybody will be probably thinking about it. Why does RAG actually matter? You know, why is RAG really helpfu ...

Result 3: Introduction to RAG (Introduction to RAG)
So I'm super excited to start the series of videos on RAG. In this video and in the upcoming series of videos, we are going to understand everything about retrieval a

In [47]:
from langchain.memory import ConversationBufferMemory

# Rebuild memory with the correct keys
memory = ConversationBufferMemory(
    memory_key="chat_history",
    input_key="question",   # <- matches ConversationalRetrievalChain input
    output_key="answer",    # <- the field we want stored
    return_messages=True
)

# Optional quick test (use matching keys!)
memory.save_context(
    {"question": "What are the main components of a RAG pipeline?"},
    {"answer": "Retrieval, augmentation, and generation."}
)
print("🧠 Memory ok. Messages:", memory.load_memory_variables({})["chat_history"])


🧠 Memory ok. Messages: [HumanMessage(content='What are the main components of a RAG pipeline?', additional_kwargs={}, response_metadata={}), AIMessage(content='Retrieval, augmentation, and generation.', additional_kwargs={}, response_metadata={})]


In [48]:
# --- STEP 3: Build Prompt Template ---
from langchain.prompts import PromptTemplate

template = """
You are an expert study assistant for the 'Ultimate RAG Bootcamp'.
Use ONLY the information from the provided context to answer the student's question.
If the answer is not found in the context, say you don't know.

Context:
{context}

Chat history:
{chat_history}

Question:
{question}

Answer in a clear, educational, and concise way.
"""

rag_prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question", "chat_history"]
)

print("✅ Prompt template created successfully.")


✅ Prompt template created successfully.


In [49]:
# --- STEP 4: Initialize the LLM ---
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="gpt-4o-mini",   # you can also try "gpt-4-turbo"
    temperature=0,         # 0 = factual, deterministic answers
    openai_api_key=openai_api_key
)

print("✅ LLM initialized successfully.")


✅ LLM initialized successfully.


In [50]:
# --- STEP 5 (fixed): Build Conversational Retrieval Chain ---
from langchain.chains import ConversationalRetrievalChain

rag_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    combine_docs_chain_kwargs={"prompt": rag_prompt},
    return_source_documents=True,
    output_key="answer"   # ✅ tell it to store only the answer in memory
)

print("✅ RAG Conversational Chain created successfully (with memory fix).")


✅ RAG Conversational Chain created successfully (with memory fix).


In [52]:
query = "How to load pdf files?"
response = rag_chain.invoke({"question": query})

print("🧠 QUESTION:", query)
print("\n💬 ANSWER:\n", response["answer"])

print("\n📚 SOURCES:")
for i, doc in enumerate(response["source_documents"][:2]):
    print(f"\n--- Source {i+1} ---")
    print(f"From: {doc.metadata.get('source_file')}")
    print(f"Section: {doc.metadata.get('section_name')}")
    print(f"Video: {doc.metadata.get('video_title')}")
    print(f"Content preview: {doc.page_content[:250]}...")


🧠 QUESTION: How to load pdf files?

💬 ANSWER:
 You can load PDF files using the following methods:

1. **PyPDFLoader**:
   - Create a loader with the file path (e.g., `data/pdf/attention.pdf`).
   - Call `.load()` to get a list of Document objects, one for each page.
   - Each Document contains `page_content` (extracted text) and `metadata` (fields like page number, source, creator, etc.).

2. **PyMuPDFLoader**:
   - You may need to install the `pymupdf` library first.
   - Create the loader with the same file and call `.load()`.
   - Inspect the returned Documents and metadata.
   - This method is generally fast, robust in text extraction, and supports image extraction.

3. **UnstructuredPDFLoader** (to be discussed later):
   - Useful for complex layouts, PDFs with images, tables, and heuristic segmentation before chunking.

After loading, you should print diagnostics, including the number of pages, a preview of the first page, and the metadata captured.

📚 SOURCES:

--- Source 1 ---