In [1]:
import os
from typing import List, Dict, Any
import pandas as pd

In [4]:
from langchain_core.documents import Document
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)

print("All libraries imported!")

All libraries imported!


### Understanding Document Structure in LangChain

In [None]:
## Create a simple document
doc = Document(
    page_content="This is the main text content that will be embedded and searched.",
    metadata={
        "source":"example.txt",
        "page":1,
        "author":"Abdullah Khan",
        "date_created":"2025-01-01",
        "custom_field":"any_value"
    }
)

In [34]:
print("Document Structure")

print(f"Content: {doc.page_content}")
print(f"Metadata: {doc.metadata}")

# Why metadata matters:
print("\n📝 Metadata is crucial for:")
print("- Filtering search results")
print("- Tracking document sources")
print("- Providing context in responses")
print("- Debugging and auditing")

Document Structure
Content: Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems


    
Metadata: {'source': 'data/text_files/machine_learning.txt'}

📝 Metadata is crucial for:
- Filtering search results
- Tracking document sources
- Providing context in responses
- Debugging and auditing


In [33]:
type(doc)

langchain_core.documents.base.Document

## Text files (.txt)

### TextLoader - Read single file

In [12]:
from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader

## Loading python_intro file
loader = TextLoader("data/text_files/python_intro.txt", encoding="utf-8")

loader

<langchain_community.document_loaders.text.TextLoader at 0x1088bddc0>

In [14]:
documents = loader.load()
print(type(documents))
print(documents)

<class 'list'>
[Document(metadata={'source': 'data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.')]


In [15]:
print(f"Loaded: {len(documents)} documents")
print(f"Content preview: {documents[0].page_content[:100]}...")
print(f"Metadata: {documents[0].metadata}")

Loaded: 1 documents
Content preview: Python Programming Introduction

Python is a high-level, interpreted programming language known for ...
Metadata: {'source': 'data/text_files/python_intro.txt'}


### DirectoryLoader - Multiple text files

In [35]:
from langchain_community.document_loaders import DirectoryLoader

## load all the text files from the directory
dir_loader = DirectoryLoader(
    "data/text_files",
    glob="**/*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding":"utf-8"},
    show_progress=True
)

documents = dir_loader.load()

print(f"Loaded {len(documents)} documents")
for i, doc in enumerate(documents):
    print(f"\nDocument {i+1}:")
    print(f"  Source: {doc.metadata['source']}")
    print(f"  Length: {len(doc.page_content)} characters")

100%|██████████| 2/2 [00:00<00:00, 350.28it/s]

Loaded 2 documents

Document 1:
  Source: data/text_files/python_intro.txt
  Length: 489 characters

Document 2:
  Source: data/text_files/machine_learning.txt
  Length: 575 characters





In [36]:
print("\n📊 DirectoryLoader Characteristics:")
print("✅ Advantages:")
print("  - Loads multiple files at once")
print("  - Supports glob patterns")
print("  - Progress tracking")
print("  - Recursive directory scanning")

print("\n❌ Disadvantages:")
print("  - All files must be same type")
print("  - Limited error handling per file")
print("  - Can be memory intensive for large directories")


📊 DirectoryLoader Characteristics:
✅ Advantages:
  - Loads multiple files at once
  - Supports glob patterns
  - Progress tracking
  - Recursive directory scanning

❌ Disadvantages:
  - All files must be same type
  - Limited error handling per file
  - Can be memory intensive for large directories


## Text Splitting Strategies

In [17]:
### Different Splitting techniques
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)

In [23]:
### Method 1 - Character Text Splitter
text = documents[0].page_content
print("1. CharacterTextSplitter")
char_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=200,
    chunk_overlap=20,
    length_function=len
)

char_chunks = char_splitter.split_text(text)
print(f"Create {len(char_chunks)} chunks")
print(f"First chunk: {char_chunks[0][:100]}")

1. CharacterTextSplitter
Create 3 chunks
First chunk: Python Programming Introduction
Python is a high-level, interpreted programming language known for i


In [24]:
print(char_chunks[0])
print("_________")
print(char_chunks[1])

Python Programming Introduction
Python is a high-level, interpreted programming language known for its simplicity and readability.
_________
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.
Key Features:
- Easy to learn and use
- Extensive standard library


In [28]:
### Method 2 - Recursive Character Text Splitter
text = documents[0].page_content
print("2. RecursiveCharacterTextSplitter")
recursive_splitter = RecursiveCharacterTextSplitter(
    separators=["\n", " ", ""],
    chunk_size=200,
    chunk_overlap=20,
    length_function=len
)

char_chunks = recursive_splitter.split_text(text)
print(f"Create {len(char_chunks)} chunks")
print(f"First chunk: {char_chunks[0][:100]}")

2. RecursiveCharacterTextSplitter
Create 3 chunks
First chunk: Python Programming Introduction

Python is a high-level, interpreted programming language known for 


In [29]:
print(char_chunks[0])
print("_________")
print(char_chunks[1])
print("_________")
print(char_chunks[2])

Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
_________
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
_________
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.


In [None]:
## Create text without natural breakpoints
text = documents[0].page_content
print("2. RecursiveCharacterTextSplitter")
recursive_splitter = RecursiveCharacterTextSplitter(
    separators=[" "],
    chunk_size=80,
    chunk_overlap=20,
    length_function=len
)

char_chunks = recursive_splitter.split_text(text)
for i in range(len(char_chunks) - 1):
    print(f"Chunk {i+1}: '{char_chunks[i]}'")
    print(f"Chunk {i+2}: '{char_chunks[i+1]}'")
    print()
    

2. RecursiveCharacterTextSplitter
Chunk 1: 'Python Programming Introduction

Python is a high-level, interpreted programming'
Chunk 2: 'programming language known for its simplicity and readability.
Created by Guido'

Chunk 2: 'programming language known for its simplicity and readability.
Created by Guido'
Chunk 3: 'by Guido van Rossum and first released in 1991, Python has become one of the'

Chunk 3: 'by Guido van Rossum and first released in 1991, Python has become one of the'
Chunk 4: 'become one of the most popular
programming languages in the world.

Key'

Chunk 4: 'become one of the most popular
programming languages in the world.

Key'
Chunk 5: 'in the world.

Key Features:
- Easy to learn and use
- Extensive standard'

Chunk 5: 'in the world.

Key Features:
- Easy to learn and use
- Extensive standard'
Chunk 6: 'Extensive standard library
- Cross-platform compatibility
- Strong community'

Chunk 6: 'Extensive standard library
- Cross-platform compatibility
- Strong community'

In [31]:
### TokenTextSplitter
text = documents[0].page_content
token_splitter = TokenTextSplitter(
    chunk_size=50,
    chunk_overlap=10
)

char_chunks = token_splitter.split_text(text)

print(char_chunks[0])
print(char_chunks[1])

Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in
 one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web


In [37]:
# 📊 Comparison
print("\n📊 Text Splitting Methods Comparison:")
print("\nCharacterTextSplitter:")
print("  ✅ Simple and predictable")
print("  ✅ Good for structured text")
print("  ❌ May break mid-sentence")
print("  Use when: Text has clear delimiters")

print("\nRecursiveCharacterTextSplitter:")
print("  ✅ Respects text structure")
print("  ✅ Tries multiple separators")
print("  ✅ Best general-purpose splitter")
print("  ❌ Slightly more complex")
print("  Use when: Default choice for most texts")

print("\nTokenTextSplitter:")
print("  ✅ Respects model token limits")
print("  ✅ More accurate for embeddings")
print("  ❌ Slower than character-based")
print("  Use when: Working with token-limited models")


📊 Text Splitting Methods Comparison:

CharacterTextSplitter:
  ✅ Simple and predictable
  ✅ Good for structured text
  ❌ May break mid-sentence
  Use when: Text has clear delimiters

RecursiveCharacterTextSplitter:
  ✅ Respects text structure
  ✅ Tries multiple separators
  ✅ Best general-purpose splitter
  ❌ Slightly more complex
  Use when: Default choice for most texts

TokenTextSplitter:
  ✅ Respects model token limits
  ✅ More accurate for embeddings
  ❌ Slower than character-based
  Use when: Working with token-limited models
