In [10]:
import os
from typing import List,Dict,Any
import pandas as pd

In [11]:
from langchain_core.documents import Document
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter,
)



In [12]:
doc=Document(page_content="This is a sample document to demonstrate text splitting.", metadata={"source": "example.txt","page":1,"author":"Achyut","date_created":"2024-06-15","custom_field":"any value"})
print("Document structure")
print(f"Content : {doc.page_content}")
print(f"Metadata : {doc.metadata}")


Document structure
Content : This is a sample document to demonstrate text splitting.
Metadata : {'source': 'example.txt', 'page': 1, 'author': 'Achyut', 'date_created': '2024-06-15', 'custom_field': 'any value'}


In [13]:
type(doc)

langchain_core.documents.base.Document

In [14]:
### Text files (.txt) - The simplest Case (#2-text-files)
##create a sample text file
import os
os.makedirs("data/text_files", exist_ok=True)

In [15]:
sample_text = {
    "data/text_files/python_intro.txt":"""Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

1. dollar uipsum dolor sit amet, consectetur adipiscing elit.
2. dollar uipsum dolor sit amet, consectetur adipiscing elit.
3. dollar uipsum dolor sit amet, consectetur adipiscing elit.
4. dollar uipsum dolor sit amet, consectetur adipiscing elit.
5. dollar uipsum dolor sit amet, consectetur adipiscing elit.

     Additional Context: dolor uipsum dolor sit amet, consectetur adipiscing elit. 

 """  ,
     "data/text_files/machine_learning.txt":"""Data science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from structured and unstructured data. It combines aspects of statistics, computer science, and domain expertise to analyze and interpret complex data sets.
Data science is widely used in various industries, including finance, healthcare, marketing, and technology, to make informed decisions and drive innovation.
Key components of data science include data collection, data cleaning, data analysis, machine learning, and data visualization.
 """    
}    
for filepath,content in sample_text.items():
    with open(filepath, "w",encoding="utf-8") as f:
        f.write(content)   
print("Sample text files created.")        
                       

Sample text files created.


In [24]:
### Text loader _Reading single File
from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader
## Loading a single text file
loader =TextLoader("data/text_files/python_intro.txt", encoding="utf-8")
documents = loader.load()
print(type(documents))
print(f"Number of documents loaded: {len(documents)}")
print(f"Content Preview:{documents[0].page_content[:100]}...")
print(f"MetaData: {documents[0].metadata}")




<class 'list'>
Number of documents loaded: 1
Content Preview:Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Sed do eiusmod tempor incididunt ut labore ...
MetaData: {'source': 'data/text_files/python_intro.txt'}


In [26]:
### Directory Loader -Multiple Text file
from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import DirectoryLoader

DIR_LOADER =DirectoryLoader(
    "data/text_files",
    glob ="**/*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding":"utf-8"},
    show_progress=True
)
documents = DIR_LOADER.load()
print(f"Total Documents Loaded: {len(documents)}")
for i ,doc in enumerate(documents):
    print(f"\nDocument {i+1} Content Preview: {doc.page_content[:100]}...")
    print(f"Document {i+1} Metadata: {doc.metadata}")
    print(f"length of document {i+1}: {len(doc.page_content)} characters")


100%|██████████| 2/2 [00:00<00:00, 1446.31it/s]

Total Documents Loaded: 2

Document 1 Content Preview: Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Sed do eiusmod tempor incididunt ut labore ...
Document 1 Metadata: {'source': 'data/text_files/python_intro.txt'}
length of document 1: 844 characters

Document 2 Content Preview: Data science is an interdisciplinary field that uses scientific methods, processes, algorithms and s...
Document 2 Metadata: {'source': 'data/text_files/machine_learning.txt'}
length of document 2: 585 characters





In [28]:
### Text splitting statergies

from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    TokenTextSplitter,
)
print(f"List of Document before splitting:{documents}")

List of Document before splitting:[Document(metadata={'source': 'data/text_files/python_intro.txt'}, page_content='Lorem ipsum dolor sit amet, consectetur adipiscing elit.\nSed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.\nDuis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.\nExcepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n\n1. dollar uipsum dolor sit amet, consectetur adipiscing elit.\n2. dollar uipsum dolor sit amet, consectetur adipiscing elit.\n3. dollar uipsum dolor sit amet, consectetur adipiscing elit.\n4. dollar uipsum dolor sit amet, consectetur adipiscing elit.\n5. dollar uipsum dolor sit amet, consectetur adipiscing elit.\n\n     Additional Context: dolor uipsum dolor sit amet, consectetur adipiscing elit. \n\n '), Document(metadata={'sour

In [29]:
### Method 1 - Character Text splitter
text = documents[0].page_content
text

'Lorem ipsum dolor sit amet, consectetur adipiscing elit.\nSed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.\nDuis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.\nExcepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n\n1. dollar uipsum dolor sit amet, consectetur adipiscing elit.\n2. dollar uipsum dolor sit amet, consectetur adipiscing elit.\n3. dollar uipsum dolor sit amet, consectetur adipiscing elit.\n4. dollar uipsum dolor sit amet, consectetur adipiscing elit.\n5. dollar uipsum dolor sit amet, consectetur adipiscing elit.\n\n     Additional Context: dolor uipsum dolor sit amet, consectetur adipiscing elit. \n\n '

In [31]:
print("--- Character Text Splitter ---")
char_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=200,
    chunk_overlap=20,
    length_function=len,
)
char_chunks = char_splitter.split_text(text)
print(f"Number of chunks created: {len(char_chunks)}")
for i, chunk in enumerate(char_chunks):
    print(f"\nChunk {i+1} Content Preview: {chunk[:100]}...")
    print(f"Chunk {i+1} Length: {len(chunk)} characters")

--- Character Text Splitter ---
Number of chunks created: 6

Chunk 1 Content Preview: Lorem ipsum dolor sit amet, consectetur adipiscing elit....
Chunk 1 Length: 56 characters

Chunk 2 Content Preview: Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nos...
Chunk 2 Length: 174 characters

Chunk 3 Content Preview: Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatu...
Chunk 3 Length: 102 characters

Chunk 4 Content Preview: Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id es...
Chunk 4 Length: 172 characters

Chunk 5 Content Preview: 2. dollar uipsum dolor sit amet, consectetur adipiscing elit.
3. dollar uipsum dolor sit amet, conse...
Chunk 5 Length: 185 characters

Chunk 6 Content Preview: 5. dollar uipsum dolor sit amet, consectetur adipiscing elit.
     Additional Context: dolor uipsum ...
Chunk 6 Length: 144 characters


In [32]:
### Method 2 - Recursive Character Text Splitter
print("\n--- Recursive Character Text Splitter ---")
recursive_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size=200,
    chunk_overlap=20,
    length_function=len,
)
recursive_chunks = recursive_splitter.split_text(text)
print(f"Number of chunks created: {len(recursive_chunks)}")
for i, chunk in enumerate(recursive_chunks):
    print(f"\nChunk {i+1} Content Preview: {chunk[:100]}...")
    print(f"Chunk {i+1} Length: {len(chunk)} characters")


--- Recursive Character Text Splitter ---
Number of chunks created: 7

Chunk 1 Content Preview: Lorem ipsum dolor sit amet, consectetur adipiscing elit....
Chunk 1 Length: 56 characters

Chunk 2 Content Preview: Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nos...
Chunk 2 Length: 174 characters

Chunk 3 Content Preview: Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatu...
Chunk 3 Length: 102 characters

Chunk 4 Content Preview: Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id es...
Chunk 4 Length: 110 characters

Chunk 5 Content Preview: 1. dollar uipsum dolor sit amet, consectetur adipiscing elit.
2. dollar uipsum dolor sit amet, conse...
Chunk 5 Length: 185 characters

Chunk 6 Content Preview: 4. dollar uipsum dolor sit amet, consectetur adipiscing elit.
5. dollar uipsum dolor sit amet, conse...
Chunk 6 Length: 123 characters

Chunk 7

In [33]:
### Recursive text splitter with a proper example
print("\n--- Recursive Character Text Splitter on Full Document ---")
simple_text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."""
recursive_splitter = RecursiveCharacterTextSplitter(
    separators=[" "],
    chunk_size=80,
    chunk_overlap=20,
    length_function=len,
)
recursive_chunks = recursive_splitter.split_text(simple_text)
print(f"Number of chunks created: {len(recursive_chunks)}")
for i, chunk in enumerate(recursive_chunks):
    print(f"\nChunk {i+1} Content Preview: {chunk[:100]}...")
    print(f"Chunk {i+1} Length: {len(chunk)} characters")


--- Recursive Character Text Splitter on Full Document ---
Number of chunks created: 8

Chunk 1 Content Preview: Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Sed do eiusmod tempor...
Chunk 1 Length: 78 characters

Chunk 2 Content Preview: do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim...
Chunk 2 Length: 79 characters

Chunk 3 Content Preview: Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut...
Chunk 3 Length: 74 characters

Chunk 4 Content Preview: laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in...
Chunk 4 Length: 73 characters

Chunk 5 Content Preview: aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu...
Chunk 5 Length: 74 characters

Chunk 6 Content Preview: cillum dolore eu fugiat nulla pariatur.
Excepteur sint occaecat cupidatat non...
Chunk 6 Length: 77 characters

Chunk 7 Content Preview: cupidatat non proident, sunt in culpa qui officia deserunt mollit anim 

In [34]:
### Method 3 - Token Text Splitter
print("\n--- Token Text Splitter ---")
token_splitter = TokenTextSplitter(
    chunk_size=50,
    chunk_overlap=10,
)
token_chunks = token_splitter.split_text(text)
print(f"Number of chunks created: {len(token_chunks)}")
for i, chunk in enumerate(token_chunks):
    print(f"\nChunk {i+1} Content Preview: {chunk[:100]}...")
    print(f"Chunk {i+1} Length: {len(chunk)} characters")


--- Token Text Splitter ---
Number of chunks created: 8

Chunk 1 Content Preview: Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Sed do eiusmod tempor incididunt ut labore ...
Chunk 1 Length: 134 characters

Chunk 2 Content Preview:  magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex...
Chunk 2 Length: 144 characters

Chunk 3 Content Preview: .
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla paria...
Chunk 3 Length: 132 characters

Chunk 4 Content Preview: 
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id e...
Chunk 4 Length: 144 characters

Chunk 5 Content Preview: . dollar uipsum dolor sit amet, consectetur adipiscing elit.
2. dollar uipsum dolor sit amet, consec...
Chunk 5 Length: 137 characters

Chunk 6 Content Preview: cing elit.
3. dollar uipsum dolor sit amet, consectetur adipiscing elit.
4. dollar uipsum dolor sit ...
Chunk 6 