### Introduction To Data Ingestion

In [5]:
import os
from typing import List, Dict, Any
import pandas as pd


In [6]:
from langchain_core.documents import Document
from langchain.text_splitter import(
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)
print("Set up completed")


Set up completed


#### Understanding Document  Structure in Langchain

In [7]:
## create a simple document
doc = Document(
    page_content="This is the main text content that will be embedded and searched",
    metadata={
        "source":"example.txt",
        "page":1,
        "authon":"Aditya",
        "custom_field":"any value"
    }
)
print("Document Structure")

print(f"content :{doc.page_content}")
print(f"metadata :{doc.metadata}")

Document Structure
content :This is the main text content that will be embedded and searched
metadata :{'source': 'example.txt', 'page': 1, 'authon': 'Aditya', 'custom_field': 'any value'}


###  Text Files (.TextDisplayObject)

In [8]:
import os
os.makedirs("data/text_file",exist_ok=True)

In [11]:
sample_text={
    "data/text_file/python_intro.txt":"Python is a popular, high-level programming language known for its simplicity and readability. It supports multiple programming paradigms, including procedural, object-oriented, and functional programming. Python is widely used in web development, data science, automation, artificial intelligence, and more. Key features of Python: Easy to learn and use, Large standard library, Cross-platform compatibility, Strong community support. Example: print('Hello, World!')",
    "data/text_file/ml_intro.txt":"Machine Learning (ML) is a field of artificial intelligence that enables computers to learn from data and make predictions or decisions without being explicitly programmed. ML is used in applications such as image recognition, natural language processing, and recommendation systems. Key concepts include supervised learning, unsupervised learning, and reinforcement learning. Example: Training a model to classify emails as spam or not spam."
}

for filepath,content in sample_text.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)

print("Sample file has been created")

Sample file has been created


### TextLoader - Read Single File

In [13]:
from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader  # You can use any one

## Loading a single text file
loader = TextLoader("data/text_file/python_intro.txt",encoding="utf-8")

documents = loader.load()
print(type(documents))
print(documents)


print(f"Loaded {len(documents)} document")
print(f"Content preview: {documents[0].page_content[:100]} ....")
print(f"Metadata: {documents[0].metadata}")

<class 'list'>
[Document(metadata={'source': 'data/text_file/python_intro.txt'}, page_content="Python is a popular, high-level programming language known for its simplicity and readability. It supports multiple programming paradigms, including procedural, object-oriented, and functional programming. Python is widely used in web development, data science, automation, artificial intelligence, and more. Key features of Python: Easy to learn and use, Large standard library, Cross-platform compatibility, Strong community support. Example: print('Hello, World!')")]
Loaded 1 document
Content preview: Python is a popular, high-level programming language known for its simplicity and readability. It su ....
Metadata: {'source': 'data/text_file/python_intro.txt'}


### DirectoryLoader - Multiple Text Files

In [21]:
from langchain.document_loaders import DirectoryLoader

## load all the text files from the directory
dir_loader = DirectoryLoader(
    "data/text_file",
    glob="**/*.txt", ##Pattern to matc file
    loader_cls=TextLoader,
    loader_kwargs={'encoding':'utf-8'},
    show_progress=True

)

documents=dir_loader.load()

print(f"Loaded {len(documents)} documents")

for i,doc in enumerate(documents):
    print(f"\nDocument {i+1}:")
    print(f"       Source: {doc.metadata['source']}")
    print(f"       Length: {len(doc.page_content)} characters")


100%|██████████| 2/2 [00:00<00:00, 2340.57it/s]

Loaded 2 documents

Document 1:
       Source: data/text_file/python_intro.txt
       Length: 467 characters

Document 2:
       Source: data/text_file/ml_intro.txt
       Length: 442 characters





### Text Splitting Strategies

In [39]:
from langchain.text_splitter import(
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)
print(documents)

[Document(metadata={'source': 'data/text_file/python_intro.txt'}, page_content="Python is a popular, high-level programming language known for its simplicity and readability. It supports multiple programming paradigms, including procedural, object-oriented, and functional programming. Python is widely used in web development, data science, automation, artificial intelligence, and more. Key features of Python: Easy to learn and use, Large standard library, Cross-platform compatibility, Strong community support. Example: print('Hello, World!')"), Document(metadata={'source': 'data/text_file/ml_intro.txt'}, page_content='Machine Learning (ML) is a field of artificial intelligence that enables computers to learn from data and make predictions or decisions without being explicitly programmed. ML is used in applications such as image recognition, natural language processing, and recommendation systems. Key concepts include supervised learning, unsupervised learning, and reinforcement learnin

In [40]:
### Method 1 - CharacterTextSplitter
text=documents[0].page_content
text

"Python is a popular, high-level programming language known for its simplicity and readability. It supports multiple programming paradigms, including procedural, object-oriented, and functional programming. Python is widely used in web development, data science, automation, artificial intelligence, and more. Key features of Python: Easy to learn and use, Large standard library, Cross-platform compatibility, Strong community support. Example: print('Hello, World!')"

In [46]:
### Method 1 - Character-based Text Splitter
print("Character-based Text Splitter")
char_splitter = CharacterTextSplitter(
    separator=" ", # try to split by new line first, then by space
    chunk_size=200, # max size of each chunk
    chunk_overlap=20, # overlap between chunks
    length_function=len # function to measure length (default is len
    )

char_chunks = char_splitter.split_text(text)
print(f"Total Chunks: {len(char_chunks)}")
print(f"First Chunk: {char_chunks[0]}...")

Character-based Text Splitter
Total Chunks: 3
First Chunk: Python is a popular, high-level programming language known for its simplicity and readability. It supports multiple programming paradigms, including procedural, object-oriented, and functional...


In [48]:
print(char_chunks[0])
print("-------------------------------------------")
print(char_chunks[1])
print("-------------------------------------------")
print(char_chunks[2])

Python is a popular, high-level programming language known for its simplicity and readability. It supports multiple programming paradigms, including procedural, object-oriented, and functional
-------------------------------------------
and functional programming. Python is widely used in web development, data science, automation, artificial intelligence, and more. Key features of Python: Easy to learn and use, Large standard
-------------------------------------------
use, Large standard library, Cross-platform compatibility, Strong community support. Example: print('Hello, World!')


In [50]:
# Method 2 - RecursiveCharacterTextSplitter (Recommended)
print("Recursive Character-based Text Splitter")
recursive_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n","\n", " ",""],  # Try these separators in order
    chunk_size=200,  # max size of each chunk
    chunk_overlap=20,  # overlap between chunks
    length_function=len  # function to measure length (default is len)
)
recursive_chunks = recursive_splitter.split_text(text)
print(f"Total Chunks: {len(recursive_chunks)}")
print(f"First Chunk: {recursive_chunks[0][:100]}...")


print(recursive_chunks[0])
print("-------------------------------------------")
print(recursive_chunks[1])
print("-------------------------------------------")
print(recursive_chunks[2])

Recursive Character-based Text Splitter
Total Chunks: 3
First Chunk: Python is a popular, high-level programming language known for its simplicity and readability. It su...
Python is a popular, high-level programming language known for its simplicity and readability. It supports multiple programming paradigms, including procedural, object-oriented, and functional
-------------------------------------------
and functional programming. Python is widely used in web development, data science, automation, artificial intelligence, and more. Key features of Python: Easy to learn and use, Large standard
-------------------------------------------
use, Large standard library, Cross-platform compatibility, Strong community support. Example: print('Hello, World!')


In [53]:
# Method 3 - Token-based Text Splitter 
print("Token-based Text Splitter")
token_splitter = TokenTextSplitter(
    chunk_size=50,  # max size of each chunk in tokens
    chunk_overlap=10 # overlap between chunks in tokens
    )

token_chunks = token_splitter.split_text(text)
print(f"Total Chunks: {len(token_chunks)}")
print(f"First Chunk: {token_chunks[0][:100]}...")
print(len(token_chunks[0]))      


Token-based Text Splitter
Total Chunks: 2
First Chunk: Python is a popular, high-level programming language known for its simplicity and readability. It su...
273
