In [10]:
from langchain_core.documents import Document
import os

In [11]:
os.makedirs('../data/text_file',exist_ok=True)
os.makedirs('../data/pdf_file',exist_ok=True)


In [12]:
text = {
    '../data/text_file/Python.txt':"""
    Python is a high-level, versatile programming language widely used for data science, machine learning, web development, and automation. Its simplicity
    , rich ecosystem of libraries, and strong community support make it the go-to language for both beginners and professionals.""",
    '../data/text_file/Rag.txt':"""
    RAG is an advanced approach in large language models where external knowledge sources are combined with model outputs. It retrieves relevant documents
     from a database or search system and feeds them into the LLM, improving accuracy, domain adaptation, and freshness of responses.
    """
}

for filepath,content in text.items():
  with open(filepath,'w') as f:
    f.write(content)


## Text Loader

In [13]:
from langchain_community.document_loaders import TextLoader
f = TextLoader('../data/text_file/Python.txt',encoding='utf-8')
doc = f.load()
doc

[Document(metadata={'source': '../data/text_file/Python.txt'}, page_content='\n    Python is a high-level, versatile programming language widely used for data science, machine learning, web development, and automation. Its simplicity\n    , rich ecosystem of libraries, and strong community support make it the go-to language for both beginners and professionals.')]

### for directory loader

In [14]:
from langchain_community.document_loaders import DirectoryLoader

dirload  = DirectoryLoader(
    path='../data/text_file',
    loader_cls=TextLoader,
    loader_kwargs={'encoding':'utf-8'},
    show_progress=True
)
dir_docs = dirload.load()
print(dir_docs)

100%|██████████| 2/2 [00:00<00:00, 2456.40it/s]

[Document(metadata={'source': '../data/text_file/Rag.txt'}, page_content='\n    RAG is an advanced approach in large language models where external knowledge sources are combined with model outputs. It retrieves relevant documents\n     from a database or search system and feeds them into the LLM, improving accuracy, domain adaptation, and freshness of responses.\n    '), Document(metadata={'source': '../data/text_file/Python.txt'}, page_content='\n    Python is a high-level, versatile programming language widely used for data science, machine learning, web development, and automation. Its simplicity\n    , rich ecosystem of libraries, and strong community support make it the go-to language for both beginners and professionals.')]





In [None]:
from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader

dirload  = DirectoryLoader(
    path='../data/pdf_file/',
    glob='**/*.pdf',
    loader_cls=PyMuPDFLoader
)
dir_docs = dirload.load()
print(len(dir_docs))

9


In [19]:
# splitting from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(dir_docs)

print(f"Split blog post into {len(all_splits)} sub-documents.")

Split blog post into 42 sub-documents.


In [20]:
all_splits

[Document(metadata={'producer': 'cairo 1.18.0 (https://cairographics.org)', 'creator': '', 'creationdate': '2025-01-14T15:25:37+05:30', 'source': '../data/pdf_file/output.pdf', 'file_path': '../data/pdf_file/output.pdf', 'total_pages': 8, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': "D:20250114152537+05'30", 'page': 0, 'start_index': 0}, page_content='7 – Seeing the World as the Shell Sees It\n7 – Seeing the World as the Shell Sees It\nIn this chapter we are going to look at some of the “magic” that occurs on the command\nline when we press the Enter key. While we will examine several interesting and complex\nfeatures of the shell, we will do it with just one new command.\n●\necho – Display a line of text\nExpansion\nEach time we type a command and press the Enter key, bash performs several substitu-'),
 Document(metadata={'producer': 'cairo 1.18.0 (https://cairographics.org)', 'creator': '',