Data Ingestion


In [13]:
### Document Datastructure

from langchain_core.documents import Document

In [14]:
doc = Document(
    page_content="This is the main text content I am using to create a RAG",
    metadata={
        "source": "example.txt",
        "pages":1,
        "author":"Harshul Chandrashekhar",
        "date_created":"2025-10-02"
        }
)

In [15]:
## Create a simple text file
import os
os.makedirs("../data/text_files", exist_ok=True)

In [16]:
sample_texts={
    "../data/text_files/python_intro.txt":"""
    Python is a high-level, interpreted programming language known for its clear syntax and readability.
    It is widely used in various domains, including web development, data analysis, artificial intelligence,
    and scientific computing. Python's design philosophy emphasizes code readability, allowing developers to express
    concepts in fewer lines of code compared to languages like C++ or Java.
    """,
    "../data/text_files/machine_learning.txt":""" Machine learning (ML) is a branch of artificial intelligence (AI) that enables computers to learn from data and improve their performance on a task without being explicitly programmed. """
}

for filepath,content in sample_texts.items():
    with open(filepath,"w",encoding="utf-8") as f:
        f.write(content)

print("Sample text file created")

Sample text file created


In [17]:
### TextLoader 
from langchain_community.document_loaders import TextLoader
loader = TextLoader("../data/text_files/python_intro.txt",encoding="utf-8")
docs = loader.load()
print(docs)


[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content="\n    Python is a high-level, interpreted programming language known for its clear syntax and readability.\n    It is widely used in various domains, including web development, data analysis, artificial intelligence,\n    and scientific computing. Python's design philosophy emphasizes code readability, allowing developers to express\n    concepts in fewer lines of code compared to languages like C++ or Java.\n    ")]


In [19]:
### Directory Loader

from langchain_community.document_loaders import DirectoryLoader 

##Load all the text files from the directory 

dir_loader=DirectoryLoader(
    "../data/text_files",
    glob="**/*.txt",
    loader_cls=TextLoader,
    loader_kwargs={'encoding':"utf-8"},
    show_progress=False
)

documents = dir_loader.load()
documents



[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content="\n    Python is a high-level, interpreted programming language known for its clear syntax and readability.\n    It is widely used in various domains, including web development, data analysis, artificial intelligence,\n    and scientific computing. Python's design philosophy emphasizes code readability, allowing developers to express\n    concepts in fewer lines of code compared to languages like C++ or Java.\n    "),
 Document(metadata={'source': '../data/text_files/machine_learning.txt'}, page_content=' Machine learning (ML) is a branch of artificial intelligence (AI) that enables computers to learn from data and improve their performance on a task without being explicitly programmed. ')]

In [27]:
### PDF Loader
from langchain_community.document_loaders import DirectoryLoader 
from langchain_community.document_loaders import PyMuPDFLoader

pdf_loader = DirectoryLoader(
     "../data/pdf_files",
    glob="**/*.pdf",
    loader_cls=PyMuPDFLoader,
    show_progress=False
)
pdf_docs = pdf_loader.load()
pdf_docs

pdf_docs


[Document(metadata={'producer': 'PyPDF2', 'creator': '', 'creationdate': '', 'source': "../data/pdf_files/Bill's_Windsurf_Shop_Invoice.pdf", 'file_path': "../data/pdf_files/Bill's_Windsurf_Shop_Invoice.pdf", 'total_pages': 1, 'format': 'PDF 1.3', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content="Invoice for Bill's Windsurf Shop\nEmail: Surf@Intuit.com\nInvoice Details:\nDescription\nQty\nUnit Price\nAmount\nDesign Service\n1\n$500.00\n$500.00\nConsulting\n2\n$200.00\n$400.00\nInstallation\n1\n$300.00\n$300.00\nTotal\n$1200.00\nInvoice Date: October 20, 2025\nDue Date: November 19, 2025"),
 Document(metadata={'producer': 'PyPDF2', 'creator': '', 'creationdate': '', 'source': "../data/pdf_files/Amy's_Bird_Sanctuary_Invoice.pdf", 'file_path': "../data/pdf_files/Amy's_Bird_Sanctuary_Invoice.pdf", 'total_pages': 1, 'format': 'PDF 1.3', 'title': '', 'author': '', 'subject': '', 'keywords': '', 