# Data Ingestion

In [5]:
from langchain_core.documents import Document

In [6]:
doc = Document(
    page_content="This is the main text content I am using to create RAG", 
    metadata={
        "source": "my_document.txt",
        "author": "Aditya Singh",
        "created_at": "2025-10-16"
    }
)
doc

Document(metadata={'source': 'my_document.txt', 'author': 'Aditya Singh', 'created_at': '2025-10-16'}, page_content='This is the main text content I am using to create RAG')

In [9]:
import os
os.makedirs("../data/text_files", exist_ok=True)

In [10]:
sample_texts={
    "../data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "../data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
    
    
    """

}

for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)
       

print("✅ Sample text files created!")

✅ Sample text files created!


In [15]:

%pip install langchain_community

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [20]:

from langchain_community.document_loaders import TextLoader

loader = TextLoader(
    "../data/text_files/python_intro.txt", encoding="utf-8"
)
document = loader.load()
document


[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.')]

In [23]:
from langchain_community.document_loaders import DirectoryLoader

# from notebook import documents

dir_loader = DirectoryLoader(
    "../data/text_files",
    glob = "**/*.txt",
    loader_cls = TextLoader,
    loader_kwargs={"encoding": "utf-8"},
    show_progress=False
    )

documents = dir_loader.load()
documents

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.'),
 Document(metadata={'source': '../data/text_files/machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervise

In [25]:
%pip install Pypdf

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [30]:
from langchain_community.document_loaders import DirectoryLoader,PyMuPDFLoader

dir_loader = DirectoryLoader("../data/pdf",
glob="**/*.pdf",
loader_cls=PyMuPDFLoader,
# loader_kwargs={"encoding": "utf-8"},
show_progress=False
)
documents = dir_loader.load()
documents

[Document(metadata={'producer': 'Skia/PDF m143 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': '../data/pdf/pdf1.pdf', 'file_path': '../data/pdf/pdf1.pdf', 'total_pages': 1, 'format': 'PDF 1.4', 'title': '1NH23CD194_Aditya_Kumar', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content="Aditya Kumar \n \n📍Bengaluru,IN \n📞+917061939341 \n✉️seemeadit21824@gmail.com  \nGitHub \n \nLinkedin\nSummary \nPassionate MERN stack developer with proficiency in data structures & algorithms, and basic machine \nlearning. Dedicated to building scalable web applications, optimizing performance, and leveraging \ntechnology to create innovative, real-world solutions that drive meaningful impact. \n \nSkills \n●\u200b\nProgramming Languages: | Python | C | C++ | Java | JavaScript | SQL | PHP | \n●\u200b\nFrontend Technologies: | HTML | Bootstrap | JS | TailwindCSS | React | \n●\u200b\nBackend Technologies: 

In [31]:
type(documents[0])

langchain_core.documents.base.Document