## Data ingestion

In [15]:
### Document structure

from langchain_core.documents import Document

In [16]:
doc = Document(
    page_content="Hello, world! this is a test document", 
    metadata={
        "source": "example.pdf", 
        "page": 1, "author": "John Doe", 
        'date_created': "2021-01-01"
    })

doc

Document(metadata={'source': 'example.pdf', 'page': 1, 'author': 'John Doe', 'date_created': '2021-01-01'}, page_content='Hello, world! this is a test document')

In [19]:
### create a simple text file in data folder

import os 
os.makedirs("../data/text_files", exist_ok=True)

sample_texts={
    "../data/text_files/python_intro_text.txt": """Python is a popular, high-level programming language created by Guido van Rossum and first released in 1991. It is known for its simple and readable syntax, which makes it easy for beginners to learn. Python supports multiple programming styles, including procedural, object-oriented, and functional programming.
        It is widely used for web development, data analysis, automation, scientific computing, artificial intelligence, and more. Python runs on many platforms like Windows, macOS, and Linux, making it very versatile.
        Because Python code is easy to write and read, it is often used for rapid prototyping as well as production software.""",
    "../data/text_files/python_features_text.txt": "Some of the features of Python are: \n\n 1. Easy to learn \n 2. Interactive \n 3. Dynamically typed \n 4. Extensible and embeddable \n 5. Large standard library \n 6. Extensive support libraries \n 7. Platform independent \n 8. Free and open source"
}

# open(file_path, "w", encoding="utf-8") opens a file for writing:
# "w" mode creates a new file or overwrites existing content
# encoding="utf-8" ensures proper handling of Unicode characters
# with statement ensures the file is properly closed after writing
# f.write(content) writes the text content to the file

for file_path, content in sample_texts.items():
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(content)

print('Files created successfully!')
        

Files created successfully!


### docuemnt loader 

In [None]:
### text loader

from langchain.document_loaders import TextLoader
# or
# from langchain_community.document_loaders import TextLoader
# either way, the loader is the same

loader = TextLoader("../data/text_files/python_intro_text.txt")
docs = loader.load()

print(docs)


[Document(metadata={'source': '../data/text_files/python_intro_text.txt'}, page_content='Python is a popular, high-level programming language created by Guido van Rossum and first released in 1991. It is known for its simple and readable syntax, which makes it easy for beginners to learn. Python supports multiple programming styles, including procedural, object-oriented, and functional programming.\n        It is widely used for web development, data analysis, automation, scientific computing, artificial intelligence, and more. Python runs on many platforms like Windows, macOS, and Linux, making it very versatile.\n        Because Python code is easy to write and read, it is often used for rapid prototyping as well as production software.')]


### Directory loader

In [30]:
from langchain.document_loaders import DirectoryLoader

dirLoader = DirectoryLoader(
    "../data/text_files",
    glob="*.txt",
    loader_cls=TextLoader
)
docs = dirLoader.load()

print(docs)

[Document(metadata={'source': '../data/text_files/python_intro_text.txt'}, page_content='Python is a popular, high-level programming language created by Guido van Rossum and first released in 1991. It is known for its simple and readable syntax, which makes it easy for beginners to learn. Python supports multiple programming styles, including procedural, object-oriented, and functional programming.\n        It is widely used for web development, data analysis, automation, scientific computing, artificial intelligence, and more. Python runs on many platforms like Windows, macOS, and Linux, making it very versatile.\n        Because Python code is easy to write and read, it is often used for rapid prototyping as well as production software.'), Document(metadata={'source': '../data/text_files/python_features_text.txt'}, page_content='Some of the features of Python are: \n\n 1. Easy to learn \n 2. Interactive \n 3. Dynamically typed \n 4. Extensible and embeddable \n 5. Large standard libr