## Data Ingestion

In [1]:
import os
from typing import List,Dict, Any
import pandas as pd

In [2]:
from langchain_core.documents import Document

In [3]:
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Understanding document structure in Langchain

### Error faced :

    Error 1 : import mistake, langchain_core.document --- it should be documents
    Error 2 : The parameters inside the Document() are specifically -  page_content, metadata


In [5]:
doc = Document(
    page_content = "This is the main text content that will be embedded and searched",
    metadata ={
        'source' : 'example.txt',
        'page' : 1,
        "author" : 'Akash Deep Sangwan',
        "date_created" : "2025-01-01",
        'custom-field' : 'any_value'
    }
)

print("Document Structure")
print(f"Content : {doc.page_content}")
print(f"Matadeta : {doc.metadata}")

"""
Metadata is Crucial :
    Filtering search result
    Tracking Document Sources
    Provide context in the Responses
    Debugging
"""

Document Structure
Content : This is the main text content that will be embedded and searched
Matadeta : {'source': 'example.txt', 'page': 1, 'author': 'Akash Deep Sangwan', 'date_created': '2025-01-01', 'custom-field': 'any_value'}


'\nMetadata is Crucial :\n    Filtering search result\n    Tracking Document Sources\n    Provide context in the Responses\n    Debugging\n'

## Reading a txt file

In [11]:
import os
os.makedirs("data/text_files", exist_ok=True)


In [None]:
sample_texts = {
    "data/text_files/python_intro.txt" :
    """
    Python is a high-level, interpreted programming language that emphasizes simplicity, 
    readability, and versatility. Developed by Guido van Rossum and released in 1991,
    Python has become one of the most popular languages in the world. It uses clear and 
    easy-to-understand syntax, making it an excellent choice for beginners while still being powerful 
    enough for professionals.

    Python supports multiple programming paradigms such as procedural, object-oriented, and functional 
    programming. It comes with a vast standard library and numerous third-party modules that allow developers 
    to perform a wide range of tasks — from web and software development to data analysis, machine learning, 
    and automation.

    Its cross-platform compatibility, active community, and ease of integration with other languages make 
    Python ideal for modern computing applications and rapid prototyping.
    """
}

# Inside sample_text : filepath is created and as it's value content of the file is provided

for filepath,content in sample_texts.items():
    with open(filepath, 'w', encoding = "utf-8") as f :
        f.write(content)

print("sample text file created!")
    


sample text file created!


### TextLoader - Read Single File

In [8]:
from langchain_community.document_loaders import TextLoader

# Syntax for TextLoader("filepath")
loader = TextLoader("data/text_files/python_intro.txt", encoding = 'utf-8')

documents = loader.load()
print(type(documents))
print(documents)

"""
 TextLoader function :
    - Parameters= : Filepath, Encoding - "UTF-8"
    - Output's the type : list
    - list[0] - output's the page1
    - lis1[0] - output's the page2
"""



<class 'list'>
[Document(metadata={'source': 'data/text_files/python_intro.txt'}, page_content='\n    Python is a high-level, interpreted programming language that emphasizes simplicity, \n    readability, and versatility. Developed by Guido van Rossum and released in 1991,\n    Python has become one of the most popular languages in the world. It uses clear and \n    easy-to-understand syntax, making it an excellent choice for beginners while still being powerful \n    enough for professionals.\n\n    Python supports multiple programming paradigms such as procedural, object-oriented, and functional \n    programming. It comes with a vast standard library and numerous third-party modules that allow developers \n    to perform a wide range of tasks — from web and software development to data analysis, machine learning, \n    and automation.\n\n    Its cross-platform compatibility, active community, and ease of integration with other languages make \n    Python ideal for modern computing 

'\n TextLoader function :\n    - Parameters= : Filepath, Encoding - "UTF-8"\n    - Output\'s the type : list\n    - list[0] - output\'s the page1\n    - lis1[0] - output\'s the page2\n'

### It means till we learned about the metadata and content of the file
- In build function like the text loader also performs the same functionality
- Just like the TextLoader there is library called directory loader - It can load the multiple files in one go
- However, the text-loader loads the one file at a time
- For DirectoryLoader to load the files has pre-condition that all the files in that directory should be of same type


## Text Splitting Techniques

In [9]:
print(documents) # will going to split the text of the documents into smaller chunks
text = documents[0] # return with metadata plus document content

t1 = documents[0].page_content # only returns the content
print(type(t1))  # page content contains the text - str

[Document(metadata={'source': 'data/text_files/python_intro.txt'}, page_content='\n    Python is a high-level, interpreted programming language that emphasizes simplicity, \n    readability, and versatility. Developed by Guido van Rossum and released in 1991,\n    Python has become one of the most popular languages in the world. It uses clear and \n    easy-to-understand syntax, making it an excellent choice for beginners while still being powerful \n    enough for professionals.\n\n    Python supports multiple programming paradigms such as procedural, object-oriented, and functional \n    programming. It comes with a vast standard library and numerous third-party modules that allow developers \n    to perform a wide range of tasks — from web and software development to data analysis, machine learning, \n    and automation.\n\n    Its cross-platform compatibility, active community, and ease of integration with other languages make \n    Python ideal for modern computing applications an

In [None]:
"""
split.tex() ---> only accepts the argument, if it's a string
    - document[0] : will produce an Error, cause it's type is LangChain Document not a string
    - Solution :
         - document[0].pageContent : will return the <str>, it's a valid input for split text




"""

In [10]:
# method 1 : character text splitter
print("CHARACTER TEXT SPLITTER")

char_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 200,
    chunk_overlap = 20,
    length_function = len
)

char_chunk = char_splitter.split_text(t1) 


print(f"Created {len(char_chunk)} chunks")
print(f"First chunk : {char_chunk[0][:200]}")



"""
In here, clean splitting is being done
We used \n as seperator - overlap function doesn't work
"""

CHARACTER TEXT SPLITTER
Created 6 chunks
First chunk : Python is a high-level, interpreted programming language that emphasizes simplicity, 
    readability, and versatility. Developed by Guido van Rossum and released in 1991,


"\nIn here, clean splitting is being done\nWe used \n as seperator - overlap function doesn't work\n"

In [None]:
print(char_chunk[0])
print("-------------------------------------------")
print(char_chunk[1])
print("------------------------------------------")
print(char_chunk[2])
print("-----------------------------------------------")
print(char_chunk[3])
print("-----------------------------------------------")
print(char_chunk[4])
print("-----------------------------------------------")
print(char_chunk[5])
print("-----------------------------------------------")


# Overlap is happening here, for the output the clear split has been used




Python is a high-level, interpreted programming language that emphasizes simplicity, 
    readability, and versatility. Developed by Guido van Rossum and released in 1991,
-------------------------------------------
Python has become one of the most popular languages in the world. It uses clear and 
    easy-to-understand syntax, making it an excellent choice for beginners while still being powerful
------------------------------------------
enough for professionals.
    Python supports multiple programming paradigms such as procedural, object-oriented, and functional
-----------------------------------------------
programming. It comes with a vast standard library and numerous third-party modules that allow developers
-----------------------------------------------
to perform a wide range of tasks — from web and software development to data analysis, machine learning, 
    and automation.
-----------------------------------------------
and automation.
    Its cross-platform compatibil

### Method 2 : Recursive Character Text Splitter

- common error
    1) Separators - while using list, when it's a single separator 
    

In [47]:
recursive_splitter = RecursiveCharacterTextSplitter(
    separators = ['\n\n', '\n', " ", ""],
    chunk_size = 200,
    chunk_overlap = 20,
    length_function= len
)

recursive_chunk = recursive_splitter.split_text(t1)
print(f"Created {len(recursive_chunk)} chunks")
print(f"First chunk : {recursive_chunk[0][:200]}") 


Created 7 chunks
First chunk : Python is a high-level, interpreted programming language that emphasizes simplicity, 
    readability, and versatility. Developed by Guido van Rossum and released in 1991,


In [None]:
print(recursive_chunk[0])
print("----------------------------")
print(recursive_chunk[1])
print("-----------------------------")
print(recursive_chunk[2])
print("-----------------------------")
print(recursive_chunk[3])
print("-----------------------------")


# Here we don't get to see any overlapping, it coz of the separator we have used, provides the clean split =\n


"""
There is some problem here, this code should have the splitting 
"""

Python is a high-level, interpreted programming language that emphasizes simplicity, 
    readability, and versatility. Developed by Guido van Rossum and released in 1991,
----------------------------
Python has become one of the most popular languages in the world. It uses clear and 
    easy-to-understand syntax, making it an excellent choice for beginners while still being powerful
-----------------------------
enough for professionals.
-----------------------------
Python supports multiple programming paradigms such as procedural, object-oriented, and functional
-----------------------------


In [None]:
"""
Character based Splitting

"""

In [None]:
char_splitter = CharacterTextSplitter(
    separator = " ", # split on new = lines
    chunk_size =  200,
    chunk_overlap = 20,
    length_function= len
)

char_chunk = char_splitter.split_text(t1)

In [14]:
print(f"No. of chunks : {len(char_chunk)}")

No. of chunks : 5


In [20]:
for i in range(len(char_chunk)-1):
    print(f"chunk : {i+1} = {char_chunk[i]}")
    print(f"chunk : {i+2} = {char_chunk[i+1]}")
    print("-------")


# Here overlapping is happening

    

chunk : 1 = Python is a high-level, interpreted programming language that emphasizes simplicity, 
 readability, and versatility. Developed by Guido van Rossum and released in 1991,
 Python has become one of the
chunk : 2 = become one of the most popular languages in the world. It uses clear and 
 easy-to-understand syntax, making it an excellent choice for beginners while still being powerful 
 enough for
-------
chunk : 2 = become one of the most popular languages in the world. It uses clear and 
 easy-to-understand syntax, making it an excellent choice for beginners while still being powerful 
 enough for
chunk : 3 = enough for professionals.

 Python supports multiple programming paradigms such as procedural, object-oriented, and functional 
 programming. It comes with a vast standard library and numerous
-------
chunk : 3 = enough for professionals.

 Python supports multiple programming paradigms such as procedural, object-oriented, and functional 
 programming. It comes with a va

In [None]:
# Token Text Splitter
token_split = TokenTextSplitter(
    chunk_size = 50,
    chunk_overlap = 10 ,
    length_function = len
)

In [None]:
token_chunk = token_split.split_text(t1)
print("No of chunks : ", len(token_chunk))

No of chunks :  6


In [None]:
print(token_chunk[0])
print("-----------------")
print(token_chunk[1])
print("-----------------")
print(token_chunk[2])
print("-----------------")
print(token_chunk[3])
print("-----------------")


"""
    Overlapping - Yes
     -It's more accurate
     - Separator = tokens
     - slower than CharacterTextSplit
     - Better Overlapping



-- Best Text Splitter is RecursiveTextSplitter, Cause it's 
"""



    Python is a high-level, interpreted programming language that emphasizes simplicity, 
    readability, and versatility. Developed by Guido van Rossum and released in 1991,
    Python has become one
-----------------
 1991,
    Python has become one of the most popular languages in the world. It uses clear and 
    easy-to-understand syntax, making it an excellent choice for beginners while still being powerful 
 
-----------------
 choice for beginners while still being powerful 
    enough for professionals.

    Python supports multiple programming paradigms such as procedural, object-oriented, and functional 
    programming. It comes with a vast standard
-----------------
   programming. It comes with a vast standard library and numerous third-party modules that allow developers 
    to perform a wide range of tasks — from web and software development to data analysis, machine learning, 
   
-----------------
