In [1]:
import os
from typing import List, Any, Dict
import pandas as pd

In [2]:
from langchain_core.documents import Document
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter, 
    CharacterTextSplitter, 
    TokenTextSplitter)

### Understanding Document Structure

In [7]:
## Create a document

doc = Document(
    page_content="Hello, world!",
    metadata={"source": "test.txt", 
              "title": "Test Document", 
              "page_number": 1}
)

In [None]:
doc 

Document(metadata={'source': 'test.txt', 'title': 'Test Document', 'page_number': 1}, page_content='Hello, world!')

### Text Files

In [10]:
### Text Loader
from langchain.document_loaders import TextLoader

In [13]:
# Loading a single text file
loader = TextLoader("data/text_files/python_intro.txt", encoding="utf-8")
loader.load()

[Document(metadata={'source': 'data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.')]

In [14]:
## Loading text files from directory
from langchain_community.document_loaders import DirectoryLoader

In [16]:
## Load all the text files in the directory

dir_loader = DirectoryLoader(
    "data/text_files",
    glob="*.txt", 
    loader_cls= TextLoader, #Loader class to use
    loader_kwargs = {"encoding":"utf-8"}, #Encoding to use
    show_progress=True, #Show progress
)

documents = dir_loader.load()

100%|██████████| 2/2 [00:00<00:00, 1833.17it/s]


In [17]:
len(documents)

2

In [19]:
print("Total documents loaded:", len(documents))

for i, doc in enumerate(documents):
    print(f"Document {i+1}:")
    print(f"Source : {doc.page_content[:100]}")
    print(f"Metadata: {doc.metadata}")
    print()
    print("-"*100)

Total documents loaded: 2
Document 1:
Source : Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables system
Metadata: {'source': 'data\\text_files\\machine_learning.txt'}

----------------------------------------------------------------------------------------------------
Document 2:
Source : Python Programming Introduction

Python is a high-level, interpreted programming language known for 
Metadata: {'source': 'data\\text_files\\python_intro.txt'}

----------------------------------------------------------------------------------------------------


### Text Splitting Strategies

In [20]:
from langchain.text_splitter import (
    CharacterTextSplitter, RecursiveCharacterTextSplitter, TokenTextSplitter
)

In [23]:
text = documents[0].page_content
print(text)

Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems


    


In [28]:
## Method 1 : Character Text Splitter
character_text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=200,
    chunk_overlap=20,
    length_function=len
)

char_chunks = character_text_splitter.split_text(text)
print(f"Total chunks: {len(char_chunks)}")


Total chunks: 4


In [29]:
for chunk in char_chunks:
    print(chunk)
    print()
    print("-"*100)

Machine Learning Basics
Machine learning is a subset of artificial intelligence that enables systems to learn and improve

----------------------------------------------------------------------------------------------------
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.
Types of Machine Learning:

----------------------------------------------------------------------------------------------------
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

----------------------------------------------------------------------------------------------------
Applications include image recognition, speech processing, and recommendation systems

----------------------------------------------------------------------------------------------------


In [30]:
## Method 2 : Recursive Character Text Splitter

recursive_character_text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size=200,
    chunk_overlap=20,
    length_function=len
)

recursive_chunks = recursive_character_text_splitter.split_text(text)
print(f"Total chunks: {len(recursive_chunks)}")

Total chunks: 6


In [31]:
for chunk in recursive_chunks:
    print(chunk)
    print()
    print("-"*100)

Machine Learning Basics

----------------------------------------------------------------------------------------------------
Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs

----------------------------------------------------------------------------------------------------
that can access data and use it to learn for themselves.

----------------------------------------------------------------------------------------------------
Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data

----------------------------------------------------------------------------------------------------
3. Reinforcement Learning: Learning through rewards and penalties

----------------------------------------------------------------------------------------------------
Applicatio