### Text loader

In [None]:
import nltk

# Ensure punkt is downloaded and set data path explicitly
nltk.download('punkt', download_dir='./nltk_data')
nltk.data.path.append('./nltk_data')

from langchain.document_loaders import TextLoader
loader = TextLoader('nvda_news_1.txt')
data = loader.load()
# data[0]
# data[0].page_content
data[0].metadata


In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader
loader = CSVLoader('movies.csv')
data = loader.load()
# data[0].metadata
# data[0].page_content
loader = CSVLoader('movies.csv', source_column='title')
data[0].metadata
len(data)


### Unstructured url loader

In [None]:
# pip3 install unstructured libmagic python-magic python-magic-bin

from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader([
    "https://www.moneycontrol.com/news/business/banks/hdfc-bank-re-appoints-sanmoy-chakrabarti-as-chief-risk-officer-11259771.html",
    "https://www.moneycontrol.com/news/business/markets/market-corrects-post-rbi-ups-inflation-forecast-icrr-bet-on-these-top-10-rate-sensitive-stocks-ideas-11142611.html"
])

docs = loader.load()
newsContent = docs[0].page_content
print(newsContent)



### Text splitters

In [None]:
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter(
    separator='\n',          # Split on newline characters
    chunk_size=200,          # Each chunk is at most 200 characters long
    chunk_overlap=0          # No overlapping content between chunks
    # Number of characters to overlap between consecutive chunks.
    # This helps preserve some context from the end of one chunk into the start of the next.
    # Set to 0 for no overlap, or a higher value (e.g., 50) to retain context across chunks.
)

chunks = splitter.split_text(newsContent)  # `docs` is your input string
len(chunks)                         # Returns the number of resulting chunks


CharacterTextSplitter

🔻 Drawbacks of CharacterTextSplitter:
- May split in the middle of words or sentences, leading to poor chunk quality.
- No language or token awareness, so it can break context or underutilize model capacity.
- Not ideal for semantic tasks like summarization or Q&A.

✅ Use Cases:
- Best for simple, line-separated data (e.g., logs, code).
- Useful when you need quick and lightweight splitting without language overhead.
- Good for preprocessing short texts where sentence structure isn’t critical.

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,  # Overlap helps maintain context between chunks
    separators=["\n\n", "\n", ".", "!", "?", " ", ""]  # Tries these in order for best natural splits
)

chunks = splitter.split_text(newsContent)
len(chunks)

# Loop to inspect chunk details
for i, chunk in enumerate(chunks):
    print(f"--- Chunk {i+1} ---")
    print(f"Length: {len(chunk)} characters")
    print(chunk)
    print()

✅ Use Cases of RecursiveCharacterTextSplitter:
Text summarization
Keeps sentences/paragraphs intact for better model understanding.

Question answering over documents
Preserves context so questions can refer to nearby sentences.

Search + retrieval (RAG)
Splits into dense, semantically complete chunks for embedding & retrieval.

❌ Drawbacks of RecursiveCharacterTextSplitter:
Slower than simple splitters
Due to recursive logic and merging steps.

Uneven chunk sizes
Chunks can vary depending on where natural breaks occur.

Not ideal for structured/tabular text
Like logs or code where natural language structure isn’t relevant.

**Merging** is the process of recombining small fragments (after splitting) into meaningful chunks that meet a certain chunk_size while preserving natural structure (like paragraphs or sentences).

✅ Why it matters:
When using splitters like RecursiveCharacterTextSplitter, it:
Splits using natural boundaries (like \n\n, ., , etc.)
Then merges smaller pieces back together into chunks up to chunk_size with optional chunk_overlap.

