### Text loader

In [None]:
from langchain.document_loaders import TextLoader
loader = TextLoader('nvda_news_1.txt')
data = loader.load()
# data[0]
# data[0].page_content
data[0].metadata


In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader
loader = CSVLoader('movies.csv')
data = loader.load()
# data[0].metadata
# data[0].page_content
loader = CSVLoader('movies.csv', source_column='title')
data[0].metadata
len(data)


### Unstructured url loader

In [None]:
# pip3 install unstructured libmagic python-magic python-magic-bin

from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader([
    "https://www.moneycontrol.com/news/business/banks/hdfc-bank-re-appoints-sanmoy-chakrabarti-as-chief-risk-officer-11259771.html",
    "https://www.moneycontrol.com/news/business/markets/market-corrects-post-rbi-ups-inflation-forecast-icrr-bet-on-these-top-10-rate-sensitive-stocks-ideas-11142611.html"
])

docs = loader.load()
newsContent = docs[0].page_content
print(newsContent)



### Text splitters

In [None]:
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter(
    separator='\n',          # Split on newline characters
    chunk_size=200,          # Each chunk is at most 200 characters long
    chunk_overlap=0          # No overlapping content between chunks
    # Number of characters to overlap between consecutive chunks.
    # This helps preserve some context from the end of one chunk into the start of the next.
    # Set to 0 for no overlap, or a higher value (e.g., 50) to retain context across chunks.
)

chunks = splitter.split_text(newsContent)  # `docs` is your input string
len(chunks)                         # Returns the number of resulting chunks


CharacterTextSplitter

🔻 Drawbacks of CharacterTextSplitter:
- May split in the middle of words or sentences, leading to poor chunk quality.
- No language or token awareness, so it can break context or underutilize model capacity.
- Not ideal for semantic tasks like summarization or Q&A.

✅ Use Cases:
- Best for simple, line-separated data (e.g., logs, code).
- Useful when you need quick and lightweight splitting without language overhead.
- Good for preprocessing short texts where sentence structure isn’t critical.

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,  # Overlap helps maintain context between chunks
    separators=["\n\n", "\n", ".", "!", "?", " ", ""]  # Tries these in order for best natural splits
)

chunks = splitter.split_text(newsContent)
len(chunks)

# Loop to inspect chunk details
for i, chunk in enumerate(chunks):
    print(f"--- Chunk {i+1} ---")
    print(f"Length: {len(chunk)} characters")
    print(chunk)
    print()

✅ Use Cases of RecursiveCharacterTextSplitter:
Text summarization
Keeps sentences/paragraphs intact for better model understanding.

Question answering over documents
Preserves context so questions can refer to nearby sentences.

Search + retrieval (RAG)
Splits into dense, semantically complete chunks for embedding & retrieval.

❌ Drawbacks of RecursiveCharacterTextSplitter:
Slower than simple splitters
Due to recursive logic and merging steps.

Uneven chunk sizes
Chunks can vary depending on where natural breaks occur.

Not ideal for structured/tabular text
Like logs or code where natural language structure isn’t relevant.

**Merging** is the process of recombining small fragments (after splitting) into meaningful chunks that meet a certain chunk_size while preserving natural structure (like paragraphs or sentences).

✅ Why it matters:
When using splitters like RecursiveCharacterTextSplitter, it:
Splits using natural boundaries (like \n\n, ., , etc.)
Then merges smaller pieces back together into chunks up to chunk_size with optional chunk_overlap.



### Faiss tutorial
```
!pip install faiss-cpu
!pip install sentence-transformers
```

FAISS (Facebook AI Similarity Search) is a library made by Facebook AI Research. It's used to search for similar items (like text or images) very quickly.
- Finding similar documents
- Recommending items
- Clustering data

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 100)
df = pd.read_csv("sample_text.csv")
df.shape
df

### Step 1 : Create source embeddings for the text column

In [None]:
from sentence_transformers import SentenceTransformer
# 1. Load a pretrained Sentence Transformer model
encoder = SentenceTransformer("all-mpnet-base-v2")

In [None]:
# 2. Calculate embeddings by calling encoder.encode()
sentences = df.text
embeddings = encoder.encode(sentences)        # embeddings similar to vector,
# embeddings are specifically learned representations that capture semantic meaning and relationships between data points
print(embeddings.shape)
print(embeddings)
# 3. Calculate the embedding similarities
# similarities = encoder.similarity(embeddings, embeddings)
# print(similarities)


In [None]:
dim = embeddings.shape[1]
dim

### Step 2 : Build a FAISS Index for vectors

In [None]:
import faiss

index = faiss.IndexFlatL2(dim)
# This creates an index (a data structure for searching) that uses L2 distance, which is another name for Euclidean distance.
# “Flat” means it does brute-force search (it checks all vectors) — simple but accurate.
index

### Step 3 : Normalize the source vectors (as we are using L2 distance to measure similarity) and add to the index

In [None]:
index.add(embeddings)
index

### Step 4 : Encode search text using same encorder and normalize the output vector


In [None]:
# Define a natural language search query. You can change it to test other queries.
search_query = "I want to go to manali"
# search_query = "looking for places to visit during the holidays"
# search_query = "An apple a day keeps the doctor away"

vec = encoder.encode(search_query)
# Convert the search query into a vector (embedding) using a sentence encoder.
# The result is typically a 1D array of floats, e.g., shape (768,)
vec.shape

In [None]:
import numpy as np
svec = np.array(vec).reshape(1,-1)
# Convert the 1D vector into a 2D array with shape (1, 768)
# This is needed because many libraries (like FAISS) expect input as a batch of vectors (n, dim)

svec.shape

### Step 5: Search for similar vector in the FAISS index created

In [None]:
distances, I = index.search(svec, k=2)
# Search the FAISS index for the top-k most similar vectors to 'svec'
# 'k=2' means: return the top 2 closest matches (nearest neighbors)
# Returns two things:
#  - 'distances': the similarity (or distance) scores to the top-k vectors
#  - 'I': the indices (positions) of the top-k vectors in the index

print(distances)
print(I)
# Show the similarity (or distance) scores
# If you're using IndexFlatIP → higher is better (inner product similarity)
# If you're using IndexFlatL2 → lower is better (shorter Euclidean distance)


In [None]:
I.tolist()
row_indices = I.tolist()[0]
row_indices

In [None]:
res = df.loc[row_indices]
print(search_query)
print(res)