In [None]:
# !pip install langchain

# !pip install -U langchain-community

# !pip install pypdf

# !pip install -U tqdm

# !pip install -U ipywidgets  

# !pip install -U sentence-transformers==2.2.2

# !pip install torch transformers

# !pip install InstructorEmbedding
# pip install tqdm
# !pip install --upgrade sentence-transformers huggingface-hub
# pip install faiss-cpu

Collecting langchain
  Using cached langchain-0.3.19-py3-none-any.whl.metadata (7.9 kB)
Collecting langchain-core<1.0.0,>=0.3.35 (from langchain)
  Using cached langchain_core-0.3.37-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.6 (from langchain)
  Using cached langchain_text_splitters-0.3.6-py3-none-any.whl.metadata (1.9 kB)
Collecting langsmith<0.4,>=0.1.17 (from langchain)
  Using cached langsmith-0.3.10-py3-none-any.whl.metadata (14 kB)
Collecting pydantic<3.0.0,>=2.7.4 (from langchain)
  Using cached pydantic-2.10.6-py3-none-any.whl.metadata (30 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Using cached SQLAlchemy-2.0.38-cp311-cp311-win_amd64.whl.metadata (9.9 kB)
Collecting requests<3,>=2 (from langchain)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting PyYAML>=5.3 (from langchain)
  Using cached PyYAML-6.0.2-cp311-cp311-win_amd64.whl.metadata (2.1 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)
  Usin

# Libraries

In [5]:
import os


os.chdir("../")


In [6]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import  RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from tqdm import tqdm


# Documents

## Load 

In [7]:
DATASET_PATH = "Dataset"

In [8]:
loader=PyPDFDirectoryLoader(path=DATASET_PATH,
                            recursive=True,
                            silent_errors=True)

In [9]:
documnets=loader.load()

## Split Text

In [10]:

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=100,
    length_function=len,
    add_start_index=True
)

In [11]:
split_documents = text_splitter.split_documents(documnets)

## Vectorize

### Option 1

In [9]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


In [10]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')


tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [14]:
embeddings = []
metadatas = []

# Use tqdm to show progress
for doc in tqdm(split_documents, desc="Embedding document splits"):
    # Tokenize the split text
    encoded_input = tokenizer(doc.page_content, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    embedding = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    embedding = F.normalize(embedding, p=2, dim=1)

    # Save embedding and metadata
    embeddings.append(embedding.squeeze().cpu().numpy())
    metadatas.append(doc.metadata)

Embedding document splits: 100%|██████████| 14743/14743 [30:36<00:00,  8.03it/s] 


In [24]:
import pickle

# Save embeddings list
with open('embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

# Save metadatas list
with open('metadatas.pkl', 'wb') as f:
    pickle.dump(metadatas, f)

### Option 2

In [12]:
# Initialize LangChain's FAISS wrapper
vector_store = FAISS.from_documents(split_documents, HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'))


  vector_store = FAISS.from_documents(split_documents, HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'))


In [13]:
vector_store.save_local("vector_db/faiss_index")

In [14]:
query_text = "Tell me about Frodo Baggins"
results = vector_store.similarity_search(query_text, k=5)

for result in results:
    print(result.page_content)
    print("Metadata:", result.metadata)
    print("------")

You have leave to walk over my land, if you have a mind,
Mr. Peregrin. And you, Mr. Baggins – though I daresay you
still like mushrooms.’ He laughed. ‘Ah yes, I recognized the
name. I recollect the time when young Frodo Baggins was
one of the worst young rascals of Buckland. But it wasn’t
Metadata: {'producer': 'Acrobat Distiller 6.0 (Windows)', 'creator': 'SYSTEM400 Rev 17.01', 'creationdate': '2008-07-23T14:49:49-05:00', 'title': 'The Fellowship of the Ring', 'author': 'J. R. R. Tolkien', 'moddate': '2009-04-15T19:53:53+05:30', 'ebx_publisher': 'HarperCollins', 'source': 'Dataset\\books\\j-r-r-tolkien-lord-of-the-rings-01-the-fellowship-of-the-ring-retail-pdf.pdf', 'total_pages': 571, 'page': 150, 'page_label': '122', 'start_index': 929}
------
tell you.’
‘Go on!’ said Frodo faintly.
‘It was the Sackville-Bagginses that were his downfall, as you
might expect. One day, a year before the Party, I happened to
be walking along the road, when I saw Bilbo ahead. Suddenly
in the distance th