### Working with Embedding Models and storing in VectorDb

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

In [2]:
with open("speech.txt", "r") as file:
    speech = file.read()

from langchain_community.document_loaders import TextLoader

loader = TextLoader("speech.txt")
speech_docs = loader.load()

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)
final_docs = text_splitter.split_documents(speech_docs)
len(final_docs)

126

In [4]:
final_docs

[Document(metadata={'source': 'speech.txt'}, page_content='DEMOCRACY: THE CORNERSTONE OF HUMAN PROGRESS AND FREEDOM'),
 Document(metadata={'source': 'speech.txt'}, page_content='Honorable guests, distinguished colleagues, and fellow citizens,\n\nINTRODUCTION'),
 Document(metadata={'source': 'speech.txt'}, page_content="Today, I stand before you to speak about one of humanity's greatest achievements: democracy. More"),
 Document(metadata={'source': 'speech.txt'}, page_content='democracy. More than just a political system, democracy represents the collective aspiration of'),
 Document(metadata={'source': 'speech.txt'}, page_content='aspiration of billions of people for freedom, equality, and self-determination. It is the'),
 Document(metadata={'source': 'speech.txt'}, page_content='It is the foundation upon which modern civilization has built its most cherished values and'),
 Document(metadata={'source': 'speech.txt'}, page_content='values and institutions.'),
 Document(metadata={'source

In [5]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
embeddings = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004",
    max_retries=3
)

vectors = embeddings.embed_documents([doc.page_content for doc in final_docs])
len(vectors), len(vectors[0])


(126, 768)

### USING CHROMA DB

In [None]:
from langchain_community.vectorstores import Chroma

chroma_db = Chroma.from_documents(documents=final_docs, embedding=embeddings, persist_directory="chroma_db")

In [None]:
query = "What is the main topic of the speech?"
similar_docs = chroma_db.similarity_search_with_score(query)
similar_docs

[(Document(metadata={'source': 'speech.txt'}, page_content='Honorable guests, distinguished colleagues, and fellow citizens,\n\nINTRODUCTION'),
  0.28346580266952515),
 (Document(metadata={'source': 'speech.txt'}, page_content="Today, I stand before you to speak about one of humanity's greatest achievements: democracy. More"),
  0.33668404817581177),
 (Document(metadata={'source': 'speech.txt'}, page_content='Thank you.\n\n[Word count: Approximately 1,200 words]'),
  0.34432676434516907),
 (Document(metadata={'source': 'speech.txt'}, page_content='equality, and justice.'),
  0.38298314809799194)]

In [9]:
similar_docs = chroma_db.similarity_search_with_relevance_scores(query)
similar_docs

[(Document(metadata={'source': 'speech.txt'}, page_content='Honorable guests, distinguished colleagues, and fellow citizens,\n\nINTRODUCTION'),
  0.799559408697891),
 (Document(metadata={'source': 'speech.txt'}, page_content="Today, I stand before you to speak about one of humanity's greatest achievements: democracy. More"),
  0.7619284264175452),
 (Document(metadata={'source': 'speech.txt'}, page_content='Thank you.\n\n[Word count: Approximately 1,200 words]'),
  0.7565242099875087),
 (Document(metadata={'source': 'speech.txt'}, page_content='equality, and justice.'),
  0.7291900188997381)]

### Using FAISS

In [12]:
from langchain_community.vectorstores import FAISS
texts = [doc.page_content for doc in final_docs]
text_embedding_list = list(zip(texts, vectors))
vector_store = FAISS.from_embeddings(text_embeddings=text_embedding_list, embedding=embeddings, metadatas=[doc.metadata for doc in final_docs])
similar_docs = vector_store.similarity_search(query)
similar_docs

[Document(id='c94f3846-7316-4e07-b5e5-ad940e63ac00', metadata={'source': 'speech.txt'}, page_content='Honorable guests, distinguished colleagues, and fellow citizens,\n\nINTRODUCTION'),
 Document(id='33801034-38c9-46a0-b6b8-b54bc25ecbe4', metadata={'source': 'speech.txt'}, page_content="Today, I stand before you to speak about one of humanity's greatest achievements: democracy. More"),
 Document(id='23151d5c-c1a3-4d66-8d6a-8f6bdd432bee', metadata={'source': 'speech.txt'}, page_content='Thank you.\n\n[Word count: Approximately 1,200 words]'),
 Document(id='a0d0f166-c1fe-408e-b520-3d190801f5bb', metadata={'source': 'speech.txt'}, page_content='equality, and justice.')]

In [13]:
vector_store2 = FAISS.from_documents(
    documents=final_docs,  # Your document objects
    embedding=embeddings   # Embedding model to use
)
similar_docs = vector_store2.similarity_search(query)
similar_docs

[Document(id='dc685dfe-8c39-4201-934a-5f57b4d6719f', metadata={'source': 'speech.txt'}, page_content='Honorable guests, distinguished colleagues, and fellow citizens,\n\nINTRODUCTION'),
 Document(id='786d3a55-1541-49ca-bcf7-a70aa2fe5008', metadata={'source': 'speech.txt'}, page_content="Today, I stand before you to speak about one of humanity's greatest achievements: democracy. More"),
 Document(id='aef70217-4a3c-401e-9206-7227096fd7fb', metadata={'source': 'speech.txt'}, page_content='Thank you.\n\n[Word count: Approximately 1,200 words]'),
 Document(id='be5ba72b-58d6-4f50-8085-61f7b780df37', metadata={'source': 'speech.txt'}, page_content='equality, and justice.')]

### When to Use Which?
#### Use from_documents when:<br>
You want simplicity and convenience <br>
You have Document objects ready <br>
You want FAISS to handle embedding generation <br>
#### Use from_embeddings when:<br>
You already computed embeddings<br>
You want more control over the process<br>
You're working with custom embedding pipelines<br>
You want to avoid re-computing embeddings<br>