# connecting mongodb atlas with langchain

Establishing connection to mongodb

In [3]:
from pymongo import MongoClient # import mongo client to connect
import json # import json to load credentials
import urllib.parse

# load credentials from json file
with open('credentials_mongodb.json') as f:
    login = json.load(f)

# assign credentials to variables
username = login['username']
password = urllib.parse.quote(login['password'])
host = login['host']
url = "mongodb+srv://{}:{}@{}/?retryWrites=true&w=majority".format(username, password, host)

# connect to the database
client = MongoClient(url)

Select your database and collection, for example

In [2]:
database = client['goodreads']
collection = database['reviews']
collection.find_one()  # check connection

{'_id': '18108f92b5e47fbb2154ab466d87f217',
 'book_id': '29527139',
 'user_id': '34216df2a5f3846b17e3f96bea6c2ad7',
 'rating': 3,
 'review_text': 'Read for class',
 'date_added': 'Tue Aug 22 10:29:39 -0700 2017',
 'book_title': 'All the Pretty Things: The Story of a Southern Girl Who Went Through Fire to Find Her Way Home',
 'embedding': [-0.011579134501516819,
  0.029720719903707504,
  -0.0642186850309372,
  0.03411995247006416,
  -0.042396899312734604,
  0.03216535225510597,
  0.11389686167240143,
  0.033041682094335556,
  -0.021103762090206146,
  0.03595007210969925,
  0.020850829780101776,
  0.07011657953262329,
  0.0017828595591709018,
  -0.06910087168216705,
  -0.050639692693948746,
  -0.057170893996953964,
  -0.05236879363656044,
  0.02930048108100891,
  -0.08091437816619873,
  -0.006276468746364117,
  0.029484041035175323,
  0.041949328035116196,
  -0.029490381479263306,
  0.048226360231637955,
  -0.13093781471252441,
  0.0518830306828022,
  -0.036288756877183914,
  -0.04891797

In [3]:
collection.update_many({}, {"$unset": {"embedding": ""}})


UpdateResult({'n': 8132, 'electionId': ObjectId('7fffffff0000000000000413'), 'opTime': {'ts': Timestamp(1763618327, 8133), 't': 1043}, 'nModified': 8125, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1763618327, 8133), 'signature': {'hash': b'\x05F\xb7\xa1WP\xa52\x824D\xeb\x02.n\xe9$\xcc#\x93', 'keyId': 7516640277822439438}}, 'operationTime': Timestamp(1763618327, 8133), 'updatedExisting': True}, acknowledged=True)

## 2. Load your API keys




First step, I would like you to set up langsmith API and google AI studio API

1. Create an empty `.env` file in the same working directory of this notebook

Copy the following to the `.env` file content

```
# Environment variables for LangSmith
export LANGSMITH_TRACING="true"
export LANGSMITH_API_KEY="..."

# add google api key for testing
export GOOGLE_API_KEY="..."
```

2. Create a LangSmith account & API key
- Go to LangSmith and sign in.
[https://smith.langchain.com](https://smith.langchain.com)
- Log in with your Github account
- Open Settings â†’ API Keys.
- Click Create API key (personal) and copy it.
- Now paste your API key in the `.env` file

3. Get a Gemini API key
- Visit [Google AI Studio](https://aistudio.google.com) â†’ Get API Key and sign in.
- Click Create API key and copy it.
- Paste it in the `.env` file

Once you're done setting up your `.env` file, run the cell below to import your API keys to the environment of this notebook

In [4]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Access the environment variables
os.environ['LANGCHAIN_TRACING_V2'] = os.getenv('LANGSMITH_TRACING')
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGSMITH_API_KEY')

# get google api key from .env
os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')

## 3. Create your vector embeddings

In [5]:
# ðŸ§© Imports
import langchainhub as hub
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from pymongo import MongoClient



  from .autonotebook import tqdm as notebook_tqdm
USER_AGENT environment variable not set, consider setting it to identify your requests.


In [23]:
# # Create complete backup of reviews collection
# backup_collection = database['reviews_complete_backup']

# # Check if backup exists and drop if it does
# if backup_collection.count_documents({}) > 0:
#     backup_collection.drop()

# # Copy all documents in batches
# batch_size = 10000
# total_docs = collection.count_documents({})

# for skip in range(0, total_docs, batch_size):
#     batch = list(collection.find().skip(skip).limit(batch_size))
#     if batch:
#         backup_collection.insert_many(batch)

In [None]:
# # Clean up documents with unknown book titles
# bad_titles = ["", None, "Unknown Title", "unknown", "UNKNOWN", "N/A", "n/a", "NA", "na"]

# query = {
#     "$or": [
#         {"book_title": {"$exists": False}},
#         {"book_title": {"$in": bad_titles}}
#     ]
# }


# # Delete from main collection
# collection.delete_many(query)

DeleteResult({'n': 165955, 'electionId': ObjectId('7fffffff0000000000000410'), 'opTime': {'ts': Timestamp(1762568592, 629), 't': 1040}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1762568592, 629), 'signature': {'hash': b'\xdf\xfdj\xe9\xaa|\xb2g?\xb1\xd4R\x9c\xa7\x85X\x85\x0c(G', 'keyId': 7516640277822439438}}, 'operationTime': Timestamp(1762568592, 629)}, acknowledged=True)

In [7]:
#Simple embedding creation with Sentence Transformers
from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import SentenceTransformerEmbeddings

# Create embeddings model
embeddings = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')

# Process all documents without embeddings
docs = list(collection.find({"embedding": {"$exists": False}}))

for doc in docs:
    text = doc.get("review_text", "")
    if text:
        vector = embeddings.embed_query(text)
        collection.update_one({'_id': doc['_id']}, {'$set': {'embedding': vector}})

  embeddings = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')


In [8]:
collection.find_one()  # check the embedding field has been added

{'_id': '6659e2a0bcface86f0835714023f24f4',
 'book_id': '32620360',
 'user_id': '01ec1a320ffded6b2dd47833f2c8e4fb',
 'rating': 4,
 'review_text': '"I promise to love you forever. As long as there is love in this world, we will be a part of it." \n So sweet!! Filled with tons of romance and some real emotion. Don\'t read any spoilers for this book! I\'m glad I didn\'t know anything about it. \n Renee Carlino\'s books ALWAYS captivate me. \'Swear on This Life\' is one of my all-time favorite stories. I will say that Adam is definitely my favorite thing about this book. *SIGH* Adam is one of the most memorable characters ever created by Ms. Carlino. Impulsive, sweet, charming, creative and loving, he really puts himself out there when he meets Charlotte. While I\'m not always a fan of insta-love, it just WORKED here. Charlotte opened up her heart to Adam because he seemed, well, absolutely perfect. \n I was absolutely enchanted by Adam and could understand how Charlotte fell for him. But 

In [9]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash")


In [10]:
# create vector_store (adjust index_name/text_key to your setup)
vector_store = MongoDBAtlasVectorSearch(
    collection=collection,
    embedding=embeddings,
    text_key="review_text",         # field that contains the review text
    index_name="realvector_index",  # your Atlas index name
    relevance_score_fn="cosine"
)

  vector_store = MongoDBAtlasVectorSearch(


In [11]:
retriever = vector_store.as_retriever()
retriever.invoke("recommend me a book")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[Document(metadata={'_id': 'c19db518d802fc952c11e483b4e18441', 'book_id': '1677816', 'user_id': 'e9fae29f3c1cc8cf05638b2aa2cc5c40', 'rating': 5, 'date_added': 'Sun Aug 13 13:55:20 -0700 2017', 'book_title': 'Who Switched Off My Brain?', 'embedding': [-0.023662587627768517, -0.02031187154352665, 0.00911265891045332, 0.11186811327934265, -0.09902670234441757, 0.010724562220275402, 0.019367944449186325, -0.037828054279088974, -0.08097705990076065, -0.012916097417473793, -0.0053146216087043285, 0.07223277539014816, 0.026717770844697952, -0.04392848163843155, -0.017996108159422874, -0.021665366366505623, -0.03832380473613739, -0.04782494902610779, 0.05749989300966263, -0.08058914542198181, -0.019027667120099068, -0.012692631222307682, 0.05361436679959297, -0.0014824233949184418, -0.014779212884604931, 0.015910891816020012, 0.025235295295715332, 0.001169910072349012, -0.0533997118473053, -0.013752877712249756, -0.016772359609603882, 0.0892382487654686, -0.04436114430427551, -0.03956636041402

## 5. Create your prompt template

## Decomposition

In [13]:
from langchain_core.prompts import ChatPromptTemplate

# Multi Query: Different Perspectives
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. Do not Hallucinate
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser

generate_queries = (
    prompt_perspectives 
    | llm 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [14]:
# get the questions
questions = generate_queries.invoke({"question": "I'm looking for a book to help me improve my leadership skills and communicate more effectively at work?"})


In [15]:
questions

['What books can help me develop strong leadership qualities and improve professional communication skills?',
 'Suggest a practical guide for enhancing management capabilities and effective workplace communication.',
 "I'm looking for literature that provides actionable advice on leadership development and clear communication in a corporate environment.",
 'Find books focused on mastering leadership techniques and interpersonal communication for career growth.',
 'Recommendations for resources to become a more impactful leader and a more articulate communicator at work.']

In [16]:
from typing import Any
from langchain_core.load import dumps, loads
from langchain_core.runnables import RunnableLambda

def get_unique_union(documents: list[list[Any]]):
    """Unique union of retrieved docs."""
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Deduplicate with stable ordering
    unique_docs = sorted(set(flattened_docs))
    # Return reconstructed Document objects
    return [loads(doc) for doc in unique_docs]

# Wrap as Runnable
unique_union_runnable = RunnableLambda(get_unique_union)

# Build retrieval chain
retrieval_chain = generate_queries | retriever.map() | unique_union_runnable




In [36]:
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough

# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
# create runnable chain

final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)
question = "I'm looking for a book to help me improve my leadership skills and communicate more effectively at work?"
final_rag_chain.invoke({"question":question})


'Based on the context, here are a few books that could help you improve your leadership skills and communicate more effectively at work:\n\n*   **Extreme Ownership: How U.S. Navy SEALs Lead and Win**: This book focuses on the message to "take ownership, stop making excuses, and be an extreme leader." It provides good examples and stories relevant to leadership.\n*   **The Coaching Habit: Say Less, Ask More & Change the Way You Lead Forever**: This book offers useful prompts and tips, emphasizing acting like a coach by listening more and asking the right questions, which are crucial for effective communication and leadership.\n*   **Christian Coaching: Helping Others Turn Potential into Reality**: Described as a textbook on coaching, it highlights that coaching involves "basically listening and asking questions." This directly addresses both leadership (through coaching principles) and effective communication.\n*   **The Art of War**: This is described as an "interesting little book abo

In [19]:
# stress test with another question
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough

# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
# create runnable chain

final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)
question = "What are the books that started_at for review is 2019?"
final_rag_chain.invoke({"question":question})

'I am sorry, but the provided context does not contain any information about reviews that `started_at` for the year 2019. The `date_added` for all reviews in the context is in 2017, and there is no `started_at` field in the metadata.'

## MultiQuery

In [None]:
from rag_app.application import load_prompt
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Multi Query: load template from prompts/multi_query_v1.txt
# Notebook path -> prompts file: ../rag_app/prompts/multi_query_v1.txt
multi_query_template = load_prompt('prompts/multi_query_v1.txt')
prompt_perspectives = ChatPromptTemplate.from_template(multi_query_template)

generate_queries = (
    prompt_perspectives
    | llm
    | StrOutputParser()
    | (lambda x: x.split("\n"))
)


In [None]:
# get the questions
questions = generate_queries.invoke({"question": "What books would you recommend for a young female reader interested in romance and adventure?"})

In [29]:
# show the questions
for i,q in enumerate(questions):
    print(i,q)

0 Recommend YA books that combine romance with adventurous plots.
1 What novels feature young female protagonists experiencing both love stories and thrilling adventures?
2 Adventure romance books for teen girls.
3 Looking for captivating books with exciting quests and romantic storylines, suitable for young women.
4 Stories where young heroines find love while on an epic journey or grand adventure.


In [32]:
from langchain_core.load import dumps, loads


def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

# Retrieve
question = "What books would you recommend for a young female reader interested in romance and adventure?"
retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question":question})
len(docs)

14

In [33]:
for i in docs:
    print(i.page_content[:200], "\n---")

I enjoyed this book. It is told from the viewpoint of a young teenage girl through her diary entries. The story doesn't stretch the believability factor, and it pulls all the emotional strings. 
---
Sometimes you just need a book about a young woman whose life is a mess and then she goes on an adventure and well basically everything works out. Start to finish a pleasant hour or two of your time.  
---
These are lovely books. The characters are interesting and rather fun. 
---
A very enjoyable book with a strong heroine. Will look for more books by this author 
---
Good read 
 This is a great read. Its short but it is packed with adventure & passion & heart ache & love. 
---
This was such a cute book, although pretty predictable. 
 But I'm a sucker for anything romantic..... 
---
Whew, this was a steamy one! Thanks to BookSparks for the review copy! 
 WORTH THE WAIT was exactly what I look for in a romance novel - lots of passion mixed with strong female characters, a little d 
---
This

In [34]:
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough

# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})


'Based on the context provided, here are some books that would be recommended for a young female reader interested in romance and adventure:\n\n**Strong Recommendations for both Romance and Adventure:**\n\n*   **Life As We Knew It (Last Survivors, #1)**: This book is told from the viewpoint of a "young teenage girl through her diary entries" and "pulls all the emotional strings." The title "Last Survivors" suggests an adventure or dystopian element. (Rating: 4)\n*   **Dirty Promise**: This is described as a "great read... packed with adventure & passion & heart ache & love." This directly hits both interests. (Rating: 4)\n*   **Daughter of Smoke & Bone (Daughter of Smoke & Bone, #1)**: While one reviewer disliked the amount of romance ("worst hijacking of a plot by romance"), this indicates a strong romantic element. The title suggests a fantasy/adventure setting. (Rating: 2, so read with caution regarding reviewer\'s opinion)\n*   **Chasing Paradise (A Paradise Novel Book 1)**: An "ea

## RAG Fusion

In [50]:
from langchain_core.prompts import ChatPromptTemplate
# RAG-Fusion: Related
template = """You are a helpful assistant that generates multiple search queries based on a single input query. \n
Provide these alternative questions separated by newlines. Generate multiple search queries related to: {question} \n
Output (4 queries):"""

prompt_rag_fusion = ChatPromptTemplate.from_template(template)
generate_queries = ( 
    prompt_rag_fusion | 
    llm | 
    StrOutputParser() | 
    (lambda x: x.split("\n")))

In [51]:
questions = generate_queries.invoke({"question": "I'm looking for a book to help me improve my leadership skills and communicate more effectively at work?"})

In [52]:
# show the questions
for i, q in enumerate(questions):
    print(f"Question {i+1}: {q}")

Question 1: Best books for improving leadership and communication skills
Question 2: Books on effective workplace communication and leadership development
Question 3: Top books for enhancing leadership and communication in a professional setting
Question 4: Recommended books for managers to improve communication and leadership abilities


In [53]:
def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results

retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion
docs = retrieval_chain_rag_fusion.invoke({"question": question})
len(docs)

7

In [54]:
docs

[(Document(metadata={'_id': '22fb21784141b9f6b81bf0dc16d9bfe7', 'book_id': '23848190', 'user_id': '233d30286f4046b54c54ac721c00ab93', 'rating': 3, 'date_added': 'Sun Aug 27 12:55:52 -0700 2017', 'book_title': 'Extreme Ownership: How U.S. Navy SEALs Lead and Win', 'embedding': [0.013981848023831844, 0.00367926643230021, 0.006951062474399805, 0.05324932560324669, -0.024916227906942368, 0.03627992048859596, 0.02617749013006687, -0.013107218779623508, -0.07046651840209961, 0.01572895050048828, 0.014251705259084702, 0.09929248690605164, 0.026174772530794144, -0.029616832733154297, 0.005123180337250233, -0.00970066711306572, 0.015998493880033493, 0.016006123274564743, 0.04998808354139328, -0.1078038439154625, -0.013756589032709599, -0.013561351224780083, 0.04113210737705231, 0.07821140438318253, -0.07332098484039307, -0.0011893565533682704, -0.013220073655247688, -0.026587190106511116, -0.006487476639449596, -0.046816110610961914, -0.03171054646372795, -0.0640234649181366, 0.0487469621002674

In [55]:
from langchain_core.runnables import RunnablePassthrough

# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": retrieval_chain_rag_fusion, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

'Based on the context, "The Coaching Habit: Say Less, Ask More & Change the Way You Lead Forever" seems like an excellent fit for your needs. One review describes it as a "Great book on conversations: how to listen more and ask the right questions," which directly addresses improving communication skills. Its title also indicates a focus on changing how you lead.\n\nWhile "Extreme Ownership: How U.S. Navy SEALs Lead and Win" is mentioned as a book about leadership, the reviews focus more on "taking ownership" and being a "leader" in a general sense, and less on the communication aspect you specifically asked for. "The Art of War" also touches on leadership, but in the context of warfare strategy, and doesn\'t focus on workplace communication.'

In [56]:
from langsmith import Client as LangSmithClient
import json
import os

client = LangSmithClient()

# Get the most recent run
runs = client.list_runs(project_name="default", limit=1)
latest_run = next(runs)

# Read the trace
trace = client.read_run(latest_run.id)

# Ensure logs directory exists
os.makedirs("logs", exist_ok=True)

# Export to JSON safely
filename = f"logs/{latest_run.id}.json"
with open(filename, "w", encoding="utf-8") as f:
    json.dump(trace.dict(), f, indent=2, default=str)

print(f"Trace saved to {filename}")



Trace saved to logs/ca8f6f9b-07d3-47ee-b616-fddeab78ffdb.json
