In [2]:
# https://python.langchain.com/docs/modules/data_connection/vectorstores/integrations/mongodb_atlas

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import WebBaseLoader
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import MongoDBAtlasVectorSearch
from pymongo import MongoClient

In [4]:
import os
from dotenv import load_dotenv
load_dotenv(override=True)

openai_api_key = os.environ["OPENAI_API_KEY_3.5"]
mongodb_conn_string = os.environ["MONGO_URI"]

db_name = "search_db"
collection_name = "search_col"
index_name = "vsearch_index"

In [5]:
# Step 1: Load
loaders = [
 WebBaseLoader("https://en.wikipedia.org/wiki/AT%26T"),
 WebBaseLoader("https://en.wikipedia.org/wiki/Bank_of_America")
]
data = []
for loader in loaders:
    data.extend(loader.load())

In [6]:
# Step 2: Transform (Split)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separators=[
                                               "\n\n", "\n", "(?<=\. )", " "], length_function=len)
docs = text_splitter.split_documents(data)
print('Split into ' + str(len(docs)) + ' docs')

Split into 289 docs


In [7]:
# Step 3: Embed
# https://api.python.langchain.com/en/latest/embeddings/langchain.embeddings.openai.OpenAIEmbeddings.html
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

  warn_deprecated(


In [8]:
# Step 4: Store
# Initialize MongoDB python client
client = MongoClient(mongodb_conn_string)
collection = client[db_name][collection_name]

In [9]:
# Reset w/out deleting the Search Index 
collection.delete_many({})

DeleteResult({'n': 0, 'electionId': ObjectId('7fffffff0000000000000313'), 'opTime': {'ts': Timestamp(1708858951, 13), 't': 787}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1708858951, 13), 'signature': {'hash': b'\xf7I\xee\x13\xb4\xc7\xdf7\xa9\xcf\x9e\xc6\xbf\xfd\xfa\x11"\xb20p', 'keyId': 7298821372183052290}}, 'operationTime': Timestamp(1708858951, 13)}, acknowledged=True)

In [10]:
# Insert the documents in MongoDB Atlas with their embedding
# https://github.com/hwchase17/langchain/blob/master/langchain/vectorstores/mongodb_atlas.py
docsearch = MongoDBAtlasVectorSearch.from_documents(
    docs, embeddings, collection=collection, index_name=index_name
)

In [12]:
import argparse
from pymongo import MongoClient
from langchain.vectorstores import MongoDBAtlasVectorSearch
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
import warnings

In [13]:
# Filter out the UserWarning from langchain
warnings.filterwarnings("ignore", category=UserWarning, module="langchain.chains.llm")

In [19]:
# Process arguments
parser = argparse.ArgumentParser(description='Atlas Vector Search Demo')
# er.add_argument('-q', '--question', help="The question to ask")
args = parser.parse_args()

if args.question is None:
    # Some questions to try...
    query = "How big is the telecom company?"
    query = "Who started AT&T?"
    #query = "Where is AT&T based?"
    #query = "What venues are AT&T branded?"
    #query = "How big is BofA?"
    #query = "When was the financial institution started?"
    #query = "Does the bank have an investment arm?"
    #query = "Where does the bank's revenue come from?"
    #query = "Tell me about charity."
    #query = "What buildings are BofA branded?"

else:
    query = args.question

print("\nYour question:")
print("-------------")
print(query)

usage: ipykernel_launcher.py [-h]
ipykernel_launcher.py: error: unrecognized arguments: --f=/home/ubuntu/.local/share/jupyter/runtime/kernel-v2-343279rsxyOmoNc1u.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
# Initialize MongoDB python client
client = MongoClient(params.mongodb_conn_string)
collection = client[params.db_name][params.collection_name]

# initialize vector store
vectorStore = MongoDBAtlasVectorSearch(
    collection, OpenAIEmbeddings(openai_api_key=params.openai_api_key), index_name=params.index_name
)

# perform a similarity search between the embedding of the query and the embeddings of the documents
# print("\nQuery Response:")
print("---------------")
docs = vectorStore.max_marginal_relevance_search(query, K=1)

print(docs[0].metadata['title'])
print(docs[0].page_content)

# Contextual Compression
llm = OpenAI(openai_api_key=params.openai_api_key, temperature=0)
compressor = LLMChainExtractor.from_llm(llm)

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectorStore.as_retriever()
)

print("\nAI Response:")
print("-----------")
compressed_docs = compression_retriever.get_relevant_documents(query)
print(compressed_docs[0].metadata['title'])
print(compressed_docs[0].page_content)
