In [36]:


import os
from langchain import hub
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_openai import ChatOpenAI
from langchain_mongodb import MongoDBAtlasVectorSearch
from loading_doc_helper import (
    load_pdf, load_youtube, load_link, 
    split_text, embed_and_upload,
    check_if_source_exists,
    DB_NAME, COLLECTION_NAME, embedding_model, client, ATLAS_VECTOR_SEARCH_INDEX_NAME
)

collection = client[DB_NAME][COLLECTION_NAME]
os.environ["USER_AGENT"] = "TranslationProject/1.0"


# Initialize LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Initialize vector DB
vector_db = MongoDBAtlasVectorSearch(
  collection=collection,
  embedding=embedding_model,
  index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME
)

# Get the retrieval QA chat prompt from LangChain Hub
retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")

# Create the document combination and retrieval chain
combine_docs_chain = create_stuff_documents_chain(llm, retrieval_qa_chat_prompt)
rag_chain = create_retrieval_chain(vector_db.as_retriever(), combine_docs_chain)

# Run a sample query
output = rag_chain.invoke({"input": "Who is Donald Trump"})

print(output)

2025-10-03 12:05:33,421 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'input': 'Who is Donald Trump', 'context': [Document(id='68df86a8534fd1af8b22df15', metadata={'_id': '68df86a8534fd1af8b22df15', 'source': 'https://en.wikipedia.org/wiki/Donald_Trump', 'title': 'Donald Trump - Wikipedia', 'language': 'en'}, page_content="Donald John Trump (born June 14, 1946) is an American politician, media personality, and businessman who is the 47th president of the United States. A member of the Republican Party, he served as the 45th president from 2017 to 2021.\nBorn into a wealthy family in New York City, Trump graduated from the University of Pennsylvania in 1968 with a bachelor's degree in economics. He became the president of his family's real estate business in 1971, renamed it the Trump Organization, and began acquiring and building skyscrapers, hotels, casinos, and golf courses. He launched side ventures, many licensing the Trump name, and filed for six business bankruptcies in the 1990s and 2000s. From 2004 to 2015, he hosted the reality television show 

In [37]:
# Extract and print the context/source

# linked_source = None
# for doc in output['context']:
#     if doc['metadata']

output['context']
doc  = output['context']



raw_sources = []
title = ""
url = ""
for doc in output['context']:
    title = doc.metadata.get('title')
    url = doc.metadata.get('source')
    if title or url:
        raw_sources.append(f"{title} - {url}")

output['answer']

'Donald John Trump is an American politician, media personality, and businessman who served as the 45th president of the United States from 2017 to 2021. He is a member of the Republican Party and was born on June 14, 1946, in New York City. Trump graduated from the University of Pennsylvania in 1968 with a bachelor\'s degree in economics and went on to become the president of the Trump Organization, a real estate business started by his family. He is known for his involvement in various business ventures, including building skyscrapers, hotels, casinos, and golf courses. Trump also gained fame as the host of the reality television show "The Apprentice" from 2004 to 2015.'

In [39]:
if output.get("context"):
    print("true")

true
