In [None]:
pip install langchain langchain-openai python-dotenv beautifulsoup4 google-search-results langchain_text_splitters langchain-chroma

In [22]:
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from dotenv import load_dotenv
import os
import re

In [None]:
from langchain_community.document_loaders import WebBaseLoader

url="https://en.wikipedia.org/wiki/PK_(film)"
loader = WebBaseLoader(url)
documents = loader.load()
print(documents[0].page_content)

In [38]:
from langchain.prompts import ChatPromptTemplate


prompt = ChatPromptTemplate.from_template("""
Extract the main movie or show name (title only, no extra words) from this question:

Question: "{question}"

Only return the movie/show name.
""")

def extract_title_via_llm(question: str) -> str:
    chain = prompt | llm
    result = chain.invoke({"question": question}).content.strip()
    return chain.invoke({"question": question}).content.strip()

extracted_title = extract_title_via_llm("what happens at the end of game of thrones")
print("Extracted Title:", extracted_title)

Extracted Title: Game of Thrones


In [None]:
import os
from dotenv import load_dotenv
from langchain_community.utilities import SerpAPIWrapper


# Load environment variables
load_dotenv()
serp_api_key = os.getenv("SERP_API_KEY")

# Define search parameters
params = {
    "num": 5,
    "gl": "us",
    "hl": "en",
    "engine": "google",
    "device": "desktop",
    "google_domain": "google.com"
}

# Initialize SerpAPI Wrapper
search = SerpAPIWrapper(serpapi_api_key=serp_api_key, params=params)

# Define the query
extracted_title = "Inception"
query = f"{extracted_title} wikipedia"

# Perform the search and get structured results
results = search.results(query)

# Extract organic results
organic_results = results.get("organic_results", [])

# Print results
print("🔍 Top Organic Search Results:")
if not organic_results:
    print("No organic results found.")
else:
    for i, result in enumerate(organic_results, 1):
        title = result.get("title", "No Title")
        link = result.get("link", "No Link")
        snippet = result.get("snippet", "No Snippet")
        print(f"\n🔹 Result {i}")
        print(f"🔗 Title: {title}")
        print(f"🌐 URL: {link}")
        print(f"📝 Snippet: {snippet}")


In [77]:
import os
from dotenv import load_dotenv
from langchain_community.utilities import SerpAPIWrapper
from langchain.prompts import ChatPromptTemplate
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain.chains import RetrievalQA
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain.prompts import PromptTemplate
import chromadb
from uuid import uuid4

# Load environment variables
load_dotenv()
serp_api_key = os.getenv("SERP_API_KEY")

# LLM and Embeddings
llm = AzureChatOpenAI(
    openai_api_key=os.getenv("AZURE_OPENAI_KEY"),
    openai_api_version="2024-02-15-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    deployment_name="gpt-4.1",
    temperature=0,
)

embeddings = AzureOpenAIEmbeddings(
    azure_deployment="text-embedding-3-large",
    openai_api_key=os.getenv("AZURE_OPENAI_KEY"),
    openai_api_version="2024-02-15-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    chunk_size=1000
)

# Search configuration
params = {
    "num": 5,
    "gl": "us",
    "hl": "en",
    "engine": "google",
    "device": "desktop",
    "google_domain": "google.com"
}
search = SerpAPIWrapper(serpapi_api_key=serp_api_key, params=params)

# Title extraction prompt
prompt = ChatPromptTemplate.from_template("""
You are an expert analyst assistant.

Extract the core business query topic from the user's question. Your answer should be short and precise, combining the company/product name and the business intent.

Examples:
- Question: "Is OpenAI profitable yet?" → Answer: "OpenAI profitability"
- Question: "What are the growth plans of SpaceX?" → Answer: "SpaceX future plans"
- Question: "How is Google's AI division performing?" → Answer: "Google AI performance"
- Question: "What is Apple's next product launch?" → Answer: "Apple product roadmap"

Now, analyze and extract the main topic:

Question: "{question}"

Return only the business query topic.
""")

def extract_title_via_llm(question: str) -> str:
    chain = prompt | llm
    return chain.invoke({"question": question}).content.strip()

def get_wikipedia_link(extracted_title: str) -> str:
    query = f"{extracted_title} site:techcrunch.com OR site:bloomberg.com OR site:businessinsider.com OR site:wikipedia.org"
    results = search.results(query)
    organic_results = results.get("organic_results", [])
    return organic_results[0].get("link", None) if organic_results else None

def load_wikipedia_content(url: str) -> str:
    loader = WebBaseLoader(url)
    docs = loader.load()
    return docs[0].page_content if docs else "No content loaded."


def create_vectorstore(content: str):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    split_texts = splitter.split_text(content)
    documents = [Document(page_content=text, metadata={"source": "Wikipedia"}) for text in split_texts]
    uuids = [str(uuid4()) for _ in documents]

    client = chromadb.PersistentClient(path="./chroma_store")
    collection_name = "wikipedia_rag"

    vectorstore = Chroma(
        client=client,
        collection_name=collection_name,
        embedding_function=embeddings
    )

    vectorstore.add_documents(documents=documents, ids=uuids)
    return vectorstore

def run_rag_pipeline(user_question: str):
    title = extract_title_via_llm(user_question)
    print("🎬 Extracted Title:", title)

    wiki_url = get_wikipedia_link(title)
    print("🔗 Wikipedia URL:", wiki_url)

    if not wiki_url:
        print("❌ Could not find Wikipedia link.")
        return

    vectorstore = create_vectorstore(content)
    retriever = vectorstore.as_retriever()

    # Prompt
    qa_prompt = PromptTemplate(
        input_variables=["context", "question"],
        template="""Use the following context to answer the question as accurately as possible.

    Context:
    {context}

    Question: {question}

    Answer:"""
    )

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        chain_type="stuff",
        chain_type_kwargs={"prompt": qa_prompt},
        return_source_documents=False
    )

    answer = qa_chain.run(user_question)

    print("\n💬 Answer:")
    print(answer)


# 🔧 Run the pipeline
run_rag_pipeline("What is Apple's next product launch?")


🎬 Extracted Title: Apple product roadmap
🔗 Wikipedia URL: https://en.wikipedia.org/wiki/List_of_Apple_products
📄 Loaded 55658 characters from Wikipedia.

💬 Answer:
Based on the provided context, the next Apple product launch is scheduled for **July 15, 2024**, with the release of the **HomePod Mini (midnight)**.
