In [None]:
!pip install requests beautifulsoup4 tldextract
!pip install langchain faiss-cpu transformers

*****************  Import Libraries  **********************

In [22]:
import requests
from bs4 import BeautifulSoup
import tldextract
from urllib.parse import urljoin, urlparse
import html2text
import requests

from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import re

from langchain_groq import ChatGroq

from dotenv import load_dotenv
load_dotenv()

True

********************************  URL Input ********************************

In [23]:

url = 'https://python.langchain.com/docs/introduction/'
print("URL loaded:", url)


URL loaded: https://python.langchain.com/docs/introduction/


In [None]:

def scrape_website(url, max_pages=10):
    visited = set()
    to_visit = [url]
    extracted_text = ""

    while to_visit and len(visited) < max_pages:
        current_url = to_visit.pop(0)
        if current_url in visited:
            continue
        try:
            response = requests.get(current_url, timeout=5)
            soup = BeautifulSoup(response.text, 'html.parser')
            visited.add(current_url)
            print(f"Scraping: {current_url}")

            # Extract visible text
            # page_text = ' '.join([p.get_text() for p in soup.find_all('p')])
            # extracted_text += page_text + "\n"

            # Instead of just 'p', target a wider set of content tags
            content_tags = ['p', 'h1', 'h2', 'h3', 'li', 'code', 'pre']
            all_content = []
            for tag_name in content_tags:
                all_content.extend(soup.find_all(tag_name))
                
            page_text = ' '.join([tag.get_text() for tag in all_content])
            extracted_text += page_text + "\n"

            # Extract internal links
            for link_tag in soup.find_all('a', href=True):
                href = link_tag['href']
                full_url = urljoin(current_url, href)
                parsed = urlparse(full_url)
                base_domain = tldextract.extract(url).domain
                if base_domain in parsed.netloc and full_url not in visited:
                    to_visit.append(full_url)
        except Exception as e:
            print(f"Failed to scrape {current_url}: {e}")
    
    return extracted_text

# Execute scraping
scraped_text = scrape_website(url)

scrapped_file_data = "scraped_data.txt"
with open(scrapped_file_data, 'w', encoding='utf-8') as f:
    f.write(scraped_text)
print(f"\nSuccessfully saved scraped data to {scrapped_file_data}")

********************************  Scrap & clean the text  ********************************

In [54]:


# content_tags = ['p', 'h1', 'h2', 'h3', 'li', 'code', 'pre']
def scrape_website_focused_markdown(url, max_pages=10):
    visited = set()
    to_visit = [url]
    extracted_markdown = "" 
    total_url_collection = 1
    # Initialize the converter
    h = html2text.HTML2Text()
    h.ignore_links = False
    h.body_width = 0 

    # Define the tags we want to keep
    content_tags = ['p', 'h1', 'h2', 'h3', 'code']

    while to_visit and len(visited) < max_pages:
        current_url = to_visit.pop(0)
        if current_url in visited:
            continue
        try:
            response = requests.get(current_url, timeout=5)
            soup = BeautifulSoup(response.text, 'html.parser')
            visited.add(current_url)
            print(f"\n🔎 Scraping new: {current_url}")

            # --- Focused HTML Extraction ---
            isolated_html = ""

            # Collect only desired tags, ignoring menu/sidebar
            all_content_tags = soup.find_all(content_tags)
            for tag in all_content_tags:
                if tag.find_parent(class_=lambda c: c and any(x in c.lower() for x in ["menu", "sidebar"])):
                    continue
                isolated_html += tag.prettify()

            # Convert extracted HTML to Markdown
            if isolated_html.strip():
                markdown_text = h.handle(isolated_html)
                extracted_markdown += f"\n\n--- Page: {current_url} ---\n\n"
                extracted_markdown += markdown_text
            
            # --- Link Extraction with user confirmation ---
            for link_tag in soup.find_all('a', href=True):
                href = link_tag['href']
                full_url = urljoin(current_url, href)
                parsed = urlparse(full_url)
                base_domain = tldextract.extract(url).domain

                if total_url_collection < max_pages:
                    if base_domain in parsed.netloc and full_url not in visited:
                        choice = input(f"👉 Found link: {full_url}\nDo you want to crawl this link? (y/n): ").strip().lower()
                        if choice == "y":
                            to_visit.append(full_url)
                            total_url_collection = total_url_collection + 1
                            print("✅ Added to queue", full_url, total_url_collection)
                        else:
                            print("❌ Skipped", full_url, total_url_collection)

        except Exception as e:
            print(f"⚠️ Failed to scrape {current_url}: {e}")
    
    return extracted_markdown    

# Execute scraping
scraped_text = scrape_website_focused_markdown(url)



🔎 Scraping new: https://python.langchain.com/docs/introduction/
❌ Skipped https://python.langchain.com/docs/introduction/#__docusaurus_skipToContent_fallback 1
✅ Added to queue https://docs.langchain.com/oss/python/langchain/overview 2
❌ Skipped https://python.langchain.com/ 2
❌ Skipped https://python.langchain.com/docs/integrations/providers/ 2
❌ Skipped https://python.langchain.com/api_reference/ 2
❌ Skipped https://python.langchain.com/docs/contributing/ 2
❌ Skipped https://python.langchain.com/docs/people/ 2
✅ Added to queue https://python.langchain.com/docs/troubleshooting/errors/ 3
✅ Added to queue https://docs.smith.langchain.com 4
✅ Added to queue https://langchain-ai.github.io/langgraph/ 5
✅ Added to queue https://smith.langchain.com/hub 6
✅ Added to queue https://js.langchain.com 7
✅ Added to queue https://python.langchain.com/v0.2/docs/introduction 8
❌ Skipped https://python.langchain.com/v0.1/docs/get_started/introduction 8
✅ Added to queue https://chat.langchain.com 9
❌ S

In [55]:

def clean_scraped_text(text: str) -> str:
    """
    Clean scraped text:
    - Collapse multiple spaces into one (but keep newlines)
    - Collapse multiple blank lines into a single newline
    - Strip leading/trailing spaces
    """
    # Replace multiple spaces (but not newlines) with one
    text = re.sub(r'[ \t]+', ' ', text)

    # Collapse multiple blank lines into a single newline
    text = re.sub(r'\n\s*\n+', '\n\n', text)

    # Strip spaces at line starts/ends
    text = re.sub(r'[ \t]+\n', '\n', text)
    text = re.sub(r'\n[ \t]+', '\n', text)

    return text.strip()

clean_text   = clean_scraped_text(scraped_text)

scrapped_file_data = "scraped_data.txt"
with open(scrapped_file_data, 'w', encoding='utf-8') as f:
    f.write(clean_text)

print(f"\nSuccessfully saved scraped data to {scrapped_file_data}")


Successfully saved scraped data to scraped_data.txt


In [26]:
scraped_text = ''
scrapped_file_data = "scraped_data.txt"
with open(scrapped_file_data, 'r', encoding='utf-8') as f:
    scraped_text = f.read()

def chunk_text(text, chunk_size=1500, chunk_overlap=200):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_text(text)

# Chunk the scraped text
chunks = chunk_text(scraped_text)
print(f"Text split into {len(chunks)} chunks.")


Text split into 72 chunks.


In [None]:
chunks

In [57]:
# 🧠 Cell 6: Load Embedding Model
# Load embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
print("Embedding model loaded.")

Embedding model loaded.


In [None]:
# Create FAISS vector store
vectorstore = FAISS.from_texts(chunks, embedding_model)
print("Vector store created.")

Vector store created.


In [58]:
custom_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a helpful assistant. Use the following extracted content to answer the question.
Answer in a clear, factual, and concise way. If the answer is not in the context, say "I don’t know."

Context:
{context}

Question:
{question}

Answer:
""")

map_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
Use the following context to answer the question:

{context}

Question: {question}
Answer:
"""
)

# Prompt for the reduce step (combine answers)
combine_prompt = PromptTemplate(
    input_variables=["summaries", "question"],
    template="""
The following are answers from different documents:
{summaries}

Given the above, provide a final, concise answer to the question:

Question: {question}
Answer:
"""
)


question_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are given a document and a question. Use the document to answer.

Document:
{context}

Question: {question}
Answer:
"""
)

# Refine prompt (subsequent documents)
refine_prompt = PromptTemplate(
    input_variables=["existing_answer", "context", "question"],
    template="""
We have an existing answer: {existing_answer}

Here is another document that may help refine it:
{context}

Question: {question}

Update the answer if the document provides new useful information. 
If not, keep the original answer.

Refined Answer:
"""
)


rerank_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are given a document and a question.

Document:
{context}

Question: {question}

Provide:
1. An answer to the question (if the document is relevant).
2. A relevance score between 0 and 10 (higher means more relevant).

Format:
Answer: <your answer here>
Score: <number between 0 and 10>
"""
)


# =====================================================================
# 🔹 chain_type in RetrievalQA
# =====================================================================
#
# "stuff"
#   - Simplest method.
#   - All retrieved documents are stuffed (concatenated) into the prompt 
#     along with your query.
#   - Works well if documents are short and the number of tokens is small.
#
# ---------------------------------------------------------------------
#
# "map_reduce"
#   - Each retrieved document is first processed individually with the LLM (map step).
#   - Then the outputs are combined/summarized (reduce step).
#   - Better for handling many long documents, since it avoids hitting token limits.
#
# ---------------------------------------------------------------------
#
# "refine"
#   - Processes documents sequentially.
#   - Starts with the first document → generates an initial answer.
#   - Then refines that answer using each subsequent document.
#   - Useful when you want the model to incrementally improve its response.
#
# ---------------------------------------------------------------------
#
# "map_rerank"
#   - LLM scores each document separately for relevance and produces an answer.
#   - The best-scored answer is returned.
#   - Useful when documents may not all be relevant.
#
# =====================================================================


In [59]:
def load_qa_chain(vectorstore, model_name="google/flan-t5-base"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    hf_pipeline = pipeline("text2text-generation", model=model, tokenizeder=tokenizer)
    llm = HuggingFacePipeline(pipeline=hf_pipeline)

    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
    
    qa_chain = RetrievalQA.from_chain_type(
                                            llm=llm,
                                            retriever=retriever,
                                            chain_type="stuff",  # This tells LangChain to use simple context stuffing
                                            chain_type_kwargs={"prompt": custom_prompt},
                                            return_source_documents=True
                                            )
    return qa_chain


def load_groq_qa_chain(vectorstore, model_name, chain_type):
    llm       = ChatGroq(model=model_name, temperature=0)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
    if chain_type == "stuff":
        qa_chain = RetrievalQA.from_chain_type(
                                                llm=llm,
                                                retriever=retriever,
                                                chain_type="stuff",
                                                chain_type_kwargs={"prompt": custom_prompt},
                                                return_source_documents=True
                                                )
    elif chain_type == "map_reduce":
        qa_chain = RetrievalQA.from_chain_type(
                                                llm=llm,
                                                retriever=retriever,
                                                chain_type="map_reduce",
                                                chain_type_kwargs={
                                                                        "question_prompt": map_prompt,   # 👈 must be question_prompt
                                                                        "combine_prompt": combine_prompt
                                                                    },
                                                return_source_documents=True
                                            )

    elif chain_type == "refine":
        qa_chain = RetrievalQA.from_chain_type(
                                                llm=llm,
                                                retriever=retriever,
                                                chain_type="refine",
                                                chain_type_kwargs={
                                                    "question_prompt": question_prompt,
                                                    "refine_prompt": refine_prompt,
                                                    "document_variable_name": "context"  # 👈 matches your prompt template
                                                },
                                                return_source_documents=True
                                            )
    
    elif chain_type == "map_rerank":
        qa_chain = RetrievalQA.from_chain_type(
                                                llm=llm,
                                                retriever=retriever,
                                                chain_type="map_rerank",
                                                chain_type_kwargs={"prompt": rerank_prompt},
                                                return_source_documents=True
                                            )
    return qa_chain

# Load QA chain
# qa_chain = load_qa_chain(vectorstore, 'google/flan-t5-base')

print("QA chain ready.")

QA chain ready.


In [60]:
query  = input("Ask a question about the website: ")
print(query)

Compare Langchain, Langgraph  and Langsmith in bullet points


In [45]:
qa_groq_chain = load_groq_qa_chain(vectorstore, model_name="llama-3.1-8b-instant", chain_type="stuff")
result = qa_groq_chain({"query": query})
for docs in result['source_documents']:
    print(docs)
print("Answer:", result['result'])

page_content="Check out [ LangGraph-specific how-tos here ](https://langchain-ai.github.io/langgraph/how-tos/) .\n\n### [ Conceptual guide ](/docs/concepts/) \u200b\n\nIntroductions to all the key parts of LangChain you’ll need to know! [ Here ](/docs/concepts/) you'll find high level explanations of all LangChain concepts.\n\nFor a deeper dive into LangGraph concepts, check out [ this page ](https://langchain-ai.github.io/langgraph/concepts/) .\n\n### [ Integrations ](/docs/integrations/providers/) \u200b\n\nLangChain is part of a rich ecosystem of tools that integrate with our framework and build on top of it. If you're looking to get up and running quickly with [ chat models ](/docs/integrations/chat/) , [ vector stores ](/docs/integrations/vectorstores/) , or other LangChain components from a specific provider, check out our growing list of [ integrations ](/docs/integrations/providers/) .\n\n### [ API reference ](https://python.langchain.com/api_reference/) \u200b\n\nHead to the r

In [44]:

qa_groq_chain = load_groq_qa_chain(vectorstore, model_name="llama-3.1-8b-instant", chain_type="map_reduce")
result = qa_groq_chain({"query": query})

for docs in result['source_documents']:
    print(docs)

print("Answer:", result['result'])

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1502 > 1024). Running this sequence through the model will result in indexing errors


page_content="Check out [ LangGraph-specific how-tos here ](https://langchain-ai.github.io/langgraph/how-tos/) .\n\n### [ Conceptual guide ](/docs/concepts/) \u200b\n\nIntroductions to all the key parts of LangChain you’ll need to know! [ Here ](/docs/concepts/) you'll find high level explanations of all LangChain concepts.\n\nFor a deeper dive into LangGraph concepts, check out [ this page ](https://langchain-ai.github.io/langgraph/concepts/) .\n\n### [ Integrations ](/docs/integrations/providers/) \u200b\n\nLangChain is part of a rich ecosystem of tools that integrate with our framework and build on top of it. If you're looking to get up and running quickly with [ chat models ](/docs/integrations/chat/) , [ vector stores ](/docs/integrations/vectorstores/) , or other LangChain components from a specific provider, check out our growing list of [ integrations ](/docs/integrations/providers/) .\n\n### [ API reference ](https://python.langchain.com/api_reference/) \u200b\n\nHead to the r

In [51]:
qa_groq_chain = load_groq_qa_chain(vectorstore, model_name="llama-3.1-8b-instant", chain_type="refine")
result = qa_groq_chain({"query": query})

for docs in result['source_documents']:
    print(docs)

print("Answer:", result['result'])

page_content="Check out [ LangGraph-specific how-tos here ](https://langchain-ai.github.io/langgraph/how-tos/) .\n\n### [ Conceptual guide ](/docs/concepts/) \u200b\n\nIntroductions to all the key parts of LangChain you’ll need to know! [ Here ](/docs/concepts/) you'll find high level explanations of all LangChain concepts.\n\nFor a deeper dive into LangGraph concepts, check out [ this page ](https://langchain-ai.github.io/langgraph/concepts/) .\n\n### [ Integrations ](/docs/integrations/providers/) \u200b\n\nLangChain is part of a rich ecosystem of tools that integrate with our framework and build on top of it. If you're looking to get up and running quickly with [ chat models ](/docs/integrations/chat/) , [ vector stores ](/docs/integrations/vectorstores/) , or other LangChain components from a specific provider, check out our growing list of [ integrations ](/docs/integrations/providers/) .\n\n### [ API reference ](https://python.langchain.com/api_reference/) \u200b\n\nHead to the r