<a href="https://colab.research.google.com/github/AhlemAmmar/AI-Powered-FAQ-Bot-RAG-based-/blob/main/AI_Powered_HTML_Website_(RAG_based).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Crawl HTML Based Website

In [59]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from collections import deque



In [60]:


def scrape_wikibooks_paragraphs(start_url,visited_pages, max_pages=100,levelMin=2,levelMax=3):
    visited = set()
    queue = deque([start_url])
    ##if u want to choose domain as netloc
    #domain = urlparse(start_url).netloc
    ## if you want to choose domain as the start url
    domain = start_url

    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0.0.0 Safari/537.36"
        )
    }


    while queue and len(visited) < max_pages:
        url = queue.popleft()

        if url in visited:
            continue
        try:
            print(f'visited webPage n°{len(visited)+1}: {url}')

            resp = requests.get(url, headers=headers, timeout=10)
            if resp.status_code != 200:
                continue
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "html.parser")
            links=soup.find_all("a", href=True)
            # Extract all <p> tags inside the main content area
            paragraphs = soup.select("p")

            # Join all paragraph texts with newlines
            page_text = "\n".join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))

            for link in links:

                parsed=urlparse(link["href"]).path
                parts=parsed.strip('/').split('/')
                full_url = urljoin(url,parsed)

                if  full_url not in visited and full_url.startswith(start_url+'/') and len(parts)in [levelMin,levelMax]:

                    queue.append(full_url)


            visited_pages.append(
                {
                    "url": url,
                    "response": resp.status_code,
                    "content": page_text,
                }
            )
            visited.add(url)
        except Exception as e:
            print("Error:", e)

    return visited_pages

if __name__ == "__main__":
    website_scrapped=[]
    urls=["https://www.discovertunisia.com/en/tunisia-all-year-round"]
    for url in urls:
        website_scrapped = scrape_wikibooks_paragraphs(url,website_scrapped)




visited webPage n°1: https://www.discovertunisia.com/en/tunisia-all-year-round
visited webPage n°2: https://www.discovertunisia.com/en/tunisia-all-year-round/winter
visited webPage n°3: https://www.discovertunisia.com/en/tunisia-all-year-round/spring
visited webPage n°4: https://www.discovertunisia.com/en/tunisia-all-year-round/summer
visited webPage n°5: https://www.discovertunisia.com/en/tunisia-all-year-round/autumn


In [61]:
website_scrapped

[{'url': 'https://www.discovertunisia.com/en/tunisia-all-year-round',
  'response': 200,
  'content': 'Mild and sunny, the winter is perfect for long walks, horse rides, drives and motorbike trips. It’s the ideal season for exploring the Sahara or visiting medinas, museums and archaeological sites. Alternatives include a spa treatment and fitness regime or a round of golf on one of Tunisia’s greens.\nThe season of light and greenery, it is springtime in Tunisia that inspired the painter, Paul Klee, during his famous trip to the country. Make the most of this beautiful season for a break by the water, to practise your favourite water sport or to explore sites and locations imbued with history.\nIn the summer, Tunisia is the ultimate seaside destination: endless beaches, hotels and clubs to suit everyone and bustling tourist resorts. In summer, the country really comes alive at night, with plenty of opportunities to let your hair down.\nMake the most of the mild autumns for a swim in the

## Setup RAG PIPELINE

In [62]:
%pip install --quiet --upgrade langchain-text-splitters langchain-community langgraph

In [63]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict

In [64]:
from google.colab import userdata
import os

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"]=userdata.get('LANGSMITH_API_KEY')

In [65]:
%pip install -qU "langchain[google-genai]"

In [66]:
if not os.environ.get("GOOGLE_API_KEY"):
  os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')


In [67]:
from langchain.chat_models import init_chat_model
# chat model
llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai")

In [68]:
%pip install -qU langchain-huggingface

In [69]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [70]:
%pip install -qU langchain-core

In [71]:
from langchain_core.vectorstores import InMemoryVectorStore
#v in Memory vector store
vector_store = InMemoryVectorStore(embeddings_model)

**we can use the webBaseLoader which is a document loader load web pages into the LangChain Document format,it will return a list of Document objects -- one per page -- containing a single string of the page's text.**

In [72]:
from langchain_community.document_loaders import BSHTMLLoader
#Loading HTML with BeautifulSoup4
page_url = "https://www.discovertunisia.com/en/tunisia-all-year-round"
loader = WebBaseLoader(web_paths=[page_url])
docs = []
async for doc in loader.alazy_load():
    docs.append(doc)

assert len(docs) == 1
doc = docs[0]

Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.35s/it]


In [73]:
print(f"{doc.metadata}\n")
print(doc.page_content[:500].strip())

{'source': 'https://www.discovertunisia.com/en/tunisia-all-year-round', 'title': 'Holidays in Tunisia - All year round - Discover Tunisia', 'description': "Holidays in Tunisia doesn't need a specific season to enjoy it, you can visit Tunisia all year round and any season, so you can have a very nice program any time of year in Tunisia don't hesitate to spend your holidays in Tunisia", 'language': 'No language found.'}

Holidays in Tunisia - All year round - Discover Tunisia


































Skip to main content


 









English 


Français
العربية
Italiano
English
Deutsch
中国
Pyc
český
Polski
Español












Contact us
Travel to Tunisia (Last update)
 






Discover Tunisia

Toggle navigation








It’s time for Tunisia English
Swedish  
Danish 

Discover
Tunisia all year round
Thematic Routes Cinematographic  Route
Culinary Route
UNESCO Route

Media center
Even


**But we will use our custom HTML Scrapper to retrive the text from a webpage and its primary child and prevent the non important content**

In [74]:
from langchain.docstore.document import Document

In [75]:
docs = [
    Document(page_content=item["content"], metadata={"url": item["url"]})
    for item in website_scrapped
]

In [76]:
print(f"Total characters: {len(docs[0].page_content)}")

Total characters: 1189


In [77]:
# Splitting documents
"""
TextSplitter: Object that splits a list of Documents into smaller chunks. Subclass of DocumentTransformer
"""
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,# chunk size (characters)
    chunk_overlap=200 # chunk overlap (characters)
    )
# recursively split the document using common separators like new lines until each chunk is the appropriate size.
all_splits = text_splitter.split_documents(docs)

print(f"Split blog post into {len(all_splits)} sub-documents.")

Split blog post into 12 sub-documents.


In [78]:
# Storing documents

"""
 embed the contents of each document split and insert these embeddings into a vector store
 """
_ = vector_store.add_documents(documents=all_splits)

In [79]:
# Define prompt for question-answering
prompt = hub.pull("rlm/rag-prompt")

In [80]:
example_messages = prompt.invoke(
    {"context": "(context goes here)", "question": "(question goes here)"}
).to_messages()

assert len(example_messages) == 1
print(example_messages[0].content)

You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: (question goes here) 
Context: (context goes here) 
Answer:


In [81]:
"""
To use LangGraph, we need to define three things:

The state of our application;
The nodes of our application (i.e., application steps);
The "control flow" of our application (e.g., the ordering of the steps).
"""



# Define state for application
# The state of our application controls what data is input to the application, transferred between steps, and output by the application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps : NODES

def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])

    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}

In [82]:
# Control flow
## Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [83]:
response = graph.invoke({"question": "what is the subject of documents"})
print(response["answer"])

The documents describe various attractions, activities, and landscapes in Tunisia, highlighting different seasons and regions. They cover romantic beaches, historic towns, natural beauty, archaeological sites, and sports like golf and hunting. Therefore, the subject of the documents is tourism and travel in Tunisia.
