In [1]:
!pip install beautifulsoup4 requests
!pip install langchain sentence-transformers faiss-cpu
!pip install transformers langdetect nltk




In [2]:
pip install -U langchain-community



In [3]:
import requests
from bs4 import BeautifulSoup
import re
from langdetect import detect
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
import os


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
def scrape_books(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")

        books = []
        book_elements = soup.select('article.product_pod')

        for book in book_elements:
            title = book.h3.a['title']
            price = book.select_one('.price_color').text
            books.append(f"Book: {title} - Price: {price}")

        return "\n".join(books)

    except Exception as e:
        print(f"Error scraping website: {e}")
        return ""


In [21]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[\r\n\t]', ' ', text)
    return text.strip()


In [22]:
def split_text_into_chunks(text):
    if not text:
        return []
    sentences = text.split('\n')
    chunks = [sentence.strip() for sentence in sentences if sentence.strip()]
    return chunks


In [23]:
website_url = "https://books.toscrape.com/"
website_text = scrape_books(website_url)

cleaned_text = clean_text(website_text)
chunks = split_text_into_chunks(cleaned_text)

if not chunks:
    raise ValueError("❌ No products found! Please check the scraping logic or website structure.")

print(f"✅ Total Chunks Retrieved: {len(chunks)}")
print(f"Sample Chunk: {chunks[0]}")


✅ Total Chunks Retrieved: 1
Sample Chunk: Book: A Light in the Attic - Price: Â£51.77 Book: Tipping the Velvet - Price: Â£53.74 Book: Soumission - Price: Â£50.10 Book: Sharp Objects - Price: Â£47.82 Book: Sapiens: A Brief History of Humankind - Price: Â£54.23 Book: The Requiem Red - Price: Â£22.65 Book: The Dirty Little Secrets of Getting Your Dream Job - Price: Â£33.34 Book: The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull - Price: Â£17.93 Book: The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics - Price: Â£22.60 Book: The Black Maria - Price: Â£52.15 Book: Starving Hearts (Triangular Trade Trilogy, #1) - Price: Â£13.99 Book: Shakespeare's Sonnets - Price: Â£20.66 Book: Set Me Free - Price: Â£17.46 Book: Scott Pilgrim's Precious Little Life (Scott Pilgrim #1) - Price: Â£52.29 Book: Rip it Up and Start Again - Price: Â£35.02 Book: Our Band Could Be Your Life: Scenes from the American Indie Underground, 19

In [24]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

texts = chunks
metadatas = [{"source": f"product_{i}"} for i in range(len(texts))]

vectordb = FAISS.from_texts(texts, embedding=embedding_model, metadatas=metadatas)

print("✅ Vectorstore created successfully!")


✅ Vectorstore created successfully!


In [25]:
qa_model_pipeline = pipeline("text2text-generation", model="google/flan-t5-small", max_new_tokens=300)

llm = HuggingFacePipeline(pipeline=qa_model_pipeline)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 3}),
    return_source_documents=True
)

print("✅ QA Chain initialized successfully!")


Device set to use cpu


✅ QA Chain initialized successfully!


In [26]:
def chat():
    print("💬 Chatbot ready! Type 'exit' to quit.")
    while True:
        query = input("You: ")
        if query.lower() == 'exit':
            break
        result = qa_chain.invoke({"query": query})
        answer = result['result']
        print(f"Bot: {answer}\n")


In [28]:
chat()


💬 Chatbot ready! Type 'exit' to quit.
You: "How much is 'The Grand Design
Bot: £20.66

You: "What books are available
Bot: Book: A Light in the Attic - Price £53.74 Book: Soumission - Price £50.10 Book: Sharp Objects - Price £47.82 Book: Sapiens: A Brief History of Humankind - Price £52.29 Book: The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull - Price £52.29 Book: Set Me Free - Price £37.02 Book: Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991 - Price £37.59 Book: Olio - Price £23.88 Book: Libertarianism for Beginners - Price £45.17

You: How much is  Libertarianism for Beginners
Bot: £45.17

You: Which book is cheaper: "Tipping the Velvet" or "Soumission
Bot: Book: Soumission

You: Do you have any books about "Science
Bot: No

You: Can I buy books online from here
Bot: Yes

You: ecommend me a book to read
Bot: Book: The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull

You: rec