In [None]:
!pip install sentence-transformers

In [None]:
# ===================== STEP 1: IMPORTS =====================
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup
import requests
import numpy as np
import time

# ===================== STEP 2: FETCH & CLEAN WEBPAGE =====================
def fetch_text(url):
    print(f"Fetching: {url}")
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/114.0.0.0 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers, timeout=20)
        soup = BeautifulSoup(response.text, "html.parser")

        # Remove script/style
        for script in soup(["script", "style"]):
            script.extract()

        text = soup.get_text(separator=' ', strip=True)
        print("Text length:", len(text))
        return text
    except Exception as e:
        print(" Error while fetching:", e)
        return ""

# ===================== STEP 3: CHUNK TEXT =====================
def chunk_text(text, chunk_size=500, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

# ===================== STEP 4: CREATE EMBEDDINGS =====================
def create_embeddings(chunks, model_name='all-MiniLM-L6-v2'):
    if len(chunks) == 0:
        return None, None
    print("🔹 Loading embedding model...")
    model = SentenceTransformer(model_name)
    print("🔹 Creating embeddings...")
    embeddings = model.encode(chunks, show_progress_bar=True)
    return model, np.array(embeddings)

# ===================== STEP 5: ANSWER QUESTIONS =====================
def answer_question(question, chunks, embeddings, model, top_k=3, threshold=0.25):
    if len(chunks) == 0 or embeddings is None:
        return " No chunks available to answer questions.", []

    q_emb = model.encode([question])
    try:
        sims = cosine_similarity(q_emb, embeddings)[0]
    except ValueError:
        return " Cannot compute similarity; embeddings might be empty.", []

    top_idx = sims.argsort()[-top_k:][::-1]

    if sims[top_idx[0]] < threshold:
        return " Not enough information found in the crawled content.", []

    retrieved = [(chunks[i], sims[i]) for i in top_idx]
    answer_text = "\n\n---\n\n".join([t for t, s in retrieved])
    return answer_text, retrieved

# ===================== STEP 6: MAIN PROGRAM =====================
if __name__ == "__main__":
    url = input("Enter starting website URL: ").strip()
    text = fetch_text(url)

    if len(text) == 0:
        print(" No text fetched. Exiting.")
        exit()

    chunks = chunk_text(text)
    print(f" Total chunks created: {len(chunks)}")

    model, embeddings = create_embeddings(chunks)
    if embeddings is None:
        print(" Embeddings could not be created. Exiting.")
        exit()

    while True:
        question = input("\nAsk a question (or type 'exit'): ").strip()
        if question.lower() == 'exit':
            break

        start_time = time.time()
        answer, sources = answer_question(question, chunks, embeddings, model)
        end_time = time.time()

        print("\n==================== ANSWER ====================\n")
        print(answer[:2000])  # Limit output length for readability

        if sources:
            print("\n==================== SOURCES ====================\n")
            for idx, (chunk, sim) in enumerate(sources):
                print(f"Chunk {idx+1} | similarity={sim:.3f}")
        print("\n==================== TIMING ====================\n")
        print(f"Total time: {end_time - start_time:.2f} sec")
        print("\n================================================\n")
