In [None]:
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline

# Initialize the Hugging Face pipeline for Question Answering
qa_pipeline = pipeline("question-answering")

# Initialize Embedding Model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Function to Crawl and Scrape Website Content
def scrape_website(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        # Extract text from paragraphs
        paragraphs = [p.get_text() for p in soup.find_all("p")]
        return " ".join(paragraphs)
    else:
        print(f"Failed to fetch {url}. Status code: {response.status_code}")
        return ""

# Function to Segment Content into Chunks
def segment_content(text, max_chunk_size=500):
    words = text.split()
    chunks = [
        " ".join(words[i:i + max_chunk_size])
        for i in range(0, len(words), max_chunk_size)
    ]
    return chunks

# Function to Store Embeddings in FAISS
def store_embeddings(urls):
    # FAISS index setup
    dimension = 384  # for the all-MiniLM-L6-v2 model
    index = faiss.IndexFlatL2(dimension)
    ids = []
    embeddings = []
    metadata = []
    
    for url in urls:
        print(f"Scraping: {url}")
        content = scrape_website(url)
        if content:
            print("Segmenting content...")
            chunks = segment_content(content)
            print("Generating embeddings...")
            chunk_embeddings = embedding_model.encode(chunks)
            for i, chunk_embedding in enumerate(chunk_embeddings):
                index.add(np.array([chunk_embedding]).astype("float32"))
                ids.append(f"{url}-{i}")
                embeddings.append(chunk_embedding)
                metadata.append({"url": url, "chunk_id": i, "text": chunks[i]})
    return index, ids, embeddings, metadata

# Function to Perform Query and Generate Response
def query_pipeline(user_query, index, metadata):
    print("Generating query embedding...")
    query_embedding = embedding_model.encode([user_query])

    print("Searching in FAISS...")
    D, I = index.search(np.array(query_embedding).astype("float32"), k=5)

    # Construct context for QA model
    context = "\n".join(
        [f"Chunk {i+1}: {metadata[idx]['text']}" for i, idx in enumerate(I[0])]
    )
    prompt = f"Context:\n{context}\n\nQuestion: {user_query}"

    print("Querying Hugging Face QA model...")
    response = qa_pipeline(question=user_query, context=context)
    return response["answer"]

# Main Script
if __name__ == "__main__":
    # URLs to process
    urls = [
        "https://www.uchicago.edu/",
        "https://www.washington.edu/",
        "https://www.stanford.edu/",
        "https://und.edu/"
    ]

    # Step 1: Crawl, Embed, and Store in FAISS
    print("Starting data ingestion...")
    index, ids, embeddings, metadata = store_embeddings(urls)

    # Step 2: Query the System
    print("Ready for user queries.")
    while True:
        query = input("\nEnter your question (or 'exit' to quit): ")
        if query.lower() == "exit":
            break
        response = query_pipeline(query, index, metadata)
        print("\nResponse:")
        print(response)
