# Assingment 3.1

This notebook contains the solution and outputs for **Assingment 3.1**.

## Setup and Imports

In [3]:
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import chromadb
from transformers import pipeline
import time
from urllib.parse import urljoin

# Initialize components
try:
    embedder = SentenceTransformer('all-MiniLM-L6-v2')
    chroma_client = chromadb.Client()
    # Check if collection exists, delete and recreate or reuse
    collection_name = "web_content"
    try:
        collection = chroma_client.get_collection(collection_name)
        print(f"Collection '{collection_name}' already exists. Reusing it.")
    except:
        print(f"Creating new collection '{collection_name}'.")
        chroma_client.delete_collection(collection_name)  # Delete if exists
        collection = chroma_client.create_collection(collection_name)
    qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", device=-1)  # Explicitly use CPU
    print("Components initialized successfully.")
except Exception as e:
    print(f"Error initializing components: {e}")
    raise

# Function to crawl a website and extract text
def crawl_website(url, max_pages=5):
    visited = set()
    to_visit = [url]
    documents = []
    
    while to_visit and len(visited) < max_pages:
        current_url = to_visit.pop(0)
        if current_url in visited:
            continue
        try:
            headers = {"User-Agent": "Mozilla/5.0"}  # Added to avoid rate-limiting
            response = requests.get(current_url, timeout=5, headers=headers)
            if response.status_code != 200:
                print(f"Skipping {current_url}: Status code {response.status_code}")
                continue
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract text from paragraphs
            text = ' '.join([p.get_text().strip() for p in soup.find_all('p') if p.get_text().strip()])
            if text:
                documents.append({"url": current_url, "text": text})
                print(f"Crawled {current_url}: {len(text)} characters extracted")
            else:
                print(f"No text extracted from {current_url}")
            
            # Find links to follow
            for link in soup.find_all('a', href=True):
                next_url = urljoin(current_url, link['href'])
                if next_url.startswith(url) and next_url not in visited:
                    to_visit.append(next_url)
            
            visited.add(current_url)
        except Exception as e:
            print(f"Error crawling {current_url}: {e}")
    
    if not documents:
        print("Warning: No documents were crawled.")
    return documents

# Function to store documents in vector database
def store_documents(documents):
    if not documents:
        print("No documents to store.")
        return
    for i, doc in enumerate(documents):
        try:
            embedding = embedder.encode(doc["text"]).tolist()
            collection.add(
                documents=[doc["text"]],
                embeddings=[embedding],
                metadatas=[{"url": doc["url"]}],
                ids=[f"doc_{i}"]
            )
            print(f"Stored document {i} from {doc['url']}")
        except Exception as e:
            print(f"Error storing document {i} from {doc['url']}: {e}")

# Function to retrieve relevant documents and answer question
def answer_question(question):
    try:
        query_embedding = embedder.encode(question).tolist()
        results = collection.query(query_embeddings=[query_embedding], n_results=2)
        documents = results["documents"][0] if results["documents"] else []
        
        if not documents:
            print(f"No documents retrieved for question: {question}")
            return "No relevant information found.", ""
        
        context = " ".join(documents)
        if not context.strip():
            print(f"Empty context for question: {question}")
            return "No relevant information found.", ""
        
        print(f"Context length for question '{question}': {len(context)} characters")
        answer = qa_pipeline(question=question, context=context)
        return answer["answer"], context
    except Exception as e:
        print(f"Error answering question '{question}': {e}")
        return "Error processing question.", ""

# Main pipeline
def rag_pipeline(start_url, questions):
    print("Crawling website...")
    start_time = time.time()
    documents = crawl_website(start_url)
    print(f"Crawled {len(documents)} pages in {time.time() - start_time:.2f} seconds")
    
    print("Storing documents...")
    store_documents(documents)
    
    print("Answering questions...")
    results = []
    for question in questions:
        answer, context = answer_question(question)
        results.append({"question": question, "answer": answer, "context": context})
    
    return results

# Example usage
if __name__ == "__main__":
    try:
        # Example website and questions
        start_url = "https://en.wikipedia.org/wiki/Artificial_intelligence"
        questions = [
            "What is artificial intelligence?",
            "What are the applications of AI?",
            "Who invented the term AI?"
        ]
        
        results = rag_pipeline(start_url, questions)
        for result in results:
            print(f"Question: {result['question']}")
            print(f"Answer: {result['answer']}")
            print(f"Context: {result['context'][:200]}..." if result['context'] else "Context: None")
            print("-" * 50)
        
        # Evaluation (simplified)
        print("Evaluation: Manual inspection required for real datasets like Natural Questions")
    except Exception as e:
        print(f"Error running RAG pipeline: {e}")

Collection 'web_content' already exists. Reusing it.


Device set to use cpu


Components initialized successfully.
Crawling website...
Crawled https://en.wikipedia.org/wiki/Artificial_intelligence: 86139 characters extracted
Crawled https://en.wikipedia.org/wiki/Artificial_intelligence#bodyContent: 86139 characters extracted
Crawled https://en.wikipedia.org/wiki/Artificial_intelligence#Goals: 86139 characters extracted
Crawled https://en.wikipedia.org/wiki/Artificial_intelligence#Reasoning_and_problem-solving: 86139 characters extracted
Crawled https://en.wikipedia.org/wiki/Artificial_intelligence#Knowledge_representation: 86139 characters extracted
Crawled 5 pages in 9.19 seconds
Storing documents...
Stored document 0 from https://en.wikipedia.org/wiki/Artificial_intelligence
Stored document 1 from https://en.wikipedia.org/wiki/Artificial_intelligence#bodyContent
Stored document 2 from https://en.wikipedia.org/wiki/Artificial_intelligence#Goals
Stored document 3 from https://en.wikipedia.org/wiki/Artificial_intelligence#Reasoning_and_problem-solving
Stored docu