In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time

class SimpleCrawler:
    def __init__(self, base_url, max_pages=50):
        self.base_url = base_url
        self.visited = set()
        self.to_visit = [base_url]
        self.max_pages = max_pages
        self.documents = {}  # doc_id -> text

    def crawl(self):
        while self.to_visit and len(self.visited) < self.max_pages:
            url = self.to_visit.pop(0)
            if url in self.visited:
                continue

            print(f"Crawling: {url}")
            try:
                response = requests.get(url, timeout=5)
                if response.status_code != 200:
                    continue
                soup = BeautifulSoup(response.text, 'html.parser')

                # Extract text (simple)
                text = soup.get_text(separator=' ', strip=True)

                # Store doc
                doc_id = len(self.visited)
                self.documents[doc_id] = {'url': url, 'text': text}

                self.visited.add(url)

                # Find new links within the same domain
                base_domain = urlparse(self.base_url).netloc
                for link in soup.find_all('a', href=True):
                    abs_link = urljoin(url, link['href'])
                    if urlparse(abs_link).netloc == base_domain and abs_link not in self.visited:
                        self.to_visit.append(abs_link)

                time.sleep(1)  # polite crawling

            except Exception as e:
                print(f"Failed to crawl {url}: {e}")

        print(f"Crawled {len(self.documents)} pages.")
        return self.documents
import json
if __name__ == "__main__":
    base_url = 'https://en.wikipedia.org/wiki/Web_crawler'
    crawler = SimpleCrawler(base_url, max_pages=10)
    docs = crawler.crawl()
    filename = "documents.json"
    with open(filename) as f:
        json.dump(docs,f)


Crawling: https://en.wikipedia.org/wiki/Web_crawler
Crawling: https://en.wikipedia.org/wiki/Web_crawler#bodyContent
Crawling: https://en.wikipedia.org/wiki/Main_Page
Crawling: https://en.wikipedia.org/wiki/Wikipedia:Contents
Crawling: https://en.wikipedia.org/wiki/Portal:Current_events
Crawling: https://en.wikipedia.org/wiki/Special:Random
Crawling: https://en.wikipedia.org/wiki/Wikipedia:About
Crawling: https://en.wikipedia.org/wiki/Wikipedia:Contact_us
Crawling: https://en.wikipedia.org/wiki/Help:Contents
Crawling: https://en.wikipedia.org/wiki/Help:Introduction
Crawled 10 pages.

Doc 0: https://en.wikipedia.org/wiki/Web_crawler

Doc 1: https://en.wikipedia.org/wiki/Web_crawler#bodyContent

Doc 2: https://en.wikipedia.org/wiki/Main_Page

Doc 3: https://en.wikipedia.org/wiki/Wikipedia:Contents

Doc 4: https://en.wikipedia.org/wiki/Portal:Current_events

Doc 5: https://en.wikipedia.org/wiki/Special:Random

Doc 6: https://en.wikipedia.org/wiki/Wikipedia:About

Doc 7: https://en.wikipedi

In [9]:
import json
filename = "documents.json"
with open(filename,'w') as f:
    json.dump(docs,f)

In [18]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import defaultdict
import json

# Download required NLTK data files once
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    filtered = [t for t in tokens if t.isalpha() and t not in stop_words]
    return filtered

def build_inverted_index(docs):
    inverted_index = defaultdict(set)
    for doc_id, doc in docs.items():
        tokens = preprocess(doc['text'])
        for token in set(tokens):  # add once per document
            inverted_index[token].add(doc_id)
    # Convert sets to lists for JSON serialization
    inverted_index = {k: list(v) for k,v in inverted_index.items()}
    return inverted_index

if __name__ == "__main__":
    # Assume you loaded docs from previous crawler step
    with open("documents.json", "r") as f:
        docs = json.load(f)

    inverted_index = build_inverted_index(docs)

    with open("inverted_index.json", "w") as f:
        json.dump(inverted_index, f)

    print(f"Inverted index built with {len(inverted_index)} unique tokens.")


Inverted index built with 3803 unique tokens.


[nltk_data] Downloading package punkt to C:\Users\Bhuvain
[nltk_data]     Jhamb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Bhuvain
[nltk_data]     Jhamb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Bhuvain
[nltk_data]     Jhamb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pickle

def build_semantic_index(docs, model_name='all-MiniLM-L6-v2'):
    """
    docs: dict of doc_id -> { 'url':..., 'text':... }
    returns: dict doc_id -> embedding vector (numpy array)
    """
    model = SentenceTransformer(model_name)
    embeddings = {}
    for doc_id, doc in docs.items():
        emb = model.encode(doc['text'], convert_to_numpy=True)
        embeddings[doc_id] = emb
    return embeddings

if __name__ == "__main__":
    with open("documents.json", "r") as f:
        docs = json.load(f)

    embeddings = build_semantic_index(docs)

    # Save embeddings (can't save numpy arrays directly as JSON)
    with open("semantic_embeddings.pkl", "wb") as f:
        pickle.dump(embeddings, f)

    print(f"Built semantic embeddings for {len(embeddings)} documents.")



  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Built semantic embeddings for 10 documents.


In [None]:
import json
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    return [t for t in tokens if t.isalpha() and t not in stop_words]

def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1)*np.linalg.norm(vec2) + 1e-10)
with open("inverted_index.json", "r") as f:
    inverted_index = json.load(f)

with open("semantic_embeddings.pkl", "rb") as f:
    semantic_embeddings = pickle.load(f)

# For demo, assume docs dictionary is available here too
with open("documents.json", "r") as f:
    docs = json.load(f)

model = SentenceTransformer('all-MiniLM-L6-v2')
def keyword_search(query_tokens, inverted_index):
    """
    Returns dict doc_id -> keyword score (e.g. term frequency in query)
    """
    doc_scores = Counter()
    for token in query_tokens:
        if token in inverted_index:
            for doc_id in inverted_index[token]:
                doc_scores[doc_id] += 1
    return doc_scores
def semantic_search(query, semantic_embeddings, model, top_k=10):
    query_emb = model.encode(query, convert_to_numpy=True)
    scores = {}
    for doc_id, emb in semantic_embeddings.items():
        scores[doc_id] = cosine_similarity(query_emb, emb)
    top_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
    return dict(top_docs)
def combine_scores(kw_scores, sem_scores, alpha=0.5):
    """
    alpha: weight for keyword score (0-1), (1-alpha) for semantic score
    Normalize scores before combining
    """
    all_doc_ids = set(kw_scores.keys()).union(sem_scores.keys())

    # Normalize
    max_kw = max(kw_scores.values()) if kw_scores else 1
    max_sem = max(sem_scores.values()) if sem_scores else 1

    combined_scores = {}
    for doc_id in all_doc_ids:
        kw_score = kw_scores.get(doc_id, 0) / max_kw
        sem_score = sem_scores.get(doc_id, 0) / max_sem
        combined_scores[doc_id] = alpha * kw_score + (1 - alpha) * sem_score

    # Sort descending
    ranked = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    return ranked
def search(query, inverted_index, semantic_embeddings, model, docs, alpha=0.5, top_k=5):
    query_tokens = preprocess(query)
    kw_scores = keyword_search(query_tokens, inverted_index)
    sem_scores = semantic_search(query, semantic_embeddings, model, top_k=top_k*3)  # get more sem results to combine

    combined = combine_scores(kw_scores, sem_scores, alpha=alpha)
    top_results = combined[:top_k]

    results = []
    for doc_id, score in top_results:
        doc = docs[str(doc_id)]
        snippet = doc['text'][:200].replace('\n',' ') + '...'  # simple snippet
        results.append({'doc_id': doc_id, 'url': doc['url'], 'score': score, 'snippet': snippet})

    return results


if __name__ == "__main__":
    query = "machine learning web crawler"
    results = search(query, inverted_index, semantic_embeddings, model, docs)
    for r in results:
        print(f"Doc {r['doc_id']} | Score: {r['score']:.3f} | URL: {r['url']}\nSnippet: {r['snippet']}\n")


Doc 0 | Score: 1.000 | URL: https://en.wikipedia.org/wiki/Web_crawler
Snippet: Web crawler - Wikipedia Jump to content Main menu Main menu move to sidebar hide Navigation Main page Contents Current events Random article About Wikipedia Contact us Contribute Help Learn to edit Co...

Doc 1 | Score: 1.000 | URL: https://en.wikipedia.org/wiki/Web_crawler#bodyContent
Snippet: Web crawler - Wikipedia Jump to content Main menu Main menu move to sidebar hide Navigation Main page Contents Current events Random article About Wikipedia Contact us Contribute Help Learn to edit Co...

Doc 2 | Score: 0.418 | URL: https://en.wikipedia.org/wiki/Main_Page
Snippet: Wikipedia, the free encyclopedia Jump to content Main menu Main menu move to sidebar hide Navigation Main page Contents Current events Random article About Wikipedia Contact us Contribute Help Learn t...

Doc 6 | Score: 0.199 | URL: https://en.wikipedia.org/wiki/Wikipedia:About
Snippet: Wikipedia:About - Wikipedia Jump to content Main menu M

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
with open("inverted_index.json", "r") as f:
    inverted_index = json.load(f)

with open("semantic_embeddings.pkl", "rb") as f:
    semantic_embeddings = pickle.load(f)

with open("documents.json", "r") as f:
    docs = json.load(f)

model = SentenceTransformer('all-MiniLM-L6-v2')
def generate_summary(text, max_length=60, min_length=30):
    input_text = text if len(text) < 1000 else text[:1000]
    summary = summarizer(input_text, max_length=max_length, min_length=min_length, do_sample=False)
    return summary[0]['summary_text']
def search_with_summary(query, inverted_index, semantic_embeddings, model, docs, alpha=0.5, top_k=5):
    query_tokens = preprocess(query)
    kw_scores = keyword_search(query_tokens, inverted_index)
    sem_scores = semantic_search(query, semantic_embeddings, model, top_k=top_k*3)

    combined = combine_scores(kw_scores, sem_scores, alpha=alpha)
    top_results = combined[:top_k]

    results = []
    for doc_id, score in top_results:
        doc = docs[str(doc_id)]
        snippet = doc['text'][:500].replace('\n',' ') + '...' 
        summary = generate_summary(snippet)
        results.append({
            'doc_id': doc_id,
            'url': doc['url'],
            'score': score,
            'snippet': snippet,
            'summary': summary
        })

    return results

if __name__ == "__main__":
    query = "machine learning web crawler"
    results = search_with_summary(query, inverted_index, semantic_embeddings, model, docs)
    for r in results:
        print(f"Doc {r['doc_id']} | Score: {r['score']:.3f} | URL: {r['url']}\nSummary: {r['summary']}\n")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu
Your max_length is set to 100, but your input_length is only 91. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)
Your max_length is set to 100, but your input_length is only 91. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)
Your max_length is set to 100, but your input_length is only 85. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. sum

Doc 0 | Score: 1.000 | URL: https://en.wikipedia.org/wiki/Web_crawler
Summary: Web crawler - Wikipedia Jump to content Main menu Main menu move to sidebar hide Navigation Main page Contents Current events Random article About Wikipedia Contact us Contribute Help Learn to edit Community portal Recent changes Upload file Special pages Search Search Search Appearance Donate Create account Log in Personal tools Log in Log in Pages for logged out editors learn more Contributions Talk

Doc 1 | Score: 1.000 | URL: https://en.wikipedia.org/wiki/Web_crawler#bodyContent
Summary: Web crawler - Wikipedia Jump to content Main menu Main menu move to sidebar hide Navigation Main page Contents Current events Random article About Wikipedia Contact us Contribute Help Learn to edit Community portal Recent changes Upload file Special pages Search Search Search Appearance Donate Create account Log in Personal tools Log in Log in Pages for logged out editors learn more Contributions Talk

Doc 2 | Score: 0.4