In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time

class SimpleCrawler:
    def __init__(self, base_url, max_pages=50):
        self.base_url = base_url
        self.visited = set()
        self.to_visit = [base_url]
        self.max_pages = max_pages
        self.documents = {}  # doc_id -> text

    def crawl(self):
        while self.to_visit and len(self.visited) < self.max_pages:
            url = self.to_visit.pop(0)
            if url in self.visited:
                continue

            print(f"Crawling: {url}")
            try:
                response = requests.get(url, timeout=5)
                if response.status_code != 200:
                    continue
                soup = BeautifulSoup(response.text, 'html.parser')

                # Extract text (simple)
                text = soup.get_text(separator=' ', strip=True)

                # Store doc
                doc_id = len(self.visited)
                self.documents[doc_id] = {'url': url, 'text': text}

                self.visited.add(url)

                # Find new links within the same domain
                base_domain = urlparse(self.base_url).netloc
                for link in soup.find_all('a', href=True):
                    abs_link = urljoin(url, link['href'])
                    if urlparse(abs_link).netloc == base_domain and abs_link not in self.visited:
                        self.to_visit.append(abs_link)

                time.sleep(1)  # polite crawling

            except Exception as e:
                print(f"Failed to crawl {url}: {e}")

        print(f"Crawled {len(self.documents)} pages.")
        return self.documents
import json
if __name__ == "__main__":
    base_url = 'https://en.wikipedia.org/wiki/Web_crawler'
    crawler = SimpleCrawler(base_url, max_pages=10)
    docs = crawler.crawl()
    filename = "documents.json"
    with open(filename) as f:
        json.dump(docs,f)


Crawling: https://en.wikipedia.org/wiki/Web_crawler
Crawling: https://en.wikipedia.org/wiki/Web_crawler#bodyContent
Crawling: https://en.wikipedia.org/wiki/Main_Page
Crawling: https://en.wikipedia.org/wiki/Wikipedia:Contents
Crawling: https://en.wikipedia.org/wiki/Portal:Current_events
Crawling: https://en.wikipedia.org/wiki/Special:Random
Crawling: https://en.wikipedia.org/wiki/Wikipedia:About
Crawling: https://en.wikipedia.org/wiki/Wikipedia:Contact_us
Crawling: https://en.wikipedia.org/wiki/Help:Contents
Crawling: https://en.wikipedia.org/wiki/Help:Introduction
Crawled 10 pages.

Doc 0: https://en.wikipedia.org/wiki/Web_crawler

Doc 1: https://en.wikipedia.org/wiki/Web_crawler#bodyContent

Doc 2: https://en.wikipedia.org/wiki/Main_Page

Doc 3: https://en.wikipedia.org/wiki/Wikipedia:Contents

Doc 4: https://en.wikipedia.org/wiki/Portal:Current_events

Doc 5: https://en.wikipedia.org/wiki/Special:Random

Doc 6: https://en.wikipedia.org/wiki/Wikipedia:About

Doc 7: https://en.wikipedi

In [9]:
import json
filename = "documents.json"
with open(filename,'w') as f:
    json.dump(docs,f)

In [18]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import defaultdict
import json

# Download required NLTK data files once
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    filtered = [t for t in tokens if t.isalpha() and t not in stop_words]
    return filtered

def build_inverted_index(docs):
    inverted_index = defaultdict(set)
    for doc_id, doc in docs.items():
        tokens = preprocess(doc['text'])
        for token in set(tokens):  # add once per document
            inverted_index[token].add(doc_id)
    # Convert sets to lists for JSON serialization
    inverted_index = {k: list(v) for k,v in inverted_index.items()}
    return inverted_index

if __name__ == "__main__":
    # Assume you loaded docs from previous crawler step
    with open("documents.json", "r") as f:
        docs = json.load(f)

    inverted_index = build_inverted_index(docs)

    with open("inverted_index.json", "w") as f:
        json.dump(inverted_index, f)

    print(f"Inverted index built with {len(inverted_index)} unique tokens.")


Inverted index built with 3803 unique tokens.


[nltk_data] Downloading package punkt to C:\Users\Bhuvain
[nltk_data]     Jhamb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Bhuvain
[nltk_data]     Jhamb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Bhuvain
[nltk_data]     Jhamb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pickle

def build_semantic_index(docs, model_name='all-MiniLM-L6-v2'):
    """
    docs: dict of doc_id -> { 'url':..., 'text':... }
    returns: dict doc_id -> embedding vector (numpy array)
    """
    model = SentenceTransformer(model_name)
    embeddings = {}
    for doc_id, doc in docs.items():
        emb = model.encode(doc['text'], convert_to_numpy=True)
        embeddings[doc_id] = emb
    return embeddings

if __name__ == "__main__":
    with open("documents.json", "r") as f:
        docs = json.load(f)

    embeddings = build_semantic_index(docs)

    # Save embeddings (can't save numpy arrays directly as JSON)
    with open("semantic_embeddings.pkl", "wb") as f:
        pickle.dump(embeddings, f)

    print(f"Built semantic embeddings for {len(embeddings)} documents.")



ModuleNotFoundError: No module named 'sentence_transformers'

In [17]:
word_tokenize("helllodoodood ddsdsdsd")

['helllodoodood', 'ddsdsdsd']

In [15]:
import nltk

try:
    nltk.data.find('tokenizers/punkt')
    print("Punkt tokenizer is already downloaded within the virtual environment.")
except nltk.downloader.DownloadError:
    print("Downloading the 'punkt' tokenizer within the virtual environment...")
    nltk.download('punkt')
    print("'punkt' tokenizer downloaded successfully within the virtual environment.")

try:
    nltk.data.find('corpora/stopwords')
    print("Stopwords corpus is already downloaded within the virtual environment.")
except nltk.downloader.DownloadError:
    print("Downloading the 'stopwords' corpus within the virtual environment...")
    nltk.download('stopwords')
    print("'stopwords' corpus downloaded successfully within the virtual environment.")

from nltk.tokenize import word_tokenize

# Now try tokenizing again
text = "helllodoodood ddsdsdsd"
tokens = word_tokenize(text)
print(tokens)

Punkt tokenizer is already downloaded within the virtual environment.
Stopwords corpus is already downloaded within the virtual environment.


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\Bhuvain Jhamb/nltk_data'
    - 'c:\\Users\\Bhuvain Jhamb\\Desktop\\Semantic-Search-Engine-with-Summarization-Ranking\\myenv\\nltk_data'
    - 'c:\\Users\\Bhuvain Jhamb\\Desktop\\Semantic-Search-Engine-with-Summarization-Ranking\\myenv\\share\\nltk_data'
    - 'c:\\Users\\Bhuvain Jhamb\\Desktop\\Semantic-Search-Engine-with-Summarization-Ranking\\myenv\\lib\\nltk_data'
    - 'C:\\Users\\Bhuvain Jhamb\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [16]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to C:\Users\Bhuvain
[nltk_data]     Jhamb\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True