**Practical 7**

**Aim : Web Crawling and Indexing**
*   Develop a web crawler to fetch and index web pages.
*   Handle challenges such as robots.txt, dynamic content, and crawling delays.



In [None]:
print("T114 | Bhumika Shelar")
import requests
import time
import re
from bs4 import BeautifulSoup
from urllib.robotparser import RobotFileParser
from collections import defaultdict

# -------------------------------
# Stop words list (small & simple)
# -------------------------------
stop_words = {'the', 'is', 'in', 'and', 'to', 'of', 'a', 'for', 'on'}

# -------------------------------
# Check robots.txt
# -------------------------------
def allowed(url):
    rp = RobotFileParser()
    rp.set_url(url + "/robots.txt")
    rp.read()
    return rp.can_fetch("*", url)

# -------------------------------
# Crawl and index
# -------------------------------
def crawl(url):
    index = defaultdict(list)

    if not allowed(url):
        print("Crawling not allowed by robots.txt")
        return index

    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    text = soup.get_text().lower()
    words = re.findall(r'\b\w+\b', text)

    # Remove stop words
    words = [w for w in words if w not in stop_words]

    # Build inverted index
    for word in set(words):
        index[word].append(url)

    time.sleep(2)  # crawling delay
    return index

# -------------------------------
# Run crawler
# -------------------------------
url = "https://www.youtube.com/"
index = crawl(url)

print("Indexed words:")
#print(dict(list(index.items())[:10]))
for key, value in list(index.items())[:10]:
    print(f"{key} : {value}")

T114 | Bhumika Shelar
Indexed words:
llc : ['https://www.youtube.com/']
google : ['https://www.youtube.com/']
youtube簡介媒體著作權與我們聯絡創作者廣告開發人員條款隱私權政策與安全性youtube : ['https://www.youtube.com/']
運作方式測試新功能 : ['https://www.youtube.com/']
2026 : ['https://www.youtube.com/']
