'''NAME: Aher Swami Sandip
ROLL NO. 01
COURSE: AI&DS
CLASS: BE
SUB:Computer Laboratory-II (Information Retrival)'''

In [2]:
import os
import math
import requests
from bs4 import BeautifulSoup
import networkx as nx

# Step 1: Crawling - Collect links from a page
def get_links_from_url(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = set()
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            if href.startswith("http"):
                links.add(href)
        return links
    except:
        return set()

# Step 2: Build a directed graph from a list of URLs
def build_web_graph(seed_urls, max_depth=1):
    graph = {}
    visited = set()

    def crawl(url, depth):
        if depth > max_depth or url in visited:
            return
        visited.add(url)
        links = get_links_from_url(url)
        graph[url] = links
        for link in links:
            crawl(link, depth + 1)

    for seed in seed_urls:
        crawl(seed, 0)

    return graph

# Step 3: Compute PageRank using simplified algorithm
def compute_pagerank(graph, damping=0.85, max_iterations=100, tol=1e-6):
    pages = list(graph.keys())
    n = len(pages)
    ranks = {page: 1.0 / n for page in pages}

    for _ in range(max_iterations):
        new_ranks = {}
        for page in pages:
            incoming_links = [p for p in pages if page in graph[p]]
            rank_sum = sum(ranks[p] / len(graph[p]) for p in incoming_links if graph[p])
            new_ranks[page] = (1 - damping) / n + damping * rank_sum

        # Check convergence
        delta = sum(abs(new_ranks[page] - ranks[page]) for page in pages)
        ranks = new_ranks
        if delta < tol:
            break

    return ranks

# Step 4: Run everything together
if __name__ == "__main__":
    seed_urls = [
        "https://en.wikipedia.org/wiki/Web_crawler", 
        "https://en.wikipedia.org/wiki/PageRank"
    ]

    print("Building web graph...")
    web_graph = build_web_graph(seed_urls, max_depth=1)

    print("Computing PageRank...")
    pagerank_scores = compute_pagerank(web_graph)

    print("\nPageRank Results:")
    for page, score in sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True):
        print(f"{page}: {score:.5f}")


Building web graph...


  soup = BeautifulSoup(response.text, 'html.parser')


Computing PageRank...

PageRank Results:
https://en.wikipedia.org/wiki/Web_crawler: 0.07500
https://en.wikipedia.org/wiki/PageRank: 0.07500
