In [6]:
import requests
from bs4 import BeautifulSoup
import numpy as np

In [8]:
def get_links(url):
  try:
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    links = set()

    for link in soup.find_all('a',href=True):
      href = link.get('href')
      if href.startswith('http'):
        links.add(href)
    return links
  except requests.RequestException as e:
    print(f"Error: {e}")
    return set()

In [9]:
def build_graph(start_urls,max_pages=10):
  page_urls = []
  to_crawl =  set(start_urls)
  crawled = set()
  url_to_links = {}
  while to_crawl and len(crawled) < max_pages:
    url = to_crawl.pop()
    if url in crawled:
      continue
    print(f"Crawling {url}")
    crawled.add(url)

    links = get_links(url)
    url_to_links[url] = links
    for link in links:
      to_crawl.add(link)
    page_urls.append(url)
    print(f"Links found: {len(links)}, To crawl: {len(to_crawl)}, Crawled: {len(crawled)}")
  n = len(page_urls)
  adjacency_matrix = np.zeros((n,n))
  url_to_index = {url : idx for idx, url in enumerate(page_urls)}
  for i, url in enumerate(page_urls):
    links = url_to_links[url]
    for link in links:
      if link in url_to_index:
        adjacency_matrix[i,url_to_index[link]] = 1
  return page_urls,adjacency_matrix

In [10]:
def pagerank(adjacency_matrix, d=0.85, max_iter=100, tol=1.0e-6):
    N = len(adjacency_matrix)
    column_sums = adjacency_matrix.sum(axis=0)
    column_sums[column_sums == 0] = 1  # Avoid division by zero
    M = adjacency_matrix / column_sums
  # Normalize adjacency matrix (Column stochastic matrix)

    # Handle dangling nodes (pages with no outgoing links)
    dangling_nodes = np.where(M.sum(axis=0) == 0)[0]
    M[:, dangling_nodes] = 1.0 / N  # Treat dangling nodes as if they are connected to all nodes

    # Initial PageRank vector (uniform distribution)
    rank = np.ones(N) / N

    for _ in range(max_iter):
        new_rank = (1 - d) / N + d * M.dot(rank)

        # Check for convergence
        if np.linalg.norm(new_rank - rank, 1) < tol:
            return new_rank

        rank = new_rank

    raise ValueError(f"PageRank did not converge within {max_iter} iterations.")

In [11]:
start_urls = ['https://news.ycombinator.com/',
'https://www.example.com/']

In [None]:
page_urls,adjacency_matrix = build_graph(start_urls)

Crawling https://news.ycombinator.com/


In [None]:
page_rank_values = pagerank(adjacency_matrix,max_iter=1000,tol=1)

In [None]:
print("PageRank Scores:")
for url, rank in sorted(zip(page_urls, page_rank_values), key=lambda x: -x[1]):
    print(f"{url}: {rank}")