Implement Page Rank Algorithm. (Use python or beautiful soup for 
implementation).

In [3]:
pip install beautifulsoup4 requests

Note: you may need to restart the kernel to use updated packages.


In [4]:
import requests
from bs4 import BeautifulSoup
import numpy as np

In [8]:
# Step 1: Scrape a small set of web pages
def get_links(url):
    links = []
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        for link in soup.find_all('a', href=True):
            links.append(link['href'])
    except Exception as e:
        print(f"Error fetching {url}: {e}")
    return links

In [10]:
# Step 2: Create a small web graph
urls = [
    'https://www.wikipedia.org/',
    'https://en.wikipedia.org/wiki/Web_scraping',
    'https://en.wikipedia.org/wiki/Pagerank',
    'https://en.wikipedia.org/wiki/Algorithm',
]

link_graph = {url: get_links(url) for url in urls}

In [12]:
# Step 3: Build the adjacency matrix
n = len(urls)
adjacency_matrix = np.zeros((n, n))

url_index = {url: i for i, url in enumerate(urls)}

for url, links in link_graph.items():
    for link in links:
        if link in url_index:
            adjacency_matrix[url_index[url]][url_index[link]] = 1

In [14]:
# Step 4: Calculate PageRank
def calculate_pagerank(adjacency_matrix, d=0.85, num_iterations=100):
    n = adjacency_matrix.shape[0]
    pagerank = np.ones(n) / n
    for _ in range(num_iterations):
        new_pagerank = (1 - d) / n + d * (adjacency_matrix.T @ pagerank) / np.sum(adjacency_matrix.T @ pagerank)
        pagerank = new_pagerank
    return pagerank

pagerank_values = calculate_pagerank(adjacency_matrix)
pagerank_values_dict = {url: pagerank_values[i] for i, url in enumerate(urls)}

  new_pagerank = (1 - d) / n + d * (adjacency_matrix.T @ pagerank) / np.sum(adjacency_matrix.T @ pagerank)


In [16]:
# Step 5: Output the results
print("PageRank Values:")
for url, rank in pagerank_values_dict.items():
    print(f"{url}: {rank:.4f}")

PageRank Values:
https://www.wikipedia.org/: nan
https://en.wikipedia.org/wiki/Web_scraping: nan
https://en.wikipedia.org/wiki/Pagerank: nan
https://en.wikipedia.org/wiki/Algorithm: nan
