In [42]:
import sqlite3

# Connect to the SQLite database
conn = sqlite3.connect('spider.sqlite')
cur = conn.cursor()

# Find all the distinct pages (from_ids) that link to other pages
# These pages will "send" PageRank
cur.execute('''SELECT DISTINCT from_id FROM Links''')
from_ids = list()
for row in cur: 
    from_ids.append(row[0])

# Find the distinct pages (to_ids) that "receive" PageRank
# Also gather all valid links (pairs of from_id, to_id)
to_ids = list()
links = list()
cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''')
for row in cur:
    from_id = row[0]
    to_id = row[1]
    # Skip self-links and links that are not within the SCC
    if from_id == to_id: 
        continue
    if from_id not in from_ids: 
        continue
    if to_id not in from_ids: 
        continue
    links.append(row)
    # Add to_id to the list of receivers if it's not already present
    if to_id not in to_ids: 
        to_ids.append(to_id)

# Get the current (previous) PageRank values for all nodes (from_ids)
prev_ranks = dict()
for node in from_ids:
    cur.execute('''SELECT new_rank FROM Pages WHERE id = ?''', (node,))
    row = cur.fetchone()
    prev_ranks[node] = row[0]

# Set the number of PageRank iterations to perform
sval = input('How many iterations:')
many = 1
if len(sval) > 0: 
    many = int(sval)

# Sanity check: ensure that there are nodes to compute PageRank for
if len(prev_ranks) < 1: 
    print("Nothing to page rank. Check data.")
    quit()

# Perform PageRank iterations in memory for speed
for i in range(many):
    next_ranks = dict()
    total = 0.0

    # Initialize next_ranks and calculate the total rank value
    for (node, old_rank) in list(prev_ranks.items()):
        total += old_rank
        next_ranks[node] = 0.0

    # Distribute PageRank across outgoing links
    for (node, old_rank) in list(prev_ranks.items()):
        give_ids = list()
        for (from_id, to_id) in links:
            if from_id != node: 
                continue
            if to_id not in to_ids: 
                continue
            give_ids.append(to_id)
        
        # If there are no outgoing links, skip this node
        if len(give_ids) < 1: 
            continue
        
        # Split the rank equally across all outgoing links
        amount = old_rank / len(give_ids)
        for id in give_ids:
            next_ranks[id] += amount

    # Calculate the total rank after distribution and adjust with evaporation factor
    newtot = sum(next_ranks.values())
    evap = (total - newtot) / len(next_ranks)

    # Adjust each node's rank with the evaporation factor
    for node in next_ranks:
        next_ranks[node] += evap

    # Calculate the difference between old and new ranks for convergence
    totdiff = 0
    for (node, old_rank) in list(prev_ranks.items()):
        new_rank = next_ranks[node]
        diff = abs(old_rank - new_rank)
        totdiff += diff

    avediff = totdiff / len(prev_ranks)  # Average difference per node
    print(i + 1, avediff)

    # Update prev_ranks to the latest next_ranks for the next iteration
    prev_ranks = next_ranks

# After the iterations, store the final ranks in the database
print(list(next_ranks.items())[:5])
cur.execute('''UPDATE Pages SET old_rank=new_rank''')
for (id, new_rank) in list(next_ranks.items()):
    cur.execute('''UPDATE Pages SET new_rank=? WHERE id=?''', (new_rank, id))

# Commit the changes and close the connection
conn.commit()
cur.close()


How many iterations: 20


1 0.05715873015873008
2 0.04023703703703708
3 0.03835654320987669
4 0.03612944855967074
5 0.034624070233196164
6 0.03296941593049842
7 0.031525544637707584
8 0.030103972132378178
9 0.028766080690713167
10 0.027483067490167542
11 0.026259007583579685
12 0.025089434873060205
13 0.0239719013290044
14 0.022904270351874485
15 0.021884130649832547
16 0.020909462372051733
17 0.01997819036822622
18 0.01908840166791842
19 0.018238240547953814
20 0.017425944646716304
[(1, 0.41620182608098744), (2, 0.8934731572457999), (4, 0.23413818822582355), (9, 0.42660830742122186), (14, 0.20336115730115156)]


OperationalError: database is locked