<a href="https://colab.research.google.com/github/AlecTraas/computational-geo-lab/blob/main/Colab/Matthew/Page_Rank.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Pagerank Algorithm


- Want to take in a directed graph, and give a probability distribution that details the probability of landing on a given node
- Output this in a way such that:
  - Node A: 50%
  - Node B: 20%
  - Could possibly be done by iterating through the node list and probability distribution at the same time and print each one by one

To-Do list 3/19 and Ideas
- Finish coding single_pr
  - find out count for each node attached to the in node, then divide prob by out count, add together for each in node of a given node
- Think about convergence
  - Brunt force of checking new prob list and old prob list, if they are the same for 1/2 decimal points, end the algorithm
    - Could lead to potential complexity problems
  - Try threshold idea (i.e if |current - update| < 0.001 for every prob, end)

#### Pseudocode


- First assign the same probability to each node (1/n)
- Caution of convergence as we iterate over the algorithm (look for when probabilities start to stabilize as the time to stop)
  - Could look at stopping when all rounded probabilities (to 2 decimal places) are the same between the previous iteration and the current iteration

## Code

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import math
from sympy import Matrix

### Example Graph Used as a test

In [None]:
DG = nx.DiGraph()
edge_list = [ (1,2),(1,3), (1,4), (2,3), (2,4),(3,1), (4,1), (4,3)]
DG.add_edges_from(edge_list)

### Single PageRank update

In [None]:
def single_pr(graph, node, prob_list):

#First obtain the nodes that have an edge going into our desired node
  in_list = []
  for edge in graph.in_edges(node):
    curr = edge
    in_list.append(curr)


  update_pr = 0
  for i in range(len(in_list)):
    curr_deg = graph.out_degree(in_list[i][0])
    index = list(graph.nodes()).index(in_list[i][0])
    curr_prob = prob_list[index]
    update_pr = update_pr + (curr_prob / curr_deg)


  return update_pr


In [2]:
#Create an update function
#Could use this to update a single pagerank
#Given a node at the current iteration, output the updated page_rank
def single_prd(graph, node, prob_list, d):

#First obtain the nodes that have an edge going into our desired node
  in_list = []
  for edge in graph.in_edges(node):
    curr = edge
    in_list.append(curr)


  update_pr = 0
  for i in range(len(in_list)):
    curr_deg = graph.out_degree(in_list[i][0])
    index = list(graph.nodes()).index(in_list[i][0])
    curr_prob = prob_list[index]
    update_pr = update_pr + curr_prob / curr_deg


  final_update = (1-d)/ len(np.array(graph.nodes())) + d* update_pr

  return final_update


In [None]:
#Building Alg with example graph
G = nx.DiGraph()
edge_list = [ (1,3), (2,3)]
prob_list = [1/3,1/3,1/3]
G.add_edges_from(edge_list)
G.in_edges(3)
in_list = []
for edge in G.in_edges(3):
  #Store edges in a list that can be accessed
  curr = edge
  in_list.append(curr)


new_prob = 0
for i in range(len(in_list)):

  curr_out = G.out_degree(in_list[i][0])
  index = list(G.nodes()).index(in_list[i][0])
  curr_prob = prob_list[index]
  new_prob = new_prob + curr_prob / curr_out

new_prob


single_pr(G, 3, prob_list)





0.6666666666666666

### PageRank completed algorithm

In [None]:
def PageRank(digraph):
  node_list = np.array(digraph.nodes())
  n = len(node_list)
  current_prob_dist = np.repeat((1/n), n) #Probabilities of each node (indexes should correspond to above node list) (should st)
  updated_prob_dist = np.repeat(0.0, n) # placeholder list to throw in new values during the update sequence
  difference = 1
  while difference > 0.00001:   #Iterate through multiple page rank updates (multiple p-distribution updates)
    for  i in range(n):  #Single probability distribution update (single page rank update for all points)
      update_pr = single_pr(digraph, node_list[i], current_prob_dist)
      updated_prob_dist[i] = update_pr

      difference = np.max(abs(updated_prob_dist - current_prob_dist))
      current_prob_dist = np.array(updated_prob_dist)

  return updated_prob_dist

In [51]:
def PageRank_d(digraph, d):
  node_list = np.array(digraph.nodes())
  n = len(node_list)
  current_prob_dist = np.repeat((1/n), n) #Probabilities of each node (indexes should correspond to above node list) (should st)
  updated_prob_dist = np.repeat(0.0, n) # placeholder list to throw in new values during the update sequence
  difference = 1
  while difference > 0.00001:   #Iterate through multiple page rank updates (multiple p-distribution updates)
    for  i in range(n):  #Single probability distribution update (single page rank update for all points)
      update_pr = single_prd(digraph, node_list[i], current_prob_dist, d)
      updated_prob_dist[i] = update_pr

      difference = np.max(abs(updated_prob_dist - current_prob_dist))
      current_prob_dist = np.array(updated_prob_dist)


  return updated_prob_dist / sum(updated_prob_dist)

#### Testing algorithm iteration by iteration

In [None]:
DG.add_edges_from(edge_list)

node_list = np.array(DG.nodes())
n = len(np.array(DG.nodes()))
current_prob_dist = np.repeat((1/n), n)
update_prob_dist = np.zeros(n, float)

for i in range(n):
  update_pr = single_pr(DG, node_list[i], current_prob_dist)
  update_prob_dist[i] = update_pr

difference = np.max(abs(update_prob_dist - current_prob_dist))

print(update_prob_dist)
current_prob_dist = np.array(update_prob_dist)

#Setting current_prob_dist = update_prob_dist is causing some issues :(
for i in range(n):
  update_pr = single_pr(DG, node_list[i], current_prob_dist)
  update_prob_dist[i] = update_pr

current_prob_dist = np.array(update_prob_dist)

for i in range(n):
  update_pr = single_pr(DG, node_list[i], current_prob_dist)
  update_prob_dist[i] = update_pr


sum(update_prob_dist)



[0.05 0.25 0.35 0.1  0.25]


0.09999266751110554

# Linear Algebra Approach

#### Stochastic Matrix Function Given a Digraph

In [80]:
def stochastic(DG):
  A = nx.to_numpy_array(DG)
  for i in range(len(A)):
    count = 0
    for j in range(len(A)):
      if A[i][j] == 1:
        count = count + 1

    A[i] = A[i] / count



  return A.transpose()

##### Scrap work for Stochastic Matrix function

In [54]:
#Calculate the stochastic matrix from the adjacency matrix
A = nx.to_numpy_array(DG)
A_t = np.transpose(A)
for i in range(len(A_t)):
  count = 0
  for j in range(len(A_t)):
    if A_t[i][j] == 1:
      count = count + 1

  A_t[i] = A_t[i] / count

A

array([[0.        , 1.        , 0.33333333, 0.5       ],
       [0.        , 0.        , 0.33333333, 0.5       ],
       [0.        , 0.        , 0.        , 0.        ],
       [1.        , 0.        , 0.33333333, 0.        ]])

In [67]:
A = nx.to_numpy_array(DG)

for i in range(len(A)):
  count = 0
  for j in range(len(A)):
    if A[i][j] == 1:
      count = count + 1

  A[i] = A[i] / count



A.transpose()

array([[0.        , 0.        , 1.        , 0.5       ],
       [0.33333333, 0.        , 0.        , 0.        ],
       [0.33333333, 0.5       , 0.        , 0.5       ],
       [0.33333333, 0.5       , 0.        , 0.        ]])

#### Finalized PageRank Algorithm w/ Linear Algebra

In [90]:
def linalg_PageRank(DG, d):
  n = len(np.array(DG.nodes()))
  A = stochastic(DG)
  B = np.ones((n,n))

  M = d*A + ((1-d)/n)*B

  vals, vecs = np.linalg.eig(M)
  prob_vecs = vecs.real
  return abs(prob_vecs.transpose()[0]) / sum(abs(prob_vecs.transpose()[0]))

##### Scrap work for Algorithm

In [87]:
A = stochastic(DG)

n = len(np.array(DG.nodes()))
B = np.ones((n,n))
d = 0.85
M= d*A + ((1-d)/n)*B
M

array([[0.0375    , 0.0375    , 0.8875    , 0.4625    ],
       [0.32083333, 0.0375    , 0.0375    , 0.0375    ],
       [0.32083333, 0.4625    , 0.0375    , 0.4625    ],
       [0.32083333, 0.4625    , 0.0375    , 0.0375    ]])

In [85]:
vals, vecs = np.linalg.eig(M)
prob_vecs = vecs.real
prob_vecs.transpose()

prob_vecs.transpose()[0] / sum(prob_vecs.transpose()[0])

array([0.36815068, 0.14180936, 0.28796163, 0.20207834])

In [91]:
linalg_PageRank(DG, 0.85)
#IT WORKS LFG!!

array([0.36815068, 0.14180936, 0.28796163, 0.20207834])

##Testing

### Example Test Graph

In [92]:
DG = nx.DiGraph()
edge_list = [ (1,2),(1,3), (1,4), (2,3), (2,4),(3,1), (4,1), (4,3)]
DG.add_edges_from(edge_list)

### Comparison between brunt force, linear algebra, and networkX package

In [97]:
print(PageRank_d(DG, 0.85)) # Brunt Force
print(linalg_PageRank(DG, 0.85)) # Linear Algebra
nx.pagerank(DG, 0.85) # Built-in networkX

[0.36814208 0.1418122  0.28796333 0.20208239]
[0.36815068 0.14180936 0.28796163 0.20207834]


{1: 0.3681509531104541,
 2: 0.14180962694364857,
 3: 0.28796129337481535,
 4: 0.20207812657108176}